00001
00013 #include <GKlib.h>
00014
00015
00017
00018 typedef struct {
00019 int minfreq;
00020 int maxfreq;
00021 int minlen;
00022 int maxlen;
00023 int tnitems;
00024
00025
00026 void (*callback)(void *stateptr, int nitems, int *itemids, int ntrans, int *transids);
00027 void *stateptr;
00028
00029
00030 int *rmarker;
00031 gk_ikv_t *cand;
00032 } isparams_t;
00033
00034
00035
00037
00038 void itemsets_find_frequent_itemsets(isparams_t *params, gk_csr_t *mat,
00039 int preflen, int *prefix);
00040 gk_csr_t *itemsets_project_matrix(isparams_t *param, gk_csr_t *mat, int cid);
00041
00042
00043
00044
00046
00047 void gk_find_frequent_itemsets(int ntrans, ssize_t *tranptr, int *tranind,
00048 int minfreq, int maxfreq, int minlen, int maxlen,
00049 void (*process_itemset)(void *stateptr, int nitems, int *itemids,
00050 int ntrans, int *transids),
00051 void *stateptr)
00052 {
00053 ssize_t i;
00054 gk_csr_t *mat, *pmat;
00055 isparams_t params;
00056 int *pattern;
00057
00058
00059 mat = gk_csr_Create();
00060 mat->nrows = ntrans;
00061 mat->ncols = tranind[gk_iargmax(tranptr[ntrans], tranind)]+1;
00062 mat->rowptr = gk_zcopy(ntrans+1, tranptr, gk_zmalloc(ntrans+1, "gk_find_frequent_itemsets: mat.rowptr"));
00063 mat->rowind = gk_icopy(tranptr[ntrans], tranind, gk_imalloc(tranptr[ntrans], "gk_find_frequent_itemsets: mat.rowind"));
00064 mat->colids = gk_iincset(mat->ncols, 0, gk_imalloc(mat->ncols, "gk_find_frequent_itemsets: mat.colids"));
00065
00066
00067 params.minfreq = minfreq;
00068 params.maxfreq = (maxfreq == -1 ? mat->nrows : maxfreq);
00069 params.minlen = minlen;
00070 params.maxlen = (maxlen == -1 ? mat->ncols : maxlen);
00071 params.tnitems = mat->ncols;
00072 params.callback = process_itemset;
00073 params.stateptr = stateptr;
00074 params.rmarker = gk_ismalloc(mat->nrows, 0, "gk_find_frequent_itemsets: rmarker");
00075 params.cand = gk_ikvmalloc(mat->ncols, "gk_find_frequent_itemsets: cand");
00076
00077
00078 gk_csr_CreateIndex(mat, GK_CSR_COL);
00079 pmat = itemsets_project_matrix(¶ms, mat, -1);
00080 gk_csr_Free(&mat);
00081
00082 pattern = gk_imalloc(pmat->ncols, "gk_find_frequent_itemsets: pattern");
00083 itemsets_find_frequent_itemsets(¶ms, pmat, 0, pattern);
00084
00085 gk_csr_Free(&pmat);
00086 gk_free((void **)&pattern, ¶ms.rmarker, ¶ms.cand, LTERM);
00087
00088 }
00089
00090
00091
00092
00094
00095 void itemsets_find_frequent_itemsets(isparams_t *params, gk_csr_t *mat,
00096 int preflen, int *prefix)
00097 {
00098 ssize_t i;
00099 gk_csr_t *cmat;
00100
00101
00102 for (i=0; i<mat->ncols; i++) {
00103 prefix[preflen] = mat->colids[i];
00104
00105 if (preflen+1 >= params->minlen)
00106 (*params->callback)(params->stateptr, preflen+1, prefix,
00107 mat->colptr[i+1]-mat->colptr[i], mat->colind+mat->colptr[i]);
00108
00109 if (preflen+1 < params->maxlen) {
00110 cmat = itemsets_project_matrix(params, mat, i);
00111 itemsets_find_frequent_itemsets(params, cmat, preflen+1, prefix);
00112 gk_csr_Free(&cmat);
00113 }
00114 }
00115
00116 }
00117
00118
00119
00127
00128 gk_csr_t *itemsets_project_matrix(isparams_t *params, gk_csr_t *mat, int cid)
00129 {
00130 ssize_t i, j, k, ii, pnnz;
00131 int nrows, ncols, pnrows, pncols;
00132 ssize_t *colptr, *pcolptr;
00133 int *colind, *colids, *pcolind, *pcolids, *rmarker;
00134 gk_csr_t *pmat;
00135 gk_ikv_t *cand;
00136
00137 nrows = mat->nrows;
00138 ncols = mat->ncols;
00139 colptr = mat->colptr;
00140 colind = mat->colind;
00141 colids = mat->colids;
00142
00143 rmarker = params->rmarker;
00144 cand = params->cand;
00145
00146
00147
00148 pmat = gk_csr_Create();
00149 pmat->nrows = pnrows = (cid == -1 ? nrows : colptr[cid+1]-colptr[cid]);
00150
00151
00152
00153 if (cid == -1) {
00154 gk_iset(nrows, 1, rmarker);
00155 }
00156 else {
00157 for (i=colptr[cid]; i<colptr[cid+1]; i++)
00158 rmarker[colind[i]] = 1;
00159 }
00160
00161
00162
00163 for (pncols=0, pnnz=0, i=cid+1; i<ncols; i++) {
00164 for (k=0, j=colptr[i]; j<colptr[i+1]; j++) {
00165 k += rmarker[colind[j]];
00166 }
00167 if (k >= params->minfreq && k <= params->maxfreq) {
00168 cand[pncols].val = i;
00169 cand[pncols++].key = k;
00170 pnnz += k;
00171 }
00172 }
00173
00174
00175 gk_ikvsorti(pncols, cand);
00176
00177
00178
00179 pmat->ncols = pncols;
00180 pmat->colids = pcolids = gk_imalloc(pncols, "itemsets_project_matrix: pcolids");
00181 pmat->colptr = pcolptr = gk_zmalloc(pncols+1, "itemsets_project_matrix: pcolptr");
00182 pmat->colind = pcolind = gk_imalloc(pnnz, "itemsets_project_matrix: pcolind");
00183
00184
00185
00186 pcolptr[0] = 0;
00187 for (pnnz=0, ii=0; ii<pncols; ii++) {
00188 i = cand[ii].val;
00189 for (j=colptr[i]; j<colptr[i+1]; j++) {
00190 if (rmarker[colind[j]])
00191 pcolind[pnnz++] = colind[j];
00192 }
00193
00194 pcolids[ii] = colids[i];
00195 pcolptr[ii+1] = pnnz;
00196 }
00197
00198
00199
00200 if (cid == -1) {
00201 gk_iset(nrows, 0, rmarker);
00202 }
00203 else {
00204 for (i=colptr[cid]; i<colptr[cid+1]; i++)
00205 rmarker[colind[i]] = 0;
00206 }
00207
00208
00209 return pmat;
00210 }