00001
00010 #include <GKlib.h>
00011
00012 #define OMPMINOPS 50000
00013
00014
00018
00019 gk_csr_t *gk_csr_Create()
00020 {
00021 gk_csr_t *mat;
00022
00023 mat = (gk_csr_t *)gk_malloc(sizeof(gk_csr_t), "gk_csr_Create: mat");
00024
00025 gk_csr_Init(mat);
00026
00027 return mat;
00028 }
00029
00030
00031
00035
00036 void gk_csr_Init(gk_csr_t *mat)
00037 {
00038 memset(mat, 0, sizeof(gk_csr_t));
00039 mat->nrows = mat->ncols = -1;
00040 }
00041
00042
00043
00047
00048 void gk_csr_Free(gk_csr_t **mat)
00049 {
00050 if (*mat == NULL)
00051 return;
00052 gk_csr_FreeContents(*mat);
00053 gk_free((void **)mat, LTERM);
00054 }
00055
00056
00057
00062
00063 void gk_csr_FreeContents(gk_csr_t *mat)
00064 {
00065 gk_free((void *)&mat->rowptr, &mat->rowind, &mat->rowval, &mat->rowids,
00066 &mat->colptr, &mat->colind, &mat->colval, &mat->colids,
00067 &mat->rnorms, &mat->cnorms, &mat->rsums, &mat->csums,
00068 &mat->rsizes, &mat->csizes, &mat->rvols, &mat->cvols,
00069 &mat->rwgts, &mat->cwgts,
00070 LTERM);
00071 }
00072
00073
00074
00079
00080 gk_csr_t *gk_csr_Dup(gk_csr_t *mat)
00081 {
00082 gk_csr_t *nmat;
00083
00084 nmat = gk_csr_Create();
00085
00086 nmat->nrows = mat->nrows;
00087 nmat->ncols = mat->ncols;
00088
00089
00090 if (mat->rowptr)
00091 nmat->rowptr = gk_zcopy(mat->nrows+1, mat->rowptr,
00092 gk_zmalloc(mat->nrows+1, "gk_csr_Dup: rowptr"));
00093 if (mat->rowids)
00094 nmat->rowids = gk_icopy(mat->nrows, mat->rowids,
00095 gk_imalloc(mat->nrows, "gk_csr_Dup: rowids"));
00096 if (mat->rnorms)
00097 nmat->rnorms = gk_fcopy(mat->nrows, mat->rnorms,
00098 gk_fmalloc(mat->nrows, "gk_csr_Dup: rnorms"));
00099 if (mat->rowind)
00100 nmat->rowind = gk_icopy(mat->rowptr[mat->nrows], mat->rowind,
00101 gk_imalloc(mat->rowptr[mat->nrows], "gk_csr_Dup: rowind"));
00102 if (mat->rowval)
00103 nmat->rowval = gk_fcopy(mat->rowptr[mat->nrows], mat->rowval,
00104 gk_fmalloc(mat->rowptr[mat->nrows], "gk_csr_Dup: rowval"));
00105
00106
00107 if (mat->colptr)
00108 nmat->colptr = gk_zcopy(mat->ncols+1, mat->colptr,
00109 gk_zmalloc(mat->ncols+1, "gk_csr_Dup: colptr"));
00110 if (mat->colids)
00111 nmat->colids = gk_icopy(mat->ncols, mat->colids,
00112 gk_imalloc(mat->ncols, "gk_csr_Dup: colids"));
00113 if (mat->cnorms)
00114 nmat->cnorms = gk_fcopy(mat->ncols, mat->cnorms,
00115 gk_fmalloc(mat->ncols, "gk_csr_Dup: cnorms"));
00116 if (mat->colind)
00117 nmat->colind = gk_icopy(mat->colptr[mat->ncols], mat->colind,
00118 gk_imalloc(mat->colptr[mat->ncols], "gk_csr_Dup: colind"));
00119 if (mat->colval)
00120 nmat->colval = gk_fcopy(mat->colptr[mat->ncols], mat->colval,
00121 gk_fmalloc(mat->colptr[mat->ncols], "gk_csr_Dup: colval"));
00122
00123 return nmat;
00124 }
00125
00126
00127
00134
00135 gk_csr_t *gk_csr_ExtractSubmatrix(gk_csr_t *mat, int rstart, int nrows)
00136 {
00137 ssize_t i;
00138 gk_csr_t *nmat;
00139
00140 if (rstart+nrows > mat->nrows)
00141 return NULL;
00142
00143 nmat = gk_csr_Create();
00144
00145 nmat->nrows = nrows;
00146 nmat->ncols = mat->ncols;
00147
00148
00149 if (mat->rowptr)
00150 nmat->rowptr = gk_zcopy(nrows+1, mat->rowptr+rstart,
00151 gk_zmalloc(nrows+1, "gk_csr_ExtractSubmatrix: rowptr"));
00152 for (i=nrows; i>=0; i--)
00153 nmat->rowptr[i] -= nmat->rowptr[0];
00154 ASSERT(nmat->rowptr[0] == 0);
00155
00156 if (mat->rowids)
00157 nmat->rowids = gk_icopy(nrows, mat->rowids+rstart,
00158 gk_imalloc(nrows, "gk_csr_ExtractSubmatrix: rowids"));
00159 if (mat->rnorms)
00160 nmat->rnorms = gk_fcopy(nrows, mat->rnorms+rstart,
00161 gk_fmalloc(nrows, "gk_csr_ExtractSubmatrix: rnorms"));
00162
00163 if (mat->rsums)
00164 nmat->rsums = gk_fcopy(nrows, mat->rsums+rstart,
00165 gk_fmalloc(nrows, "gk_csr_ExtractSubmatrix: rsums"));
00166
00167 ASSERT(nmat->rowptr[nrows] == mat->rowptr[rstart+nrows]-mat->rowptr[rstart]);
00168 if (mat->rowind)
00169 nmat->rowind = gk_icopy(mat->rowptr[rstart+nrows]-mat->rowptr[rstart],
00170 mat->rowind+mat->rowptr[rstart],
00171 gk_imalloc(mat->rowptr[rstart+nrows]-mat->rowptr[rstart],
00172 "gk_csr_ExtractSubmatrix: rowind"));
00173 if (mat->rowval)
00174 nmat->rowval = gk_fcopy(mat->rowptr[rstart+nrows]-mat->rowptr[rstart],
00175 mat->rowval+mat->rowptr[rstart],
00176 gk_fmalloc(mat->rowptr[rstart+nrows]-mat->rowptr[rstart],
00177 "gk_csr_ExtractSubmatrix: rowval"));
00178
00179 return nmat;
00180 }
00181
00182
00183
00190
00191 gk_csr_t *gk_csr_ExtractRows(gk_csr_t *mat, int nrows, int *rind)
00192 {
00193 ssize_t i, ii, j, nnz;
00194 gk_csr_t *nmat;
00195
00196 nmat = gk_csr_Create();
00197
00198 nmat->nrows = nrows;
00199 nmat->ncols = mat->ncols;
00200
00201 for (nnz=0, i=0; i<nrows; i++)
00202 nnz += mat->rowptr[rind[i]+1]-mat->rowptr[rind[i]];
00203
00204 nmat->rowptr = gk_zmalloc(nmat->nrows+1, "gk_csr_ExtractPartition: rowptr");
00205 nmat->rowind = gk_imalloc(nnz, "gk_csr_ExtractPartition: rowind");
00206 nmat->rowval = gk_fmalloc(nnz, "gk_csr_ExtractPartition: rowval");
00207
00208 nmat->rowptr[0] = 0;
00209 for (nnz=0, j=0, ii=0; ii<nrows; ii++) {
00210 i = rind[ii];
00211 gk_icopy(mat->rowptr[i+1]-mat->rowptr[i], mat->rowind+mat->rowptr[i], nmat->rowind+nnz);
00212 gk_fcopy(mat->rowptr[i+1]-mat->rowptr[i], mat->rowval+mat->rowptr[i], nmat->rowval+nnz);
00213 nnz += mat->rowptr[i+1]-mat->rowptr[i];
00214 nmat->rowptr[++j] = nnz;
00215 }
00216 ASSERT(j == nmat->nrows);
00217
00218 return nmat;
00219 }
00220
00221
00222
00229
00230 gk_csr_t *gk_csr_ExtractPartition(gk_csr_t *mat, int *part, int pid)
00231 {
00232 ssize_t i, j, nnz;
00233 gk_csr_t *nmat;
00234
00235 nmat = gk_csr_Create();
00236
00237 nmat->nrows = 0;
00238 nmat->ncols = mat->ncols;
00239
00240 for (nnz=0, i=0; i<mat->nrows; i++) {
00241 if (part[i] == pid) {
00242 nmat->nrows++;
00243 nnz += mat->rowptr[i+1]-mat->rowptr[i];
00244 }
00245 }
00246
00247 nmat->rowptr = gk_zmalloc(nmat->nrows+1, "gk_csr_ExtractPartition: rowptr");
00248 nmat->rowind = gk_imalloc(nnz, "gk_csr_ExtractPartition: rowind");
00249 nmat->rowval = gk_fmalloc(nnz, "gk_csr_ExtractPartition: rowval");
00250
00251 nmat->rowptr[0] = 0;
00252 for (nnz=0, j=0, i=0; i<mat->nrows; i++) {
00253 if (part[i] == pid) {
00254 gk_icopy(mat->rowptr[i+1]-mat->rowptr[i], mat->rowind+mat->rowptr[i], nmat->rowind+nnz);
00255 gk_fcopy(mat->rowptr[i+1]-mat->rowptr[i], mat->rowval+mat->rowptr[i], nmat->rowval+nnz);
00256 nnz += mat->rowptr[i+1]-mat->rowptr[i];
00257 nmat->rowptr[++j] = nnz;
00258 }
00259 }
00260 ASSERT(j == nmat->nrows);
00261
00262 return nmat;
00263 }
00264
00265
00266
00276
00277 gk_csr_t **gk_csr_Split(gk_csr_t *mat, int *color)
00278 {
00279 ssize_t i, j;
00280 int nrows, ncolors;
00281 ssize_t *rowptr;
00282 int *rowind;
00283 float *rowval;
00284 gk_csr_t **smats;
00285
00286 nrows = mat->nrows;
00287 rowptr = mat->rowptr;
00288 rowind = mat->rowind;
00289 rowval = mat->rowval;
00290
00291 ncolors = gk_imax(rowptr[nrows], color)+1;
00292
00293 smats = (gk_csr_t **)gk_malloc(sizeof(gk_csr_t *)*ncolors, "gk_csr_Split: smats");
00294 for (i=0; i<ncolors; i++) {
00295 smats[i] = gk_csr_Create();
00296 smats[i]->nrows = mat->nrows;
00297 smats[i]->ncols = mat->ncols;
00298 smats[i]->rowptr = gk_zsmalloc(nrows+1, 0, "gk_csr_Split: smats[i]->rowptr");
00299 }
00300
00301 for (i=0; i<nrows; i++) {
00302 for (j=rowptr[i]; j<rowptr[i+1]; j++)
00303 smats[color[j]]->rowptr[i]++;
00304 }
00305 for (i=0; i<ncolors; i++)
00306 MAKECSR(j, nrows, smats[i]->rowptr);
00307
00308 for (i=0; i<ncolors; i++) {
00309 smats[i]->rowind = gk_imalloc(smats[i]->rowptr[nrows], "gk_csr_Split: smats[i]->rowind");
00310 smats[i]->rowval = gk_fmalloc(smats[i]->rowptr[nrows], "gk_csr_Split: smats[i]->rowval");
00311 }
00312
00313 for (i=0; i<nrows; i++) {
00314 for (j=rowptr[i]; j<rowptr[i+1]; j++) {
00315 smats[color[j]]->rowind[smats[color[j]]->rowptr[i]] = rowind[j];
00316 smats[color[j]]->rowval[smats[color[j]]->rowptr[i]] = rowval[j];
00317 smats[color[j]]->rowptr[i]++;
00318 }
00319 }
00320
00321 for (i=0; i<ncolors; i++)
00322 SHIFTCSR(j, nrows, smats[i]->rowptr);
00323
00324 return smats;
00325 }
00326
00327
00328
00348
00349 gk_csr_t *gk_csr_Read(char *filename, int format, int readvals, int numbering)
00350 {
00351 ssize_t i, k, l;
00352 size_t nfields, nrows, ncols, nnz, fmt, ncon;
00353 size_t lnlen;
00354 ssize_t *rowptr;
00355 int *rowind, ival;
00356 float *rowval=NULL, fval;
00357 int readsizes, readwgts;
00358 char *line=NULL, *head, *tail, fmtstr[256];
00359 FILE *fpin;
00360 gk_csr_t *mat=NULL;
00361
00362
00363 if (!gk_fexists(filename))
00364 gk_errexit(SIGERR, "File %s does not exist!\n", filename);
00365
00366 if (format == GK_CSR_FMT_BINROW) {
00367 mat = gk_csr_Create();
00368
00369 fpin = gk_fopen(filename, "rb", "gk_csr_Read: fpin");
00370 if (fread(&(mat->nrows), sizeof(int32_t), 1, fpin) != 1)
00371 gk_errexit(SIGERR, "Failed to read the nrows from file %s!\n", filename);
00372 if (fread(&(mat->ncols), sizeof(int32_t), 1, fpin) != 1)
00373 gk_errexit(SIGERR, "Failed to read the ncols from file %s!\n", filename);
00374 mat->rowptr = gk_zmalloc(mat->nrows+1, "gk_csr_Read: rowptr");
00375 if (fread(mat->rowptr, sizeof(ssize_t), mat->nrows+1, fpin) != mat->nrows+1)
00376 gk_errexit(SIGERR, "Failed to read the rowptr from file %s!\n", filename);
00377 mat->rowind = gk_imalloc(mat->rowptr[mat->nrows], "gk_csr_Read: rowind");
00378 if (fread(mat->rowind, sizeof(int32_t), mat->rowptr[mat->nrows], fpin) != mat->rowptr[mat->nrows])
00379 gk_errexit(SIGERR, "Failed to read the rowind from file %s!\n", filename);
00380 if (readvals == 1) {
00381 mat->rowval = gk_fmalloc(mat->rowptr[mat->nrows], "gk_csr_Read: rowval");
00382 if (fread(mat->rowval, sizeof(float), mat->rowptr[mat->nrows], fpin) != mat->rowptr[mat->nrows])
00383 gk_errexit(SIGERR, "Failed to read the rowval from file %s!\n", filename);
00384 }
00385
00386 gk_fclose(fpin);
00387 return mat;
00388 }
00389
00390 if (format == GK_CSR_FMT_BINCOL) {
00391 mat = gk_csr_Create();
00392
00393 fpin = gk_fopen(filename, "rb", "gk_csr_Read: fpin");
00394 if (fread(&(mat->nrows), sizeof(int32_t), 1, fpin) != 1)
00395 gk_errexit(SIGERR, "Failed to read the nrows from file %s!\n", filename);
00396 if (fread(&(mat->ncols), sizeof(int32_t), 1, fpin) != 1)
00397 gk_errexit(SIGERR, "Failed to read the ncols from file %s!\n", filename);
00398 mat->colptr = gk_zmalloc(mat->ncols+1, "gk_csr_Read: colptr");
00399 if (fread(mat->colptr, sizeof(ssize_t), mat->ncols+1, fpin) != mat->ncols+1)
00400 gk_errexit(SIGERR, "Failed to read the colptr from file %s!\n", filename);
00401 mat->colind = gk_imalloc(mat->colptr[mat->ncols], "gk_csr_Read: colind");
00402 if (fread(mat->colind, sizeof(int32_t), mat->colptr[mat->ncols], fpin) != mat->colptr[mat->ncols])
00403 gk_errexit(SIGERR, "Failed to read the colind from file %s!\n", filename);
00404 if (readvals) {
00405 mat->colval = gk_fmalloc(mat->colptr[mat->ncols], "gk_csr_Read: colval");
00406 if (fread(mat->colval, sizeof(float), mat->colptr[mat->ncols], fpin) != mat->colptr[mat->ncols])
00407 gk_errexit(SIGERR, "Failed to read the colval from file %s!\n", filename);
00408 }
00409
00410 gk_fclose(fpin);
00411 return mat;
00412 }
00413
00414
00415 if (format == GK_CSR_FMT_CLUTO) {
00416 fpin = gk_fopen(filename, "r", "gk_csr_Read: fpin");
00417 do {
00418 if (gk_getline(&line, &lnlen, fpin) <= 0)
00419 gk_errexit(SIGERR, "Premature end of input file: file:%s\n", filename);
00420 } while (line[0] == '%');
00421
00422 if (sscanf(line, "%zu %zu %zu", &nrows, &ncols, &nnz) != 3)
00423 gk_errexit(SIGERR, "Header line must contain 3 integers.\n");
00424
00425 readsizes = 0;
00426 readwgts = 0;
00427 readvals = 1;
00428 numbering = 1;
00429 }
00430 else if (format == GK_CSR_FMT_METIS) {
00431 fpin = gk_fopen(filename, "r", "gk_csr_Read: fpin");
00432 do {
00433 if (gk_getline(&line, &lnlen, fpin) <= 0)
00434 gk_errexit(SIGERR, "Premature end of input file: file:%s\n", filename);
00435 } while (line[0] == '%');
00436
00437 fmt = ncon = 0;
00438 nfields = sscanf(line, "%zu %zu %zu %zu", &nrows, &nnz, &fmt, &ncon);
00439 if (nfields < 2)
00440 gk_errexit(SIGERR, "Header line must contain at least 2 integers (#vtxs and #edges).\n");
00441
00442 ncols = nrows;
00443 nnz *= 2;
00444
00445 if (fmt > 111)
00446 gk_errexit(SIGERR, "Cannot read this type of file format [fmt=%zu]!\n", fmt);
00447
00448 sprintf(fmtstr, "%03zu", fmt%1000);
00449 readsizes = (fmtstr[0] == '1');
00450 readwgts = (fmtstr[1] == '1');
00451 readvals = (fmtstr[2] == '1');
00452 numbering = 1;
00453 ncon = (ncon == 0 ? 1 : ncon);
00454 }
00455 else {
00456 readsizes = 0;
00457 readwgts = 0;
00458
00459 gk_getfilestats(filename, &nrows, &nnz, NULL, NULL);
00460
00461 if (readvals == 1 && nnz%2 == 1)
00462 gk_errexit(SIGERR, "Error: The number of numbers (%zd %d) in the input file is not even.\n", nnz, readvals);
00463 if (readvals == 1)
00464 nnz = nnz/2;
00465 fpin = gk_fopen(filename, "r", "gk_csr_Read: fpin");
00466 }
00467
00468 mat = gk_csr_Create();
00469
00470 mat->nrows = nrows;
00471
00472 rowptr = mat->rowptr = gk_zmalloc(nrows+1, "gk_csr_Read: rowptr");
00473 rowind = mat->rowind = gk_imalloc(nnz, "gk_csr_Read: rowind");
00474 if (readvals != 2)
00475 rowval = mat->rowval = gk_fsmalloc(nnz, 1.0, "gk_csr_Read: rowval");
00476
00477 if (readsizes)
00478 mat->rsizes = gk_fsmalloc(nrows, 0.0, "gk_csr_Read: rsizes");
00479
00480 if (readwgts)
00481 mat->rwgts = gk_fsmalloc(nrows*ncon, 0.0, "gk_csr_Read: rwgts");
00482
00483
00484
00485
00486 numbering = (numbering ? - 1 : 0);
00487 for (ncols=0, rowptr[0]=0, k=0, i=0; i<nrows; i++) {
00488 do {
00489 if (gk_getline(&line, &lnlen, fpin) == -1)
00490 gk_errexit(SIGERR, "Premature end of input file: file while reading row %d\n", i);
00491 } while (line[0] == '%');
00492
00493 head = line;
00494 tail = NULL;
00495
00496
00497 if (readsizes) {
00498 #ifdef __MSC__
00499 mat->rsizes[i] = (float)strtod(head, &tail);
00500 #else
00501 mat->rsizes[i] = strtof(head, &tail);
00502 #endif
00503 if (tail == head)
00504 gk_errexit(SIGERR, "The line for vertex %zd does not have size information\n", i+1);
00505 if (mat->rsizes[i] < 0)
00506 errexit("The size for vertex %zd must be >= 0\n", i+1);
00507 head = tail;
00508 }
00509
00510
00511 if (readwgts) {
00512 for (l=0; l<ncon; l++) {
00513 #ifdef __MSC__
00514 mat->rwgts[i*ncon+l] = (float)strtod(head, &tail);
00515 #else
00516 mat->rwgts[i*ncon+l] = strtof(head, &tail);
00517 #endif
00518 if (tail == head)
00519 errexit("The line for vertex %zd does not have enough weights "
00520 "for the %d constraints.\n", i+1, ncon);
00521 if (mat->rwgts[i*ncon+l] < 0)
00522 errexit("The weight vertex %zd and constraint %zd must be >= 0\n", i+1, l);
00523 head = tail;
00524 }
00525 }
00526
00527
00528
00529 while (1) {
00530 ival = (int)strtol(head, &tail, 0);
00531 if (tail == head)
00532 break;
00533 head = tail;
00534
00535 if ((rowind[k] = ival + numbering) < 0)
00536 gk_errexit(SIGERR, "Error: Invalid column number %d at row %zd.\n", ival, i);
00537
00538 ncols = gk_max(rowind[k], ncols);
00539
00540 if (readvals == 1) {
00541 #ifdef __MSC__
00542 fval = (float)strtod(head, &tail);
00543 #else
00544 fval = strtof(head, &tail);
00545 #endif
00546 if (tail == head)
00547 gk_errexit(SIGERR, "Value could not be found for column! Row:%zd, NNZ:%zd\n", i, k);
00548 head = tail;
00549
00550 rowval[k] = fval;
00551 }
00552 k++;
00553 }
00554 rowptr[i+1] = k;
00555 }
00556
00557 if (format == GK_CSR_FMT_METIS) {
00558 ASSERT(ncols+1 == mat->nrows);
00559 mat->ncols = mat->nrows;
00560 }
00561 else {
00562 mat->ncols = ncols+1;
00563 }
00564
00565 if (k != nnz)
00566 gk_errexit(SIGERR, "gk_csr_Read: Something wrong with the number of nonzeros in "
00567 "the input file. NNZ=%zd, ActualNNZ=%zd.\n", nnz, k);
00568
00569 gk_fclose(fpin);
00570
00571 gk_free((void **)&line, LTERM);
00572
00573 return mat;
00574 }
00575
00576
00577
00590
00591 void gk_csr_Write(gk_csr_t *mat, char *filename, int format, int writevals, int numbering)
00592 {
00593 ssize_t i, j;
00594 FILE *fpout;
00595
00596 if (format == GK_CSR_FMT_BINROW) {
00597 if (filename == NULL)
00598 gk_errexit(SIGERR, "The filename parameter cannot be NULL.\n");
00599 fpout = gk_fopen(filename, "wb", "gk_csr_Write: fpout");
00600
00601 fwrite(&(mat->nrows), sizeof(int32_t), 1, fpout);
00602 fwrite(&(mat->ncols), sizeof(int32_t), 1, fpout);
00603 fwrite(mat->rowptr, sizeof(ssize_t), mat->nrows+1, fpout);
00604 fwrite(mat->rowind, sizeof(int32_t), mat->rowptr[mat->nrows], fpout);
00605 if (writevals)
00606 fwrite(mat->rowval, sizeof(float), mat->rowptr[mat->nrows], fpout);
00607
00608 gk_fclose(fpout);
00609 return;
00610 }
00611
00612 if (format == GK_CSR_FMT_BINCOL) {
00613 if (filename == NULL)
00614 gk_errexit(SIGERR, "The filename parameter cannot be NULL.\n");
00615 fpout = gk_fopen(filename, "wb", "gk_csr_Write: fpout");
00616
00617 fwrite(&(mat->nrows), sizeof(int32_t), 1, fpout);
00618 fwrite(&(mat->ncols), sizeof(int32_t), 1, fpout);
00619 fwrite(mat->colptr, sizeof(ssize_t), mat->ncols+1, fpout);
00620 fwrite(mat->colind, sizeof(int32_t), mat->colptr[mat->ncols], fpout);
00621 if (writevals)
00622 fwrite(mat->colval, sizeof(float), mat->colptr[mat->ncols], fpout);
00623
00624 gk_fclose(fpout);
00625 return;
00626 }
00627
00628 if (filename)
00629 fpout = gk_fopen(filename, "w", "gk_csr_Write: fpout");
00630 else
00631 fpout = stdout;
00632
00633 if (format == GK_CSR_FMT_CLUTO) {
00634 fprintf(fpout, "%d %d %zd\n", mat->nrows, mat->ncols, mat->rowptr[mat->nrows]);
00635 writevals = 1;
00636 numbering = 1;
00637 }
00638
00639 for (i=0; i<mat->nrows; i++) {
00640 for (j=mat->rowptr[i]; j<mat->rowptr[i+1]; j++) {
00641 fprintf(fpout, " %d", mat->rowind[j]+(numbering ? 1 : 0));
00642 if (writevals)
00643 fprintf(fpout, " %f", mat->rowval[j]);
00644 }
00645 fprintf(fpout, "\n");
00646 }
00647 if (filename)
00648 gk_fclose(fpout);
00649 }
00650
00651
00652
00668
00669 gk_csr_t *gk_csr_Prune(gk_csr_t *mat, int what, int minf, int maxf)
00670 {
00671 ssize_t i, j, nnz;
00672 int nrows, ncols;
00673 ssize_t *rowptr, *nrowptr;
00674 int *rowind, *nrowind, *collen;
00675 float *rowval, *nrowval;
00676 gk_csr_t *nmat;
00677
00678 nmat = gk_csr_Create();
00679
00680 nrows = nmat->nrows = mat->nrows;
00681 ncols = nmat->ncols = mat->ncols;
00682
00683 rowptr = mat->rowptr;
00684 rowind = mat->rowind;
00685 rowval = mat->rowval;
00686
00687 nrowptr = nmat->rowptr = gk_zmalloc(nrows+1, "gk_csr_Prune: nrowptr");
00688 nrowind = nmat->rowind = gk_imalloc(rowptr[nrows], "gk_csr_Prune: nrowind");
00689 nrowval = nmat->rowval = gk_fmalloc(rowptr[nrows], "gk_csr_Prune: nrowval");
00690
00691
00692 switch (what) {
00693 case GK_CSR_COL:
00694 collen = gk_ismalloc(ncols, 0, "gk_csr_Prune: collen");
00695
00696 for (i=0; i<nrows; i++) {
00697 for (j=rowptr[i]; j<rowptr[i+1]; j++) {
00698 ASSERT(rowind[j] < ncols);
00699 collen[rowind[j]]++;
00700 }
00701 }
00702 for (i=0; i<ncols; i++)
00703 collen[i] = (collen[i] >= minf && collen[i] <= maxf ? 1 : 0);
00704
00705 nrowptr[0] = 0;
00706 for (nnz=0, i=0; i<nrows; i++) {
00707 for (j=rowptr[i]; j<rowptr[i+1]; j++) {
00708 if (collen[rowind[j]]) {
00709 nrowind[nnz] = rowind[j];
00710 nrowval[nnz] = rowval[j];
00711 nnz++;
00712 }
00713 }
00714 nrowptr[i+1] = nnz;
00715 }
00716 gk_free((void **)&collen, LTERM);
00717 break;
00718
00719 case GK_CSR_ROW:
00720 nrowptr[0] = 0;
00721 for (nnz=0, i=0; i<nrows; i++) {
00722 if (rowptr[i+1]-rowptr[i] >= minf && rowptr[i+1]-rowptr[i] <= maxf) {
00723 for (j=rowptr[i]; j<rowptr[i+1]; j++, nnz++) {
00724 nrowind[nnz] = rowind[j];
00725 nrowval[nnz] = rowval[j];
00726 }
00727 }
00728 nrowptr[i+1] = nnz;
00729 }
00730 break;
00731
00732 default:
00733 gk_csr_Free(&nmat);
00734 gk_errexit(SIGERR, "Unknown prunning type of %d\n", what);
00735 return NULL;
00736 }
00737
00738 return nmat;
00739 }
00740
00741
00742
00758
00759 gk_csr_t *gk_csr_LowFilter(gk_csr_t *mat, int what, int norm, float fraction)
00760 {
00761 ssize_t i, j, nnz;
00762 int nrows, ncols, ncand, maxlen=0;
00763 ssize_t *rowptr, *colptr, *nrowptr;
00764 int *rowind, *colind, *nrowind;
00765 float *rowval, *colval, *nrowval, rsum, tsum;
00766 gk_csr_t *nmat;
00767 gk_fkv_t *cand;
00768
00769 nmat = gk_csr_Create();
00770
00771 nrows = nmat->nrows = mat->nrows;
00772 ncols = nmat->ncols = mat->ncols;
00773
00774 rowptr = mat->rowptr;
00775 rowind = mat->rowind;
00776 rowval = mat->rowval;
00777 colptr = mat->colptr;
00778 colind = mat->colind;
00779 colval = mat->colval;
00780
00781 nrowptr = nmat->rowptr = gk_zmalloc(nrows+1, "gk_csr_LowFilter: nrowptr");
00782 nrowind = nmat->rowind = gk_imalloc(rowptr[nrows], "gk_csr_LowFilter: nrowind");
00783 nrowval = nmat->rowval = gk_fmalloc(rowptr[nrows], "gk_csr_LowFilter: nrowval");
00784
00785
00786 switch (what) {
00787 case GK_CSR_COL:
00788 if (mat->colptr == NULL)
00789 gk_errexit(SIGERR, "Cannot filter columns when column-based structure has not been created.\n");
00790
00791 gk_zcopy(nrows+1, rowptr, nrowptr);
00792
00793 for (i=0; i<ncols; i++)
00794 maxlen = gk_max(maxlen, colptr[i+1]-colptr[i]);
00795
00796 #pragma omp parallel private(i, j, ncand, rsum, tsum, cand)
00797 {
00798 cand = gk_fkvmalloc(maxlen, "gk_csr_LowFilter: cand");
00799
00800 #pragma omp for schedule(static)
00801 for (i=0; i<ncols; i++) {
00802 for (tsum=0.0, ncand=0, j=colptr[i]; j<colptr[i+1]; j++, ncand++) {
00803 cand[ncand].val = colind[j];
00804 cand[ncand].key = colval[j];
00805 tsum += (norm == 1 ? colval[j] : colval[j]*colval[j]);
00806 }
00807 gk_fkvsortd(ncand, cand);
00808
00809 for (rsum=0.0, j=0; j<ncand && rsum<=fraction*tsum; j++) {
00810 rsum += (norm == 1 ? cand[j].key : cand[j].key*cand[j].key);
00811 nrowind[nrowptr[cand[j].val]] = i;
00812 nrowval[nrowptr[cand[j].val]] = cand[j].key;
00813 nrowptr[cand[j].val]++;
00814 }
00815 }
00816
00817 gk_free((void **)&cand, LTERM);
00818 }
00819
00820
00821 for (nnz=0, i=0; i<nrows; i++) {
00822 for (j=rowptr[i]; j<nrowptr[i]; j++, nnz++) {
00823 nrowind[nnz] = nrowind[j];
00824 nrowval[nnz] = nrowval[j];
00825 }
00826 nrowptr[i] = nnz;
00827 }
00828 SHIFTCSR(i, nrows, nrowptr);
00829
00830 break;
00831
00832 case GK_CSR_ROW:
00833 if (mat->rowptr == NULL)
00834 gk_errexit(SIGERR, "Cannot filter rows when row-based structure has not been created.\n");
00835
00836 for (i=0; i<nrows; i++)
00837 maxlen = gk_max(maxlen, rowptr[i+1]-rowptr[i]);
00838
00839 #pragma omp parallel private(i, j, ncand, rsum, tsum, cand)
00840 {
00841 cand = gk_fkvmalloc(maxlen, "gk_csr_LowFilter: cand");
00842
00843 #pragma omp for schedule(static)
00844 for (i=0; i<nrows; i++) {
00845 for (tsum=0.0, ncand=0, j=rowptr[i]; j<rowptr[i+1]; j++, ncand++) {
00846 cand[ncand].val = rowind[j];
00847 cand[ncand].key = rowval[j];
00848 tsum += (norm == 1 ? rowval[j] : rowval[j]*rowval[j]);
00849 }
00850 gk_fkvsortd(ncand, cand);
00851
00852 for (rsum=0.0, j=0; j<ncand && rsum<=fraction*tsum; j++) {
00853 rsum += (norm == 1 ? cand[j].key : cand[j].key*cand[j].key);
00854 nrowind[rowptr[i]+j] = cand[j].val;
00855 nrowval[rowptr[i]+j] = cand[j].key;
00856 }
00857 nrowptr[i+1] = rowptr[i]+j;
00858 }
00859
00860 gk_free((void **)&cand, LTERM);
00861 }
00862
00863
00864 nrowptr[0] = nnz = 0;
00865 for (i=0; i<nrows; i++) {
00866 for (j=rowptr[i]; j<nrowptr[i+1]; j++, nnz++) {
00867 nrowind[nnz] = nrowind[j];
00868 nrowval[nnz] = nrowval[j];
00869 }
00870 nrowptr[i+1] = nnz;
00871 }
00872
00873 break;
00874
00875 default:
00876 gk_csr_Free(&nmat);
00877 gk_errexit(SIGERR, "Unknown prunning type of %d\n", what);
00878 return NULL;
00879 }
00880
00881 return nmat;
00882 }
00883
00884
00885
00900
00901 gk_csr_t *gk_csr_TopKPlusFilter(gk_csr_t *mat, int what, int topk, float keepval)
00902 {
00903 ssize_t i, j, k, nnz;
00904 int nrows, ncols, ncand;
00905 ssize_t *rowptr, *colptr, *nrowptr;
00906 int *rowind, *colind, *nrowind;
00907 float *rowval, *colval, *nrowval;
00908 gk_csr_t *nmat;
00909 gk_fkv_t *cand;
00910
00911 nmat = gk_csr_Create();
00912
00913 nrows = nmat->nrows = mat->nrows;
00914 ncols = nmat->ncols = mat->ncols;
00915
00916 rowptr = mat->rowptr;
00917 rowind = mat->rowind;
00918 rowval = mat->rowval;
00919 colptr = mat->colptr;
00920 colind = mat->colind;
00921 colval = mat->colval;
00922
00923 nrowptr = nmat->rowptr = gk_zmalloc(nrows+1, "gk_csr_LowFilter: nrowptr");
00924 nrowind = nmat->rowind = gk_imalloc(rowptr[nrows], "gk_csr_LowFilter: nrowind");
00925 nrowval = nmat->rowval = gk_fmalloc(rowptr[nrows], "gk_csr_LowFilter: nrowval");
00926
00927
00928 switch (what) {
00929 case GK_CSR_COL:
00930 if (mat->colptr == NULL)
00931 gk_errexit(SIGERR, "Cannot filter columns when column-based structure has not been created.\n");
00932
00933 cand = gk_fkvmalloc(nrows, "gk_csr_LowFilter: cand");
00934
00935 gk_zcopy(nrows+1, rowptr, nrowptr);
00936 for (i=0; i<ncols; i++) {
00937 for (ncand=0, j=colptr[i]; j<colptr[i+1]; j++, ncand++) {
00938 cand[ncand].val = colind[j];
00939 cand[ncand].key = colval[j];
00940 }
00941 gk_fkvsortd(ncand, cand);
00942
00943 k = gk_min(topk, ncand);
00944 for (j=0; j<k; j++) {
00945 nrowind[nrowptr[cand[j].val]] = i;
00946 nrowval[nrowptr[cand[j].val]] = cand[j].key;
00947 nrowptr[cand[j].val]++;
00948 }
00949 for (; j<ncand; j++) {
00950 if (cand[j].key < keepval)
00951 break;
00952
00953 nrowind[nrowptr[cand[j].val]] = i;
00954 nrowval[nrowptr[cand[j].val]] = cand[j].key;
00955 nrowptr[cand[j].val]++;
00956 }
00957 }
00958
00959
00960 for (nnz=0, i=0; i<nrows; i++) {
00961 for (j=rowptr[i]; j<nrowptr[i]; j++, nnz++) {
00962 nrowind[nnz] = nrowind[j];
00963 nrowval[nnz] = nrowval[j];
00964 }
00965 nrowptr[i] = nnz;
00966 }
00967 SHIFTCSR(i, nrows, nrowptr);
00968
00969 gk_free((void **)&cand, LTERM);
00970 break;
00971
00972 case GK_CSR_ROW:
00973 if (mat->rowptr == NULL)
00974 gk_errexit(SIGERR, "Cannot filter rows when row-based structure has not been created.\n");
00975
00976 cand = gk_fkvmalloc(ncols, "gk_csr_LowFilter: cand");
00977
00978 nrowptr[0] = 0;
00979 for (nnz=0, i=0; i<nrows; i++) {
00980 for (ncand=0, j=rowptr[i]; j<rowptr[i+1]; j++, ncand++) {
00981 cand[ncand].val = rowind[j];
00982 cand[ncand].key = rowval[j];
00983 }
00984 gk_fkvsortd(ncand, cand);
00985
00986 k = gk_min(topk, ncand);
00987 for (j=0; j<k; j++, nnz++) {
00988 nrowind[nnz] = cand[j].val;
00989 nrowval[nnz] = cand[j].key;
00990 }
00991 for (; j<ncand; j++, nnz++) {
00992 if (cand[j].key < keepval)
00993 break;
00994
00995 nrowind[nnz] = cand[j].val;
00996 nrowval[nnz] = cand[j].key;
00997 }
00998 nrowptr[i+1] = nnz;
00999 }
01000
01001 gk_free((void **)&cand, LTERM);
01002 break;
01003
01004 default:
01005 gk_csr_Free(&nmat);
01006 gk_errexit(SIGERR, "Unknown prunning type of %d\n", what);
01007 return NULL;
01008 }
01009
01010 return nmat;
01011 }
01012
01013
01014
01030
01031 gk_csr_t *gk_csr_ZScoreFilter(gk_csr_t *mat, int what, float zscore)
01032 {
01033 ssize_t i, j, nnz;
01034 int nrows;
01035 ssize_t *rowptr, *nrowptr;
01036 int *rowind, *nrowind;
01037 float *rowval, *nrowval, avgwgt;
01038 gk_csr_t *nmat;
01039
01040 nmat = gk_csr_Create();
01041
01042 nmat->nrows = mat->nrows;
01043 nmat->ncols = mat->ncols;
01044
01045 nrows = mat->nrows;
01046 rowptr = mat->rowptr;
01047 rowind = mat->rowind;
01048 rowval = mat->rowval;
01049
01050 nrowptr = nmat->rowptr = gk_zmalloc(nrows+1, "gk_csr_ZScoreFilter: nrowptr");
01051 nrowind = nmat->rowind = gk_imalloc(rowptr[nrows], "gk_csr_ZScoreFilter: nrowind");
01052 nrowval = nmat->rowval = gk_fmalloc(rowptr[nrows], "gk_csr_ZScoreFilter: nrowval");
01053
01054
01055 switch (what) {
01056 case GK_CSR_COL:
01057 gk_errexit(SIGERR, "This has not been implemented yet.\n");
01058 break;
01059
01060 case GK_CSR_ROW:
01061 if (mat->rowptr == NULL)
01062 gk_errexit(SIGERR, "Cannot filter rows when row-based structure has not been created.\n");
01063
01064 nrowptr[0] = 0;
01065 for (nnz=0, i=0; i<nrows; i++) {
01066 avgwgt = zscore/(rowptr[i+1]-rowptr[i]);
01067 for (j=rowptr[i]; j<rowptr[i+1]; j++) {
01068 if (rowval[j] > avgwgt) {
01069 nrowind[nnz] = rowind[j];
01070 nrowval[nnz] = rowval[j];
01071 nnz++;
01072 }
01073 }
01074 nrowptr[i+1] = nnz;
01075 }
01076 break;
01077
01078 default:
01079 gk_csr_Free(&nmat);
01080 gk_errexit(SIGERR, "Unknown prunning type of %d\n", what);
01081 return NULL;
01082 }
01083
01084 return nmat;
01085 }
01086
01087
01088
01097
01098 void gk_csr_CompactColumns(gk_csr_t *mat)
01099 {
01100 ssize_t i;
01101 int nrows, ncols, nncols;
01102 ssize_t *rowptr;
01103 int *rowind, *colmap;
01104 gk_ikv_t *clens;
01105
01106 nrows = mat->nrows;
01107 ncols = mat->ncols;
01108 rowptr = mat->rowptr;
01109 rowind = mat->rowind;
01110
01111 colmap = gk_imalloc(ncols, "gk_csr_CompactColumns: colmap");
01112
01113 clens = gk_ikvmalloc(ncols, "gk_csr_CompactColumns: clens");
01114 for (i=0; i<ncols; i++) {
01115 clens[i].key = 0;
01116 clens[i].val = i;
01117 }
01118
01119 for (i=0; i<rowptr[nrows]; i++)
01120 clens[rowind[i]].key++;
01121 gk_ikvsortd(ncols, clens);
01122
01123 for (nncols=0, i=0; i<ncols; i++) {
01124 if (clens[i].key > 0)
01125 colmap[clens[i].val] = nncols++;
01126 else
01127 break;
01128 }
01129
01130 for (i=0; i<rowptr[nrows]; i++)
01131 rowind[i] = colmap[rowind[i]];
01132
01133 mat->ncols = nncols;
01134
01135 gk_free((void **)&colmap, &clens, LTERM);
01136 }
01137
01138
01139
01145
01146 void gk_csr_SortIndices(gk_csr_t *mat, int what)
01147 {
01148 int n, nn=0;
01149 ssize_t *ptr;
01150 int *ind;
01151 float *val;
01152
01153 switch (what) {
01154 case GK_CSR_ROW:
01155 if (!mat->rowptr)
01156 gk_errexit(SIGERR, "Row-based view of the matrix does not exists.\n");
01157
01158 n = mat->nrows;
01159 ptr = mat->rowptr;
01160 ind = mat->rowind;
01161 val = mat->rowval;
01162 break;
01163
01164 case GK_CSR_COL:
01165 if (!mat->colptr)
01166 gk_errexit(SIGERR, "Column-based view of the matrix does not exists.\n");
01167
01168 n = mat->ncols;
01169 ptr = mat->colptr;
01170 ind = mat->colind;
01171 val = mat->colval;
01172 break;
01173
01174 default:
01175 gk_errexit(SIGERR, "Invalid index type of %d.\n", what);
01176 return;
01177 }
01178
01179 #pragma omp parallel if (n > 100)
01180 {
01181 ssize_t i, j, k;
01182 gk_ikv_t *cand;
01183 float *tval;
01184
01185 #pragma omp single
01186 for (i=0; i<n; i++)
01187 nn = gk_max(nn, ptr[i+1]-ptr[i]);
01188
01189 cand = gk_ikvmalloc(nn, "gk_csr_SortIndices: cand");
01190 tval = gk_fmalloc(nn, "gk_csr_SortIndices: tval");
01191
01192 #pragma omp for schedule(static)
01193 for (i=0; i<n; i++) {
01194 for (k=0, j=ptr[i]; j<ptr[i+1]; j++) {
01195 if (j > ptr[i] && ind[j] < ind[j-1])
01196 k = 1;
01197 cand[j-ptr[i]].val = j-ptr[i];
01198 cand[j-ptr[i]].key = ind[j];
01199 tval[j-ptr[i]] = val[j];
01200 }
01201 if (k) {
01202 gk_ikvsorti(ptr[i+1]-ptr[i], cand);
01203 for (j=ptr[i]; j<ptr[i+1]; j++) {
01204 ind[j] = cand[j-ptr[i]].key;
01205 val[j] = tval[cand[j-ptr[i]].val];
01206 }
01207 }
01208 }
01209
01210 gk_free((void **)&cand, &tval, LTERM);
01211 }
01212
01213 }
01214
01215
01216
01222
01223 void gk_csr_CreateIndex(gk_csr_t *mat, int what)
01224 {
01225
01226 ssize_t i, j, k, nf, nr;
01227 ssize_t *fptr, *rptr;
01228 int *find, *rind;
01229 float *fval, *rval;
01230
01231 switch (what) {
01232 case GK_CSR_COL:
01233 nf = mat->nrows;
01234 fptr = mat->rowptr;
01235 find = mat->rowind;
01236 fval = mat->rowval;
01237
01238 if (mat->colptr) gk_free((void **)&mat->colptr, LTERM);
01239 if (mat->colind) gk_free((void **)&mat->colind, LTERM);
01240 if (mat->colval) gk_free((void **)&mat->colval, LTERM);
01241
01242 nr = mat->ncols;
01243 rptr = mat->colptr = gk_zsmalloc(nr+1, 0, "gk_csr_CreateIndex: rptr");
01244 rind = mat->colind = gk_imalloc(fptr[nf], "gk_csr_CreateIndex: rind");
01245 rval = mat->colval = (fval ? gk_fmalloc(fptr[nf], "gk_csr_CreateIndex: rval") : NULL);
01246 break;
01247 case GK_CSR_ROW:
01248 nf = mat->ncols;
01249 fptr = mat->colptr;
01250 find = mat->colind;
01251 fval = mat->colval;
01252
01253 if (mat->rowptr) gk_free((void **)&mat->rowptr, LTERM);
01254 if (mat->rowind) gk_free((void **)&mat->rowind, LTERM);
01255 if (mat->rowval) gk_free((void **)&mat->rowval, LTERM);
01256
01257 nr = mat->nrows;
01258 rptr = mat->rowptr = gk_zsmalloc(nr+1, 0, "gk_csr_CreateIndex: rptr");
01259 rind = mat->rowind = gk_imalloc(fptr[nf], "gk_csr_CreateIndex: rind");
01260 rval = mat->rowval = (fval ? gk_fmalloc(fptr[nf], "gk_csr_CreateIndex: rval") : NULL);
01261 break;
01262 default:
01263 gk_errexit(SIGERR, "Invalid index type of %d.\n", what);
01264 return;
01265 }
01266
01267
01268 for (i=0; i<nf; i++) {
01269 for (j=fptr[i]; j<fptr[i+1]; j++)
01270 rptr[find[j]]++;
01271 }
01272 MAKECSR(i, nr, rptr);
01273
01274 if (rptr[nr] > 6*nr) {
01275 for (i=0; i<nf; i++) {
01276 for (j=fptr[i]; j<fptr[i+1]; j++)
01277 rind[rptr[find[j]]++] = i;
01278 }
01279 SHIFTCSR(i, nr, rptr);
01280
01281 if (fval) {
01282 for (i=0; i<nf; i++) {
01283 for (j=fptr[i]; j<fptr[i+1]; j++)
01284 rval[rptr[find[j]]++] = fval[j];
01285 }
01286 SHIFTCSR(i, nr, rptr);
01287 }
01288 }
01289 else {
01290 if (fval) {
01291 for (i=0; i<nf; i++) {
01292 for (j=fptr[i]; j<fptr[i+1]; j++) {
01293 k = find[j];
01294 rind[rptr[k]] = i;
01295 rval[rptr[k]++] = fval[j];
01296 }
01297 }
01298 }
01299 else {
01300 for (i=0; i<nf; i++) {
01301 for (j=fptr[i]; j<fptr[i+1]; j++)
01302 rind[rptr[find[j]]++] = i;
01303 }
01304 }
01305 SHIFTCSR(i, nr, rptr);
01306 }
01307 }
01308
01309
01310
01318
01319 void gk_csr_Normalize(gk_csr_t *mat, int what, int norm)
01320 {
01321 ssize_t i, j;
01322 int n;
01323 ssize_t *ptr;
01324 float *val, sum;
01325
01326 if (what&GK_CSR_ROW && mat->rowval) {
01327 n = mat->nrows;
01328 ptr = mat->rowptr;
01329 val = mat->rowval;
01330
01331 #pragma omp parallel if (ptr[n] > OMPMINOPS)
01332 {
01333 #pragma omp for private(j,sum) schedule(static)
01334 for (i=0; i<n; i++) {
01335 for (sum=0.0, j=ptr[i]; j<ptr[i+1]; j++){
01336 if (norm == 2)
01337 sum += val[j]*val[j];
01338 else if (norm == 1)
01339 sum += val[j];
01340 }
01341 if (sum > 0) {
01342 if (norm == 2)
01343 sum=1.0/sqrt(sum);
01344 else if (norm == 1)
01345 sum=1.0/sum;
01346 for (j=ptr[i]; j<ptr[i+1]; j++)
01347 val[j] *= sum;
01348
01349 }
01350 }
01351 }
01352 }
01353
01354 if (what&GK_CSR_COL && mat->colval) {
01355 n = mat->ncols;
01356 ptr = mat->colptr;
01357 val = mat->colval;
01358
01359 #pragma omp parallel if (ptr[n] > OMPMINOPS)
01360 {
01361 #pragma omp for private(j,sum) schedule(static)
01362 for (i=0; i<n; i++) {
01363 for (sum=0.0, j=ptr[i]; j<ptr[i+1]; j++)
01364 if (norm == 2)
01365 sum += val[j]*val[j];
01366 else if (norm == 1)
01367 sum += val[j];
01368 if (sum > 0) {
01369 if (norm == 2)
01370 sum=1.0/sqrt(sum);
01371 else if (norm == 1)
01372 sum=1.0/sum;
01373 for (j=ptr[i]; j<ptr[i+1]; j++)
01374 val[j] *= sum;
01375 }
01376 }
01377 }
01378 }
01379 }
01380
01381
01382
01388
01389 void gk_csr_Scale(gk_csr_t *mat, int type)
01390 {
01391 ssize_t i, j;
01392 int nrows, ncols, nnzcols, bgfreq;
01393 ssize_t *rowptr;
01394 int *rowind, *collen;
01395 float *rowval, *cscale, maxtf;
01396
01397 nrows = mat->nrows;
01398 rowptr = mat->rowptr;
01399 rowind = mat->rowind;
01400 rowval = mat->rowval;
01401
01402 switch (type) {
01403 case GK_CSR_MAXTF:
01404 #pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
01405 {
01406 #pragma omp for private(j, maxtf) schedule(static)
01407 for (i=0; i<nrows; i++) {
01408 maxtf = fabs(rowval[rowptr[i]]);
01409 for (j=rowptr[i]; j<rowptr[i+1]; j++)
01410 maxtf = (maxtf < fabs(rowval[j]) ? fabs(rowval[j]) : maxtf);
01411
01412 for (j=rowptr[i]; j<rowptr[i+1]; j++)
01413 rowval[j] = .5 + .5*rowval[j]/maxtf;
01414 }
01415 }
01416 break;
01417
01418 case GK_CSR_MAXTF2:
01419 #pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
01420 {
01421 #pragma omp for private(j, maxtf) schedule(static)
01422 for (i=0; i<nrows; i++) {
01423 maxtf = fabs(rowval[rowptr[i]]);
01424 for (j=rowptr[i]; j<rowptr[i+1]; j++)
01425 maxtf = (maxtf < fabs(rowval[j]) ? fabs(rowval[j]) : maxtf);
01426
01427 for (j=rowptr[i]; j<rowptr[i+1]; j++)
01428 rowval[j] = .1 + .9*rowval[j]/maxtf;
01429 }
01430 }
01431 break;
01432
01433 case GK_CSR_SQRT:
01434 #pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
01435 {
01436 #pragma omp for private(j) schedule(static)
01437 for (i=0; i<nrows; i++) {
01438 for (j=rowptr[i]; j<rowptr[i+1]; j++) {
01439 if (rowval[j] != 0.0)
01440 rowval[j] = .1+sign(rowval[j], sqrt(fabs(rowval[j])));
01441 }
01442 }
01443 }
01444 break;
01445
01446 case GK_CSR_POW25:
01447 #pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
01448 {
01449 #pragma omp for private(j) schedule(static)
01450 for (i=0; i<nrows; i++) {
01451 for (j=rowptr[i]; j<rowptr[i+1]; j++) {
01452 if (rowval[j] != 0.0)
01453 rowval[j] = .1+sign(rowval[j], sqrt(sqrt(fabs(rowval[j]))));
01454 }
01455 }
01456 }
01457 break;
01458
01459 case GK_CSR_POW65:
01460 #pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
01461 {
01462 #pragma omp for private(j) schedule(static)
01463 for (i=0; i<nrows; i++) {
01464 for (j=rowptr[i]; j<rowptr[i+1]; j++) {
01465 if (rowval[j] != 0.0)
01466 rowval[j] = .1+sign(rowval[j], powf(fabs(rowval[j]), .65));
01467 }
01468 }
01469 }
01470 break;
01471
01472 case GK_CSR_POW75:
01473 #pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
01474 {
01475 #pragma omp for private(j) schedule(static)
01476 for (i=0; i<nrows; i++) {
01477 for (j=rowptr[i]; j<rowptr[i+1]; j++) {
01478 if (rowval[j] != 0.0)
01479 rowval[j] = .1+sign(rowval[j], powf(fabs(rowval[j]), .75));
01480 }
01481 }
01482 }
01483 break;
01484
01485 case GK_CSR_POW85:
01486 #pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
01487 {
01488 #pragma omp for private(j) schedule(static)
01489 for (i=0; i<nrows; i++) {
01490 for (j=rowptr[i]; j<rowptr[i+1]; j++) {
01491 if (rowval[j] != 0.0)
01492 rowval[j] = .1+sign(rowval[j], powf(fabs(rowval[j]), .85));
01493 }
01494 }
01495 }
01496 break;
01497
01498 case GK_CSR_LOG:
01499 #pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
01500 {
01501 double logscale = 1.0/log(2.0);
01502 #pragma omp for schedule(static,32)
01503 for (i=0; i<rowptr[nrows]; i++) {
01504 if (rowval[i] != 0.0)
01505 rowval[i] = 1+(rowval[i]>0.0 ? log(rowval[i]) : -log(-rowval[i]))*logscale;
01506 }
01507 #ifdef XXX
01508 #pragma omp for private(j) schedule(static)
01509 for (i=0; i<nrows; i++) {
01510 for (j=rowptr[i]; j<rowptr[i+1]; j++) {
01511 if (rowval[j] != 0.0)
01512 rowval[j] = 1+(rowval[j]>0.0 ? log(rowval[j]) : -log(-rowval[j]))*logscale;
01513
01514 }
01515 }
01516 #endif
01517 }
01518 break;
01519
01520 case GK_CSR_IDF:
01521 ncols = mat->ncols;
01522 cscale = gk_fmalloc(ncols, "gk_csr_Scale: cscale");
01523 collen = gk_ismalloc(ncols, 0, "gk_csr_Scale: collen");
01524
01525 for (i=0; i<nrows; i++) {
01526 for (j=rowptr[i]; j<rowptr[i+1]; j++)
01527 collen[rowind[j]]++;
01528 }
01529
01530 #pragma omp parallel if (ncols > OMPMINOPS)
01531 {
01532 #pragma omp for schedule(static)
01533 for (i=0; i<ncols; i++)
01534 cscale[i] = (collen[i] > 0 ? log(1.0*nrows/collen[i]) : 0.0);
01535 }
01536
01537 #pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
01538 {
01539 #pragma omp for private(j) schedule(static)
01540 for (i=0; i<nrows; i++) {
01541 for (j=rowptr[i]; j<rowptr[i+1]; j++)
01542 rowval[j] *= cscale[rowind[j]];
01543 }
01544 }
01545
01546 gk_free((void **)&cscale, &collen, LTERM);
01547 break;
01548
01549 case GK_CSR_IDF2:
01550 ncols = mat->ncols;
01551 cscale = gk_fmalloc(ncols, "gk_csr_Scale: cscale");
01552 collen = gk_ismalloc(ncols, 0, "gk_csr_Scale: collen");
01553
01554 for (i=0; i<nrows; i++) {
01555 for (j=rowptr[i]; j<rowptr[i+1]; j++)
01556 collen[rowind[j]]++;
01557 }
01558
01559 nnzcols = 0;
01560 #pragma omp parallel if (ncols > OMPMINOPS)
01561 {
01562 #pragma omp for schedule(static) reduction(+:nnzcols)
01563 for (i=0; i<ncols; i++)
01564 nnzcols += (collen[i] > 0 ? 1 : 0);
01565
01566 bgfreq = gk_max(10, (ssize_t)(.5*rowptr[nrows]/nnzcols));
01567 printf("nnz: %zd, nnzcols: %d, bgfreq: %d\n", rowptr[nrows], nnzcols, bgfreq);
01568
01569 #pragma omp for schedule(static)
01570 for (i=0; i<ncols; i++)
01571 cscale[i] = (collen[i] > 0 ? log(1.0*(nrows+2*bgfreq)/(bgfreq+collen[i])) : 0.0);
01572 }
01573
01574 #pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
01575 {
01576 #pragma omp for private(j) schedule(static)
01577 for (i=0; i<nrows; i++) {
01578 for (j=rowptr[i]; j<rowptr[i+1]; j++)
01579 rowval[j] *= cscale[rowind[j]];
01580 }
01581 }
01582
01583 gk_free((void **)&cscale, &collen, LTERM);
01584 break;
01585
01586 default:
01587 gk_errexit(SIGERR, "Unknown scaling type of %d\n", type);
01588 }
01589
01590 }
01591
01592
01593
01599
01600 void gk_csr_ComputeSums(gk_csr_t *mat, int what)
01601 {
01602 ssize_t i;
01603 int n;
01604 ssize_t *ptr;
01605 float *val, *sums;
01606
01607 switch (what) {
01608 case GK_CSR_ROW:
01609 n = mat->nrows;
01610 ptr = mat->rowptr;
01611 val = mat->rowval;
01612
01613 if (mat->rsums)
01614 gk_free((void **)&mat->rsums, LTERM);
01615
01616 sums = mat->rsums = gk_fsmalloc(n, 0, "gk_csr_ComputeSums: sums");
01617 break;
01618 case GK_CSR_COL:
01619 n = mat->ncols;
01620 ptr = mat->colptr;
01621 val = mat->colval;
01622
01623 if (mat->csums)
01624 gk_free((void **)&mat->csums, LTERM);
01625
01626 sums = mat->csums = gk_fsmalloc(n, 0, "gk_csr_ComputeSums: sums");
01627 break;
01628 default:
01629 gk_errexit(SIGERR, "Invalid sum type of %d.\n", what);
01630 return;
01631 }
01632
01633 #pragma omp parallel for if (ptr[n] > OMPMINOPS) schedule(static)
01634 for (i=0; i<n; i++)
01635 sums[i] = gk_fsum(ptr[i+1]-ptr[i], val+ptr[i], 1);
01636 }
01637
01638
01639
01645
01646 void gk_csr_ComputeSquaredNorms(gk_csr_t *mat, int what)
01647 {
01648 ssize_t i;
01649 int n;
01650 ssize_t *ptr;
01651 float *val, *norms;
01652
01653 switch (what) {
01654 case GK_CSR_ROW:
01655 n = mat->nrows;
01656 ptr = mat->rowptr;
01657 val = mat->rowval;
01658
01659 if (mat->rnorms) gk_free((void **)&mat->rnorms, LTERM);
01660
01661 norms = mat->rnorms = gk_fsmalloc(n, 0, "gk_csr_ComputeSums: norms");
01662 break;
01663 case GK_CSR_COL:
01664 n = mat->ncols;
01665 ptr = mat->colptr;
01666 val = mat->colval;
01667
01668 if (mat->cnorms) gk_free((void **)&mat->cnorms, LTERM);
01669
01670 norms = mat->cnorms = gk_fsmalloc(n, 0, "gk_csr_ComputeSums: norms");
01671 break;
01672 default:
01673 gk_errexit(SIGERR, "Invalid norm type of %d.\n", what);
01674 return;
01675 }
01676
01677 #pragma omp parallel for if (ptr[n] > OMPMINOPS) schedule(static)
01678 for (i=0; i<n; i++)
01679 norms[i] = gk_fdot(ptr[i+1]-ptr[i], val+ptr[i], 1, val+ptr[i], 1);
01680 }
01681
01682
01683
01696
01697 float gk_csr_ComputeSimilarity(gk_csr_t *mat, int i1, int i2, int what, int simtype)
01698 {
01699 int nind1, nind2;
01700 int *ind1, *ind2;
01701 float *val1, *val2, stat1, stat2, sim;
01702
01703 switch (what) {
01704 case GK_CSR_ROW:
01705 if (!mat->rowptr)
01706 gk_errexit(SIGERR, "Row-based view of the matrix does not exists.\n");
01707 nind1 = mat->rowptr[i1+1]-mat->rowptr[i1];
01708 nind2 = mat->rowptr[i2+1]-mat->rowptr[i2];
01709 ind1 = mat->rowind + mat->rowptr[i1];
01710 ind2 = mat->rowind + mat->rowptr[i2];
01711 val1 = mat->rowval + mat->rowptr[i1];
01712 val2 = mat->rowval + mat->rowptr[i2];
01713 break;
01714
01715 case GK_CSR_COL:
01716 if (!mat->colptr)
01717 gk_errexit(SIGERR, "Column-based view of the matrix does not exists.\n");
01718 nind1 = mat->colptr[i1+1]-mat->colptr[i1];
01719 nind2 = mat->colptr[i2+1]-mat->colptr[i2];
01720 ind1 = mat->colind + mat->colptr[i1];
01721 ind2 = mat->colind + mat->colptr[i2];
01722 val1 = mat->colval + mat->colptr[i1];
01723 val2 = mat->colval + mat->colptr[i2];
01724 break;
01725
01726 default:
01727 gk_errexit(SIGERR, "Invalid index type of %d.\n", what);
01728 return 0.0;
01729 }
01730
01731
01732 switch (simtype) {
01733 case GK_CSR_COS:
01734 case GK_CSR_JAC:
01735 sim = stat1 = stat2 = 0.0;
01736 i1 = i2 = 0;
01737 while (i1<nind1 && i2<nind2) {
01738 if (i1 == nind1) {
01739 stat2 += val2[i2]*val2[i2];
01740 i2++;
01741 }
01742 else if (i2 == nind2) {
01743 stat1 += val1[i1]*val1[i1];
01744 i1++;
01745 }
01746 else if (ind1[i1] < ind2[i2]) {
01747 stat1 += val1[i1]*val1[i1];
01748 i1++;
01749 }
01750 else if (ind1[i1] > ind2[i2]) {
01751 stat2 += val2[i2]*val2[i2];
01752 i2++;
01753 }
01754 else {
01755 sim += val1[i1]*val2[i2];
01756 stat1 += val1[i1]*val1[i1];
01757 stat2 += val2[i2]*val2[i2];
01758 i1++;
01759 i2++;
01760 }
01761 }
01762 if (simtype == GK_CSR_COS)
01763 sim = (stat1*stat2 > 0.0 ? sim/sqrt(stat1*stat2) : 0.0);
01764 else
01765 sim = (stat1+stat2-sim > 0.0 ? sim/(stat1+stat2-sim) : 0.0);
01766 break;
01767
01768 case GK_CSR_MIN:
01769 sim = stat1 = stat2 = 0.0;
01770 i1 = i2 = 0;
01771 while (i1<nind1 && i2<nind2) {
01772 if (i1 == nind1) {
01773 stat2 += val2[i2];
01774 i2++;
01775 }
01776 else if (i2 == nind2) {
01777 stat1 += val1[i1];
01778 i1++;
01779 }
01780 else if (ind1[i1] < ind2[i2]) {
01781 stat1 += val1[i1];
01782 i1++;
01783 }
01784 else if (ind1[i1] > ind2[i2]) {
01785 stat2 += val2[i2];
01786 i2++;
01787 }
01788 else {
01789 sim += gk_min(val1[i1],val2[i2]);
01790 stat1 += val1[i1];
01791 stat2 += val2[i2];
01792 i1++;
01793 i2++;
01794 }
01795 }
01796 sim = (stat1+stat2-sim > 0.0 ? sim/(stat1+stat2-sim) : 0.0);
01797
01798 break;
01799
01800 case GK_CSR_AMIN:
01801 sim = stat1 = stat2 = 0.0;
01802 i1 = i2 = 0;
01803 while (i1<nind1 && i2<nind2) {
01804 if (i1 == nind1) {
01805 stat2 += val2[i2];
01806 i2++;
01807 }
01808 else if (i2 == nind2) {
01809 stat1 += val1[i1];
01810 i1++;
01811 }
01812 else if (ind1[i1] < ind2[i2]) {
01813 stat1 += val1[i1];
01814 i1++;
01815 }
01816 else if (ind1[i1] > ind2[i2]) {
01817 stat2 += val2[i2];
01818 i2++;
01819 }
01820 else {
01821 sim += gk_min(val1[i1],val2[i2]);
01822 stat1 += val1[i1];
01823 stat2 += val2[i2];
01824 i1++;
01825 i2++;
01826 }
01827 }
01828 sim = (stat1 > 0.0 ? sim/stat1 : 0.0);
01829
01830 break;
01831
01832 default:
01833 gk_errexit(SIGERR, "Unknown similarity measure %d\n", simtype);
01834 return -1;
01835 }
01836
01837 return sim;
01838
01839 }
01840
01841
01842
01868
01869 int gk_csr_GetSimilarRows(gk_csr_t *mat, int nqterms, int *qind,
01870 float *qval, int simtype, int nsim, float minsim, gk_fkv_t *hits,
01871 int *i_marker, gk_fkv_t *i_cand)
01872 {
01873 ssize_t i, ii, j, k;
01874 int nrows, ncols, ncand;
01875 ssize_t *colptr;
01876 int *colind, *marker;
01877 float *colval, *rnorms, mynorm, *rsums, mysum;
01878 gk_fkv_t *cand;
01879
01880 if (nqterms == 0)
01881 return 0;
01882
01883 nrows = mat->nrows;
01884 ncols = mat->ncols;
01885 colptr = mat->colptr;
01886 colind = mat->colind;
01887 colval = mat->colval;
01888
01889 marker = (i_marker ? i_marker : gk_ismalloc(nrows, -1, "gk_csr_SimilarRows: marker"));
01890 cand = (i_cand ? i_cand : gk_fkvmalloc(nrows, "gk_csr_SimilarRows: cand"));
01891
01892 switch (simtype) {
01893 case GK_CSR_COS:
01894 for (ncand=0, ii=0; ii<nqterms; ii++) {
01895 i = qind[ii];
01896 if (i < ncols) {
01897 for (j=colptr[i]; j<colptr[i+1]; j++) {
01898 k = colind[j];
01899 if (marker[k] == -1) {
01900 cand[ncand].val = k;
01901 cand[ncand].key = 0;
01902 marker[k] = ncand++;
01903 }
01904 cand[marker[k]].key += colval[j]*qval[ii];
01905 }
01906 }
01907 }
01908 break;
01909
01910 case GK_CSR_JAC:
01911 for (ncand=0, ii=0; ii<nqterms; ii++) {
01912 i = qind[ii];
01913 if (i < ncols) {
01914 for (j=colptr[i]; j<colptr[i+1]; j++) {
01915 k = colind[j];
01916 if (marker[k] == -1) {
01917 cand[ncand].val = k;
01918 cand[ncand].key = 0;
01919 marker[k] = ncand++;
01920 }
01921 cand[marker[k]].key += colval[j]*qval[ii];
01922 }
01923 }
01924 }
01925
01926 rnorms = mat->rnorms;
01927 mynorm = gk_fdot(nqterms, qval, 1, qval, 1);
01928
01929 for (i=0; i<ncand; i++)
01930 cand[i].key = cand[i].key/(rnorms[cand[i].val]+mynorm-cand[i].key);
01931 break;
01932
01933 case GK_CSR_MIN:
01934 for (ncand=0, ii=0; ii<nqterms; ii++) {
01935 i = qind[ii];
01936 if (i < ncols) {
01937 for (j=colptr[i]; j<colptr[i+1]; j++) {
01938 k = colind[j];
01939 if (marker[k] == -1) {
01940 cand[ncand].val = k;
01941 cand[ncand].key = 0;
01942 marker[k] = ncand++;
01943 }
01944 cand[marker[k]].key += gk_min(colval[j], qval[ii]);
01945 }
01946 }
01947 }
01948
01949 rsums = mat->rsums;
01950 mysum = gk_fsum(nqterms, qval, 1);
01951
01952 for (i=0; i<ncand; i++)
01953 cand[i].key = cand[i].key/(rsums[cand[i].val]+mysum-cand[i].key);
01954 break;
01955
01956
01957 case GK_CSR_AMIN:
01958 for (ncand=0, ii=0; ii<nqterms; ii++) {
01959 i = qind[ii];
01960 if (i < ncols) {
01961 for (j=colptr[i]; j<colptr[i+1]; j++) {
01962 k = colind[j];
01963 if (marker[k] == -1) {
01964 cand[ncand].val = k;
01965 cand[ncand].key = 0;
01966 marker[k] = ncand++;
01967 }
01968 cand[marker[k]].key += gk_min(colval[j], qval[ii]);
01969 }
01970 }
01971 }
01972
01973 mysum = gk_fsum(nqterms, qval, 1);
01974
01975 for (i=0; i<ncand; i++)
01976 cand[i].key = cand[i].key/mysum;
01977 break;
01978
01979 default:
01980 gk_errexit(SIGERR, "Unknown similarity measure %d\n", simtype);
01981 return -1;
01982 }
01983
01984
01985 for (j=0, i=0; i<ncand; i++) {
01986 marker[cand[i].val] = -1;
01987 if (cand[i].key >= minsim)
01988 cand[j++] = cand[i];
01989 }
01990 ncand = j;
01991
01992 if (nsim == -1 || nsim >= ncand) {
01993 nsim = ncand;
01994 }
01995 else {
01996 nsim = gk_min(nsim, ncand);
01997 gk_dfkvkselect(ncand, nsim, cand);
01998 gk_fkvsortd(nsim, cand);
01999 }
02000
02001 gk_fkvcopy(nsim, cand, hits);
02002
02003 if (i_marker == NULL)
02004 gk_free((void **)&marker, LTERM);
02005 if (i_cand == NULL)
02006 gk_free((void **)&cand, LTERM);
02007
02008 return nsim;
02009 }
02010