001: /*
002:  * Copyright (C) 2002 Jeffrey R. Hoy <jrh26@cornell.edu>
003:  * 
004:  * This program is free software; you can redistribute it and/or
005:  * modify it under the terms of the GNU General Public License
006:  * as published by the Free Software Foundation; either version 2
007:  * of the License, or (at your option) any later version.
008:  * 
009:  * This program is distributed in the hope that it will be useful,
010:  * but WITHOUT ANY WARRANTY; without even the implied warranty of
011:  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
012:  * GNU General Public License for more details.
013:  * 
014:  * You should have received a copy of the GNU General Public License
015:  * along with this program; if not, write to the Free Software
016:  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
017:  */
018: 
019: 
020: 
021: #include <stdio.h>
022: #include <fstream.h>
023: #include <stdlib.h>
024: #include <string.h>
025: 
026: #include <math.h>
027: #include <time.h>
028: 
029: // Parameters
030: typedef /*unsigned __int64*/ int DWORD64; // change this to use 64-bit values
031: const double percentReads = 0.95; // amount of data to cover
032: int cutoff = 50; // max sequence length
033: const int numFiles = 500; // number of files to build at once
034: #define BUFF_SIZE 1500000 // DWORD64s
035: #define HASH_SIZE 1048576 // integers
036: #define LISTBLOCK_SIZE 1048576 // integers
037: const int writeBuffSize = 20000;
038: #define MAX_ASTERIK 0
039: #define ASTERIK 99999999 // special number to mark '*'
040: 
041: // Global variables
042: int minSupport;
043: int minSupportTotal;
044: int* globalBlock;  // final stat information
045: int blockSize = 0;
046: int blockAlloc = 0;
047: int blockMax = 10000; // initial num lines in global stat block
048: const int lineLength = cutoff + 2; // cutoff for sequence, one for count, one for length
049: int globalSequenceLength; // used for binary search
050: #define SYNTAX "Syntax:\n  mine.exe datafile outfile minSupportIndividual minSupportTotal "
051: 
052: int checkAndMerge(int* line1, int* line2, int len);
053: void addBlockData(int length, int count, int* label);
054: 
055: 
056: class Sequence {
057: 
058: private:
059: 
060:         int sequenceLength;
061:         int numSequences;
062:         DWORD64* data;
063:         char* fileName;
064: 
065: 
066: public:
067: 
068:         Sequence(char* fileNameIn); // constructor for length 1
069:         Sequence(Sequence* seqLenN, Sequence* seqLen1, char* fileNameIn); // constructor for length >1
070:         ~Sequence() {
071:                 if (numSequences > 1 && data != NULL)
072:                         delete [] data;
073:         }
074: 
075:         int getNumSequences() {
076:                 return numSequences;
077:         }
078: 
079:         
080:         int getSequenceLength() {
081:                 return sequenceLength;
082:         }
083: 
084:         const DWORD64* getData() {
085:                 return data;
086:         }
087: 
088:         void addToBlock();
089: 
090: };
091: 
092: 
093: 
094: 
095: // Comparison for size 1 sorting, smaller first
096: int compare1(const void* a, const void* b) {
097:         if (*(DWORD64*)a > *(DWORD64*)b)
098:                 return 1;
099:         else if (*(DWORD64*)a == *(DWORD64*)b)
100:                 return 0;
101:         else
102:                 return -1;
103: }
104: 
105: 
106: 
107: // Comparison for global block sorting, larger first
108: int compare2(const void* a, const void* b) {
109:         return ((int*)b)[cutoff] - ((int*)a)[cutoff]; // switch (ret positive) if b is larger
110: }
111: 
112: 
113: // Comparison for finding most significant loads, larger first
114: int compare3(const void* a, const void* b) {
115:         if (*(DWORD64*)a < *(DWORD64*)b)
116:                 return 1;
117:         else if (*(DWORD64*)a == *(DWORD64*)b)
118:                 return 0;
119:         else
120:                 return -1;
121: }
122: 
123: // For binary search on sorted sequences
124: int search1(const void* a, const void* b) {
125:         int i;
126:         for (i = 0; i < globalSequenceLength; i++) {
127:                 if ( ((DWORD64*)a)[i] < ((DWORD64*)b)[i] )
128:                         return -1;
129:                 else if ( ((DWORD64*)a)[i] > ((DWORD64*)b)[i] )
130:                         return 1;
131:         }
132: 
133:         return 0;
134: }
135: 
136: 
137: 
138: 
139: // Builds all sequences of length 1
140: Sequence :: Sequence(char* fileNameIn) {
141:         int index;
142:         int intsRead;
143:         int listIndex = 0;
144:         int numValues = 0;
145:         int check;
146: 
147:         sequenceLength = 1;
148:         fileName = fileNameIn;
149: 
150:         // probabilistic counting:
151:         //   hash everthing into large buckets,
152:         //   mark buckets that have more than minimum support
153:         //   hash again, make list of address that might have min support and counts
154:         //   sort and pack
155: 
156:         DWORD64* hashBlock = new DWORD64 [HASH_SIZE];
157:         if (hashBlock == NULL) { cout << "Insufficient Memory for hashBlock"; return; }
158:         memset(hashBlock, 0, HASH_SIZE*sizeof(DWORD64));
159: 
160:         ifstream* traceFile = new ifstream(fileName, ios::in | ios::binary);
161:         if (!traceFile) { cout << "Unable to open trace file" << endl; return; }
162: 
163:         DWORD64* readBuff = new DWORD64 [BUFF_SIZE];
164:         if (readBuff == NULL) { cout << "Insufficient Memory for readBuff" << endl; return; }
165: 
166:         while (!traceFile->eof()) {
167:                 traceFile->read((char*)readBuff, BUFF_SIZE*sizeof(DWORD64));
168: 
169:                 intsRead = traceFile->gcount() / sizeof(DWORD64);
170: 
171:                 for (index = 0; index < intsRead; index++) {
172:                         if (readBuff[index] % HASH_SIZE > HASH_SIZE || readBuff[index] % HASH_SIZE < 0) {
173:                                 cout << "Problem with mod and hashing the data:" << endl;
174:                                 cout << *(int*)&readBuff[index] << " " << *(((int*)&readBuff[index])+1)<< endl;
175:                                 cout << index << endl;
176:                         } else
177:                         hashBlock[readBuff[index] % HASH_SIZE]++;
178:                 }
179:         }
180:         traceFile->close();
181: 
182:         traceFile->open(fileName, ios::in | ios::binary);
183:         if (!traceFile) { cout << "Unable to open trace file" << endl; return; }
184: 
185:         DWORD64* listBlock = new DWORD64 [LISTBLOCK_SIZE*2];
186:         if (listBlock == NULL) { cout << "Insufficient Memory for listBlock, change LISTBLOCK_SIZE" << endl; return; }
187:         memset(listBlock, 0, LISTBLOCK_SIZE*2*sizeof(DWORD64));
188: 
189:         while (!traceFile->eof()) {
190:                 traceFile->read((char*)readBuff, BUFF_SIZE*sizeof(DWORD64));
191: 
192:                 intsRead = traceFile->gcount() / sizeof(DWORD64);
193: 
194:                 for (index = 0; index < intsRead; index++) {
195:                         if (hashBlock[readBuff[index] % HASH_SIZE] >= minSupport) {
196:                                 int found = 0;
197:                                 for (check = 0; check < listIndex; check++) {
198:                                         if (listBlock[check*2] == readBuff[index]) {
199:                                                 found = 1;
200:                                                 listBlock[check*2+1]++;
201:                                         }
202:                                 }
203:                                 if (!found) {
204:                                         listBlock[listIndex*2] = readBuff[index];
205:                                         listBlock[listIndex*2+1]++;
206:                                         listIndex++;
207:                                         if (listIndex == 3000) {
208:                                                 cout << "3000 sequences length 1 thus far, this might get slow" << endl;
209:                                         }
210:                                         if (listIndex >= LISTBLOCK_SIZE) {
211:                                                 cout << "Need more memory for listBlock... Program exit" << endl;
212:                                         }
213:                                 }
214: 
215:                         }
216:                 }
217:         }
218:         traceFile->close();
219:         delete [] readBuff;
220: 
221: 
222:         // pack it up
223:         numSequences = 0;
224:         for (index = 0; index < listIndex; index++) 
225:                 if (listBlock[index*2+1] >= minSupport) 
226:                         numSequences++;
227:         data = new DWORD64[numSequences*2*sizeof(DWORD64)];
228:         if (data == NULL) { cout << "Insufficient Memory for data" << endl; return; }
229:         memset(data, 0, numSequences*2*sizeof(DWORD64));
230: 
231:         // pack it up
232:         numSequences = 0;
233:         for (index = 0; index < listIndex; index++) {
234:                 if (listBlock[index*2+1] >= minSupport) {
235:                         data[numSequences*2] = listBlock[index*2];
236:                         data[numSequences*2+1] = listBlock[index*2+1];
237:                         numSequences++;
238:                 }
239:         }
240: 
241: #if MAX_ASTERIK > 0
242:         // Added code for wildcard
243:         data[numSequences*2] = ASTERIK;
244:         numSequences++;
245: #endif
246: 
247:         // sort in increasing order
248:         qsort(data, numSequences, sizeof(DWORD64)*2, compare1);
249: 
250: cout << "Num potential sequences len1 is " << numSequences << endl;
251: 
252:         delete [] hashBlock;
253:         delete [] listBlock;
254:         delete traceFile;
255: }
256: 
257: 
258: 
259: 
260: 
261: 
262: 
263: // Builds all sequences of length n+1
264: Sequence :: Sequence(Sequence* seqLenN, Sequence* seqLen1, char* fileNameIn) {
265: 
266:         int i,j,k,h;
267: 
268:         int lenSeqN = seqLenN->getSequenceLength();
269:         int numSeqN = seqLenN->getNumSequences();
270:         int numSeq = 0;
271:         const DWORD64* seqLenNData = seqLenN->getData();
272: 
273:         sequenceLength = lenSeqN+1;
274:         fileName = fileNameIn;
275: 
276:         // build larger sequences
277:         // just get count this time
278:         numSeq = 0;
279:         for (i = 0; i < numSeqN; i++) {
280:                 for (j = 0; j < numSeqN; j++) {
281:                         if (memcmp(&seqLenNData[i*(lenSeqN+1)+1], &seqLenNData[j*(lenSeqN+1)], (lenSeqN-1)*sizeof(DWORD64)) == 0) { // inner match found
282:                                 numSeq++;
283:                         }
284:                 }
285:         }
286: 
287:         DWORD64* tempData = new DWORD64[numSeq * (sequenceLength+1)];
288:         if (tempData == NULL) { cout << "Insufficient Memory2" << endl; exit(1); }
289:         memset(tempData, 0, sizeof(DWORD64)*numSeq*(sequenceLength+1));
290: 
291:         // now actually build
292:         numSeq = 0;
293:         for (i = 0; i < numSeqN; i++) {
294:                 for (j = 0; j < numSeqN; j++) {
295:                         if (memcmp(&seqLenNData[i*(lenSeqN+1)+1], &seqLenNData[j*(lenSeqN+1)], (lenSeqN-1)*sizeof(DWORD64)) == 0) { // inner match found
296: 
297:                                 memcpy(&tempData[numSeq*(sequenceLength+1)], &seqLenNData[i*(lenSeqN+1)], lenSeqN*sizeof(DWORD64));
298:                                 tempData[numSeq*(sequenceLength+1) + (sequenceLength-1)] = seqLenNData[j*(lenSeqN+1)+(lenSeqN-1)];
299: 
300:                                 int astcount = 0;
301:                                 for (k = 0; k < sequenceLength; k++) {
302:                                         if (tempData[numSeq*(sequenceLength+1)+k] == ASTERIK)
303:                                                 astcount++;
304:                                 }
305: 
306:                                 if (astcount <= MAX_ASTERIK) // accept new sequence
307:                                         numSeq++;
308:                         }
309:                 }
310:         }
311: 
312:         cout << "Num potential sequences length "<<sequenceLength<<" is " << numSeq << endl;
313: 
314:         ifstream* traceFile = new ifstream(fileName, ios::in | ios::binary);
315:         if (traceFile->fail()) { cout << "Unable to open trace file, program exit" << endl;        return;        }
316: 
317:         int intsRead = 0;
318: 
319:         DWORD64* buff = new DWORD64 [BUFF_SIZE];
320:         if (buff == NULL) { cout << "Insufficient Memory for readBuff" << endl;        return;        }
321: 
322:         while (!traceFile->eof()) {
323:                 traceFile->read((char*)(&buff[intsRead]), (BUFF_SIZE-intsRead)*sizeof(DWORD64));
324:                 intsRead += traceFile->gcount() / sizeof(DWORD64);
325: 
326:                 // new optimized code to handle wildcard searches
327:                 globalSequenceLength = sequenceLength; // so search knows number of elements
328:                 DWORD64* place = NULL;
329:                 DWORD64 temp1 = 0, temp2 = 0;
330: 
331:                 for (h = 0; h < intsRead-sequenceLength+1; h++) {
332: 
333:                         // using #'s because this is the hotspot
334:                         // This could be rewitten so handle any number of asteriks, but it would be more complex code
335:                         // The search space is too large for more than 2 asteriks, so it shouldn't matter
336:                         #if MAX_ASTERIK >= 3
337:                                 cout << "Only handling 2 asteriks for now" << endl;
338:                         #endif
339: 
340:                         #if MAX_ASTERIK >= 2
341:                                 for (i = 0; i < sequenceLength-1; i++) {
342:                                         temp1 = buff[h + i];
343:                                         buff[h + i] = ASTERIK;
344:                                         for (j = i+1; j < sequenceLength; j++) {
345:                                                 temp2 = buff[h + j];
346:                                                 buff[h + j] = ASTERIK;
347: 
348:                                                 place = (DWORD64*)bsearch(&buff[h], tempData, numSeq, sizeof(DWORD64)*(sequenceLength+1), search1);
349:                                                 if (place != NULL) // found
350:                                                         place[sequenceLength]++;
351: 
352:                                                 buff[h + j] = temp2;
353: 
354:                                         }
355:                                         buff[h + i] = temp1;
356:                                 }
357:                         #endif
358: 
359:                         #if MAX_ASTERIK >= 1
360:                                 //change so it only looks up if it misses the others?
361:                                 for (i = 0; i < sequenceLength; i++) {
362:                                         temp1 = buff[h + i];
363:                                         buff[h + i] = ASTERIK;
364: 
365:                                         place = (DWORD64*)bsearch(&buff[h], tempData, numSeq, sizeof(DWORD64)*(sequenceLength+1), search1);
366:                                         if (place != NULL) // found
367:                                                 place[sequenceLength]++;
368: 
369:                                         buff[h + i] = temp1;
370:                                 }
371:                         #endif
372: 
373:                         // also handle original search
374:                         place = (DWORD64*)bsearch(&buff[h], tempData, numSeq, sizeof(DWORD64)*(sequenceLength+1), search1);
375:                         if (place != NULL) // found
376:                                 place[sequenceLength]++;
377:                 }
378: 
379:                 int intsToCopy = sequenceLength-1;  // need to overlap the buffer
380:                 memcpy(&buff[0], &buff[intsRead-intsToCopy], (intsToCopy)*sizeof(DWORD64));
381:                 intsRead = intsToCopy;
382: 
383:         }
384: 
385:         traceFile->close();
386: 
387: 
388: 
389:         // pack the data by removing unsupported sequences
390:         numSequences = 0;
391:         for (i = 0; i < numSeq; i++) {
392:                 if (tempData[i*(sequenceLength+1) + sequenceLength] >= minSupport)
393:                         numSequences++;
394:         }
395: 
396:         data = new DWORD64 [numSequences * (sequenceLength+1)];
397:         if (data == NULL) { cout << "Insufficient Memory3" << endl;        return;        }
398: 
399:         numSequences = 0;
400:         for (i = 0; i < numSeq; i++) {
401:                 if (tempData[i*(sequenceLength+1) + sequenceLength] >= minSupport) {
402:                         memcpy(&data[numSequences * (sequenceLength+1)], &tempData[i * (sequenceLength+1)], (sequenceLength+1)*sizeof(DWORD64));
403:                         numSequences++;
404:                 }
405:         }
406: 
407:         delete [] buff; 
408:         delete [] tempData;
409:         delete traceFile;
410: 
411: }
412: 
413: 
414: 
415: 
416: void Sequence :: addToBlock() {
417:         int i,j,k;
418:         int found;
419:         int label[200] = {0};
420:         int labelCount = 0;
421: 
422:         for (i = 0; i < numSequences; i++) {
423:                 
424:                 // get labels for block line
425:                 labelCount = 0;
426:                 for (j = 0; j < sequenceLength; j++) {
427:                         found = 0;
428:                         for (k = 0; k < j; k++) {
429:                                 if (data[i*(sequenceLength+1) + j] == data[i*(sequenceLength+1) + k]) { // repeated element
430:                                         found = 1;
431:                                         label[j] = label[k];
432:                                         break;
433:                                 }
434:                         }
435:                         if (!found) {
436:                                 if (data[i*(sequenceLength+1) + j] == ASTERIK) {
437:                                         label[j] = ASTERIK;
438:                                 } else {
439:                                         labelCount++;
440:                                         label[j] = labelCount;
441:                                 }
442:                         }
443:                 }
444:                 addBlockData(sequenceLength, (int)data[i*(sequenceLength+1) + sequenceLength], label);
445:         }
446: 
447: }
448: 
449: 
450: 
451: 
452: 
453: 
454: 
455: 
456: 
457: 
458: // global block update
459: // Global block structure:
460: // blockMax lines each of length linelength
461: //  each line has 0 to cutoff labels, count, and length
462: //
463: 
464: void addBlockData(int length, int count, int* label) {
465: 
466:         int i;
467: 
468:         // search for line
469:         // could actually hash this if it is a bottleneck
470:         for (i = 0; i < blockSize; i++) {
471:                 if (memcmp(&globalBlock[i*lineLength], label, cutoff*sizeof(int)) == 0) {
472:                         globalBlock[i*lineLength + cutoff] += count;
473:                         return;
474:                 }
475:         }
476: 
477:         // not found
478:         memcpy(&globalBlock[blockSize*lineLength], label, cutoff*sizeof(int));
479:         globalBlock[blockSize*lineLength + cutoff] += count;
480:         globalBlock[blockSize*lineLength + cutoff + 1] = length;
481: 
482:         blockSize++;
483: 
484:         // check if will need more memory for the block
485:         if (blockSize*lineLength >= blockAlloc) {
486:                 cout << "Global info block full, reallocating..." << endl;
487: 
488:                 blockAlloc = blockAlloc * 4;
489:                 int* globalBlockNew = new int [blockAlloc];
490:                 if (globalBlockNew == NULL) { cout << "Insufficient memory for global information block, program exit" << endl; exit(1); }
491:                 cout << "Done reallocating1..." << endl;
492:                 memset(globalBlockNew, 0, blockAlloc*sizeof(int));
493:                 cout << "Done reallocating2..." << endl;
494: 
495:                 memcpy(globalBlockNew, globalBlock, blockSize*lineLength*sizeof(int));
496:                 cout << "Done reallocating3..." << endl;
497:                 delete [] globalBlock ;
498:                 globalBlock = globalBlockNew;
499: 
500:                 cout << "Done reallocating..." << endl;
501: 
502:         }
503: }
504: 
505: 
506: 
507: 
508: // Normalize pattern by starting on 'A'
509: void flatten(int* a, int len) {
510: 
511:         int i;
512:         int newcount = 1;
513:         int* change = new int[cutoff];
514: 
515:         for (i = 0; i < len; i++)
516:                 change[i] = -1;
517: 
518:         for (i = 0; i < len; i++) {
519:                 if (a[i] != ASTERIK) if (change[a[i]] == -1) {
520:                         change[a[i]] = newcount;
521:                         newcount++;
522:                 }
523:         }
524: 
525:         for (i = 0; i < len; i++) if (a[i] != ASTERIK) {
526:                 a[i] = change[a[i]];
527:         }
528: 
529:         delete [] change;
530: 
531: }
532: 
533: 
534: 
535: // See if pattern is same thing but shifted, if so merge
536: int checkAndMerge(int* line1, int* line2, int len) {
537: 
538:         int i,j;
539:         int found = 0;
540: 
541:         int* a = new int[len];
542:         int* b = new int[len];
543: 
544: 
545:         for (j = 0; j < len; j++)
546:                 b[j] = line2[j];
547: 
548:         for (i = 0; i < len; i++) {
549:                 for (j = 0; j < len; j++) {
550:                         a[j] = line1[(i+j) % len];
551:                 }
552:                 flatten(a, len);
553: 
554:                 found = 1;
555:                 for (j = 0; j < len; j++)
556:                         if (a[j] != b[j])
557:                                 found = 0;
558: 
559:                 if (found) { // take max times appearing
560:                         if (line1[cutoff] > line2[cutoff])
561:                                 line2[cutoff] = 0;
562:                         else
563:                                 line1[cutoff] = 0;
564:                         break;
565:                 }
566:         }
567: 
568:         delete [] a;
569:         delete [] b;
570: 
571:         return found;
572: 
573: }
574: 
575: 
576: 
577: // Formats results to outfile
578: void printBlockData(char* outFileName, char* dataFileName) {
579:         
580:         cout << "Global block size is " << blockSize << endl;
581: 
582:         int i,j,k;
583:         int templine[1000] = {0};
584: 
585:         // combine patterns that are really similiar, like AAAB and AABA and ABAA and ABBB
586:         for (i = 1; i <= cutoff; i++) {
587:                 for (j = 0; j < blockSize-1; j++) {
588:                         if (globalBlock[j*lineLength + cutoff + 1] == i && globalBlock[j*lineLength + cutoff] > 0)
589:                                 for (k = j+1; k < blockSize; k++) {
590:                                         // check that length is good
591:                                         if (globalBlock[k*lineLength + cutoff + 1] == i && globalBlock[k*lineLength + cutoff] > 0)
592:                                                 checkAndMerge(&globalBlock[j*lineLength], &globalBlock[k*lineLength], i);
593:                         }
594:                 }
595:         }
596: 
597:         // sorts the whole thing, ignoring length at this point
598:         qsort(globalBlock, blockSize, lineLength*sizeof(int), compare2);
599: 
600: 
601:         ofstream outFile;
602:         outFile.open(outFileName, ios::out | ios::trunc);
603:         if (outFile.fail()) {
604:                 cout << "Can't open output file: " << outFileName << endl;
605:                 return;
606:         }
607: 
608:         outFile << "Data File Name: " << dataFileName << endl;
609:         outFile << "Minimum support for each load: " << minSupport << endl;
610:         outFile << "Minimum support over all loads: " << minSupportTotal << endl;
611: 
612:         for (i = 1; i <= cutoff; i++) {
613:                 outFile << "Sequences Length " << i << endl;
614:                 for (j = 0; j < blockSize; j++) {
615:                         if (globalBlock[j*lineLength + cutoff + 1] == i && globalBlock[j*lineLength + cutoff] >= minSupportTotal) {
616:                                 outFile << "  Count: " << globalBlock[j*lineLength + cutoff] << " Sequence:";
617:                                 for (k = 0; k < i; k++) {
618:                                         if (globalBlock[j*lineLength + k] == ASTERIK)
619:                                                 outFile << " " << '*';
620:                                         else
621:                                                 outFile << " " << (char)((globalBlock[j*lineLength + k]-1) + 'A');
622:                                 }
623:                                 outFile << endl;
624:                         }
625:                 }
626:         }
627: 
628: 
629:         outFile.close();
630: 
631: }
632: 
633: 
634: 
635: 
636: // Analyzes a single load file, building sequences from 1 to cutoff
637: void Start(char* fileName) {
638: 
639:         Sequence* seqLenN = new Sequence(fileName);
640:         seqLenN->addToBlock();
641: 
642:         while (seqLenN->getNumSequences() > 0 && seqLenN->getSequenceLength() < cutoff) {
643:                 Sequence* seqLenNPlus1 = new Sequence(seqLenN, seqLenN, fileName);
644: 
645:                 delete seqLenN;
646:                 seqLenN = seqLenNPlus1;
647:                 seqLenN->addToBlock();
648:         }
649:         delete seqLenN;
650: 
651: }
652: 
653: 
654: 
655: 
656: 
657: 
658: 
659: void separateLoads(char* dataFileName, char* outFileName) {
660: 
661:         cout << "Execution Beginning" << endl;
662:         cout << "Max Sequence Length: " << cutoff << endl;
663: 
664:         int totalLoads = 0;
665:         int temp = 0;
666:         DWORD64 totalReads = 0;
667:         DWORD64 countReads = 0;
668: 
669:         int readsUsed = -1;
670:         int i;
671: 
672:         ifstream* traceFile = new ifstream(dataFileName, ios::in | ios::binary);
673:         if (!traceFile) {
674:                 cout << "Unable to open trace file, program exit" << endl;
675:                 return;
676:         }
677: 
678:         //
679:         // Obtain numbers of most significant loads
680:         // Variable loadGroup has an int for each load.  It stores the new file number if the
681:         //  load is significant.  Otherwise it stores 0.
682:         //
683:         
684:         traceFile->read((char*)&totalLoads, sizeof(int));
685:         cout << (int)totalLoads << " Total Loads" << endl;
686: 
687:         DWORD64* loadNums = new DWORD64[(int)totalLoads*2];
688:         if (loadNums == NULL) cout << "Not enough memory for loadNums" << endl;
689:         memset(loadNums, 0, sizeof(DWORD64)*(int)totalLoads*2);
690: 
691:         int* readBuff = new int [BUFF_SIZE*2];
692:         if (readBuff == NULL) cout << "Not enough memory for readBuff" << endl;
693:         while (!traceFile->eof()) {
694:                 traceFile->read((char*)readBuff, BUFF_SIZE*2*sizeof(int));
695: 
696:                 int valsRead = traceFile->gcount() / sizeof(DWORD64);
697: 
698:                 for (i = 0; i < valsRead; i+=2)
699:                         loadNums[readBuff[i]*2]++;
700:         }
701:         traceFile->close();
702: 
703:         for (i = 0; i < totalLoads; i++) {
704:                 loadNums[i*2+1] = i;
705:         }
706: 
707:         // sort, larger in front
708:         qsort(loadNums, (int)totalLoads, sizeof(DWORD64)*2, compare3);
709: 
710:         for (i = 0; i < totalLoads; i++)
711:                 totalReads+=loadNums[i*2];
712: 
713:         for (i = 0; i < totalLoads; i++) {
714:                 countReads+=loadNums[i*2];
715:                 if (((double)(int)countReads) / ((double)(int)totalReads) > percentReads) {
716:                         readsUsed = i+1;
717:                         break;
718:                 }
719:         }
720: 
721:         cout << readsUsed << " Loads Used" << endl;
722:         if (readsUsed == -1) { cout << "Bad number of loads.  Perhaps file does not exist?  Program exit." << endl;        exit(1); }
723:         DWORD64* loadGroup = new DWORD64 [(int)totalLoads];
724:         if (loadGroup == NULL) cout << "Not enough memory for loadGroup" << endl;
725:         memset(loadGroup, 0, sizeof(DWORD64)*(int)totalLoads);
726: 
727:         for (i = 0; i < readsUsed; i++)
728:                 loadGroup[loadNums[i*2+1]] = i+1;  // assign group numbers to loads that will be used
729:         delete [] loadNums;
730: 
731: 
732: 
733:         //
734:         // Build a file for each of the significant loads
735:         // and analyze the values from that load
736:         //
737: 
738:         int* count = new int[numFiles];
739:         if (count == NULL) cout << "Not enough memory for count" << endl;
740:         memset(count, 0, sizeof(int)*numFiles);
741: 
742:         DWORD64* buff = new DWORD64[numFiles*writeBuffSize];
743:         if (buff == NULL) { cout << "Not enough memory for buff" << endl; return; }
744:         memset(buff, 0, sizeof(DWORD64)*numFiles*writeBuffSize);
745:         
746:         int base = 0;
747:         while (base < readsUsed) { // take it a group of files at a time
748: 
749:                 memset(buff, 0, sizeof(DWORD64)*numFiles*writeBuffSize);
750:                 memset(count, 0, sizeof(int)*numFiles);
751: 
752:                 int max = base + numFiles;
753:                 if (readsUsed < max)
754:                         max = readsUsed;
755: 
756:                 cout << "Building loads " << base << " to " << max-1 << endl;
757: 
758:                 ofstream* tempFile = new ofstream[numFiles];
759:                 char filename[100] = {0};
760: 
761:                 for (i = 0; i < max-base; i++) {
762:                         sprintf(filename, "load%04d.data", i);
763:                         tempFile[i].open(filename, ios::out | ios::binary);
764:                 }
765:                 DWORD64 group;
766:                 DWORD64 mgroup;
767:                 traceFile->open(dataFileName, ios::in | ios::binary);
768:                 traceFile->read((char*)&temp, sizeof(int));
769:                  while (!traceFile->eof()) {
770:                         traceFile->read((char*)readBuff, BUFF_SIZE*2*sizeof(int));
771:                         int valsRead = traceFile->gcount() / sizeof(int);
772: 
773:                         for (i = 0; i < valsRead; i+=2) {
774:                                 if (loadGroup[readBuff[i]] > 0) { // used in file
775:                                         group = loadGroup[readBuff[i]]-1; // get load number
776:                                         if (base <= group && group < max) { // if file is being built
777:                                                 mgroup = group-base;
778:                                                 buff[mgroup*writeBuffSize + count[mgroup]] = readBuff[i+1];
779:                                                 count[mgroup]++;
780:                                                 if (count[mgroup] == writeBuffSize) {
781:                                                         tempFile[mgroup].write((char*)&buff[mgroup*writeBuffSize], sizeof(DWORD64)*count[mgroup]);
782:                                                         count[mgroup]=0;
783:                                                 }
784:                                         }
785:                                 }
786:                         }
787:                 }
788:                 traceFile->close();
789: 
790:                 for (i = 0; i < max-base; i++) {
791:                         tempFile[i].write((char*)&buff[i*writeBuffSize], sizeof(DWORD64)*count[i]);
792:                         tempFile[i].close();
793:                 }
794: 
795:                  for (i = 0; i < max-base; i++) {
796:                         sprintf(filename, "load%04d.data", i);
797:                         Start(filename);
798:                         cout << "Analysis of Load " << i+base << " completed" << endl;
799:                 }
800: 
801:                 delete [] tempFile;
802:                 base += numFiles;
803:         }
804: 
805:         delete [] loadGroup;
806:         delete [] readBuff;
807:         delete [] buff;
808:         delete [] count;
809:         delete traceFile;
810: 
811:         cout << "Printing block data to " << outFileName << " ..." << endl;
812: 
813:         printBlockData(outFileName, dataFileName);
814: 
815:         cout << "Execution Complete" << endl;
816:         return;
817: 
818: }
819: 
820: 
821: 
822: 
823: 
824: 
825: 
826: 
827: int main(int argc, char* argv[]) {
828: 
829:         if (argc != 5) { cout << SYNTAX << endl; return 1; }
830: 
831:         char* dataFileName = argv[1];
832:         char* outFileName = argv[2];
833: 
834:         minSupport = atoi(argv[3]);
835:         minSupportTotal = atoi(argv[4]);
836: 
837:         if (minSupport > minSupportTotal) { cout << SYNTAX << endl; return 1; }
838: 
839:         blockAlloc = blockMax * lineLength;
840:         globalBlock = new int [blockAlloc];
841:         if (globalBlock == NULL) { cout << "Insufficient memory for global information block" << endl; return 1; }
842:         memset(globalBlock, 0, blockAlloc*sizeof(int));
843: 
844:         separateLoads(dataFileName, outFileName);
845: 
846:         delete [] globalBlock;
847: 
848:         return 0;
849: 
850: }
851: 
852: