// File:  preproc.C 
// Created by: Pradeep Shenoy (purdy@cse.iitb.ernet.in)
// Last modified: 10 June 1999
//
// Description: 
// 	Does the level1+2 counting, and writes the first level snakes
// (in mixed fashion) to disk. Returns 2-large itemsets in an array of 
// Cand_t structures to calling function.
//  

#include "include/global.h"
#include "include/snakes.h"
#include "include/largeset.h"
#include "include/tidheap.h"
#include <fstream.h>

extern LargeSet_t *Large;  // Put all 1- and 2-large itemsets here.

// Input interface to the database flat file.
extern  int get_tuple(ifstream& inpfd, int *items);

// Pass over the snakes for 1-large items, and count level2 candidates.
Cand_t *CountLevelTwo(int *largeItems, int nitems, int *nlarge2);


extern Cand_t * preprocess(int * nlarge2){

    // Count single-item frequencies here.
    int sncount[Global::MAXSN];
    
    // Initialise the snakes, open data file for writing.
    int fd = open(STORDATA, O_RDWR|O_CREAT|O_TRUNC, 0600); 
    int l1_counter = Global::MAXSN; // common counter for all WSN instances.
    WSN *snakes = new WSN[Global::MAXSN];
    for(int item = 0; item < Global::MAXSN;item++){
    	snakes[item].init_write(fd, item, &l1_counter);
    	sncount[item] = 0;
    }

    //---- added by Vikram
    ifstream inp_fd;

#ifdef __GNUC__
    inp_fd.open(DATAFILE, ios::in|ios::binary);
#else //SPARCompiler doesn't support this
    inp_fd.open(DATAFILE);
#endif
    if (! inp_fd)
    {
	cout << "Couldn't open database file: " << DATAFILE << endl;
	exit(1);
    }

    //---- added by Vikram
// Original code commented by Vikram for woking with Linux
//    int inp_fd = open(DATAFILE, O_RDONLY);


    // Do the first pass over the database -- count 1-items
    int nitems, trans_ID = 0;
    int items[Global::MAX_TLEN + 100]; // Safety factor
    while( (nitems = get_tuple(inp_fd, items)) > 0){
	trans_ID++;
	for (int i = 0; i < nitems; i++){
	    sncount[items[i]]++;
	    snakes[items[i]].addToSnake(trans_ID);
	}
    }
    Global::MAXTID = trans_ID;	// Number of tuples in the databases.


    // Find the number of large items in level-1
    int nlarge1 = 0;
    int largeItems[Global::MAXSN];
    for(int item = 0; item < Global::MAXSN; item++)
        if(Global::hasMinSup(sncount[item])){
	   Large -> Insert(&item, 1, sncount[item]);
	   largeItems[nlarge1++] = item;
	 }  else snakes[item].DiscardLastBuffer();

    cout << "Level1: " << nlarge1 << endl;

    delete [] snakes;
    close(fd);

    // Count level2 candidates using the written out snakes.
    return CountLevelTwo(largeItems, nlarge1, nlarge2);
}


// For counting level-2 candidates. Done by maintaining a l1 x l1 array
// where l1 = # large single items. The function makes a pass over the
// snakes corresponding to these large items, and merges them to recreate
// tuples, and count all pairs in each tuple.
// If really large number of candidates, then the l2-candidates array will
// have to be partitioned, and multiple passes made over the bit-vectors.
// Currently not implemented.

Cand_t *CountLevelTwo(int *largeItems, int nitems, int *nlarge2){

    assert(nitems < 6000);  // Will run out of memory otherwise.

    // Initialise the read snakes.
    RSN *snakes = new RSN[nitems];
    TidHeap tidheap(nitems);
    int inp_fd = open(STORDATA, O_RDONLY);
    for (int i = 0; i < nitems; i++){
    	snakes[i].init_read(inp_fd, largeItems[i]);
	tidheap.insert(i, snakes[i].getNextTid());
    }

    // Initialise the counter array.
    int **l2count = new int *[nitems];
    int *a = new int[nitems*nitems];
    for(int i = 0; i < nitems; i++){
	l2count[i] = &a[nitems*i];
	for(int j = 0; j < nitems;  j++)
	    l2count[i][j] = 0;
    }
    
    // The merge works this way: Insert the latest TID entry for each snake
    // into the TID heap. At each step delete the least-TID entry, and add
    // its corresponding snake to a tuple corresponding to the TID. 
    // Then make an entry in heap for the next TID of the snake. 
    // Everytime a tuple is completed, count all pairs of items in the
    // tuple.

    int tuple[Global::MAX_TLEN+100]; int tlen = 0, lastTID = 0;
    heapbuc_t buc = tidheap.deleteMIN();
    while (buc.TID > 0){
	if ( buc.TID == lastTID) // Add to txn
	    tuple[tlen++] = buc.readid;
	else {
	    for (int i = 0; i < tlen; i++)
		for(int j = i+1; j < tlen; j++)
		    l2count[tuple[i]][tuple[j]]++;

	    tlen = 0;
	    tuple[tlen++] = buc.readid;
	    lastTID = buc.TID;
	}

	int TID;
	if ((TID = snakes[buc.readid].getNextTid()) > 0)
	    tidheap.insert(buc.readid, TID);

	buc = tidheap.deleteMIN();
    }

    // The last transaction -- missed out above.
    for (int i = 0; i < tlen; i++)
	for(int j = i+1; j < tlen; j++)
	    l2count[tuple[i]][tuple[j]]++;

    // We're done.
    delete [] snakes;
    close(inp_fd);


    // First find how many large 2 itemsets
    int l2large = 0;
    for (int i = 0; i < nitems; i++)
	for(int j = i+1; j < nitems; j++)
	    if (Global::hasMinSup(l2count[i][j]+l2count[j][i])) l2large++;


    // Fill in the array of large 2 itemsets -- to be returned
    // Short note: $tmpint$ is a "snaked" array, i.e., bits of it are used
    // as small arrays in each candidate-struct. This saves several
    // mallocs--one for each of these small arrays.

    Cand_t *l2array = new Cand_t[l2large];
    int *tmpint = new int[l2large*2];
    int k = 0, count = 0;
    for (int i = 0; i < nitems; i++)
        for (int j = i+1; j < nitems; j++)
	    if (Global::hasMinSup(l2count[i][j]+l2count[j][i])){
	   	l2array[count].itemset = &tmpint[k];
		l2array[count].frequency = l2count[i][j]+l2count[j][i];
		tmpint[k++] = l2array[count].genID1 = largeItems[i]; 
		tmpint[k++] = l2array[count].genID2 = largeItems[j];

	        Large -> Insert(l2array[count].itemset, 2,
					l2array[count].frequency);

		count++;
	    }
	    
    delete [] a; delete [] l2count;

    *nlarge2 = l2large;
    return l2array;
}

