/***********************************************************************
 AUTHOR: Vikram Pudi
 DESCRIPTION:
     ARMOR: Association Rule Mining based on ORacle

    Copyright (C) 2003 Database Systems Lab, SERC, IISc, Bangalore.

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
***********************************************************************/
#include "data.h"
#include "tidsetdag.h"
#include "mymath.h" //for ceil()
#include <stdlib.h> //for atol()

int main(int argc, char *argv[])
{
    if (argc < 2)
    {
	cerr << "Usage: " << argv[0]
	     << " <dataset-file> <minsup> [<output-file>]" << endl;
	return 1;
    }

    Data db(argv[1]);
    long mincount = atol(argv[2]);
    char *nullfile = "/dev/null";
    char *outname = (argc < 3)? nullfile : argv[3];
    int partition_size = 100000;

    //---------- db pass to determine noRows and maxItem
//    cout << "\nPreprocessing..." << endl;
    Itemset iset;
    int noRows = 0, maxItem = -1;
    while (db >> iset)
    {
	noRows++;
	if (iset.size() > 0 && maxItem < iset.back())
	    maxItem = iset.back();
    }

//    cout << noRows << " transactions." << endl;
//    cout << maxItem + 1 << " items." << endl;
    initHist(maxItem+2);
    int noPartitions = (int) ceil(noRows / (double)partition_size);
    double minsupp = (mincount-0.5) / (double) noRows;

    //---------- first phase
//    cout << "\nPhase 1" << endl;

    db.rewind();
    TidsetDag D(maxItem+1);
    vector<Itemset> partition;
    partition.resize(partition_size);
    int pno, count;
    for (pno = 0; pno < noPartitions; pno++)
    {
	Itemset t;
	count = 0;
	while (count < partition_size && db >> t)
	{
	    D.crunch(t, count, minsupp);
	    partition[count].swap(t);
	    count++;
	}

	partition.resize(count);

	D.update(pno, partition_size, partition, minsupp);

//	cout << "Number of nodes in dag after partition " <<
//		pno << ": " << D.size() << endl;
    }

    //output current memory usage
    partition = vector<Itemset>(); //free memory

    //---------- second phase
//    cout << "\nPhase 2" << endl;

    ofstream outfile(outname);
    if ( !outfile ) {
//	cout << "Not opening output file." << endl;
//	return 1;
    }

    long tuple = (pno-1)*partition_size + count;
    outfile << " (" << noRows << ")\n";
    bool sets2; //are there any 2-itemsets in NB
    sets2 = D.outputLNB(outfile, pno-1, minsupp, partition_size, count);

    db.rewind();

    for (pno = 0; pno < noPartitions; pno++)
    {
//	if (sets2 == false && D.size() == D.noItems())
	if (D.size() == D.noItems())
	    break;

	Itemset t;
	count = 0;
	while (count < partition_size && db >> t)
	{
	    D.crunchNB(t, count, minsupp);
	    count++;
	}

	sets2 = D.incrCountNB(outfile, pno, mincount, count);

//	cout << "Number of nodes in dag after partition " <<
//		pno << ": " << D.size() << endl;
    }

    printNoItemsets();
    return 0;
}
