/**********************************************************************
    Copyright (C) 2004 Database Systems Lab, Supercomputer Education and
    Research Centre, Indian Institute of Science, Bangalore, INDIA.
    http://dsl.serc.iisc.ernet.in

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
***********************************************************************/


/********************* Interface to the Database ***********************
 AUTHOR: Vikram Pudi

 DESCRIPTION:

 As far as we are concerned till now, a database must just contain (in
 some form) a sequence of itemsets.

 An itemset is a set of items.  An item is an elementary entity that is
 not further defined, except that it comes from a predefined set of
 possible entities.

 It is assumed that the items in an itemset are in ascending order.  An
 itemset is implemented as a vector of items, as defined in the
 standard template library (stl) of C++.

 Item ids are assumed to start from 0 till some max-1.  Even though we
 do not use this fact here, it will be used later.  It is the job of
 this file to convert items to ids from 0 to max-1, if they are not
 already of that form in the database.

 Along with each database file, there must be another file which
 contains information about the database, such as number of items,
 tuples, etc.  We call this file as a metafile.  The syntax of this
 file is as follows --

	pathname of the database file
	number of items
	number of tuples

 An alternative syntax, which is useful for incremental mining
 algorithms is --

	pathname of the database file
	number of items
	number of tuples in part 1
	number of tuples in part 2
	.
	.
	.
 
 There can be more than one metafile per database file.  This is useful
 for comparing incremental algorithms versus non-incremental ones.

 The database is designed as a class "Database" and behaves like an
 input stream of itemsets.  To open the database, we pass a string
 containing the pathname of the metafile to the constructor.  For the
 alternative syntax of the metafile as defined above, we may also pass
 which part of the database we want, as a second argument to the
 constructor.  To open the first part, we have part=1.  To open the
 second part (increment), we can have part=2, etc.

 Since a database behaves like an input stream of itemsets, we can have
 code like --

 	Itemset i;  Database db;
	db >> i;

 The operation "Database >>" returns the database, which can then be
 tested to see if any more input operations can be done. The state can
 be tested merely by using the name of the database. So the following
 will perform a pass over the database --

	while (db >> i)
 	{
	    //use i
	}
***********************************************************************/

#ifndef DATABASE_H_
#define DATABASE_H_

#include <fstream.h>
#include <fcntl.h>
#include <unistd.h>
#include "item.h"
#include "taxonomy.h"

const int BUFSIZE = 8192 * sizeof(Item);

class Database {
    int fd; //the actual database file descriptor
    int buf[BUFSIZE];
    int cur_blk_size;
    int cur_buf_pos;

    int noAttributes; //no of items
    int databaseSize; //no of tuples
    int cursor;
    AncesTable table; //for generalized rules

    void get_first_blk()
    {
	cur_blk_size = (read(fd, (char *)buf, BUFSIZE * sizeof(Item))) /
		sizeof(Item);
	cur_buf_pos = 3;
    }

    void get_next_blk()
    {
       int res = cur_blk_size - cur_buf_pos + 3;

       if (res > 0)
       { //copy partial transaction to beginning of buffer
	  memcpy(buf, (char *)(buf + cur_buf_pos - 3), res * sizeof(Item));
	  cur_blk_size = res;
       }
       else //no partial transaction in buffer
	  cur_blk_size = 0;

       cur_blk_size += (read(fd, (char *)(buf + cur_blk_size),
        	((BUFSIZE - cur_blk_size)*sizeof(Item)))) / sizeof(Item);

       if (cur_blk_size > 0)
	  cur_buf_pos = 3;
    }

public :
    Database(char *metadata,int);
	//metadata is name of a file containing information about data
        //the int will tell which part of the database file is needed
        //1 is for the 1st part, etc. ... useful for incremental mining

    int size() const { return databaseSize; }
    int rows() const { return size(); }
    int columns() const { return noAttributes; }
    AncesTable& tax() { return table; }

    void rewind() //goto first tuple
	{ lseek(fd, 0, SEEK_SET); cursor = 0; get_first_blk(); }

    void skip(); //skip the next tuple

    friend Database& operator>>(Database& d, Itemset& i);
    	//operator gets the next tuple and puts it into itemset i

    operator bool() const { return (cursor <= size()); }
    	//returns true if the input stream is good(), i.e. there is
    	//input to be seen.
};

Database::Database(char *metadata, int part = 1 )
{
    ifstream mf(metadata);
    if (! mf)
    {
	cout << "Couldn't open metafile: " << metadata << endl;
	exit(1);
    }

    char datafile[256];
    mf >> datafile;
    mf >> noAttributes;
    if (! mf)
    {
	mf.clear();
	char taxfile[256];
	mf >> taxfile;
	Taxonomy taxonomy(taxfile);
	table.construct(taxonomy);
	mf >> noAttributes;
    }

    fd = open(datafile, O_RDONLY);
    if (fd < 0)
    {
	cout << "Couldn't open database file: " << datafile << endl;
	exit(1);
    }

    for (int n = 0; n < part; n++)
    {
	mf >> databaseSize;
	cursor = 0;
	if (n < part-1)
	    for (int count = 0; count < databaseSize; count++)
		skip();
    }

    get_first_blk();
}

Database& operator>>(Database& d, Itemset& i)
{
    if (++d.cursor > d.size())
	return d;

    int noItems = d.buf[d.cur_buf_pos-1];
    if (d.cur_buf_pos + noItems + 3 > d.cur_blk_size)
	d.get_next_blk();

    i.from_int_array(d.buf + d.cur_buf_pos, noItems);
    d.cur_buf_pos += (noItems + 3);

    return d;
}

void Database::skip()
{
    if (++cursor > size())
	return;

    int noItems = buf[cur_buf_pos-1];
    if (cur_buf_pos + noItems + 3 > cur_blk_size)
	get_next_blk();

    cur_buf_pos += (noItems + 3);
}

inline void write_big_endian(ofstream &fp, int stuff)
{
    char buf[sizeof(int)];
    for (int i = 0; i < (int)sizeof(int); i++)
	buf[sizeof(int)-i-1] = (stuff & (0x000000ff<<(8*i))) >> (8*i);
    fp.write(buf, sizeof(int));
}

void writeTransaction(Itemset& i, int tid, ofstream &fp)
{
    int cid = tid; //customer id
    int nitems = i.size();

    fp.write((char *)&cid, sizeof(int));
    fp.write((char *)&tid, sizeof(int));
    fp.write((char *)&nitems, sizeof(int));
    for (int j = 0; j < nitems; j++)
    {
	int item = i[j].id;
	fp.write((char *)&item, sizeof(int));
    }
    if (! fp)
    {
	cerr << "Cannot write to output file!" << endl;
	exit(1);
    }
}

void writeTransactionBigEndian(Itemset& i, int tid, ofstream &fp)
{
    int cid = tid; //customer id
    int nitems = i.size();

    write_big_endian(fp, cid);
    write_big_endian(fp, tid);
    write_big_endian(fp, nitems);
    for (int j = 0; j < nitems; j++)
    {
	int item = i[j].id;
	write_big_endian(fp, item);
    }
    if (! fp)
    {
	cerr << "Cannot write to output file!" << endl;
	exit(1);
    }
}

#endif
