/*
	Code for accessing vectors on disk in svmlight format
	(http://svmlight.joachims.org/). This code has a GPL license. Please
	let me know if you find any bugs.
	
	Filip Radlinski, 11 Feb 2003
	filip@cs.cornell.edu
*/

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "DT_data.h"

enum FILETYPE {
	DT_CLOSED,
	DT_TWO_FILES,
	DT_ONE_FILE
};

struct DT_file {
		enum FILETYPE type;
		FILE *docfile, *labelfile;
		int pos;
};

static int cur_pair;
static struct DT_file datafiles[MAX_FILES];

/* Initialize data variables */
void DT_init() {
	int i;
	
	cur_pair = 0;
	
	for(i=0; i<MAX_FILES; i++) {
		datafiles[i].type = DT_CLOSED;
		datafiles[i].docfile = 0;
		datafiles[i].labelfile = 0;
		datafiles[i].pos = 0;
	}
}

/* Print out a vector */
void DT_print(struct DT_vector *vec) {
  int i;

  for(i=0; i<vec->size; i++) 
    printf("%i:%.4e ", vec->dimensions[i], vec->values[i]);
	printf("\n");
}

/* Open the docfile and labelfile */
int DT_open(char *docfile, char *labelfile) {

	int rv, i, filetype;

	if (labelfile == NULL)
		filetype = DT_ONE_FILE;
	else
		filetype = DT_TWO_FILES;
	
	/* Check that don't have too many pairs open */
	if (cur_pair >= MAX_FILES)
		return -1;
	
	datafiles[cur_pair].docfile = fopen(docfile,"r");
	if (datafiles[cur_pair].docfile==0) {
		printf("Error opening doc file %s\n", docfile);
		return -1;
	}

	if (filetype == DT_TWO_FILES) {
		datafiles[cur_pair].labelfile = fopen(labelfile,"r");
		if (datafiles[cur_pair].labelfile == 0) {
			printf("Error opening label file %s", labelfile);
			fclose(datafiles[cur_pair].docfile);
			datafiles[cur_pair].docfile = 0;
			return -1;
		}
	}

	datafiles[cur_pair].type = filetype;
	datafiles[cur_pair].pos = 0;
	
	rv = cur_pair;

	/* cur_pair points to the next free one */
	for(i=0; (i<MAX_FILES)&&(cur_pair==rv); i++) {
		if (datafiles[i].type == DT_CLOSED)
			cur_pair = i;
	}
	
	/* Ran out of storage space */
	if (cur_pair == rv)
		cur_pair = MAX_FILES;

	return rv;
}

/* Close the docfile and labelfile for this docno */
void DT_close(int docno) {

	cur_pair = docno;

	/* Close current pair and free it up */
	if (datafiles[docno].type != DT_CLOSED) {
		fclose(datafiles[docno].docfile);
		datafiles[docno].docfile = 0;
		if (datafiles[docno].type == DT_TWO_FILES) {
			fclose(datafiles[docno].labelfile);
			datafiles[docno].labelfile = 0;
		}
		datafiles[docno].pos = 0;
		datafiles[docno].type = DT_CLOSED;
	}
}

/* Rewind the docfile and labelfile for this docno. Return 1 on success,
	 and 0 on failure. */
int DT_rewind(int docno) {

	if (datafiles[docno].type != DT_CLOSED) {
		rewind(datafiles[docno].docfile);
		if (datafiles[docno].type == DT_TWO_FILES)
			rewind(datafiles[docno].labelfile);
		datafiles[docno].pos = 0;
		return 1;
	}
	return 0;
}

/* Get the next document vector and label */
struct DT_vector *DT_next(int docno) {

	char temp_doc[MAX_VEC_STR_LEN];
	char temp_label[MAX_VEC_STR_LEN];
	char *temp_doc_p;
	int i;
	struct DT_file *d;
	struct DT_vector *dp;

	d = &(datafiles[docno]);
	dp = malloc(sizeof(struct DT_file));

	if (docno < 0) {
		printf("Error: Invalid fileset %i\n", docno);
		return NULL;
	}
	
	/* Not open */
	if (d->type == DT_CLOSED) {
		printf("Error: Fileset %i not open\n", docno);
		return NULL;
	}

	/* End of file */
	if (feof(d->docfile) ||
			((d->type == DT_TWO_FILES) && (feof(d->labelfile)))) {
		return NULL;
	}

	/* Read a line from the document file */
	fgets(temp_doc, MAX_VEC_STR_LEN, d->docfile);
	temp_doc_p = temp_doc;

	/* End of file - usually get the error after trying to read
		 one line too many. */
	if (feof(d->docfile) ||
			((d->type == DT_TWO_FILES) && (feof(d->labelfile)))) {
		return NULL;
	}
	
	/* If one file, get label from start of line */
	if (d->type == DT_ONE_FILE) {
		sscanf(temp_doc_p, "%s", temp_label);
		temp_doc_p += strlen(temp_label)+1;
		dp->label = malloc(strlen(temp_label)+1);
		memcpy(dp->label, temp_label, strlen(temp_label)+1);
	}

	/* Count the elements - one for each ':' */
	dp->size=0;
	for(i=0; i<strlen(temp_doc_p)-1; i++)
		if (temp_doc_p[i] == ':')
			dp->size++;
	
	dp->dimensions = malloc(sizeof(int)*dp->size);
	dp->values = malloc(sizeof(double)*dp->size);
	
	/* Store vector, While we can fit it and we haven't read everything */
	for(i=0; i<dp->size; i++) {
		sscanf(temp_doc_p, "%i:%lf", &(dp->dimensions[i]), &(dp->values[i]));

		/* Skip over this value pair, and any whitespace */
		temp_doc_p = strchr(temp_doc_p, ' ');
		if (temp_doc_p == NULL) break;
		while((temp_doc_p[0] == ' ')||(temp_doc_p[0] == '\t'))
			temp_doc_p++;
	}
	
	/* Read the label(s) if stored in a second file */
	if (d->type == DT_TWO_FILES) {
		fgets(temp_label, MAX_VEC_STR_LEN, d->labelfile);
		dp->label = malloc(strlen(temp_label)+1);
		memcpy(dp->label, temp_label, strlen(temp_label)+1);
		for(i=0; i<strlen(dp->label); i++)
			if (dp->label[i] == '\n')
				dp->label[i] = 0;
	}
	
	d->pos++;

	return dp;
}

/* Find the maximum word no in the vector */
int DT_max(struct DT_vector *vec) {

	if ((vec == NULL) || (vec->size == 0))
		return -1;

	return (vec->dimensions[vec->size-1]);
}   

/* Get the (n+1)th record */
struct DT_vector *DT_find(int docno, int n) {

	int i;
	struct DT_vector *dp;
	
	/* Only rewind if we have passed what we're looking for */
	if (n <= datafiles[docno].pos) 
		DT_rewind(docno);
	else
		n -= datafiles[docno].pos + 1;
	
	for(i=0; i<n; i++) {
		dp = DT_next(docno);
		if (dp==NULL)
			return NULL;
		else if (i!=n-1)
			DT_free(dp);
	}
	
	return dp;
}

/* Free a datapoint */
void DT_free(struct DT_vector *vec) {
	
	if (vec==NULL) return;
	
	if (vec->dimensions!=NULL)
		free(vec->dimensions);
	if (vec->values!=NULL)
		free(vec->values);
	if (vec->label!=NULL)
		free(vec->label);
	free(vec);
	
}
