/*
 * Analysis.java
 *
 *  Class that does common decision tree statistics on particular
 *    sets of data  
 */

import java.util.*;

/**
 *
 * @author  cnh
 * @version 
 */
public class DataSet implements java.io.Serializable {
	private LinkedList ll = new LinkedList();
	
	private DataSet maxDS[]; 		// These are subsets (partitions)
	private double max_IG = 0.0;
	private int maxIGFeatureIndex;
	private int numFeatures = 0;
	private Vector usedFeatures;
	private Vector[] featureValues; // contains list of all known values for each feature 

	/** Creates new DataSet */
	public DataSet(int numFeatures) {
		this.numFeatures = numFeatures;
		featureValues = new Vector[numFeatures];
		for(int i = 0; i < numFeatures; i++)
			featureValues[i] = new Vector();
		usedFeatures = new Vector();
	}
	
	public DataSet(int numFeatures, Vector usedFeatures) {
		this(numFeatures);
		this.usedFeatures = usedFeatures;
	}

	public void add(Vector v) {
		for (int i = 1; i < v.size(); i++) {
			Float feature = (Float) v.get(i);
			if(!featureValues[i-1].contains(feature)) 	// add feature to list of
				featureValues[i-1].add(feature);	   		//   possible values if 
														//   not already there.
		}
		ll.add(v);
	}
	
	public void add(Float[] feature, String label) {
		Vector v = new Vector(); // Store in vector format
		v.add(label);
		for (int i = 0; i < feature.length; i++) {
			v.add(feature[i]);
			if(!featureValues[i].contains(feature[i])) // add feature to list of
				featureValues[i].add(feature[i]);	   //   possible values if 
													   //   not already there.
		}
		ll.add(v);
	}

	public int getSize() {
		return ll.size();
	}

	private double calcEntropy() {
		int size = ll.size();
		int positive = 0;
		int negative = 0;
		double entropy = 0.0;
		Vector classifications = new Vector();
		int counts[] = new int[32];
		int totalCount = 0;

		for (int i = 0; i < size; i++) {
			Vector v = (Vector) ll.get(i);
			String label = (String) v.get(0); 
			if (!classifications.contains(label))
				classifications.add(label);

			int index = classifications.indexOf(label);

			counts[index]++;
			totalCount++;
		}

		for (int i = 0; i < classifications.size(); i++) {
			entropy += Math.log((double) counts[i] / (double) totalCount);
		}

		return -1.0 * entropy;
	}

	// This function will return the attribute which classifies
	//   the data the best.  (it returns the position)
	// Internally, it also partitions the data, which can be
	//   retrieved using the other functions.
	public int calcInfoGain() {
		double entropy = this.calcEntropy();

		int maxVar = -1;
		max_IG = -10000000.0;
		
		// Loop through each attribute and look for maximum information gain
		for(int i = 0; i < numFeatures; i++) {
			
			// if feature is used already... skip
			if(usedFeatures.contains(new Integer(i)))
				continue;
				
			// Now find out the number of possible values:
			int possibleValues = getValueCount(i);
			
			// Initialize sub datasets for each partitioning	
			DataSet subDS[] = new DataSet[possibleValues];
			
			Vector newUsedFeatures = (Vector) usedFeatures.clone();
			newUsedFeatures.add(new Integer(i));
			
			for(int j = 0; j < possibleValues; j++)
				subDS[j] = new DataSet(numFeatures, newUsedFeatures);
			
			// Add the vectors to each subset
			for(int j = 0; j < ll.size(); j++) {
				// Retrieve a f.v
				Vector v = (Vector) ll.get(j);
				// Look at feature i (i+1 since 0 is label)
				Float feature = (Float) v.get(i+1);
				// Find index of feature in possible value vector
				int index = featureValues[i].indexOf(feature);
				// Now add feature to that dataset
				if(index==-1) {
					System.out.println("Internal Error: feature value not in possible feature values");
					System.exit(0);
				}
				
				subDS[index].add(v);
			}
			
			// Now, calculate the information gain
			
			double IG = entropy;
	
			for(int k = 0; k < possibleValues; k++) {
				IG = (double) IG - (double) subDS[k].calcEntropy() *
						  ((double) subDS[k].getSize()/(double) this.getSize());
			}

			// System.out.println("Info gain produced: " + IG);
			if (IG > max_IG) {
				max_IG = IG;
				maxVar = i;
				maxDS = subDS;
			}
		}

		maxIGFeatureIndex = maxVar;
		return maxVar;

	}

	public DataSet getSubset(int i) {
		return maxDS[i];
	}


	public double getIG() {
		return max_IG;
	}


	public String uniformLabel() {
		String retval = "";
		if (getSize() == 0) {
			return retval;
		}
		Vector v0 = (Vector) ll.get(0);
		retval = (String) v0.get(0);

		for (int i = 0; i < getSize(); i++) {
			Vector v = (Vector) ll.get(i);
			String tf = (String) v.get(0);
			if (!tf.equals(retval)) {
				return "";
			}
		}

		//System.out.println("Uniform label:" + retval);
		return retval;
	}

	public String majorityLabel() {
		Vector labels = new Vector();
		int counts[] = new int[32];

		String retval = "";
		if (getSize() == 0) {
			return retval;
		}

		for (int i = 0; i < getSize(); i++) {
			Vector v = (Vector) ll.get(i);
			String tf = (String) v.get(0);
			if (!labels.contains(tf)) {
				labels.add(tf);
			}
			counts[labels.indexOf(tf)]++;
		}
		int max = 0;
		int maxj = 0;
		for (int j = 0; j < 32; j++) {
			if (counts[j] > max) {
				max = counts[j];
				maxj = j;
			}
		}
		return ((String) labels.get(maxj));
	}

	public int getValueCount(int featureIndex) {
		return featureValues[featureIndex].size();
	}
	
	// Must be called after calculating information gain
	public int getBranchFactor() {
		return maxDS.length;
	}
	
	// Must be called after calculating information gain
	public Float getBranchValue(int branchIndex) {
		return (Float) featureValues[maxIGFeatureIndex].get(branchIndex);
	}
	
	public Vector getUsedFeatures() {
		return usedFeatures;
	}
}
