
myentropy:{[list]
 tot: count list;
 nums: value count each group list;
 probs: nums % tot;
 sum probs * neg (log probs) % (log 2) }

/ entropy of list2 given list1
mycondentropy:{[list1; list2]
 tot: count list1;
 g: group list1;
 nums: value count each g;
 probs: nums % tot;
 ents: myentropy each list2 g;
 sum probs * ents}

/ amount of information gained about list2 given list1
infogain:{[list1; list2]
  myentropy[list2] - mycondentropy[list1; list2]}

x: 1 1 0 0 1 1 1 1
x,: x
y: 0 1 0 0 1 0 1 1
y,: y
z: 1 1 1 0 0 0 1 1
z,: z
target: 1 1 0 0 1 0 1 1 0 1 1 0 1 1 1 0

headers: `x`y`z
lists:(x; y; z)



/ decision tree for a bunch of source lists to get a target list
mydectreestump:{[lists; target; level]
 if[(0 = count lists); out:("  ")];
 if[(0 = myentropy[target]); out:("  ")];
 igs: infogain[;target] each lists;
 i: igs ? max igs;
 out: ( (level # " "), (string headers[i])),/: (string distinct lists[i]);
 out,: ( (level # " "), (string headers[i])),/: (string distinct lists[i]);
 out}

myfile: `:tmpout 
/ use following to see if basic functionality works
/ x: mydectreestump[lists; target; 2]
/ myfile 0: x
/ use following for debugging
/ myfile 0: enlist "   "
/ h: (neg) hopen myfile
 
 
 

/ decision tree for a bunch of source lists to get a target list
mydectreedebug:{[lists; remaininglistids; target; level]
 if[(0 = myentropy[target]); h ((level+1) # " "),raze string (distinct target), (" happens "), (count target), (" times.")];
 if[(0 < myentropy[target]) & (0 = count remaininglistids); h ((level+1) # " "),/: (raze each) (string each) (distinct target),'(`$"_is correct_"),/:(value count each group target),\: (`$"_times.")];
 if[(0 = myentropy[target]) | (0 = count remaininglistids); :"  "];
 igs: infogain[;target] each lists[remaininglistids];
 i: igs ? max igs; / among those remaining
 i: remaininglistids[i]; / actual list
 x: (preparedecdebug[lists; remaininglistids except i;target; level+1; i] each distinct lists[i]);
 x}

/ co-routine for the decision tree
preparedecdebug:{[lists; remaininglistids; target; level; i; val]
  h enlist (level # " "), (string headers[i]), ("_"), (string val) , (":");
  myind: where lists[i] = val;
  x: mydectreedebug[lists[;myind]; remaininglistids; target[myind]; level];
  x}
 

/ decision tree for a bunch of source lists to get a target list
mydectree:{[lists; remaininglistids; target; level]
 if[(0 = myentropy[target]); :enlist ((level+1) # " "),raze string (distinct target), (" happens "), (count target), (" times.")];
 if[(0 = count remaininglistids); :((level+1) # " "),/: (raze each) (string each) (distinct target),'(`$"_is correct_"),/:(value count each group target),\: (`$"_times.")];
 igs: infogain[;target] each lists[remaininglistids];
 i: igs ? max igs; / among those remaining
 i: remaininglistids[i]; / actual list
 out: raze (preparedec[lists; remaininglistids except i;target; level+1; i] each distinct lists[i]);
 out}

/ co-routine for the decision tree
preparedec:{[lists; remaininglistids; target; level; i; val]
  out: enlist (level # " "), (string headers[i]), ("_"), (string val) , (":");
  myind: where lists[i] = val;
  out,: mydectree[lists[;myind]; remaininglistids; target[myind]; level];
  out}
 
 
/ for testing of the basic functionality
x: mydectree[lists; til count lists; target; 0]
myfile 0: x
/ end testing of basic functionality
myfile2: `:tmpoutcrossval
myfile2 0: enlist "Cross-validation of decision tree"
h: (neg) hopen myfile2


/ now start cross-validation
/ In this case, we formulate a bunch of different trees
/ but don't worry about testing.
crossvalsimple:{[lists; target]
 num: 10;
 do[num;
   tot: count target;
   myind: (neg floor 0.8 * tot) ? tot;
   mylists: lists[;myind];
   mytarget: target[myind];
   h mydectree[mylists; til count mylists; mytarget; 0]];
 num}


/ decision tree for a bunch of source lists to get a target list
/ This includes testing
mydectreetest:{[lists; remaininglistids; target; level; liststest; targettest]
 if[(1 = count distinct target); 
   numcorrecttraining+: count target;
   numcorrecttest+: count targettest[where targettest = first distinct target];
   :enlist ((level+1) # " "),raze string (distinct target), (" happens "), (count target), (" times.")];
 if[(0 = count remaininglistids); 
   trip: calccorrect[target];
   numcorrecttraining+: trip[2];
   numcorrecttest+: count targettest[where targettest = trip[0]];
   :((level+1) # " "),/: (raze each) (string each) (distinct target),'(`$"_is correct_"),/:(value count each group target),\: (`$"_times.")];
 igs: infogain[;target] each lists[remaininglistids];
 i: igs ? max igs; / among those remaining
 i: remaininglistids[i]; / actual list
 out: raze (preparedectest[lists; remaininglistids except i;target; level+1; i; liststest; targettest] each distinct lists[i]);
 out}

/ co-routine for the decision tree
preparedectest:{[lists; remaininglistids; target; level; i; liststest; targettest; val]
  out: enlist (level # " "), (string headers[i]), ("_"), (string val) , (":");
  myind: where lists[i] = val;
  myindtest: where liststest[i] = val; / testing part
  out,: mydectreetest[lists[;myind]; remaininglistids; target[myind]; level; liststest[myindtest]; targettest[myindtest]];
  out}
 

/ now start cross-validation
/ here we take along the training and test and return both training
/ error and test error.
/ This requires a different dectree called dectreetest
crossvaltest:{[lists; target]
 num: 10;
 do[num;
   tot: count target;
   myind: (neg floor 0.8 * tot) ? tot;
   myind@: rank myind;
   myindtest: (til tot) except myind;
   mylists: lists[;myind];
   mytarget: target[myind];
   myliststest: lists[;myindtest];
   mytargettest: target[myindtest];
   numcorrecttraining:: 0; / a global
   numcorrecttest:: 0; / a global
   h mydectreetest[mylists; til count mylists; mytarget; 0; myliststest; mytargettest];
   h enlist ("Training correct is: "),(string (numcorrecttraining % (count myind)));
   h enlist ("Test correct is: "),(string (numcorrecttest % (count myindtest)))];
 num}



 
 
/ target could be however big the target vector is
/ finds the majority value, total number and the number of correct
calccorrect:{[target]
  g: count each group target;
  totval: sum value g;
  numcorrect: (max value g);
  imax: (value g) ? numcorrect;
  decide: (key g)[imax];
  (decide; totval; numcorrect)}
 

x: crossvaltest[lists;target]

x

"Look at tmpoutcrossval"
