mirror of
https://github.com/hsokolowski/iTree.git
synced 2026-04-22 22:36:54 -04:00
534 lines
15 KiB
JavaScript
534 lines
15 KiB
JavaScript
// @ts-check
|
|
|
|
/**
|
|
* @typedef {Object} DecisionTreeBuilder
|
|
* @property {Array<Object>} trainingSet
|
|
* @property {Array<string>} allAttributes
|
|
* @property {Array<string>} allClasses
|
|
* @property {number} minItemsCount
|
|
* @property {string} categoryAttr
|
|
* @property {number} entropyThrehold
|
|
* @property {number} maxTreeDepth
|
|
* @property {Array<string>} ignoredAttributes
|
|
* @property {string} algorithm
|
|
*/
|
|
|
|
/**
|
|
* @param {DecisionTreeBuilder} _builder
|
|
* @param {boolean} isChanged
|
|
*/
|
|
//TSP
|
|
function buildDecisionTreeMix(
|
|
_builder,
|
|
isChanged = false,
|
|
changedAttribute1 = null,
|
|
changedAttribute2 = null
|
|
) {
|
|
//debugger;
|
|
const builder = { ..._builder };
|
|
const {
|
|
trainingSet,
|
|
minItemsCount,
|
|
categoryAttr,
|
|
entropyThrehold,
|
|
maxTreeDepth,
|
|
ignoredAttributes,
|
|
algorithm,
|
|
allClasses,
|
|
} = builder;
|
|
|
|
/** @type {string | number} */
|
|
var _quality = 0;
|
|
|
|
// LEAF
|
|
if (maxTreeDepth === 0 || trainingSet.length <= minItemsCount) {
|
|
//console.log('Liść bo maxTreeDepth:', maxTreeDepth, ' Ilość elementów:', trainingSet.length);
|
|
let _category = mostFrequentValue(trainingSet, categoryAttr);
|
|
let _positiveCounter = 0;
|
|
for (let element of trainingSet) {
|
|
if (element[categoryAttr] === _category) _positiveCounter++;
|
|
}
|
|
let _negativeCounter = trainingSet.length - _positiveCounter;
|
|
_quality = _positiveCounter / trainingSet.length;
|
|
_quality = _quality * 100;
|
|
|
|
return {
|
|
category: _category,
|
|
quality: _quality.toFixed(2),
|
|
matchedCount: _positiveCounter,
|
|
notMatchedCount: _negativeCounter,
|
|
trainingSet2: trainingSet,
|
|
};
|
|
}
|
|
|
|
// LEAF
|
|
var initialEntropy = entropy(trainingSet, categoryAttr);
|
|
if (initialEntropy <= entropyThrehold && !isChanged) {
|
|
console.log('initialEntropy ' + initialEntropy + '<=' + entropyThrehold + ' entropyThrehold');
|
|
let _category = mostFrequentValue(trainingSet, categoryAttr);
|
|
let _positiveCounter = 0;
|
|
for (let element of trainingSet) {
|
|
if (element[categoryAttr] === _category) _positiveCounter++;
|
|
}
|
|
let _negativeCounter = trainingSet.length - _positiveCounter;
|
|
_quality = _positiveCounter / trainingSet.length;
|
|
_quality = _quality * 100;
|
|
|
|
return {
|
|
category: _category,
|
|
quality: _quality.toFixed(2),
|
|
matchedCount: _positiveCounter,
|
|
notMatchedCount: _negativeCounter,
|
|
trainingSet2: trainingSet,
|
|
};
|
|
}
|
|
|
|
var attributes = builder.allAttributes.filter(el => el !== categoryAttr && !ignoredAttributes.includes(el));
|
|
|
|
var arrayOfTests = [];
|
|
|
|
if (algorithm.includes('c45')) arrayOfTests.push(C45Dif(trainingSet, categoryAttr, ignoredAttributes));
|
|
if (algorithm.includes('tsp')) arrayOfTests.push(TSPDif(allClasses, attributes, trainingSet, categoryAttr));
|
|
if (algorithm.includes('tspw'))
|
|
arrayOfTests.push(TSPWDif(allClasses, attributes, trainingSet, categoryAttr));
|
|
|
|
//console.log(arrayOfTests);
|
|
|
|
var lowest;
|
|
var tmp;
|
|
var min = 1000;
|
|
for (var alg of arrayOfTests) {
|
|
tmp = alg.maxDif;
|
|
if (tmp < min) {
|
|
lowest = alg;
|
|
min = tmp;
|
|
}
|
|
}
|
|
//console.log(lowest);
|
|
const { maxDif, match, notMatch, attribute1, attribute2, direction, L_weight } = lowest;
|
|
|
|
// LEAF
|
|
if (!maxDif) {
|
|
//console.log('Liść bo maxDif:', maxDif);
|
|
let _category = mostFrequentValue(trainingSet, categoryAttr);
|
|
let _positiveCounter = 0;
|
|
for (let element of trainingSet) {
|
|
if (element[categoryAttr] === _category) _positiveCounter++;
|
|
}
|
|
let _negativeCounter = trainingSet.length - _positiveCounter;
|
|
_quality = _positiveCounter / trainingSet.length;
|
|
_quality = _quality * 100;
|
|
|
|
return {
|
|
category: _category,
|
|
quality: _quality.toFixed(2),
|
|
matchedCount: _positiveCounter,
|
|
notMatchedCount: _negativeCounter,
|
|
trainingSet2: trainingSet,
|
|
};
|
|
}
|
|
|
|
//LEAF
|
|
if (match.length === 0 || notMatch.length === 0) {
|
|
//console.log('Liść bo Lewa/Prawa wynosi 0');
|
|
let _category = mostFrequentValue(trainingSet, categoryAttr);
|
|
let _positiveCounter = 0;
|
|
for (let element of trainingSet) {
|
|
if (element[categoryAttr] === _category) _positiveCounter++;
|
|
}
|
|
let _negativeCounter = trainingSet.length - _positiveCounter;
|
|
_quality = _positiveCounter / trainingSet.length;
|
|
_quality = _quality * 100;
|
|
|
|
return {
|
|
category: _category,
|
|
quality: _quality.toFixed(2),
|
|
matchedCount: _positiveCounter,
|
|
notMatchedCount: _negativeCounter,
|
|
trainingSet2: trainingSet,
|
|
};
|
|
}
|
|
|
|
builder.maxTreeDepth = maxTreeDepth - 1;
|
|
builder.trainingSet = match;
|
|
var matchSubTree = buildDecisionTreeMix(builder); //savesubtreesinfothreshold
|
|
|
|
builder.trainingSet = notMatch;
|
|
var notMatchSubTree = buildDecisionTreeMix(builder);
|
|
|
|
//console.log('TUTAJ');
|
|
return {
|
|
attr2: attribute1,
|
|
pivot: attribute2,
|
|
predicateName: direction,
|
|
match: matchSubTree,
|
|
notMatch: notMatchSubTree, //{category: ...}
|
|
matchedCount: match.length,
|
|
notMatchedCount: notMatch.length,
|
|
nodeSet: match.concat(notMatch),
|
|
weight: L_weight ? L_weight.toFixed(3) : null,
|
|
};
|
|
}
|
|
|
|
function countUniqueValues(items, attr) {
|
|
////var counter = {};
|
|
|
|
// detecting different values of attribute
|
|
//// for (var i = items.length - 1; i >= 0; i--) {
|
|
//// // items[i][attr] - value of attribute
|
|
//// counter[items[i][attr]] = 0;
|
|
//// }
|
|
var counter = Object.fromEntries(items.map(item => [item[attr], 0]));
|
|
|
|
// counting number of occurrences of each of values
|
|
// of attribute
|
|
for (var j = items.length - 1; j >= 0; j--) {
|
|
counter[items[j][attr]] += 1;
|
|
}
|
|
|
|
return counter;
|
|
}
|
|
|
|
function mostFrequentValue(items, attr) {
|
|
// counting number of occurrences of each of values
|
|
// of attribute
|
|
var counter = countUniqueValues(items, attr);
|
|
|
|
var mostFrequentCount = 0;
|
|
var mostFrequentValue;
|
|
|
|
for (var value in counter) {
|
|
if (counter[value] > mostFrequentCount) {
|
|
mostFrequentCount = counter[value];
|
|
mostFrequentValue = value;
|
|
}
|
|
}
|
|
|
|
return mostFrequentValue;
|
|
}
|
|
|
|
/** @type {Worker} */
|
|
// @ts-ignore
|
|
const context = self; //eslint-disable-line
|
|
context.onmessage = function (event) {
|
|
console.log('received message', event);
|
|
const {
|
|
data: { _builder, isChanged = false, changedAttribute1 = null, changedAttribute2 = null },
|
|
} = event;
|
|
const result = buildDecisionTreeMix(_builder, isChanged, changedAttribute1, changedAttribute2);
|
|
context.postMessage(result);
|
|
};
|
|
|
|
function TSPDif(allClasses, attributes, trainingSet, categoryAttr) {
|
|
var right = 0,
|
|
left = 0;
|
|
var maxDif = 100;
|
|
var direction = '<';
|
|
/** @type {string | number} */ var attribute1 = -1;
|
|
/** @type {string | number} */ var attribute2 = -1;
|
|
var leftList = [],
|
|
rightList = [],
|
|
classMatrix = [new Array(allClasses.length).fill(0), new Array(allClasses.length).fill(0)],
|
|
match = [],
|
|
notMatch = [];
|
|
|
|
for (let attr1 of attributes) {
|
|
for (let attr2 of attributes) {
|
|
if (attr1 !== attr2) {
|
|
right = left = 0;
|
|
leftList = [];
|
|
rightList = [];
|
|
classMatrix = [new Array(allClasses.length).fill(0), new Array(allClasses.length).fill(0)];
|
|
|
|
// division
|
|
for (let element of trainingSet) {
|
|
const attribute = element[categoryAttr];
|
|
|
|
if (element[attr1] < element[attr2]) {
|
|
left++;
|
|
leftList.push(element);
|
|
classMatrix[0][allClasses.indexOf(attribute)]++;
|
|
} else {
|
|
right++;
|
|
rightList.push(element);
|
|
classMatrix[1][allClasses.indexOf(attribute)]++;
|
|
}
|
|
}
|
|
|
|
// probability
|
|
var probR = 0,
|
|
probL = 0,
|
|
rankL = 0,
|
|
rankR = 0;
|
|
for (let k = 0; k < allClasses.length; k++) {
|
|
probL = left === 0 ? 0 : classMatrix[0][k] / left;
|
|
probR = right === 0 ? 0 : classMatrix[1][k] / right;
|
|
|
|
rankL += probL * probL;
|
|
rankR += probR * probR;
|
|
}
|
|
|
|
// setting new values
|
|
var currentDif =
|
|
(right / trainingSet.length) * (1 - rankR) + (left / trainingSet.length) * (1 - rankL);
|
|
if (currentDif < maxDif) {
|
|
maxDif = currentDif;
|
|
attribute1 = attr1;
|
|
attribute2 = attr2;
|
|
match = leftList;
|
|
notMatch = rightList;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return { maxDif, attribute1, attribute2, match, notMatch, direction };
|
|
}
|
|
|
|
function TSPWDif(allClasses, attributes, trainingSet, categoryAttr) {
|
|
var right = 0,
|
|
left = 0,
|
|
sum1 = 0,
|
|
sum2 = 0,
|
|
L_weight = 0,
|
|
weight = 0,
|
|
direction = '<';
|
|
var maxDif = 100;
|
|
/** @type {string | number} */ var attribute1 = -1;
|
|
/** @type {string | number} */ var attribute2 = -1;
|
|
var leftList = [],
|
|
rightList = [],
|
|
classMatrix = [new Array(allClasses.length).fill(0), new Array(allClasses.length).fill(0)],
|
|
match = [],
|
|
notMatch = [];
|
|
|
|
for (let attr1 of attributes) {
|
|
for (let attr2 of attributes) {
|
|
if (attr1 !== attr2) {
|
|
right = left = sum1 = sum2 = weight = 0;
|
|
leftList = [];
|
|
rightList = [];
|
|
classMatrix = [new Array(allClasses.length).fill(0), new Array(allClasses.length).fill(0)];
|
|
|
|
for (let index = 0; index < trainingSet.length; index++) {
|
|
const element = trainingSet[index];
|
|
if (!isNaN(element[attr1]) && !isNaN(element[attr2])) {
|
|
sum1 += parseFloat(element[attr1]);
|
|
sum2 += parseFloat(element[attr2]);
|
|
}
|
|
}
|
|
sum1 /= trainingSet.length;
|
|
sum2 /= trainingSet.length;
|
|
weight = sum1 / sum2;
|
|
|
|
// division
|
|
for (let element of trainingSet) {
|
|
const attribute = element[categoryAttr];
|
|
if (element[attr1] < weight * element[attr2]) {
|
|
left++;
|
|
leftList.push(element);
|
|
classMatrix[0][allClasses.indexOf(attribute)]++;
|
|
} else {
|
|
right++;
|
|
rightList.push(element);
|
|
classMatrix[1][allClasses.indexOf(attribute)]++;
|
|
}
|
|
}
|
|
var probR = 0,
|
|
probL = 0,
|
|
rankL = 0,
|
|
rankR = 0;
|
|
for (let k = 0; k < allClasses.length; k++) {
|
|
probL = left === 0 ? 0 : classMatrix[0][k] / left;
|
|
probR = right === 0 ? 0 : classMatrix[1][k] / right;
|
|
|
|
rankL += probL * probL;
|
|
rankR += probR * probR;
|
|
}
|
|
|
|
var currentDif =
|
|
(right / trainingSet.length) * (1 - rankR) + (left / trainingSet.length) * (1 - rankL);
|
|
|
|
if (currentDif < maxDif) {
|
|
maxDif = currentDif;
|
|
attribute1 = attr1;
|
|
attribute2 = attr2;
|
|
match = leftList;
|
|
notMatch = rightList;
|
|
L_weight = weight;
|
|
}
|
|
}
|
|
}
|
|
|
|
return { maxDif, attribute1, attribute2, match, notMatch, direction, L_weight };
|
|
}
|
|
}
|
|
|
|
function C45Dif(trainingSet, categoryAttr, ignoredAttributes) {
|
|
var alreadyChecked = {};
|
|
|
|
var bestSplit = { gain: 0 };
|
|
|
|
var pivot;
|
|
var predicateName;
|
|
var attrPredPivot;
|
|
var predicate;
|
|
var currSplit;
|
|
var matchEntropy;
|
|
var notMatchEntropy;
|
|
var newEntropy;
|
|
var currGain;
|
|
var initialEntropy = entropy(trainingSet, categoryAttr);
|
|
|
|
for (var i = trainingSet.length - 1; i >= 0; i--) {
|
|
var item = trainingSet[i];
|
|
|
|
// iterating over all attributes of item
|
|
for (var attr in item) {
|
|
//if(ignoredAttributes[attr]===true) console.log("równe")
|
|
if (attr === categoryAttr || ignoredAttributes.includes(attr)) {
|
|
//if ((attr === categoryAttr) || ignore===attr) {
|
|
continue;
|
|
}
|
|
|
|
// let the value of current attribute be the pivot
|
|
pivot = item[attr];
|
|
if (!isNaN(pivot)) {
|
|
pivot = parseFloat(pivot);
|
|
}
|
|
// pick the predicate
|
|
// depending on the type of the attribute value
|
|
//var predicateName;
|
|
if (typeof pivot == 'number') {
|
|
//console.log('is number ' + pivot + ' ' + typeof pivot)
|
|
predicateName = '>=';
|
|
} else {
|
|
//console.log('is not number ' + pivot + ' ' + typeof pivot)
|
|
|
|
// there is no sense to compare non-numeric attributes
|
|
// so we will check only equality of such attributes
|
|
predicateName = '==';
|
|
}
|
|
|
|
attrPredPivot = attr + predicateName + pivot;
|
|
if (alreadyChecked[attrPredPivot]) {
|
|
// skip such pairs of 'attribute-predicate-pivot',
|
|
// which been already checked
|
|
continue;
|
|
}
|
|
alreadyChecked[attrPredPivot] = true;
|
|
|
|
predicate = predicates[predicateName];
|
|
|
|
// splitting training set by given 'attribute-predicate-value'
|
|
currSplit = split(trainingSet, attr, predicate, pivot);
|
|
////console.log(currSplit)
|
|
// calculating entropy of subsets
|
|
matchEntropy = entropy(currSplit.match, categoryAttr);
|
|
notMatchEntropy = entropy(currSplit.notMatch, categoryAttr);
|
|
////console.log(bestSplit.gain)
|
|
// calculating informational gain
|
|
newEntropy = 0;
|
|
newEntropy += matchEntropy * currSplit.match.length;
|
|
newEntropy += notMatchEntropy * currSplit.notMatch.length;
|
|
newEntropy /= trainingSet.length;
|
|
currGain = initialEntropy - newEntropy;
|
|
//console.log("CURRENT GAIN 2"+currGain)
|
|
if (currGain > bestSplit.gain) {
|
|
// remember pairs 'attribute-predicate-value'
|
|
// which provides informational gain
|
|
bestSplit = currSplit;
|
|
bestSplit.predicateName = predicateName;
|
|
bestSplit.predicate = predicate;
|
|
bestSplit.attribute = attr;
|
|
bestSplit.pivot = pivot;
|
|
bestSplit.gain = currGain;
|
|
}
|
|
}
|
|
}
|
|
|
|
return {
|
|
maxDif: bestSplit.gain,
|
|
attribute1: bestSplit.attribute,
|
|
attribute2: bestSplit.pivot,
|
|
match: bestSplit.match,
|
|
notMatch: bestSplit.notMatch,
|
|
direction: bestSplit.predicateName,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Calculating entropy of array of objects
|
|
* by specific attribute.
|
|
*
|
|
* @param items - array of objects
|
|
*
|
|
* @param attr - variable with name of attribute,
|
|
* which embedded in each object
|
|
*/
|
|
function entropy(items, attr) {
|
|
// counting number of occurrences of each of values
|
|
// of attribute
|
|
var counter = countUniqueValues(items, attr);
|
|
|
|
var entropy = 0;
|
|
var p;
|
|
for (var i in counter) {
|
|
p = counter[i] / items.length;
|
|
entropy += -p * Math.log(p);
|
|
}
|
|
|
|
return entropy;
|
|
}
|
|
|
|
/**
|
|
* Splitting array of objects by value of specific attribute,
|
|
* using specific predicate and pivot.
|
|
*
|
|
* Items which matched by predicate will be copied to
|
|
* the new array called 'match', and the rest of the items
|
|
* will be copied to array with name 'notMatch'
|
|
*
|
|
* @param items - array of objects
|
|
*
|
|
* @param attr - variable with name of attribute,
|
|
* which embedded in each object
|
|
*
|
|
* @param predicate - function(x, y)
|
|
* which returns 'true' or 'false'
|
|
*
|
|
* @param pivot - used as the second argument when
|
|
* calling predicate function:
|
|
* e.g. predicate(item[attr], pivot)
|
|
*/
|
|
function split(items, attr, predicate, pivot) {
|
|
var match = [];
|
|
var notMatch = [];
|
|
|
|
var item, attrValue;
|
|
|
|
for (var i = items.length - 1; i >= 0; i--) {
|
|
item = items[i];
|
|
attrValue = item[attr];
|
|
|
|
if (predicate(attrValue, pivot)) {
|
|
match.push(item);
|
|
} else {
|
|
notMatch.push(item);
|
|
}
|
|
}
|
|
|
|
return {
|
|
match: match,
|
|
notMatch: notMatch,
|
|
};
|
|
}
|
|
|
|
var predicates = {
|
|
'==': function (a, b) {
|
|
return a === b;
|
|
},
|
|
'>=': function (a, b) {
|
|
return a >= b;
|
|
},
|
|
};
|