# Let's create functions to calculate gini and entropy scores # Imports from math import log # calcpercent calculates the number of samples and percentages of each class def calcpercent(node): nodesum = sum(node.values()) percents = {c:v/nodesum for c,v in node.items()} return nodesum, percents # giniscore calculates the score for a node using above formula def giniscore(node): nodesum, percents = calcpercent(node) score = round(1 - sum([i**2 for i in percents.values()]), 3) print('Gini Score for node {} : {}'.format(node, score)) return score # entropy score calculates the score for a node using above formula def entropyscore(node): nodesum, percents = calcpercent(node) score = round(sum([-i*log(i,2) for i in percents.values()]), 3) print('Entropy Score for node {} : {}'.format(node, score)) return score # infogain calculates the information gain given parent node, child nodes and criterion def infogain(parent, children, criterion): score = {'gini': giniscore, 'entropy': entropyscore} metric = score[criterion] parentscore = metric(parent) parentsum = sum(parent.values()) weighted_child_score = sum([metric(i)*sum(i.values())/parentsum for i in children]) gain = round((parentscore - weighted_child_score),2) print('Information gain: {}'.format(gain)) return gain # Parent node parent_node = {'Red': 3, 'Blue':4, 'Green':5 } # Let's say after the split nodes are node1 = {'Red':3, 'Blue':4} node2 = {'Green':5} gini_gain = infogain(parent_node, [node1, node2], 'gini') entropy_gain = infogain(parent_node, [node1, node2], 'entropy')
International Corporate Trainer | Technical Consultant | Data Scientist | IoT Solution Architect | Blockchain Architect | Industry 4.0 Architect
Thursday, 16 May 2019
DT maths
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment