Thursday 16 May 2019

DT maths

# Let's create functions to calculate gini and entropy scores

# Imports
from math import log

# calcpercent calculates the number of samples and percentages of each class
def calcpercent(node):
    nodesum = sum(node.values())
    percents = {c:v/nodesum for c,v in node.items()}
    return nodesum, percents

# giniscore calculates the score for a node using above formula
def giniscore(node):
    nodesum, percents = calcpercent(node)
    score = round(1 - sum([i**2 for i in percents.values()]), 3)
    print('Gini Score for node {} : {}'.format(node, score))
    return score
    
# entropy score calculates the score for a node using above formula
def entropyscore(node):
    nodesum, percents = calcpercent(node)
    score = round(sum([-i*log(i,2) for i in percents.values()]), 3)
    print('Entropy Score for node {} : {}'.format(node, score))
    return score

# infogain calculates the information gain given parent node, child nodes and criterion
def infogain(parent, children, criterion):
    score = {'gini': giniscore, 'entropy': entropyscore}
    metric = score[criterion]
    parentscore = metric(parent)
    parentsum = sum(parent.values())
    weighted_child_score = sum([metric(i)*sum(i.values())/parentsum  for i in children])
    gain = round((parentscore - weighted_child_score),2)
    print('Information gain: {}'.format(gain))
    return gain
# Parent node
parent_node = {'Red': 3, 'Blue':4, 'Green':5 }

# Let's say after the split nodes are 
node1 = {'Red':3, 'Blue':4}
node2 = {'Green':5}
gini_gain = infogain(parent_node, [node1, node2], 'gini')

entropy_gain = infogain(parent_node, [node1, node2], 'entropy')

No comments:

Post a Comment