Thursday, 16 May 2019

Gini and enropy compare

# Performance wise there is not much difference between entropy and gini scores.
# Imports 
import numpy as np
import pandas as pd
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

# Load Dataset 
# Dataset can be found at: https://www.kaggle.com/uciml/sms-spam-collection-dataset

df = pd.read_csv(r'C:\Users\Manish\Desktop\VNR CDC\Day 4\Decision Tree\sms-spam-collection-dataset\spam.csv', encoding = 'latin-1' )

# Keep only necessary columns
df = df[['v2', 'v1']]

# Rename columns
df.columns = ['SMS', 'Type']
df.head()

# Let's view top 5 rows of the loaded dataset
df.head()

# Let's process the text data 
# Instantiate count vectorizer
countvec = CountVectorizer(ngram_range=(1,4), stop_words='english',  strip_accents='unicode', max_features=1000)
cdf = countvec.fit_transform(df.SMS)

# Instantiate algos
dt_gini = DecisionTreeClassifier(criterion='gini')
dt_entropy = DecisionTreeClassifier(criterion='entropy')

# ests = {'Logistic Regression':lr,'Decision tree': dt,'Random forest': rf, 'Naive Bayes': mnb}
ests = {'Decision tree with gini index': dt_gini, 'Decision tree with entropy': dt_entropy}

for est in ests:
    print("{} score: {}%".format(est, round(cross_val_score(ests[est],X=cdf.toarray(), y=df.Type.values, cv=5).mean()*100, 3)))
    print("\n")

No comments:

Post a Comment