# Performance wise there is not much difference between entropy and gini scores. # Imports import numpy as np import pandas as pd import os from sklearn.feature_extraction.text import CountVectorizer from sklearn.model_selection import cross_val_score from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.naive_bayes import MultinomialNB from sklearn.ensemble import RandomForestClassifier # Load Dataset # Dataset can be found at: https://www.kaggle.com/uciml/sms-spam-collection-dataset df = pd.read_csv(r'C:\Users\Manish\Desktop\VNR CDC\Day 4\Decision Tree\sms-spam-collection-dataset\spam.csv', encoding = 'latin-1' ) # Keep only necessary columns df = df[['v2', 'v1']] # Rename columns df.columns = ['SMS', 'Type'] df.head() # Let's view top 5 rows of the loaded dataset df.head() # Let's process the text data # Instantiate count vectorizer countvec = CountVectorizer(ngram_range=(1,4), stop_words='english', strip_accents='unicode', max_features=1000) cdf = countvec.fit_transform(df.SMS) # Instantiate algos dt_gini = DecisionTreeClassifier(criterion='gini') dt_entropy = DecisionTreeClassifier(criterion='entropy') # ests = {'Logistic Regression':lr,'Decision tree': dt,'Random forest': rf, 'Naive Bayes': mnb} ests = {'Decision tree with gini index': dt_gini, 'Decision tree with entropy': dt_entropy} for est in ests: print("{} score: {}%".format(est, round(cross_val_score(ests[est],X=cdf.toarray(), y=df.Type.values, cv=5).mean()*100, 3))) print("\n")
International Corporate Trainer | Technical Consultant | Data Scientist | IoT Solution Architect | Blockchain Architect | Industry 4.0 Architect
Thursday, 16 May 2019
Gini and enropy compare
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment