In [1]:
import os
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn import cross_validation, metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from time import time
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score , classification_report
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report
In [3]:
# read .csv from provided dataset
csv_filename="zoo.data"

# df=pd.read_csv(csv_filename,index_col=0)
df=pd.read_csv(csv_filename, 
              names=["Animal", "Hair" , "Feathers" , "Eggs" , "Milk" , "Airborne",
                     "Aquatic" , "Predator" , "Toothed" , "Backbone", "Breathes" , "Venomous",
                     "Fins", "Legs", "Tail", "Domestic", "Catsize", "Type" ])
In [4]:
df.head()
Out[4]:
Animal Hair Feathers Eggs Milk Airborne Aquatic Predator Toothed Backbone Breathes Venomous Fins Legs Tail Domestic Catsize Type
0 aardvark 1 0 0 1 0 0 1 1 1 1 0 0 4 0 0 1 1
1 antelope 1 0 0 1 0 0 0 1 1 1 0 0 4 1 0 1 1
2 bass 0 0 1 0 0 1 1 1 1 0 0 1 0 1 0 0 4
3 bear 1 0 0 1 0 0 1 1 1 1 0 0 4 0 0 1 1
4 boar 1 0 0 1 0 0 1 1 1 1 0 0 4 1 0 1 1
In [62]:
df.tail()
Out[62]:
Animal Hair Feathers Eggs Milk Airborne Aquatic Predator Toothed Backbone Breathes Venomous Fins Legs Tail Domestic Catsize Type animals
96 wallaby 1 0 0 1 0 0 0 1 1 1 0 0 0.0 1 0 1 1 95
97 wasp 1 0 1 0 1 0 0 0 0 1 1 0 0.0 0 0 0 6 96
98 wolf 1 0 0 1 0 0 1 1 1 1 0 0 0.0 1 0 1 1 97
99 worm 0 0 1 0 0 0 0 0 0 1 0 0 1.0 0 0 0 7 98
100 wren 0 1 1 0 1 0 0 0 1 1 0 0 0.0 1 0 0 2 99
In [6]:
df['Animal'].unique()
Out[6]:
array(['aardvark', 'antelope', 'bass', 'bear', 'boar', 'buffalo', 'calf',
       'carp', 'catfish', 'cavy', 'cheetah', 'chicken', 'chub', 'clam',
       'crab', 'crayfish', 'crow', 'deer', 'dogfish', 'dolphin', 'dove',
       'duck', 'elephant', 'flamingo', 'flea', 'frog', 'fruitbat',
       'giraffe', 'girl', 'gnat', 'goat', 'gorilla', 'gull', 'haddock',
       'hamster', 'hare', 'hawk', 'herring', 'honeybee', 'housefly',
       'kiwi', 'ladybird', 'lark', 'leopard', 'lion', 'lobster', 'lynx',
       'mink', 'mole', 'mongoose', 'moth', 'newt', 'octopus', 'opossum',
       'oryx', 'ostrich', 'parakeet', 'penguin', 'pheasant', 'pike',
       'piranha', 'pitviper', 'platypus', 'polecat', 'pony', 'porpoise',
       'puma', 'pussycat', 'raccoon', 'reindeer', 'rhea', 'scorpion',
       'seahorse', 'seal', 'sealion', 'seasnake', 'seawasp', 'skimmer',
       'skua', 'slowworm', 'slug', 'sole', 'sparrow', 'squirrel',
       'starfish', 'stingray', 'swan', 'termite', 'toad', 'tortoise',
       'tuatara', 'tuna', 'vampire', 'vole', 'vulture', 'wallaby', 'wasp',
       'wolf', 'worm', 'wren'], dtype=object)
In [7]:
#Convert animal labels to numbers
le_animals = preprocessing.LabelEncoder()
df['animals'] = le_animals.fit_transform(df.Animal)
 
#Get binarized Legs columns
df['Legs'] = pd.get_dummies(df.Legs)
#types = pd.get_dummies(df.Type)
 
#Build new array
#train_data = pd.concat([hour, days, district], axis=1)
#train_data['crime']=crime
In [8]:
df.head()
Out[8]:
Animal Hair Feathers Eggs Milk Airborne Aquatic Predator Toothed Backbone Breathes Venomous Fins Legs Tail Domestic Catsize Type animals
0 aardvark 1 0 0 1 0 0 1 1 1 1 0 0 0.0 0 0 1 1 0
1 antelope 1 0 0 1 0 0 0 1 1 1 0 0 0.0 1 0 1 1 1
2 bass 0 0 1 0 0 1 1 1 1 0 0 1 1.0 1 0 0 4 2
3 bear 1 0 0 1 0 0 1 1 1 1 0 0 0.0 0 0 1 1 3
4 boar 1 0 0 1 0 0 1 1 1 1 0 0 0.0 1 0 1 1 4
In [20]:
df['Type'].unique()
Out[20]:
array([1, 4, 2, 7, 6, 5, 3], dtype=int64)
In [10]:
features=(list(df.columns[1:]))
In [11]:
features
Out[11]:
['Hair',
 'Feathers',
 'Eggs',
 'Milk',
 'Airborne',
 'Aquatic',
 'Predator',
 'Toothed',
 'Backbone',
 'Breathes',
 'Venomous',
 'Fins',
 'Legs',
 'Tail',
 'Domestic',
 'Catsize',
 'Type',
 'animals']
In [12]:
features.remove('Type')
In [13]:
X = df[features]
y = df['Type']
In [14]:
X.head()
Out[14]:
Hair Feathers Eggs Milk Airborne Aquatic Predator Toothed Backbone Breathes Venomous Fins Legs Tail Domestic Catsize animals
0 1 0 0 1 0 0 1 1 1 1 0 0 0.0 0 0 1 0
1 1 0 0 1 0 0 0 1 1 1 0 0 0.0 1 0 1 1
2 0 0 1 0 0 1 1 1 1 0 0 1 1.0 1 0 0 2
3 1 0 0 1 0 0 1 1 1 1 0 0 0.0 0 0 1 3
4 1 0 0 1 0 0 1 1 1 1 0 0 0.0 1 0 1 4
In [17]:
# split dataset to 60% training and 40% testing
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0)
In [18]:
print X_train.shape, y_train.shape
(60, 17) (60L,)

Feature importances with forests of trees

This examples shows the use of forests of trees to evaluate the importance of features on an artificial classification task. The red bars are the feature importances of the forest, along with their inter-trees variability.

In [23]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import ExtraTreesClassifier

# Build a classification task using 3 informative features

# Build a forest and compute the feature importances
forest = ExtraTreesClassifier(n_estimators=250,
                              random_state=0)

forest.fit(X, y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d - %s (%f) " % (f + 1, indices[f], features[indices[f]], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure(num=None, figsize=(14, 10), dpi=80, facecolor='w', edgecolor='k')
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()
Feature ranking:
1. feature 3 - Milk (0.138606) 
2. feature 1 - Feathers (0.138508) 
3. feature 2 - Eggs (0.113904) 
4. feature 7 - Toothed (0.083274) 
5. feature 0 - Hair (0.083251) 
6. feature 8 - Backbone (0.082373) 
7. feature 9 - Breathes (0.076449) 
8. feature 11 - Fins (0.063457) 
9. feature 13 - Tail (0.050104) 
10. feature 12 - Legs (0.048997) 
11. feature 5 - Aquatic (0.035794) 
12. feature 4 - Airborne (0.035151) 
13. feature 16 - animals (0.016738) 
14. feature 15 - Catsize (0.011835) 
15. feature 6 - Predator (0.011787) 
16. feature 10 - Venomous (0.009257) 
17. feature 14 - Domestic (0.000515) 
In [24]:
importances[indices[:5]]
Out[24]:
array([ 0.13860564,  0.13850845,  0.11390376,  0.083274  ,  0.08325078])
In [25]:
for f in range(5):
    print("%d. feature %d - %s (%f)" % (f + 1, indices[f], features[indices[f]] ,importances[indices[f]]))
1. feature 3 - Milk (0.138606)
2. feature 1 - Feathers (0.138508)
3. feature 2 - Eggs (0.113904)
4. feature 7 - Toothed (0.083274)
5. feature 0 - Hair (0.083251)
In [70]:
best_features = []
for i in indices[:5]:
    best_features.append(features[i])
In [71]:
# Plot the top 5 feature importances of the forest
plt.figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
plt.title("Feature importances")
plt.bar(range(5), importances[indices][:5], 
       color="r",  yerr=std[indices][:5], align="center")
plt.xticks(range(5), best_features)
plt.xlim([-1, 5])
plt.show()

Decision Tree accuracy and time elapsed caculation

In [26]:
t0=time()
print "DecisionTree"

dt = DecisionTreeClassifier(min_samples_split=20,random_state=99)
# dt = DecisionTreeClassifier(min_samples_split=20,max_depth=5,random_state=99)

clf_dt=dt.fit(X_train,y_train)

print "Acurracy: ", clf_dt.score(X_test,y_test)
t1=time()
print "time elapsed: ", t1-t0
DecisionTree
Acurracy:  0.80487804878
time elapsed:  0.018000125885

cross validation for DT

In [27]:
tt0=time()
print "cross result========"
scores = cross_validation.cross_val_score(dt, X, y, cv=3)
print scores
print scores.mean()
tt1=time()
print "time elapsed: ", tt1-tt0
print "\n"
cross result========
[ 0.81081081  0.82352941  0.9       ]
0.844780074192
time elapsed:  0.0289998054504


In [54]:
a = pd.Series(['0','1','1','0','1','0','0','0','1','1','0','0','2','1','0','2','99'])
In [60]:
x = dt.predict(a)
C:\Miniconda2\lib\site-packages\sklearn\utils\validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
In [61]:
x
Out[61]:
array([2], dtype=int64)

Tuning our hyperparameters using GridSearch

In [65]:
b = dt.predict_proba(a)
C:\Miniconda2\lib\site-packages\sklearn\utils\validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
In [66]:
b
Out[66]:
array([[ 0.,  1.,  0.,  0.,  0.,  0.,  0.]])
In [146]:
from sklearn.metrics import classification_report

pipeline = Pipeline([
    ('clf', DecisionTreeClassifier(criterion='entropy'))
])

parameters = {
    'clf__max_depth': (5, 25 , 50),
    'clf__min_samples_split': (1, 5, 10),
    'clf__min_samples_leaf': (1, 2, 3)
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='f1')
grid_search.fit(X_train, y_train)

print 'Best score: %0.3f' % grid_search.best_score_
print 'Best parameters set:'

best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])

predictions = grid_search.predict(X_test)

print classification_report(y_test, predictions)
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   22.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 288 out of 288 | elapsed:  4.5min finished
Fitting 3 folds for each of 96 candidates, totalling 288 fits
Best score: 0.662
Best parameters set:
	clf__max_depth: 5
	clf__min_samples_leaf: 5
	clf__min_samples_split: 1
             precision    recall  f1-score   support

          0       0.62      0.59      0.60      7427
          1       0.65      0.68      0.67      8431

avg / total       0.64      0.64      0.64     15858

Random Forest accuracy and time elapsed caculation

In [76]:
t2=time()
print "RandomForest"
rf = RandomForestClassifier(n_estimators=100,n_jobs=-1)
clf_rf = rf.fit(X_train,y_train)
print "Acurracy: ", clf_rf.score(X_test,y_test)
t3=time()
print "time elapsed: ", t3-t2
RandomForest
Acurracy:  0.90243902439
time elapsed:  1.07899999619

cross validation for RF

In [78]:
tt2=time()
print "cross result========"
scores = cross_validation.cross_val_score(rf, X, y, cv=3)
print scores
print scores.mean()
tt3=time()
print "time elapsed: ", tt3-tt2
print "\n"
cross result========
[ 0.97297297  0.91176471  0.93333333]
0.939357004063
time elapsed:  3.70700001717


Tuning Models using GridSearch

In [ ]:
pipeline2 = Pipeline([
('clf', RandomForestClassifier(criterion='entropy'))
])

parameters = {
    'clf__n_estimators': (5, 25, 50, 100),
    'clf__max_depth': (5, 25 , 50),
    'clf__min_samples_split': (1, 5, 10),
    'clf__min_samples_leaf': (1, 2, 3)
}

grid_search = GridSearchCV(pipeline2, parameters, n_jobs=-1, verbose=1, scoring='accuracy', cv=3)

grid_search.fit(X_train, y_train)

print 'Best score: %0.3f' % grid_search.best_score_

print 'Best parameters set:'
best_parameters = grid_search.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])

predictions = grid_search.predict(X_test)
print 'Accuracy:', accuracy_score(y_test, predictions)
print classification_report(y_test, predictions)
    

Naive Bayes accuracy and time elapsed caculation

In [81]:
t4=time()
print "NaiveBayes"
nb = BernoulliNB()
clf_nb=nb.fit(X_train,y_train)
print "Acurracy: ", clf_nb.score(X_test,y_test)
t5=time()
print "time elapsed: ", t5-t4
NaiveBayes
Acurracy:  0.878048780488
time elapsed:  0.0820000171661

cross-validation for NB

In [83]:
tt4=time()
print "cross result========"
scores = cross_validation.cross_val_score(nb, X,y, cv=3)
print scores
print scores.mean()
tt5=time()
print "time elapsed: ", tt5-tt4
print "\n"
cross result========
[ 0.97297297  0.94117647  0.9       ]
0.93804981452
time elapsed:  0.0400002002716


KNN accuracy and time elapsed caculation

In [86]:
t6=time()
print "KNN"
# knn = KNeighborsClassifier(n_neighbors=3)
knn = KNeighborsClassifier(n_neighbors=3)
clf_knn=knn.fit(X_train, y_train)
print "Acurracy: ", clf_knn.score(X_test,y_test) 
t7=time()
print "time elapsed: ", t7-t6
KNN
Acurracy:  0.317073170732
time elapsed:  0.00500011444092

cross validation for KNN

In [87]:
tt6=time()
print "cross result========"
scores = cross_validation.cross_val_score(knn, X,y, cv=5)
print scores
print scores.mean()
tt7=time()
print "time elapsed: ", tt7-tt6
print "\n"
cross result========
[ 0.18181818  0.28571429  0.33333333  0.31578947  0.16666667]
0.256664388243
time elapsed:  0.0609998703003


Fine tuning the model using GridSearch

In [92]:
from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn import grid_search

knn = KNeighborsClassifier()

parameters = {'n_neighbors':[1,10]}

grid = grid_search.GridSearchCV(knn, parameters, n_jobs=-1, verbose=1, scoring='accuracy')


grid.fit(X_train, y_train)

print 'Best score: %0.3f' % grid.best_score_

print 'Best parameters set:'
best_parameters = grid.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])
    
predictions = grid.predict(X_test)
print classification_report(y_test, predictions)
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Best score: 0.517
Best parameters set:
	n_neighbors: 1
             precision    recall  f1-score   support

          1       0.64      0.56      0.60        16
          2       0.40      0.29      0.33         7
          3       1.00      0.25      0.40         4
          4       0.25      0.12      0.17         8
          5       0.20      1.00      0.33         1
          6       0.00      0.00      0.00         3
          7       0.09      0.50      0.15         2

avg / total       0.47      0.37      0.38        41

[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   12.9s finished

SVM accuracy and time elapsed caculation

In [88]:
t7=time()
print "SVM"

svc = SVC()
clf_svc=svc.fit(X_train, y_train)
print "Acurracy: ", clf_svc.score(X_test,y_test) 
t8=time()
print "time elapsed: ", t8-t7
SVM
Acurracy:  0.365853658537
time elapsed:  0.0490000247955

cross validation for SVM

In [90]:
tt7=time()
print "cross result========"
scores = cross_validation.cross_val_score(svc,X,y, cv=5)
print scores
print scores.mean()
tt8=time()
print "time elapsed: ", tt7-tt6
print "\n"
cross result========
[ 0.40909091  0.23809524  0.38095238  0.42105263  0.38888889]
0.367616009721
time elapsed:  32.3789999485


In [91]:
from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn import grid_search

svc = SVC()

parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}

grid = grid_search.GridSearchCV(svc, parameters, n_jobs=-1, verbose=1, scoring='accuracy')


grid.fit(X_train, y_train)

print 'Best score: %0.3f' % grid.best_score_

print 'Best parameters set:'
best_parameters = grid.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])
    
predictions = grid.predict(X_test)
print classification_report(y_test, predictions)
C:\Miniconda2\lib\site-packages\sklearn\cross_validation.py:516: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of labels for any class cannot be less than n_folds=3.
  % (min_labels, self.n_folds)), Warning)
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:   17.5s finished
Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best score: 0.850
Best parameters set:
	C: 10
	kernel: 'linear'
             precision    recall  f1-score   support

          1       1.00      1.00      1.00        16
          2       1.00      1.00      1.00         7
          3       1.00      0.25      0.40         4
          4       0.89      1.00      0.94         8
          5       0.33      1.00      0.50         1
          6       1.00      1.00      1.00         3
          7       1.00      1.00      1.00         2

avg / total       0.96      0.93      0.92        41

In [93]:
pipeline = Pipeline([
    ('clf', SVC(kernel='linear', gamma=0.01, C=10))
])

parameters = {
    'clf__gamma': (0.01, 0.03, 0.1, 0.3, 1),
    'clf__C': (0.1, 0.3, 1, 3, 10, 30),
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy')

grid_search.fit(X_train, y_train)

print 'Best score: %0.3f' % grid_search.best_score_

print 'Best parameters set:'
best_parameters = grid_search.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])
    
predictions = grid_search.predict(X_test)
print classification_report(y_test, predictions)
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   17.7s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:   18.3s finished
Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best score: 0.867
Best parameters set:
	clf__C: 0.3
	clf__gamma: 0.01
             precision    recall  f1-score   support

          1       1.00      1.00      1.00        16
          2       0.88      1.00      0.93         7
          3       0.00      0.00      0.00         4
          4       0.89      1.00      0.94         8
          5       0.33      1.00      0.50         1
          6       1.00      0.67      0.80         3
          7       0.67      1.00      0.80         2

avg / total       0.83      0.88      0.84        41

C:\Miniconda2\lib\site-packages\sklearn\metrics\classification.py:1074: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)