Car Evaluation¶

https://archive.ics.uci.edu/ml/datasets/Car+Evaluation

import os
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn import cross_validation, metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from time import time
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score , classification_report
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report

# read .csv from provided dataset
csv_filename="car.data"

# df=pd.read_csv(csv_filename,index_col=0)
df=pd.read_csv(csv_filename,
              names=["Buying", "Maintenance" , "Doors" , "Persons" , "Lug-Boot" , "Safety", "Class"])

df.head()

#Convert animal labels to numbers
le = preprocessing.LabelEncoder()
df['Class'] = le.fit_transform(df.Class)

df['Class'].unique()

array([2, 0, 3, 1], dtype=int64)

features = list(df.columns)

features.remove('Class')

for f in features:

    #Get binarized columns
    df[f] = pd.get_dummies(df[f])
 
    # Build new array
# train_data = pd.concat([hour, days, district], axis=1)
# train_data['crime']=crime

df.head()

X = df[features]
y = df['Class']

X.describe()

# split dataset to 60% training and 40% testing
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0)

print X_train.shape, y_train.shape

(1036, 6) (1036L,)

Feature importances with forests of trees¶

This examples shows the use of forests of trees to evaluate the importance of features on an artificial classification task. The red bars are the feature importances of the forest, along with their inter-trees variability.

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import ExtraTreesClassifier

# Build a classification task using 3 informative features

# Build a forest and compute the feature importances
forest = ExtraTreesClassifier(n_estimators=250,
                              random_state=0)

forest.fit(X, y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d - %s (%f) " % (f + 1, indices[f], features[indices[f]], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure(num=None, figsize=(14, 10), dpi=80, facecolor='w', edgecolor='k')
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()

Feature ranking:
1. feature 3 - Milk (0.138606) 
2. feature 1 - Feathers (0.138508) 
3. feature 2 - Eggs (0.113904) 
4. feature 7 - Toothed (0.083274) 
5. feature 0 - Hair (0.083251) 
6. feature 8 - Backbone (0.082373) 
7. feature 9 - Breathes (0.076449) 
8. feature 11 - Fins (0.063457) 
9. feature 13 - Tail (0.050104) 
10. feature 12 - Legs (0.048997) 
11. feature 5 - Aquatic (0.035794) 
12. feature 4 - Airborne (0.035151) 
13. feature 16 - animals (0.016738) 
14. feature 15 - Catsize (0.011835) 
15. feature 6 - Predator (0.011787) 
16. feature 10 - Venomous (0.009257) 
17. feature 14 - Domestic (0.000515)

importances[indices[:5]]

array([ 0.13860564,  0.13850845,  0.11390376,  0.083274  ,  0.08325078])

for f in range(5):
    print("%d. feature %d - %s (%f)" % (f + 1, indices[f], features[indices[f]] ,importances[indices[f]]))

1. feature 3 - Milk (0.138606)
2. feature 1 - Feathers (0.138508)
3. feature 2 - Eggs (0.113904)
4. feature 7 - Toothed (0.083274)
5. feature 0 - Hair (0.083251)

best_features = []
for i in indices[:5]:
    best_features.append(features[i])

# Plot the top 5 feature importances of the forest
plt.figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
plt.title("Feature importances")
plt.bar(range(5), importances[indices][:5], 
       color="r",  yerr=std[indices][:5], align="center")
plt.xticks(range(5), best_features)
plt.xlim([-1, 5])
plt.show()

Decision Tree accuracy and time elapsed caculation¶

t0=time()
print "DecisionTree"

dt = DecisionTreeClassifier(min_samples_split=20,random_state=99)
# dt = DecisionTreeClassifier(min_samples_split=20,max_depth=5,random_state=99)

clf_dt=dt.fit(X_train,y_train)

print "Acurracy: ", clf_dt.score(X_test,y_test)
t1=time()
print "time elapsed: ", t1-t0

DecisionTree
Acurracy:  0.75289017341
time elapsed:  0.00600004196167

cross validation for DT¶

tt0=time()
print "cross result========"
scores = cross_validation.cross_val_score(dt, X, y, cv=3)
print scores
print scores.mean()
tt1=time()
print "time elapsed: ", tt1-tt0
print "\n"

cross result========
[ 0.67071057  0.68923611  0.75826087]
0.7060691842
time elapsed:  0.0390000343323

Tuning our hyperparameters using GridSearch¶

from sklearn.metrics import classification_report

pipeline = Pipeline([
    ('clf', DecisionTreeClassifier(criterion='entropy'))
])

parameters = {
    'clf__max_depth': (5, 25 , 50),
    'clf__min_samples_split': (1, 5, 10),
    'clf__min_samples_leaf': (1, 2, 3)
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='f1')
grid_search.fit(X_train, y_train)

print 'Best score: %0.3f' % grid_search.best_score_
print 'Best parameters set:'

best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])

predictions = grid_search.predict(X_test)

print classification_report(y_test, predictions)

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   17.0s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   17.7s finished

Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best score: 0.732
Best parameters set:
	clf__max_depth: 25
	clf__min_samples_leaf: 1
	clf__min_samples_split: 10
             precision    recall  f1-score   support

          0       0.52      0.49      0.50       155
          1       0.00      0.00      0.00        29
          2       0.82      0.93      0.87       480
          3       0.00      0.00      0.00        28

avg / total       0.68      0.75      0.72       692

C:\Miniconda2\lib\site-packages\sklearn\metrics\classification.py:1074: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

Random Forest accuracy and time elapsed caculation¶

t2=time()
print "RandomForest"
rf = RandomForestClassifier(n_estimators=100,n_jobs=-1)
clf_rf = rf.fit(X_train,y_train)
print "Acurracy: ", clf_rf.score(X_test,y_test)
t3=time()
print "time elapsed: ", t3-t2

RandomForest
Acurracy:  0.75289017341
time elapsed:  1.30200004578

cross validation for RF¶

tt2=time()
print "cross result========"
scores = cross_validation.cross_val_score(rf, X, y, cv=3)
print scores
print scores.mean()
tt3=time()
print "time elapsed: ", tt3-tt2
print "\n"

cross result========
[ 0.694974    0.67013889  0.74608696]
0.703733282959
time elapsed:  3.81699991226

Tuning Models using GridSearch¶

pipeline2 = Pipeline([
('clf', RandomForestClassifier(criterion='entropy'))
])

parameters = {
    'clf__n_estimators': (5, 25, 50, 100),
    'clf__max_depth': (5, 25 , 50),
    'clf__min_samples_split': (1, 5, 10),
    'clf__min_samples_leaf': (1, 2, 3)
}

grid_search = GridSearchCV(pipeline2, parameters, n_jobs=-1, verbose=1, scoring='accuracy', cv=3)

grid_search.fit(X_train, y_train)

print 'Best score: %0.3f' % grid_search.best_score_

print 'Best parameters set:'
best_parameters = grid_search.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])

predictions = grid_search.predict(X_test)
print 'Accuracy:', accuracy_score(y_test, predictions)
print classification_report(y_test, predictions)

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   18.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   27.0s
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed:   34.5s finished

Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best score: 0.769
Best parameters set:
	clf__max_depth: 5
	clf__min_samples_leaf: 1
	clf__min_samples_split: 10
	clf__n_estimators: 50
Accuracy: 0.754335260116
             precision    recall  f1-score   support

          0       0.52      0.46      0.49       155
          1       0.00      0.00      0.00        29
          2       0.81      0.94      0.87       480
          3       0.00      0.00      0.00        28

avg / total       0.68      0.75      0.71       692

Naive Bayes accuracy and time elapsed caculation¶

t4=time()
print "NaiveBayes"
nb = BernoulliNB()
clf_nb=nb.fit(X_train,y_train)
print "Acurracy: ", clf_nb.score(X_test,y_test)
t5=time()
print "time elapsed: ", t5-t4

NaiveBayes
Acurracy:  0.754335260116
time elapsed:  0.460999965668

cross-validation for NB¶

tt4=time()
print "cross result========"
scores = cross_validation.cross_val_score(nb, X,y, cv=3)
print scores
print scores.mean()
tt5=time()
print "time elapsed: ", tt5-tt4
print "\n"

cross result========
[ 0.70710572  0.56423611  0.76      ]
0.67711394345
time elapsed:  0.0479998588562

KNN accuracy and time elapsed caculation¶

t6=time()
print "KNN"
# knn = KNeighborsClassifier(n_neighbors=3)
knn = KNeighborsClassifier(n_neighbors=3)
clf_knn=knn.fit(X_train, y_train)
print "Acurracy: ", clf_knn.score(X_test,y_test) 
t7=time()
print "time elapsed: ", t7-t6

KNN
Acurracy:  0.705202312139
time elapsed:  0.0299999713898

cross validation for KNN¶

tt6=time()
print "cross result========"
scores = cross_validation.cross_val_score(knn, X,y, cv=5)
print scores
print scores.mean()
tt7=time()
print "time elapsed: ", tt7-tt6
print "\n"

cross result========
[ 0.6849711   0.76011561  0.5         0.71676301  0.72674419]
0.677718779406
time elapsed:  0.125

Fine tuning the model using GridSearch¶

from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn import grid_search

knn = KNeighborsClassifier()

parameters = {'n_neighbors':[1,10]}

grid = grid_search.GridSearchCV(knn, parameters, n_jobs=-1, verbose=1, scoring='accuracy')


grid.fit(X_train, y_train)

print 'Best score: %0.3f' % grid.best_score_

print 'Best parameters set:'
best_parameters = grid.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])
    
predictions = grid.predict(X_test)
print classification_report(y_test, predictions)

Fitting 3 folds for each of 2 candidates, totalling 6 fits
Best score: 0.704
Best parameters set:
	n_neighbors: 10
             precision    recall  f1-score   support

          0       0.54      0.36      0.43       155
          1       0.00      0.00      0.00        29
          2       0.80      0.94      0.86       480
          3       0.46      0.39      0.42        28

avg / total       0.69      0.75      0.71       692

[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   12.9s finished

SVM accuracy and time elapsed caculation¶

t7=time()
print "SVM"

svc = SVC()
clf_svc=svc.fit(X_train, y_train)
print "Acurracy: ", clf_svc.score(X_test,y_test) 
t8=time()
print "time elapsed: ", t8-t7

SVM
Acurracy:  0.763005780347
time elapsed:  0.120000123978

cross validation for SVM¶

tt7=time()
print "cross result========"
scores = cross_validation.cross_val_score(svc,X,y, cv=5)
print scores
print scores.mean()
tt8=time()
print "time elapsed: ", tt7-tt6
print "\n"

cross result========
[ 0.70231214  0.76878613  0.82947977  0.78323699  0.76162791]
0.769088587176
time elapsed:  23.5260000229

from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn import grid_search

svc = SVC()

parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}

grid = grid_search.GridSearchCV(svc, parameters, n_jobs=-1, verbose=1, scoring='accuracy')


grid.fit(X_train, y_train)

print 'Best score: %0.3f' % grid.best_score_

print 'Best parameters set:'
best_parameters = grid.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])
    
predictions = grid.predict(X_test)
print classification_report(y_test, predictions)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best score: 0.773
Best parameters set:
	C: 1
	kernel: 'linear'
             precision    recall  f1-score   support

          0       0.52      0.52      0.52       155
          1       0.00      0.00      0.00        29
          2       0.83      0.93      0.88       480
          3       0.00      0.00      0.00        28

avg / total       0.69      0.76      0.73       692

[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:   17.4s finished

pipeline = Pipeline([
    ('clf', SVC(kernel='linear', gamma=0.01, C=10))
])

parameters = {
    'clf__gamma': (0.01, 0.03, 0.1, 0.3, 1),
    'clf__C': (0.1, 0.3, 1, 3, 10, 30),
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy')

grid_search.fit(X_train, y_train)

print 'Best score: %0.3f' % grid_search.best_score_

print 'Best parameters set:'
best_parameters = grid_search.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])
    
predictions = grid_search.predict(X_test)
print classification_report(y_test, predictions)

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   17.6s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:   18.2s finished

Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best score: 0.773
Best parameters set:
	clf__C: 0.1
	clf__gamma: 0.01
             precision    recall  f1-score   support

          0       0.52      0.52      0.52       155
          1       0.00      0.00      0.00        29
          2       0.83      0.93      0.88       480
          3       0.00      0.00      0.00        28

avg / total       0.69      0.76      0.73       692

	Buying	Maintenance	Doors	Persons	Lug-Boot	Safety	Class
0	vhigh	vhigh	2	2	small	low	unacc
1	vhigh	vhigh	2	2	small	med	unacc
2	vhigh	vhigh	2	2	small	high	unacc
3	vhigh	vhigh	2	2	med	low	unacc
4	vhigh	vhigh	2	2	med	med	unacc

	Buying	Maintenance	Doors	Persons	Lug-Boot	Safety
count	1728.000000	1728.000000	1728.000000	1728.000000	1728.000000	1728.000000
mean	0.250000	0.250000	0.250000	0.333333	0.333333	0.333333
std	0.433138	0.433138	0.433138	0.471541	0.471541	0.471541
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
50%	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
75%	0.250000	0.250000	0.250000	1.000000	1.000000	1.000000
max	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000

	Doors	Persons	Safety	Class
0	1.0	1.0	0.0	2
1	1.0	1.0	0.0	2
2	1.0	1.0	1.0	2
3	1.0	1.0	0.0	2
4	1.0	1.0	0.0	2

	Doors	Persons	Safety	Class
0	1.0	1.0	0.0	2
1	1.0	1.0	0.0	2
2	1.0	1.0	1.0	2
3	1.0	1.0	0.0	2
4	1.0	1.0	0.0	2

	Doors	Persons	Safety	Class
0	1.0	1.0	0.0	2
1	1.0	1.0	0.0	2
2	1.0	1.0	1.0	2
3	1.0	1.0	0.0	2
4	1.0	1.0	0.0	2