In [1]:
import os
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn import cross_validation, metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from time import time
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score , classification_report
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report
In [22]:
index=[]
board = ['a','b','c','d','e','f','g']
for i in board:
    for j in range(6):
        index.append(i + str(j+1))
In [26]:
column_names  = index +['Class']
In [28]:
# read .csv from provided dataset
csv_filename="connect-4.data"

# df=pd.read_csv(csv_filename,index_col=0)
df=pd.read_csv(csv_filename, 
              names= column_names)
In [29]:
df.head()
Out[29]:
a1 a2 a3 a4 a5 a6 b1 b2 b3 b4 ... f4 f5 f6 g1 g2 g3 g4 g5 g6 Class
0 b b b b b b b b b b ... b b b b b b b b b win
1 b b b b b b b b b b ... b b b b b b b b b win
2 b b b b b b o b b b ... b b b b b b b b b win
3 b b b b b b b b b b ... b b b b b b b b b win
4 o b b b b b b b b b ... b b b b b b b b b win

5 rows × 43 columns

In [30]:
df['Class'].unique()
Out[30]:
array(['win', 'draw', 'loss'], dtype=object)
In [31]:
#Convert animal labels to numbers
le = preprocessing.LabelEncoder()
for col in df.columns:
    df[col] = le.fit_transform(df[col])
In [32]:
df.head()
Out[32]:
a1 a2 a3 a4 a5 a6 b1 b2 b3 b4 ... f4 f5 f6 g1 g2 g3 g4 g5 g6 Class
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 2
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 2
2 0 0 0 0 0 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 2
3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 2
4 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 2

5 rows × 43 columns

In [15]:
for col in df.columns:
    df[col] = pd.get_dummies(df[col])
In [16]:
df.head()
Out[16]:
TopLeft TopMiddle TopRight MiddleLeft MiddleMiddle MiddleRight BottomLeft BottomMiddle BottomRight Class
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 0.0
4 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0
In [35]:
X = df[index]
y = df['Class']
In [36]:
X.head()
Out[36]:
a1 a2 a3 a4 a5 a6 b1 b2 b3 b4 ... f3 f4 f5 f6 g1 g2 g3 g4 g5 g6
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 42 columns

In [37]:
# split dataset to 60% training and 40% testing
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0)
In [38]:
print X_train.shape, y_train.shape
(40534, 42) (40534L,)

Feature importances with forests of trees

This examples shows the use of forests of trees to evaluate the importance of features on an artificial classification task. The red bars are the feature importances of the forest, along with their inter-trees variability.

In [25]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import ExtraTreesClassifier

# Build a classification task using 3 informative features

# Build a forest and compute the feature importances
forest = ExtraTreesClassifier(n_estimators=250,
                              random_state=0)

forest.fit(X, y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d - %s (%f) " % (f + 1, indices[f], features[indices[f]], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure(num=None, figsize=(14, 10), dpi=80, facecolor='w', edgecolor='k')
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()
Feature ranking:
1. feature 1 - TopMiddle (0.127575) 
2. feature 5 - MiddleRight (0.125550) 
3. feature 0 - TopLeft (0.117647) 
4. feature 6 - BottomLeft (0.115633) 
5. feature 3 - MiddleLeft (0.113785) 
6. feature 8 - BottomRight (0.112243) 
7. feature 2 - TopRight (0.107885) 
8. feature 7 - BottomMiddle (0.107579) 
9. feature 4 - MiddleMiddle (0.072104) 
In [26]:
importances[indices[:5]]
Out[26]:
array([ 0.12757459,  0.12554975,  0.11764683,  0.11563324,  0.11378505])
In [27]:
for f in range(5):
    print("%d. feature %d - %s (%f)" % (f + 1, indices[f], features[indices[f]] ,importances[indices[f]]))
1. feature 1 - TopMiddle (0.127575)
2. feature 5 - MiddleRight (0.125550)
3. feature 0 - TopLeft (0.117647)
4. feature 6 - BottomLeft (0.115633)
5. feature 3 - MiddleLeft (0.113785)
In [28]:
best_features = []
for i in indices[:5]:
    best_features.append(features[i])
In [29]:
# Plot the top 5 feature importances of the forest
plt.figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
plt.title("Feature importances")
plt.bar(range(5), importances[indices][:5], 
       color="r",  yerr=std[indices][:5], align="center")
plt.xticks(range(5), best_features)
plt.xlim([-1, 5])
plt.show()

Decision Tree accuracy and time elapsed caculation

In [39]:
t0=time()
print "DecisionTree"

dt = DecisionTreeClassifier(min_samples_split=20,random_state=99)
# dt = DecisionTreeClassifier(min_samples_split=20,max_depth=5,random_state=99)

clf_dt=dt.fit(X_train,y_train)

print "Acurracy: ", clf_dt.score(X_test,y_test)
t1=time()
print "time elapsed: ", t1-t0
DecisionTree
Acurracy:  0.735373570662
time elapsed:  0.519000053406

cross validation for DT

In [41]:
tt0=time()
print "cross result========"
scores = cross_validation.cross_val_score(dt, X, y, cv=5)
print scores
print scores.mean()
tt1=time()
print "time elapsed: ", tt1-tt0
print "\n"
cross result========
[ 0.66311427  0.5490675   0.46765838  0.50766042  0.39059956]
0.515620024564
time elapsed:  3.68099999428


Tuning our hyperparameters using GridSearch

In [42]:
from sklearn.metrics import classification_report

pipeline = Pipeline([
    ('clf', DecisionTreeClassifier(criterion='entropy'))
])

parameters = {
    'clf__max_depth': (5, 25 , 50),
    'clf__min_samples_split': (1, 5, 10),
    'clf__min_samples_leaf': (1, 2, 3)
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='f1')
grid_search.fit(X_train, y_train)

print 'Best score: %0.3f' % grid_search.best_score_
print 'Best parameters set:'

best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])

predictions = grid_search.predict(X_test)


print classification_report(y_test, predictions)
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   19.7s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   24.7s finished
Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best score: 0.713
Best parameters set:
	clf__max_depth: 25
	clf__min_samples_leaf: 1
	clf__min_samples_split: 1
             precision    recall  f1-score   support

          0       0.24      0.26      0.25      2559
          1       0.62      0.61      0.61      6623
          2       0.83      0.83      0.83     17841

avg / total       0.73      0.72      0.72     27023

Exporting Decision Tree to an Image

In [43]:
t0=time()
print "DecisionTree"

dt1 = DecisionTreeClassifier(min_samples_split=1,max_depth=25,min_samples_leaf=1, random_state=99)

clf_dt1=dt1.fit(X_train,y_train)

print "Acurracy: ", clf_dt1.score(X_test,y_test)
t1=time()
print "time elapsed: ", t1-t0

export_graphviz(clf_dt1, 
                out_file='tree.dot', 
                feature_names=features)
DecisionTree
Acurracy:  0.723309773156
time elapsed:  0.575999975204

After we have installed GraphViz on our computer, we can convert the tree.dot file into a PNG file by executing the following command from the command line in the location where we saved the tree.dot file:

dot -Tpng tree.dot -o tree.png

Random Forest accuracy and time elapsed caculation

In [44]:
t2=time()
print "RandomForest"
rf = RandomForestClassifier(n_estimators=100,n_jobs=-1)
clf_rf = rf.fit(X_train,y_train)
print "Acurracy: ", clf_rf.score(X_test,y_test)
t3=time()
print "time elapsed: ", t3-t2
RandomForest
Acurracy:  0.811567923621
time elapsed:  3.51699995995

cross validation for RF

In [46]:
tt2=time()
print "cross result========"
scores = cross_validation.cross_val_score(rf, X, y, cv=5)
print scores
print scores.mean()
tt3=time()
print "time elapsed: ", tt3-tt2
print "\n"
cross result========
[ 0.66267022  0.64298401  0.53826229  0.639775    0.47698001]
0.592134306321
time elapsed:  20.986000061


Tuning Models using GridSearch

In [48]:
pipeline2 = Pipeline([
('clf', RandomForestClassifier(criterion='entropy'))
])

parameters = {
    'clf__n_estimators': (5, 25, 50, 100),
    'clf__max_depth': (5, 25 , 50),
    'clf__min_samples_split': (1, 5, 10),
    'clf__min_samples_leaf': (1, 2, 3)
}

grid_search = GridSearchCV(pipeline2, parameters, n_jobs=-1, verbose=1, scoring='accuracy', cv=3)

grid_search.fit(X_train, y_train)

print 'Best score: %0.3f' % grid_search.best_score_

print 'Best parameters set:'
best_parameters = grid_search.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])

predictions = grid_search.predict(X_test)
print 'Accuracy:', accuracy_score(y_test, predictions)
print classification_report(y_test, predictions)
    
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   19.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   28.4s
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed:   36.3s finished
Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best score: 0.810
Best parameters set:
	clf__max_depth: 50
	clf__min_samples_leaf: 1
	clf__min_samples_split: 1
	clf__n_estimators: 50
Accuracy: 0.825520833333
             precision    recall  f1-score   support

        0.0       0.87      0.86      0.87       253
        1.0       0.74      0.76      0.75       131

avg / total       0.83      0.83      0.83       384

Naive Bayes accuracy and time elapsed caculation

In [47]:
t4=time()
print "NaiveBayes"
nb = BernoulliNB()
clf_nb=nb.fit(X_train,y_train)
print "Acurracy: ", clf_nb.score(X_test,y_test)
t5=time()
print "time elapsed: ", t5-t4
NaiveBayes
Acurracy:  0.635606705399
time elapsed:  0.169000148773

cross-validation for NB

In [48]:
tt4=time()
print "cross result========"
scores = cross_validation.cross_val_score(nb, X,y, cv=3)
print scores
print scores.mean()
tt5=time()
print "time elapsed: ", tt5-tt4
print "\n"
cross result========
[ 0.56918295  0.61983214  0.63531397]
0.608109687094
time elapsed:  0.949999809265


KNN accuracy and time elapsed caculation

In [49]:
t6=time()
print "KNN"
# knn = KNeighborsClassifier(n_neighbors=3)
knn = KNeighborsClassifier(n_neighbors=3)
clf_knn=knn.fit(X_train, y_train)
print "Acurracy: ", clf_knn.score(X_test,y_test) 
t7=time()
print "time elapsed: ", t7-t6
KNN
Acurracy:  0.700921437294
time elapsed:  76.8269999027

cross validation for KNN

In [50]:
tt6=time()
print "cross result========"
scores = cross_validation.cross_val_score(knn, X,y, cv=5)
print scores
print scores.mean()
tt7=time()
print "time elapsed: ", tt7-tt6
print "\n"
cross result========
[ 0.65571344  0.56150089  0.52212848  0.53704389  0.48556625]
0.552390588756
time elapsed:  284.812000036


Fine tuning the model using GridSearch

In [92]:
from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn import grid_search

knn = KNeighborsClassifier()

parameters = {'n_neighbors':[1,10]}

grid = grid_search.GridSearchCV(knn, parameters, n_jobs=-1, verbose=1, scoring='accuracy')


grid.fit(X_train, y_train)

print 'Best score: %0.3f' % grid.best_score_

print 'Best parameters set:'
best_parameters = grid.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])
    
predictions = grid.predict(X_test)
print classification_report(y_test, predictions)
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Best score: 0.517
Best parameters set:
	n_neighbors: 1
             precision    recall  f1-score   support

          1       0.64      0.56      0.60        16
          2       0.40      0.29      0.33         7
          3       1.00      0.25      0.40         4
          4       0.25      0.12      0.17         8
          5       0.20      1.00      0.33         1
          6       0.00      0.00      0.00         3
          7       0.09      0.50      0.15         2

avg / total       0.47      0.37      0.38        41

[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   12.9s finished

SVM accuracy and time elapsed caculation

In [ ]:
t7=time()
print "SVM"

svc = SVC()
clf_svc=svc.fit(X_train, y_train)
print "Acurracy: ", clf_svc.score(X_test,y_test) 
t8=time()
print "time elapsed: ", t8-t7

cross validation for SVM

In [ ]:
tt7=time()
print "cross result========"
scores = cross_validation.cross_val_score(svc,X,y, cv=5)
print scores
print scores.mean()
tt8=time()
print "time elapsed: ", tt7-tt6
print "\n"
In [ ]:
from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn import grid_search

svc = SVC()

parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}

grid = grid_search.GridSearchCV(svc, parameters, n_jobs=-1, verbose=1, scoring='accuracy')


grid.fit(X_train, y_train)

print 'Best score: %0.3f' % grid.best_score_

print 'Best parameters set:'
best_parameters = grid.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])
    
predictions = grid.predict(X_test)
print classification_report(y_test, predictions)
In [ ]:
pipeline = Pipeline([
    ('clf', SVC(kernel='linear', gamma=0.01, C=10))
])

parameters = {
    'clf__gamma': (0.01, 0.03, 0.1, 0.3, 1),
    'clf__C': (0.1, 0.3, 1, 3, 10, 30),
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy')

grid_search.fit(X_train, y_train)

print 'Best score: %0.3f' % grid_search.best_score_

print 'Best parameters set:'
best_parameters = grid_search.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])
    
predictions = grid_search.predict(X_test)
print classification_report(y_test, predictions)