Energy-Efficiency-Prediction-Classification¶

https://archive.ics.uci.edu/ml/datasets/Energy+efficiency

The dataset contains eight attributes (or features, denoted by X1...X8) and two responses (or outcomes, denoted by y1 and y2). The aim is to use the eight features to predict each of the two responses.

Specifically:

X1 Relative Compactness
X2 Surface Area
X3 Wall Area
X4 Roof Area
X5 Overall Height
X6 Orientation
X7 Glazing Area
X8 Glazing Area Distribution
y1 Heating Load
y2 Cooling Load

import os
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn import cross_validation, metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from time import time
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score , classification_report
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report

# read .csv from provided dataset
excel_filename="ENB2012_data.xlsx"

# df=pd.read_csv(csv_filename,index_col=0)
df=pd.read_excel(excel_filename)

df.head()

df.describe()

# handle Y1 and Y2 attrubte to binary [High Hl and CL; and Low HL and Cl]
high = df.Y1 >= 20
low = df.Y1 < 20
df.loc[high,'Y1'] = 1
df.loc[low,'Y1'] = 0

high2 = df.Y2 >= 20
low2 = df.Y2 < 20
df.loc[high2,'Y2'] = 1
df.loc[low2,'Y2'] = 0

df.head()

df.describe()

features=list(df.columns[:-2])

X = df[features]
y1 = df['Y1']
y2 = df['Y2']

y2.unique()

array([ 1.,  0.])

# split dataset to 60% training and 40% testing
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y1, test_size=0.4, random_state=0)
X_train2, X_test2, y_train2, y_test2 = cross_validation.train_test_split(X,y2, test_size=0.4, random_state=0)

print X_train.shape, y_train.shape, X_train2.shape, y_train2.shape

(460, 8) (460L,) (460, 8) (460L,)

Feature importances with forests of trees¶

This examples shows the use of forests of trees to evaluate the importance of features on an artificial classification task. The red bars are the feature importances of the forest, along with their inter-trees variability.

For Classification into Y1¶

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import ExtraTreesClassifier

# Build a classification task using 3 informative features

# Build a forest and compute the feature importances
forest = ExtraTreesClassifier(n_estimators=250,
                              random_state=0)

forest.fit(X, y1)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d - %s (%f) " % (f + 1, indices[f], features[indices[f]], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure(num=None, figsize=(14, 10), dpi=80, facecolor='w', edgecolor='k')
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()

Feature ranking:
1. feature 4 - X5 (0.350375) 
2. feature 3 - X4 (0.312105) 
3. feature 1 - X2 (0.139400) 
4. feature 0 - X1 (0.118307) 
5. feature 2 - X3 (0.029773) 
6. feature 6 - X7 (0.025721) 
7. feature 7 - X8 (0.021366) 
8. feature 5 - X6 (0.002953)

importances[indices[:5]]

array([ 0.35037456,  0.31210489,  0.13940011,  0.11830716,  0.02977294])

for f in range(5):
    print("%d. feature %d - %s (%f)" % (f + 1, indices[f], features[indices[f]] ,importances[indices[f]]))

1. feature 4 - X5 (0.350375)
2. feature 3 - X4 (0.312105)
3. feature 1 - X2 (0.139400)
4. feature 0 - X1 (0.118307)
5. feature 2 - X3 (0.029773)

best_features = []
for i in indices[:5]:
    best_features.append(features[i])

# Plot the top 5 feature importances of the forest
plt.figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
plt.title("Feature importances")
plt.bar(range(5), importances[indices][:5], 
       color="r",  yerr=std[indices][:5], align="center")
plt.xticks(range(5), best_features)
plt.xlim([-1, 5])
plt.show()

For Classification into Y2¶

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import ExtraTreesClassifier

# Build a classification task using 3 informative features

# Build a forest and compute the feature importances
forest = ExtraTreesClassifier(n_estimators=250,
                              random_state=0)

forest.fit(X, y2)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d - %s (%f) " % (f + 1, indices[f], features[indices[f]], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure(num=None, figsize=(14, 10), dpi=80, facecolor='w', edgecolor='k')
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()

Feature ranking:
1. feature 3 - X4 (0.303510) 
2. feature 4 - X5 (0.300616) 
3. feature 1 - X2 (0.140325) 
4. feature 0 - X1 (0.137824) 
5. feature 2 - X3 (0.057110) 
6. feature 6 - X7 (0.046795) 
7. feature 7 - X8 (0.010409) 
8. feature 5 - X6 (0.003411)

importances[indices[:5]]

array([ 0.30351024,  0.30061612,  0.14032472,  0.13782404,  0.05711031])

for f in range(5):
    print("%d. feature %d - %s (%f)" % (f + 1, indices[f], features[indices[f]] ,importances[indices[f]]))

1. feature 3 - X4 (0.303510)
2. feature 4 - X5 (0.300616)
3. feature 1 - X2 (0.140325)
4. feature 0 - X1 (0.137824)
5. feature 2 - X3 (0.057110)

best_features = []
for i in indices[:5]:
    best_features.append(features[i])

# Plot the top 5 feature importances of the forest
plt.figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
plt.title("Feature importances")
plt.bar(range(5), importances[indices][:5], 
       color="r",  yerr=std[indices][:5], align="center")
plt.xticks(range(5), best_features)
plt.xlim([-1, 5])
plt.show()

Decision Tree accuracy and time elapsed caculation¶

t0=time()
print "DecisionTree"

dt = DecisionTreeClassifier(min_samples_split=20,random_state=99)
dt2 = DecisionTreeClassifier(min_samples_split=20,random_state=99)
# dt = DecisionTreeClassifier(min_samples_split=20,max_depth=5,random_state=99)

clf_dt=dt.fit(X_train,y_train)
clf_dt2=dt2.fit(X_train2,y_train2)

print "Acurracy for Y1 ", clf_dt.score(X_test,y_test)
print "Acurracy for Y2 ", clf_dt2.score(X_test2,y_test2)

t1=time()
print "time elapsed: ", t1-t0

DecisionTree
Acurracy for Y1  0.974025974026
Acurracy for Y2  0.996753246753
time elapsed:  0.00600004196167

cross validation for DT¶

tt0=time()
print "cross result========"
scores = cross_validation.cross_val_score(dt, X,y1, cv=5)
scores2 = cross_validation.cross_val_score(dt2, X,y2, cv=5)
print scores 
print scores.mean()
print scores2 
print scores2.mean()
tt1=time()
print "time elapsed: ", tt1-tt0
print "\n"

cross result========
[ 0.91612903  1.          1.          1.          1.        ]
0.983225806452
[ 1.          1.          0.99350649  0.98039216  1.        ]
0.994779730074
time elapsed:  0.0859999656677

Tuning our hyperparameters using GridSearch¶

from sklearn.metrics import classification_report

pipeline = Pipeline([
    ('clf', DecisionTreeClassifier(criterion='entropy'))
])

parameters = {
    'clf__max_depth': (5, 25 , 50),
    'clf__min_samples_split': (1, 5, 10),
    'clf__min_samples_leaf': (1, 2, 3)
}

# For Y1
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='f1')
grid_search.fit(X_train, y_train)

# For Y2
grid_search2 = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='f1')
grid_search2.fit(X_train2, y_train2)


print 'Best score: %0.3f' % grid_search.best_score_
print 'Best parameters set:'

best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])

predictions = grid_search.predict(X_test)

print classification_report(y_test, predictions)


print 'Best score: %0.3f' % grid_search2.best_score_
print 'Best parameters set:'

best_parameters2 = grid_search2.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters2[param_name])

predictions2 = grid_search2.predict(X_test2)

print classification_report(y_test2, predictions2)

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   17.2s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   17.6s finished

Fitting 3 folds for each of 27 candidates, totalling 81 fits
Fitting 3 folds for each of 27 candidates, totalling 81 fits

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   19.2s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   19.6s finished

Best score: 0.995
Best parameters set:
	clf__max_depth: 25
	clf__min_samples_leaf: 3
	clf__min_samples_split: 1
             precision    recall  f1-score   support

        0.0       0.99      1.00      1.00       156
        1.0       1.00      0.99      1.00       152

avg / total       1.00      1.00      1.00       308

Best score: 0.990
Best parameters set:
	clf__max_depth: 25
	clf__min_samples_leaf: 1
	clf__min_samples_split: 1
             precision    recall  f1-score   support

        0.0       0.99      0.99      0.99       139
        1.0       0.99      0.99      0.99       169

avg / total       0.99      0.99      0.99       308

Random Forest accuracy and time elapsed caculation¶

t2=time()
print "RandomForest"
rf = RandomForestClassifier(n_estimators=100,n_jobs=-1)
rf2 = RandomForestClassifier(n_estimators=100,n_jobs=-1)
clf_rf = rf.fit(X_train,y_train)
clf_rf2 = rf2.fit(X_train2,y_train2)
print "Acurracy for Y1: ", clf_rf.score(X_test,y_test)
print "Acurracy for Y2: ", clf_rf2.score(X_test2,y_test2)


t3=time()
print "time elapsed: ", t3-t2

RandomForest
Acurracy for Y1:  0.996753246753
Acurracy for Y2:  0.993506493506
time elapsed:  1.6289999485

cross validation for RF¶

tt2=time()
print "cross result========"
scores = cross_validation.cross_val_score(dt, X,y1, cv=5)
scores2 = cross_validation.cross_val_score(dt, X,y2, cv=5)
print scores
print scores.mean()
print scores2
print scores2.mean()
tt3=time()
print "time elapsed: ", tt3-tt2
print "\n"

cross result========
[ 0.91612903  1.          1.          1.          1.        ]
0.983225806452
[ 1.          1.          0.99350649  0.98039216  1.        ]
0.994779730074
time elapsed:  0.0839998722076

roc_auc_score(y_test2,rf2.predict(X_test2))

0.99344429781618493

Tuning Models using GridSearch¶

For Y1:¶

pipeline2 = Pipeline([
('clf', RandomForestClassifier(criterion='entropy'))
])

parameters = {
    'clf__n_estimators': (5, 25, 50, 100),
    'clf__max_depth': (5, 25 , 50),
    'clf__min_samples_split': (1, 5, 10),
    'clf__min_samples_leaf': (1, 2, 3)
}

grid_search = GridSearchCV(pipeline2, parameters, n_jobs=-1, verbose=1, scoring='accuracy', cv=3)

grid_search.fit(X_train, y_train)

print 'Best score: %0.3f' % grid_search.best_score_

print 'Best parameters set:'
best_parameters = grid_search.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])

predictions = grid_search.predict(X_test)
print 'Accuracy:', accuracy_score(y_test, predictions)
print classification_report(y_test, predictions)

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   21.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   31.8s
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed:   47.0s finished

Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best score: 0.989
Best parameters set:
	clf__max_depth: 5
	clf__min_samples_leaf: 1
	clf__min_samples_split: 10
	clf__n_estimators: 5
Accuracy: 0.987012987013
             precision    recall  f1-score   support

        0.0       0.99      0.98      0.99       156
        1.0       0.98      0.99      0.99       152

avg / total       0.99      0.99      0.99       308

For Y2:¶

pipeline2 = Pipeline([
('clf', RandomForestClassifier(criterion='entropy'))
])

parameters = {
    'clf__n_estimators': (5, 25, 50, 100),
    'clf__max_depth': (5, 25 , 50),
    'clf__min_samples_split': (1, 5, 10),
    'clf__min_samples_leaf': (1, 2, 3)
}

grid_search = GridSearchCV(pipeline2, parameters, n_jobs=-1, verbose=1, scoring='accuracy', cv=3)

grid_search.fit(X_train2, y_train2)

print 'Best score: %0.3f' % grid_search.best_score_

print 'Best parameters set:'
best_parameters = grid_search.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])

predictions = grid_search.predict(X_test2)
print 'Accuracy:', accuracy_score(y_test2, predictions)
print classification_report(y_test2, predictions)

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   21.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   33.2s
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed:   47.6s finished

Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best score: 0.996
Best parameters set:
	clf__max_depth: 50
	clf__min_samples_leaf: 1
	clf__min_samples_split: 1
	clf__n_estimators: 25
Accuracy: 0.993506493506
             precision    recall  f1-score   support

        0.0       0.99      0.99      0.99       139
        1.0       0.99      0.99      0.99       169

avg / total       0.99      0.99      0.99       308

Naive Bayes accuracy and time elapsed caculation¶

t4=time()
print "NaiveBayes"
nb = BernoulliNB()
nb2 = BernoulliNB()
clf_nb=nb.fit(X_train,y_train)
clf_nb2=nb2.fit(X_train2,y_train2)
print "Acurracy for Y1: ", clf_nb.score(X_test,y_test)
print "Acurracy for Y2: ", clf_nb2.score(X_test2,y_test2)
t5=time()
print "time elapsed: ", t5-t4

NaiveBayes
Acurracy for Y1:  0.535714285714
Acurracy for Y2:  0.538961038961
time elapsed:  0.0820000171661

cross-validation for NB¶

tt4=time()
print "cross result========"
scores = cross_validation.cross_val_score(dt, X,y1, cv=5)
scores2 = cross_validation.cross_val_score(dt, X,y2, cv=5)
print scores
print scores.mean()
print scores2
print scores2.mean()
tt5=time()
print "time elapsed: ", tt5-tt4
print "\n"

cross result========
[ 0.91612903  1.          1.          1.          1.        ]
0.983225806452
[ 1.          1.          0.99350649  0.98039216  1.        ]
0.994779730074
time elapsed:  0.143000125885

KNN accuracy and time elapsed caculation¶

t6=time()
print "KNN"
# knn = KNeighborsClassifier(n_neighbors=3)
knn = KNeighborsClassifier()
knn2 = KNeighborsClassifier()
clf_knn=knn.fit(X_train, y_train)
clf_knn2=knn2.fit(X_train2, y_train2)
print "Acurracy for Y1: ", clf_knn.score(X_test,y_test) 
print "Acurracy for Y2: ", clf_knn2.score(X_test2,y_test2) 
t7=time()
print "time elapsed: ", t7-t6

KNN
Acurracy for Y1:  0.974025974026
Acurracy for Y2:  0.974025974026
time elapsed:  0.0189998149872

cross validation for KNN¶

tt6=time()
print "cross result========"
scores = cross_validation.cross_val_score(knn, X,y1, cv=5)
scores2 = cross_validation.cross_val_score(knn2, X,y2, cv=5)
print scores
print scores.mean()
print scores2
print scores2.mean()
tt7=time()
print "time elapsed: ", tt7-tt6
print "\n"

cross result========
[ 0.91612903  1.          1.          1.          1.        ]
0.983225806452
[ 0.92207792  0.93506494  0.94155844  0.96078431  0.97385621]
0.946668364315
time elapsed:  0.171999931335

SVM accuracy and time elapsed caculation¶

t7=time()
print "SVM"

svc = SVC()
svc2 = SVC()
clf_svc=svc.fit(X_train, y_train)
clf_svc2=svc2.fit(X_train2, y_train2)
print "Acurracy for Y1: ", clf_svc.score(X_test,y_test) 
print "Acurracy for Y2: ", clf_svc2.score(X_test2,y_test2) 
t8=time()
print "time elapsed: ", t8-t7

SVM
Acurracy for Y1:  0.974025974026
Acurracy for Y2:  0.964285714286
time elapsed:  0.0349998474121

cross validation for SVM¶

tt7=time()
print "cross result========"
#scores = cross_validation.cross_val_score(dt, X,y, cv=5)
#print scores
#print scores.mean()
scores = cross_validation.cross_val_score(dt, X,y2, cv=5)
print scores2
print scores2.mean()
tt8=time()
print "time elapsed: ", tt7-tt6
print "\n"

cross result========
[ 0.92207792  0.93506494  0.94155844  0.96078431  0.97385621]
0.946668364315
time elapsed:  5.48799991608

For Y1:¶

from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn import grid_search

svc = SVC()

parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}

grid = grid_search.GridSearchCV(svc, parameters, n_jobs=-1, verbose=1, scoring='accuracy')


grid.fit(X_train, y_train)

print 'Best score: %0.3f' % grid.best_score_

print 'Best parameters set:'
best_parameters = grid.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])
    
predictions = grid.predict(X_test)
print classification_report(y_test, predictions)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best score: 0.989
Best parameters set:
	C: 1
	kernel: 'linear'
             precision    recall  f1-score   support

        0.0       1.00      0.95      0.97       156
        1.0       0.95      1.00      0.97       152

avg / total       0.98      0.97      0.97       308

[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:   20.9s finished

For Y2:¶

from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn import grid_search

svc = SVC()

parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}

grid = grid_search.GridSearchCV(svc, parameters, n_jobs=-1, verbose=1, scoring='accuracy')


grid.fit(X_train2, y_train2)

print 'Best score: %0.3f' % grid.best_score_

print 'Best parameters set:'
best_parameters = grid.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])
    
predictions = grid.predict(X_test2)
print classification_report(y_test2, predictions)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best score: 0.963
Best parameters set:
	C: 10
	kernel: 'rbf'
             precision    recall  f1-score   support

        0.0       0.99      0.96      0.97       139
        1.0       0.97      0.99      0.98       169

avg / total       0.98      0.98      0.98       308

[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:   20.5s finished

For Y1:¶

from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn import grid_search

pipeline = Pipeline([
    ('clf', SVC(kernel='linear', gamma=0.01, C=10))
])

parameters = {
    'clf__gamma': (0.01, 0.03, 0.1, 0.3, 1),
    'clf__C': (0.1, 0.3, 1, 3, 10, 30),
}

grid = grid_search.GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy')


grid.fit(X_train, y_train)

print 'Best score: %0.3f' % grid.best_score_

print 'Best parameters set:'
best_parameters = grid.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])
    
predictions = grid.predict(X_test)
print classification_report(y_test, predictions)

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   19.7s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:   21.9s finished

Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best score: 0.989
Best parameters set:
	clf__C: 0.1
	clf__gamma: 0.01
             precision    recall  f1-score   support

        0.0       1.00      0.95      0.97       156
        1.0       0.95      1.00      0.97       152

avg / total       0.98      0.97      0.97       308

For Y2:¶

from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn import grid_search

pipeline = Pipeline([
    ('clf', SVC(kernel='linear', gamma=0.01, C=10))
])

parameters = {
    'clf__gamma': (0.01, 0.03, 0.1, 0.3, 1),
    'clf__C': (0.1, 0.3, 1, 3, 10, 30),
}

grid = grid_search.GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy')


grid.fit(X_train2, y_train2)

print 'Best score: %0.3f' % grid.best_score_

print 'Best parameters set:'
best_parameters = grid.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])
    
predictions = grid.predict(X_test2)
print classification_report(y_test2, predictions)

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   24.7s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:   37.2s finished

Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best score: 0.941
Best parameters set:
	clf__C: 0.1
	clf__gamma: 0.01
             precision    recall  f1-score   support

        0.0       0.94      1.00      0.97       139
        1.0       1.00      0.95      0.97       169

avg / total       0.97      0.97      0.97       308

Regression¶

import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler

from sklearn.cross_validation import train_test_split
from sklearn. cross_validation import cross_val_score

from sklearn.feature_selection import *
from sklearn import metrics

# read .csv from provided dataset
excel_filename="ENB2012_data.xlsx"

# df=pd.read_csv(csv_filename,index_col=0)
df=pd.read_excel(excel_filename)

df.head()

cols = df.columns

features=list(df.columns[:-2])

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid', context='notebook')

sns.pairplot(df, size=2.5);
plt.tight_layout()
# plt.savefig('./figures/scatter.png', dpi=300)
plt.show()

import numpy as np
cm = np.corrcoef(df.values.T)
sns.set(font_scale=1.5)
hm = sns.heatmap(cm, 
            cbar=True,
            annot=True, 
            square=True,
            fmt='.2f',
            annot_kws={'size': 15},
            yticklabels=cols,
            xticklabels=cols)

plt.tight_layout()
# plt.savefig('./figures/corr_mat.png', dpi=300)
plt.show()

X = df[features].values
y1 = df['Y1'].values
y2 = df['Y2'].values

def lin_regplot(X, y, model):
    plt.scatter(X, y, c='lightblue')
    plt.plot(X, model.predict(X), color='red', linewidth=2)    
    return

from sklearn.cross_validation import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y1, test_size=0.3, random_state=0)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y2, test_size=0.3, random_state=0)

Linear Regression Results for Y1:¶

slr = LinearRegression()

slr.fit(X_train, y_train)
y_train_pred = slr.predict(X_train)
y_test_pred = slr.predict(X_test)

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train, y_train_pred),
        mean_squared_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train, y_train_pred),
        r2_score(y_test, y_test_pred)))

MSE train: 8.258, test: 9.266
R^2 train: 0.918, test: 0.912

Linear Regression Results for Y2:¶

slr2 = LinearRegression()

slr2.fit(X_train2, y_train2)
y_train_pred = slr.predict(X_train2)
y_test_pred = slr.predict(X_test2)

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train2, y_train_pred),
        mean_squared_error(y_test2, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train2, y_train_pred),
        r2_score(y_test2, y_test_pred)))

MSE train: 16.907, test: 15.016
R^2 train: 0.813, test: 0.834

Using regularized methods for regression¶

For Y1:¶

A Lasso Regression model can be initialized as follows:

from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
y_train_pred = lasso.predict(X_train)
y_test_pred = lasso.predict(X_test)
print(lasso.coef_)

[ -0.00000000e+00  -0.00000000e+00   5.25092241e-02  -8.09598824e-03
   4.50582099e+00  -0.00000000e+00   1.35519277e+01   2.56907678e-01]

print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train, y_train_pred),
        mean_squared_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train, y_train_pred),
        r2_score(y_test, y_test_pred)))

MSE train: 9.226, test: 11.264
R^2 train: 0.908, test: 0.893

Similiarly Ridge regression can be used:

from sklearn.linear_model import Ridge
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)
y_train_pred = ridge.predict(X_train)
y_test_pred = ridge.predict(X_test)
print(ridge.coef_)

[ -3.02809006e+00   9.49597149e-03   3.95672125e-02  -1.50356200e-02
   5.00427216e+00  -6.11582978e-02   1.74851094e+01   2.18000043e-01]

print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train, y_train_pred),
        mean_squared_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train, y_train_pred),
        r2_score(y_test, y_test_pred)))

MSE train: 8.611, test: 10.123
R^2 train: 0.914, test: 0.904

Lastly, the ElasticNet implementation allows us to vary the L1 to L2 ratio:

from sklearn.linear_model import ElasticNet
en = ElasticNet(alpha=1.0, l1_ratio=0.5)
en.fit(X_train, y_train)
y_train_pred = en.predict(X_train)
y_test_pred = en.predict(X_test)
print(en.coef_)

[-0.         -0.06915291  0.11963801 -0.02336298  0.49115646 -0.          0.
  0.29391009]

print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train, y_train_pred),
        mean_squared_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train, y_train_pred),
        r2_score(y_test, y_test_pred)))

MSE train: 18.420, test: 24.013
R^2 train: 0.816, test: 0.772

For Y2:¶

A Lasso Regression model can be initialized as follows:

from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.1)
lasso.fit(X_train2, y_train2)
y_train_pred = lasso.predict(X_train2)
y_test_pred = lasso.predict(X_test2)
print(lasso.coef_)

[ -0.00000000e+00   6.24091916e-03   3.71676280e-02  -0.00000000e+00
   4.85086598e+00   5.35921226e-02   8.57226128e+00   9.20584239e-02]

print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train2, y_train_pred),
        mean_squared_error(y_test2, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train2, y_train_pred),
        r2_score(y_test2, y_test_pred)))

MSE train: 11.465, test: 11.517
R^2 train: 0.873, test: 0.872

Similiarly Ridge regression can be used:

from sklearn.linear_model import Ridge
ridge = Ridge(alpha=1.0)
ridge.fit(X_train2, y_train2)
y_train_pred = ridge.predict(X_train2)
y_test_pred = ridge.predict(X_test2)
print(ridge.coef_)

[ -3.45151139e+00   1.47783669e-02   2.46659874e-02  -4.94380986e-03
   5.37292798e+00   1.25497093e-01   1.29638611e+01   4.43418402e-02]

print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train2, y_train_pred),
        mean_squared_error(y_test2, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train2, y_train_pred),
        r2_score(y_test2, y_test_pred)))

MSE train: 10.808, test: 10.459
R^2 train: 0.880, test: 0.884

Lastly, the ElasticNet implementation allows us to vary the L1 to L2 ratio:

from sklearn.linear_model import ElasticNet
en = ElasticNet(alpha=1.0, l1_ratio=0.5)
en.fit(X_train2, y_train2)
y_train_pred = en.predict(X_train2)
y_test_pred = en.predict(X_test2)
print(en.coef_)

[-0.         -0.06479419  0.10593928 -0.02070728  0.5827722   0.          0.
  0.07045875]

print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train2, y_train_pred),
        mean_squared_error(y_test2, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train2, y_train_pred),
        r2_score(y_test2, y_test_pred)))

MSE train: 18.222, test: 20.797
R^2 train: 0.798, test: 0.769

For example, if we set l1_ratio to 1.0, the ElasticNet regressor would be equal to LASSO regression.

Decision tree regression¶

from sklearn.tree import DecisionTreeRegressor

tree = DecisionTreeRegressor(max_depth=3)
tree.fit(X_train, y_train)
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)

print "For Y1:"
print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train, y_train_pred),
        mean_squared_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train, y_train_pred),
        r2_score(y_test, y_test_pred)))

For Y1:
MSE train: 5.653, test: 5.824
R^2 train: 0.944, test: 0.945

from sklearn.tree import DecisionTreeRegressor

tree = DecisionTreeRegressor(max_depth=3)
tree.fit(X_train2, y_train2)
y_train_pred = tree.predict(X_train2)
y_test_pred = tree.predict(X_test2)

print "For Y2:"
print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train2, y_train_pred),
        mean_squared_error(y_test2, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train2, y_train_pred),
        r2_score(y_test2, y_test_pred)))

For Y2:
MSE train: 6.382, test: 6.874
R^2 train: 0.929, test: 0.924

Random forest regression¶

from sklearn.ensemble import RandomForestRegressor

forest = RandomForestRegressor(n_estimators=1000, 
                               criterion='mse', 
                               random_state=1, 
                               n_jobs=-1)
forest.fit(X_train, y_train)
y_train_pred = forest.predict(X_train)
y_test_pred = forest.predict(X_test)

print "For Y1 :"
print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train, y_train_pred),
        mean_squared_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train, y_train_pred),
        r2_score(y_test, y_test_pred)))

For Y1 :
MSE train: 0.034, test: 0.282
R^2 train: 1.000, test: 0.997

from sklearn.ensemble import RandomForestRegressor

forest = RandomForestRegressor(n_estimators=1000, 
                               criterion='mse', 
                               random_state=1, 
                               n_jobs=-1)
forest.fit(X_train2, y_train2)
y_train_pred = forest.predict(X_train2)
y_test_pred = forest.predict(X_test2)

print "For Y2 : "
print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train2, y_train_pred),
        mean_squared_error(y_test2, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train2, y_train_pred),
        r2_score(y_test2, y_test_pred)))

For Y2 : 
MSE train: 0.402, test: 2.853
R^2 train: 0.996, test: 0.968

plt.scatter(y_train_pred,  
            y_train_pred - y_train, 
            c='black', 
            marker='o', 
            s=35,
            alpha=0.5,
            label='Training data')
plt.scatter(y_test_pred,  
            y_test_pred - y_test, 
            c='lightgreen', 
            marker='s', 
            s=35,
            alpha=0.7,
            label='Test data')

plt.xlabel('Predicted values')
plt.ylabel('Residuals')
plt.legend(loc='upper left')
plt.hlines(y=0, xmin=min(y_train.min(),y_test.min()), xmax=max(y_train.max(), y_test.max()), lw=2, color='red')
plt.xlim([min(y_train.min(),y_test.min()), max(y_train.max(), y_test.max())])
plt.tight_layout()

# plt.savefig('./figures/slr_residuals.png', dpi=300)
plt.show()

Linear Regression¶

regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_predictions = regressor.predict(X_test)
print 'R-squared for Y1:', regressor.score(X_test, y_test)

R-squared for Y1: 0.911863285445

regressor = LinearRegression()
regressor.fit(X_train2, y_train2)
y_predictions = regressor.predict(X_test2)
print 'R-squared for Y2:', regressor.score(X_test2, y_test2)

R-squared for Y2: 0.893067342291

Cross Validation¶

print "For Y1"
scores = cross_val_score(regressor, X, y1, cv=5)
print "Average of scores: ", scores.mean()
print "Cross validation scores: ", scores

print "For Y2"
scores2 = cross_val_score(regressor, X, y2, cv=5)
print "Average of scores: ", scores2.mean()
print "Cross validation scores: ", scores2

For Y1
Average of scores:  0.889734585605
Cross validation scores:  [ 0.79250539  0.89563546  0.92408003  0.9248329   0.91161915]
For Y2
Average of scores:  0.875189124451
Cross validation scores:  [ 0.83354445  0.8616826   0.89230121  0.8961209   0.89229647]

Let's inspect some of the model's predictions and plot the true quality scores against the predicted scores:

plt.scatter(y_test,y_predictions)
plt.xlabel('True Heating Level')
plt.ylabel('Predicted Heating Level')
plt.title('Predicted Heating Level Against True Heating Level')
plt.show()

plt.scatter(y_test2,y_predictions)
plt.xlabel('True Cooling Level')
plt.ylabel('Predicted Cooling Level')
plt.title('Predicted Cooling Level Against True Cooling Level')
plt.show()

SGDRegressor¶

# Scaling the features using StandardScaler:
X_scaler = StandardScaler()
y_scaler = StandardScaler()
X_train = X_scaler.fit_transform(X_train)
y_train = y_scaler.fit_transform(y_train)
X_test = X_scaler.transform(X_test)
y_test = y_scaler.transform(y_test)

C:\Miniconda2\lib\site-packages\sklearn\preprocessing\data.py:583: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
C:\Miniconda2\lib\site-packages\sklearn\preprocessing\data.py:646: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
C:\Miniconda2\lib\site-packages\sklearn\preprocessing\data.py:646: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)

X_train2 = X_scaler.fit_transform(X_train2)
y_train2 = y_scaler.fit_transform(y_train2)
X_test2 = X_scaler.transform(X_test2)
y_test2 = y_scaler.transform(y_test2)

C:\Miniconda2\lib\site-packages\sklearn\preprocessing\data.py:583: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
C:\Miniconda2\lib\site-packages\sklearn\preprocessing\data.py:646: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
C:\Miniconda2\lib\site-packages\sklearn\preprocessing\data.py:646: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)

print "For Y1:"
regressor = SGDRegressor(loss='squared_loss')
scores = cross_val_score(regressor, X_train, y_train, cv=5)
print 'Cross validation r-squared scores:', scores
print 'Average cross validation r-squared score:', np.mean(scores)
regressor.fit_transform(X_train, y_train)
print 'Test set r-squared score', regressor.score(X_test, y_test)

For Y1:
Cross validation r-squared scores: [ 0.88588283  0.86505868  0.91505152  0.89873472  0.90853842]
Average cross validation r-squared score: 0.894653232465
Test set r-squared score 0.878022200951

C:\Miniconda2\lib\site-packages\sklearn\utils\__init__.py:93: DeprecationWarning: Function transform is deprecated; Support to use estimators as feature selectors will be removed in version 0.19. Use SelectFromModel instead.
  warnings.warn(msg, category=DeprecationWarning)

print "For Y2:"
regressor = SGDRegressor(loss='squared_loss')
scores = cross_val_score(regressor, X_train2, y_train2, cv=5)
print 'Cross validation r-squared scores:', scores
print 'Average cross validation r-squared score:', np.mean(scores)
regressor.fit_transform(X_train2, y_train2)
print 'Test set r-squared score', regressor.score(X_test2, y_test2)

For Y2:
Cross validation r-squared scores: [ 0.86943469  0.82635705  0.87185082  0.82064365  0.87736007]
Average cross validation r-squared score: 0.853129258707
Test set r-squared score 0.851521059389

C:\Miniconda2\lib\site-packages\sklearn\utils\__init__.py:93: DeprecationWarning: Function transform is deprecated; Support to use estimators as feature selectors will be removed in version 0.19. Use SelectFromModel instead.
  warnings.warn(msg, category=DeprecationWarning)

Selecting the best features¶

Sometimes there are a lot of features in the dataset, so before learning, we should try to see which features are more relevant for our learning task, i.e. which of them are better prize predictors. We will use the SelectKBest method from the feature_selection package, and plot the results.

df.head()

For Y1¶

from sklearn.feature_selection import *

fs=SelectKBest(score_func=f_regression,k=3)
X_new=fs.fit_transform(X_train,y_train)
print zip(fs.get_support(),features)

x_min, x_max = X_new[:,0].min(), X_new[:, 0].max()
y_min, y_max = y_train.min(), y_train.max()

fig=plt.figure()
#fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)

# Two subplots, unpack the axes array immediately
fig, axes = plt.subplots(1,3)

fig.set_size_inches(12,12)

for i in range(3):
    axes[i].set_aspect('equal')
    axes[i].set_title('Feature ' + str(i))
    axes[i].set_xlabel('Feature')
    axes[i].set_ylabel('Target')
    axes[i].set_xlim(x_min, x_max)
    axes[i].set_ylim(y_min, y_max)
    plt.sca(axes[i])
    plt.scatter(X_new[:,i],y_train)

[(False, u'X1'), (True, u'X2'), (False, u'X3'), (True, u'X4'), (True, u'X5'), (False, u'X6'), (False, u'X7'), (False, u'X8')]

<matplotlib.figure.Figure at 0x19883dd8>

For Y2¶

from sklearn.feature_selection import *

fs=SelectKBest(score_func=f_regression,k=3)
X_new=fs.fit_transform(X_train2,y_train2)
print zip(fs.get_support(),features)

x_min, x_max = X_new[:,0].min(), X_new[:, 0].max()
y_min, y_max = y_train2.min(), y_train2.max()

fig=plt.figure()
#fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)

# Two subplots, unpack the axes array immediately
fig, axes = plt.subplots(1,3)

fig.set_size_inches(12,12)

for i in range(3):
    axes[i].set_aspect('equal')
    axes[i].set_title('Feature ' + str(i))
    axes[i].set_xlabel('Feature')
    axes[i].set_ylabel('Target')
    axes[i].set_xlim(x_min, x_max)
    axes[i].set_ylim(y_min, y_max)
    plt.sca(axes[i])
    plt.scatter(X_new[:,i],y_train)

[(False, u'X1'), (True, u'X2'), (False, u'X3'), (True, u'X4'), (True, u'X5'), (False, u'X6'), (False, u'X7'), (False, u'X8')]

<matplotlib.figure.Figure at 0x1a073cf8>

A Linear Model¶

from sklearn.cross_validation import *
def train_and_evaluate(clf, X_train, y_train):
    
    clf.fit(X_train, y_train)
    
    print "Coefficient of determination on training set:",clf.score(X_train, y_train)
    
    # create a k-fold croos validation iterator of k=5 folds
    cv = KFold(X_train.shape[0], 5, shuffle=True, random_state=33)
    scores = cross_val_score(clf, X_train, y_train, cv=cv)
    print "Average coefficient of determination using 5-fold crossvalidation:",np.mean(scores)

print "For Y1:"
from sklearn import linear_model
clf_sgd = linear_model.SGDRegressor(loss='squared_loss', penalty=None,  random_state=42)
train_and_evaluate(clf_sgd,X_train,y_train)
print clf_sgd.coef_

For Y1:
Coefficient of determination on training set: 0.901059473606
Average coefficient of determination using 5-fold crossvalidation: 0.89637052564
[ 0.02929876 -0.11788433  0.29925323 -0.2589271   0.40909249 -0.01116163
  0.25451802  0.02560218]

print "For Y2:"
from sklearn import linear_model
clf_sgd = linear_model.SGDRegressor(loss='squared_loss', penalty=None,  random_state=42)
train_and_evaluate(clf_sgd,X_train2,y_train2)
print clf_sgd.coef_

For Y2:
Coefficient of determination on training set: 0.861638005655
Average coefficient of determination using 5-fold crossvalidation: 0.855766172226
[  2.11304615e-02  -1.23473581e-01   2.69799137e-01  -2.50285618e-01
   4.28203071e-01   1.19646078e-02   1.98020883e-01  -1.34849493e-04]

print "For Y1:"
clf_sgd1 = linear_model.SGDRegressor(loss='squared_loss', penalty='l2',  random_state=42)
train_and_evaluate(clf_sgd1,X_train,y_train)

For Y1:
Coefficient of determination on training set: 0.90105707507
Average coefficient of determination using 5-fold crossvalidation: 0.896368883051

print "For Y2:"
clf_sgd1 = linear_model.SGDRegressor(loss='squared_loss', penalty='l2',  random_state=42)
train_and_evaluate(clf_sgd1,X_train2,y_train2)

For Y2:
Coefficient of determination on training set: 0.861634914673
Average coefficient of determination using 5-fold crossvalidation: 0.85576442916

print "For Y1:"
clf_sgd2 = linear_model.SGDRegressor(loss='squared_loss', penalty='l1',  random_state=42)
train_and_evaluate(clf_sgd2,X_train,y_train)

For Y1:
Coefficient of determination on training set: 0.901065230226
Average coefficient of determination using 5-fold crossvalidation: 0.896386970858

print "For Y2:"
clf_sgd2 = linear_model.SGDRegressor(loss='squared_loss', penalty='l1',  random_state=42)
train_and_evaluate(clf_sgd2,X_train2,y_train2)

For Y2:
Coefficient of determination on training set: 0.861646734129
Average coefficient of determination using 5-fold crossvalidation: 0.855792546322

Support Vector Machines for regression¶

The regression version of SVM can be used instead to find the hyperplane (note how easy is to change the classification method in scikit-learn!). We will try a linear kernel, a polynomial kernel, and finally, a rbf kernel. For more information on kernels, see http://scikit-learn.org/stable/modules/svm.html#svm-kernels

For Y1¶

from sklearn import svm
clf_svr= svm.SVR(kernel='linear')
train_and_evaluate(clf_svr,X_train,y_train)

Coefficient of determination on training set: 0.91533407048
Average coefficient of determination using 5-fold crossvalidation: 0.911211015693

clf_svr_poly= svm.SVR(kernel='poly')
train_and_evaluate(clf_svr_poly,X_train,y_train)

Coefficient of determination on training set: 0.903942188062
Average coefficient of determination using 5-fold crossvalidation: 0.892770225131

clf_svr_rbf= svm.SVR(kernel='rbf')
train_and_evaluate(clf_svr_rbf,X_train,y_train)

Coefficient of determination on training set: 0.957551372302
Average coefficient of determination using 5-fold crossvalidation: 0.938013408133

clf_svr_poly2= svm.SVR(kernel='poly',degree=2)
train_and_evaluate(clf_svr_poly2,X_train,y_train)

Coefficient of determination on training set: 0.126149663254
Average coefficient of determination using 5-fold crossvalidation: -0.103757731421

For Y2¶

from sklearn import svm
clf_svr= svm.SVR(kernel='linear')
train_and_evaluate(clf_svr,X_train2,y_train2)

Coefficient of determination on training set: 0.879066598676
Average coefficient of determination using 5-fold crossvalidation: 0.877117198002

clf_svr_poly= svm.SVR(kernel='poly')
train_and_evaluate(clf_svr_poly,X_train2,y_train2)

Coefficient of determination on training set: 0.881342684633
Average coefficient of determination using 5-fold crossvalidation: 0.868413210606

clf_svr_rbf= svm.SVR(kernel='rbf')
train_and_evaluate(clf_svr_rbf,X_train2,y_train2)

Coefficient of determination on training set: 0.932783263523
Average coefficient of determination using 5-fold crossvalidation: 0.905017144946

clf_svr_poly2= svm.SVR(kernel='poly',degree=2)
train_and_evaluate(clf_svr_poly2,X_train2,y_train2)

Coefficient of determination on training set: 0.128269544235
Average coefficient of determination using 5-fold crossvalidation: -0.121699045485

Random Forests for Regression¶

Finally, let's try again Random Forests, in their Extra Trees, and Regression version

For Y1:¶

from sklearn import ensemble
clf_et=ensemble.ExtraTreesRegressor(n_estimators=10,random_state=42)
train_and_evaluate(clf_et,X_train,y_train)

Coefficient of determination on training set: 1.0
Average coefficient of determination using 5-fold crossvalidation: 0.996006585657

An interesting side effect of random forest classification, is that you can measure how 'important' each feature is when predicting the final result

print np.sort(zip(clf_et.feature_importances_,features),axis=0)

[[u'0.00070602437932' u'X1']
 [u'0.0131710945576' u'X2']
 [u'0.0207803993762' u'X3']
 [u'0.0536081125592' u'X4']
 [u'0.0751658740364' u'X5']
 [u'0.117730817307' u'X6']
 [u'0.241467750204' u'X7']
 [u'0.477369927581' u'X8']]

Finally, evaluate our classifiers on the testing set

from sklearn import metrics
def measure_performance(X,y,clf, show_accuracy=True,
                        show_classification_report=True,
                        show_confusion_matrix=True,
                        show_r2_score=False):
    y_pred=clf.predict(X)   
    if show_accuracy:
        print "Accuracy:{0:.3f}".format(metrics.accuracy_score(y,y_pred)),"\n"

    if show_classification_report:
        print "Classification report"
        print metrics.classification_report(y,y_pred),"\n"
        
    if show_confusion_matrix:
        print "Confusion matrix"
        print metrics.confusion_matrix(y,y_pred),"\n"
        
    if show_r2_score:
        print "Coefficient of determination:{0:.3f}".format(metrics.r2_score(y,y_pred)),"\n"

        
measure_performance(X_test,y_test,clf_et,
                    show_accuracy=False,
                    show_classification_report=False,
                    show_confusion_matrix=False,
                    show_r2_score=True)

Coefficient of determination:0.998

For Y2:¶

from sklearn import ensemble
clf_et=ensemble.ExtraTreesRegressor(n_estimators=10,random_state=42)
train_and_evaluate(clf_et,X_train2,y_train2)

Coefficient of determination on training set: 1.0
Average coefficient of determination using 5-fold crossvalidation: 0.950270367911

print np.sort(zip(clf_et.feature_importances_,features),axis=0)

[[u'0.0135715054208' u'X1']
 [u'0.0138317081046' u'X2']
 [u'0.0368978078372' u'X3']
 [u'0.0475918816425' u'X4']
 [u'0.0509413386519' u'X5']
 [u'0.109740846516' u'X6']
 [u'0.326199348493' u'X7']
 [u'0.401225563334' u'X8']]

Again, evaluating our classifiers on the testing set:

measure_performance(X_test2,y_test2,clf_et,
                    show_accuracy=False,
                    show_classification_report=False,
                    show_confusion_matrix=False,
                    show_r2_score=True)

Coefficient of determination:0.960

	X1	X2	X3	X4	X5	X6	X7	X8	Y1	Y2
count	768.000000	768.000000	768.000000	768.000000	768.00000	768.000000	768.000000	768.00000	768.000000	768.000000
mean	0.764167	671.708333	318.500000	176.604167	5.25000	3.500000	0.234375	2.81250	22.307195	24.587760
std	0.105777	88.086116	43.626481	45.165950	1.75114	1.118763	0.133221	1.55096	10.090204	9.513306
min	0.620000	514.500000	245.000000	110.250000	3.50000	2.000000	0.000000	0.00000	6.010000	10.900000
25%	0.682500	606.375000	294.000000	140.875000	3.50000	2.750000	0.100000	1.75000	12.992500	15.620000
50%	0.750000	673.750000	318.500000	183.750000	5.25000	3.500000	0.250000	3.00000	18.950000	22.080000
75%	0.830000	741.125000	343.000000	220.500000	7.00000	4.250000	0.400000	4.00000	31.667500	33.132500
max	0.980000	808.500000	416.500000	220.500000	7.00000	5.000000	0.400000	5.00000	43.100000	48.030000

	X1	X2	X3	X4	X5	X6	X7	X8	Y1	Y2
count	768.000000	768.000000	768.000000	768.000000	768.00000	768.000000	768.000000	768.00000	768.000000	768.000000
mean	0.764167	671.708333	318.500000	176.604167	5.25000	3.500000	0.234375	2.81250	0.483073	0.546875
std	0.105777	88.086116	43.626481	45.165950	1.75114	1.118763	0.133221	1.55096	0.500039	0.498122
min	0.620000	514.500000	245.000000	110.250000	3.50000	2.000000	0.000000	0.00000	0.000000	0.000000
25%	0.682500	606.375000	294.000000	140.875000	3.50000	2.750000	0.100000	1.75000	0.000000	0.000000
50%	0.750000	673.750000	318.500000	183.750000	5.25000	3.500000	0.250000	3.00000	0.000000	1.000000
75%	0.830000	741.125000	343.000000	220.500000	7.00000	4.250000	0.400000	4.00000	1.000000	1.000000
max	0.980000	808.500000	416.500000	220.500000	7.00000	5.000000	0.400000	5.00000	1.000000	1.000000

	X1	X2	X3	X4	X5	X6	Y1	Y2
0	0.98	514.5	294.0	110.25	7.0	2	15.55	21.33
1	0.98	514.5	294.0	110.25	7.0	3	15.55	21.33
2	0.98	514.5	294.0	110.25	7.0	4	15.55	21.33
3	0.98	514.5	294.0	110.25	7.0	5	15.55	21.33
4	0.90	563.5	318.5	122.50	7.0	2	20.84	28.28