https://archive.ics.uci.edu/ml/datasets/Energy+efficiency
The dataset contains eight attributes (or features, denoted by X1...X8) and two responses (or outcomes, denoted by y1 and y2). The aim is to use the eight features to predict each of the two responses.
Specifically:
import os
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn import cross_validation, metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from time import time
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score , classification_report
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report
# read .csv from provided dataset
excel_filename="ENB2012_data.xlsx"
# df=pd.read_csv(csv_filename,index_col=0)
df=pd.read_excel(excel_filename)
df.head()
df.describe()
# handle Y1 and Y2 attrubte to binary [High Hl and CL; and Low HL and Cl]
high = df.Y1 >= 20
low = df.Y1 < 20
df.loc[high,'Y1'] = 1
df.loc[low,'Y1'] = 0
high2 = df.Y2 >= 20
low2 = df.Y2 < 20
df.loc[high2,'Y2'] = 1
df.loc[low2,'Y2'] = 0
df.head()
df.describe()
features=list(df.columns[:-2])
X = df[features]
y1 = df['Y1']
y2 = df['Y2']
y2.unique()
# split dataset to 60% training and 40% testing
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y1, test_size=0.4, random_state=0)
X_train2, X_test2, y_train2, y_test2 = cross_validation.train_test_split(X,y2, test_size=0.4, random_state=0)
print X_train.shape, y_train.shape, X_train2.shape, y_train2.shape
This examples shows the use of forests of trees to evaluate the importance of features on an artificial classification task. The red bars are the feature importances of the forest, along with their inter-trees variability.
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import ExtraTreesClassifier
# Build a classification task using 3 informative features
# Build a forest and compute the feature importances
forest = ExtraTreesClassifier(n_estimators=250,
random_state=0)
forest.fit(X, y1)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
axis=0)
indices = np.argsort(importances)[::-1]
# Print the feature ranking
print("Feature ranking:")
for f in range(X.shape[1]):
print("%d. feature %d - %s (%f) " % (f + 1, indices[f], features[indices[f]], importances[indices[f]]))
# Plot the feature importances of the forest
plt.figure(num=None, figsize=(14, 10), dpi=80, facecolor='w', edgecolor='k')
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()
importances[indices[:5]]
for f in range(5):
print("%d. feature %d - %s (%f)" % (f + 1, indices[f], features[indices[f]] ,importances[indices[f]]))
best_features = []
for i in indices[:5]:
best_features.append(features[i])
# Plot the top 5 feature importances of the forest
plt.figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
plt.title("Feature importances")
plt.bar(range(5), importances[indices][:5],
color="r", yerr=std[indices][:5], align="center")
plt.xticks(range(5), best_features)
plt.xlim([-1, 5])
plt.show()
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import ExtraTreesClassifier
# Build a classification task using 3 informative features
# Build a forest and compute the feature importances
forest = ExtraTreesClassifier(n_estimators=250,
random_state=0)
forest.fit(X, y2)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
axis=0)
indices = np.argsort(importances)[::-1]
# Print the feature ranking
print("Feature ranking:")
for f in range(X.shape[1]):
print("%d. feature %d - %s (%f) " % (f + 1, indices[f], features[indices[f]], importances[indices[f]]))
# Plot the feature importances of the forest
plt.figure(num=None, figsize=(14, 10), dpi=80, facecolor='w', edgecolor='k')
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()
importances[indices[:5]]
for f in range(5):
print("%d. feature %d - %s (%f)" % (f + 1, indices[f], features[indices[f]] ,importances[indices[f]]))
best_features = []
for i in indices[:5]:
best_features.append(features[i])
# Plot the top 5 feature importances of the forest
plt.figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
plt.title("Feature importances")
plt.bar(range(5), importances[indices][:5],
color="r", yerr=std[indices][:5], align="center")
plt.xticks(range(5), best_features)
plt.xlim([-1, 5])
plt.show()
t0=time()
print "DecisionTree"
dt = DecisionTreeClassifier(min_samples_split=20,random_state=99)
dt2 = DecisionTreeClassifier(min_samples_split=20,random_state=99)
# dt = DecisionTreeClassifier(min_samples_split=20,max_depth=5,random_state=99)
clf_dt=dt.fit(X_train,y_train)
clf_dt2=dt2.fit(X_train2,y_train2)
print "Acurracy for Y1 ", clf_dt.score(X_test,y_test)
print "Acurracy for Y2 ", clf_dt2.score(X_test2,y_test2)
t1=time()
print "time elapsed: ", t1-t0
tt0=time()
print "cross result========"
scores = cross_validation.cross_val_score(dt, X,y1, cv=5)
scores2 = cross_validation.cross_val_score(dt2, X,y2, cv=5)
print scores
print scores.mean()
print scores2
print scores2.mean()
tt1=time()
print "time elapsed: ", tt1-tt0
print "\n"
from sklearn.metrics import classification_report
pipeline = Pipeline([
('clf', DecisionTreeClassifier(criterion='entropy'))
])
parameters = {
'clf__max_depth': (5, 25 , 50),
'clf__min_samples_split': (1, 5, 10),
'clf__min_samples_leaf': (1, 2, 3)
}
# For Y1
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='f1')
grid_search.fit(X_train, y_train)
# For Y2
grid_search2 = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='f1')
grid_search2.fit(X_train2, y_train2)
print 'Best score: %0.3f' % grid_search.best_score_
print 'Best parameters set:'
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print '\t%s: %r' % (param_name, best_parameters[param_name])
predictions = grid_search.predict(X_test)
print classification_report(y_test, predictions)
print 'Best score: %0.3f' % grid_search2.best_score_
print 'Best parameters set:'
best_parameters2 = grid_search2.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print '\t%s: %r' % (param_name, best_parameters2[param_name])
predictions2 = grid_search2.predict(X_test2)
print classification_report(y_test2, predictions2)
t2=time()
print "RandomForest"
rf = RandomForestClassifier(n_estimators=100,n_jobs=-1)
rf2 = RandomForestClassifier(n_estimators=100,n_jobs=-1)
clf_rf = rf.fit(X_train,y_train)
clf_rf2 = rf2.fit(X_train2,y_train2)
print "Acurracy for Y1: ", clf_rf.score(X_test,y_test)
print "Acurracy for Y2: ", clf_rf2.score(X_test2,y_test2)
t3=time()
print "time elapsed: ", t3-t2
tt2=time()
print "cross result========"
scores = cross_validation.cross_val_score(dt, X,y1, cv=5)
scores2 = cross_validation.cross_val_score(dt, X,y2, cv=5)
print scores
print scores.mean()
print scores2
print scores2.mean()
tt3=time()
print "time elapsed: ", tt3-tt2
print "\n"
roc_auc_score(y_test2,rf2.predict(X_test2))
pipeline2 = Pipeline([
('clf', RandomForestClassifier(criterion='entropy'))
])
parameters = {
'clf__n_estimators': (5, 25, 50, 100),
'clf__max_depth': (5, 25 , 50),
'clf__min_samples_split': (1, 5, 10),
'clf__min_samples_leaf': (1, 2, 3)
}
grid_search = GridSearchCV(pipeline2, parameters, n_jobs=-1, verbose=1, scoring='accuracy', cv=3)
grid_search.fit(X_train, y_train)
print 'Best score: %0.3f' % grid_search.best_score_
print 'Best parameters set:'
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print '\t%s: %r' % (param_name, best_parameters[param_name])
predictions = grid_search.predict(X_test)
print 'Accuracy:', accuracy_score(y_test, predictions)
print classification_report(y_test, predictions)
pipeline2 = Pipeline([
('clf', RandomForestClassifier(criterion='entropy'))
])
parameters = {
'clf__n_estimators': (5, 25, 50, 100),
'clf__max_depth': (5, 25 , 50),
'clf__min_samples_split': (1, 5, 10),
'clf__min_samples_leaf': (1, 2, 3)
}
grid_search = GridSearchCV(pipeline2, parameters, n_jobs=-1, verbose=1, scoring='accuracy', cv=3)
grid_search.fit(X_train2, y_train2)
print 'Best score: %0.3f' % grid_search.best_score_
print 'Best parameters set:'
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print '\t%s: %r' % (param_name, best_parameters[param_name])
predictions = grid_search.predict(X_test2)
print 'Accuracy:', accuracy_score(y_test2, predictions)
print classification_report(y_test2, predictions)
t4=time()
print "NaiveBayes"
nb = BernoulliNB()
nb2 = BernoulliNB()
clf_nb=nb.fit(X_train,y_train)
clf_nb2=nb2.fit(X_train2,y_train2)
print "Acurracy for Y1: ", clf_nb.score(X_test,y_test)
print "Acurracy for Y2: ", clf_nb2.score(X_test2,y_test2)
t5=time()
print "time elapsed: ", t5-t4
tt4=time()
print "cross result========"
scores = cross_validation.cross_val_score(dt, X,y1, cv=5)
scores2 = cross_validation.cross_val_score(dt, X,y2, cv=5)
print scores
print scores.mean()
print scores2
print scores2.mean()
tt5=time()
print "time elapsed: ", tt5-tt4
print "\n"
t6=time()
print "KNN"
# knn = KNeighborsClassifier(n_neighbors=3)
knn = KNeighborsClassifier()
knn2 = KNeighborsClassifier()
clf_knn=knn.fit(X_train, y_train)
clf_knn2=knn2.fit(X_train2, y_train2)
print "Acurracy for Y1: ", clf_knn.score(X_test,y_test)
print "Acurracy for Y2: ", clf_knn2.score(X_test2,y_test2)
t7=time()
print "time elapsed: ", t7-t6
tt6=time()
print "cross result========"
scores = cross_validation.cross_val_score(knn, X,y1, cv=5)
scores2 = cross_validation.cross_val_score(knn2, X,y2, cv=5)
print scores
print scores.mean()
print scores2
print scores2.mean()
tt7=time()
print "time elapsed: ", tt7-tt6
print "\n"
t7=time()
print "SVM"
svc = SVC()
svc2 = SVC()
clf_svc=svc.fit(X_train, y_train)
clf_svc2=svc2.fit(X_train2, y_train2)
print "Acurracy for Y1: ", clf_svc.score(X_test,y_test)
print "Acurracy for Y2: ", clf_svc2.score(X_test2,y_test2)
t8=time()
print "time elapsed: ", t8-t7
tt7=time()
print "cross result========"
#scores = cross_validation.cross_val_score(dt, X,y, cv=5)
#print scores
#print scores.mean()
scores = cross_validation.cross_val_score(dt, X,y2, cv=5)
print scores2
print scores2.mean()
tt8=time()
print "time elapsed: ", tt7-tt6
print "\n"
from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn import grid_search
svc = SVC()
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
grid = grid_search.GridSearchCV(svc, parameters, n_jobs=-1, verbose=1, scoring='accuracy')
grid.fit(X_train, y_train)
print 'Best score: %0.3f' % grid.best_score_
print 'Best parameters set:'
best_parameters = grid.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print '\t%s: %r' % (param_name, best_parameters[param_name])
predictions = grid.predict(X_test)
print classification_report(y_test, predictions)
from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn import grid_search
svc = SVC()
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
grid = grid_search.GridSearchCV(svc, parameters, n_jobs=-1, verbose=1, scoring='accuracy')
grid.fit(X_train2, y_train2)
print 'Best score: %0.3f' % grid.best_score_
print 'Best parameters set:'
best_parameters = grid.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print '\t%s: %r' % (param_name, best_parameters[param_name])
predictions = grid.predict(X_test2)
print classification_report(y_test2, predictions)
from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn import grid_search
pipeline = Pipeline([
('clf', SVC(kernel='linear', gamma=0.01, C=10))
])
parameters = {
'clf__gamma': (0.01, 0.03, 0.1, 0.3, 1),
'clf__C': (0.1, 0.3, 1, 3, 10, 30),
}
grid = grid_search.GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy')
grid.fit(X_train, y_train)
print 'Best score: %0.3f' % grid.best_score_
print 'Best parameters set:'
best_parameters = grid.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print '\t%s: %r' % (param_name, best_parameters[param_name])
predictions = grid.predict(X_test)
print classification_report(y_test, predictions)
from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn import grid_search
pipeline = Pipeline([
('clf', SVC(kernel='linear', gamma=0.01, C=10))
])
parameters = {
'clf__gamma': (0.01, 0.03, 0.1, 0.3, 1),
'clf__C': (0.1, 0.3, 1, 3, 10, 30),
}
grid = grid_search.GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy')
grid.fit(X_train2, y_train2)
print 'Best score: %0.3f' % grid.best_score_
print 'Best parameters set:'
best_parameters = grid.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print '\t%s: %r' % (param_name, best_parameters[param_name])
predictions = grid.predict(X_test2)
print classification_report(y_test2, predictions)
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn. cross_validation import cross_val_score
from sklearn.feature_selection import *
from sklearn import metrics
# read .csv from provided dataset
excel_filename="ENB2012_data.xlsx"
# df=pd.read_csv(csv_filename,index_col=0)
df=pd.read_excel(excel_filename)
df.head()
cols = df.columns
features=list(df.columns[:-2])
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid', context='notebook')
sns.pairplot(df, size=2.5);
plt.tight_layout()
# plt.savefig('./figures/scatter.png', dpi=300)
plt.show()
import numpy as np
cm = np.corrcoef(df.values.T)
sns.set(font_scale=1.5)
hm = sns.heatmap(cm,
cbar=True,
annot=True,
square=True,
fmt='.2f',
annot_kws={'size': 15},
yticklabels=cols,
xticklabels=cols)
plt.tight_layout()
# plt.savefig('./figures/corr_mat.png', dpi=300)
plt.show()
X = df[features].values
y1 = df['Y1'].values
y2 = df['Y2'].values
def lin_regplot(X, y, model):
plt.scatter(X, y, c='lightblue')
plt.plot(X, model.predict(X), color='red', linewidth=2)
return
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y1, test_size=0.3, random_state=0)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y2, test_size=0.3, random_state=0)
slr = LinearRegression()
slr.fit(X_train, y_train)
y_train_pred = slr.predict(X_train)
y_test_pred = slr.predict(X_test)
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
print('MSE train: %.3f, test: %.3f' % (
mean_squared_error(y_train, y_train_pred),
mean_squared_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
r2_score(y_train, y_train_pred),
r2_score(y_test, y_test_pred)))
slr2 = LinearRegression()
slr2.fit(X_train2, y_train2)
y_train_pred = slr.predict(X_train2)
y_test_pred = slr.predict(X_test2)
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
print('MSE train: %.3f, test: %.3f' % (
mean_squared_error(y_train2, y_train_pred),
mean_squared_error(y_test2, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
r2_score(y_train2, y_train_pred),
r2_score(y_test2, y_test_pred)))
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
y_train_pred = lasso.predict(X_train)
y_test_pred = lasso.predict(X_test)
print(lasso.coef_)
print('MSE train: %.3f, test: %.3f' % (
mean_squared_error(y_train, y_train_pred),
mean_squared_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
r2_score(y_train, y_train_pred),
r2_score(y_test, y_test_pred)))
Similiarly Ridge regression can be used:
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)
y_train_pred = ridge.predict(X_train)
y_test_pred = ridge.predict(X_test)
print(ridge.coef_)
print('MSE train: %.3f, test: %.3f' % (
mean_squared_error(y_train, y_train_pred),
mean_squared_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
r2_score(y_train, y_train_pred),
r2_score(y_test, y_test_pred)))
Lastly, the ElasticNet implementation allows us to vary the L1 to L2 ratio:
from sklearn.linear_model import ElasticNet
en = ElasticNet(alpha=1.0, l1_ratio=0.5)
en.fit(X_train, y_train)
y_train_pred = en.predict(X_train)
y_test_pred = en.predict(X_test)
print(en.coef_)
print('MSE train: %.3f, test: %.3f' % (
mean_squared_error(y_train, y_train_pred),
mean_squared_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
r2_score(y_train, y_train_pred),
r2_score(y_test, y_test_pred)))
A Lasso Regression model can be initialized as follows:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.1)
lasso.fit(X_train2, y_train2)
y_train_pred = lasso.predict(X_train2)
y_test_pred = lasso.predict(X_test2)
print(lasso.coef_)
print('MSE train: %.3f, test: %.3f' % (
mean_squared_error(y_train2, y_train_pred),
mean_squared_error(y_test2, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
r2_score(y_train2, y_train_pred),
r2_score(y_test2, y_test_pred)))
Similiarly Ridge regression can be used:
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=1.0)
ridge.fit(X_train2, y_train2)
y_train_pred = ridge.predict(X_train2)
y_test_pred = ridge.predict(X_test2)
print(ridge.coef_)
print('MSE train: %.3f, test: %.3f' % (
mean_squared_error(y_train2, y_train_pred),
mean_squared_error(y_test2, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
r2_score(y_train2, y_train_pred),
r2_score(y_test2, y_test_pred)))
Lastly, the ElasticNet implementation allows us to vary the L1 to L2 ratio:
from sklearn.linear_model import ElasticNet
en = ElasticNet(alpha=1.0, l1_ratio=0.5)
en.fit(X_train2, y_train2)
y_train_pred = en.predict(X_train2)
y_test_pred = en.predict(X_test2)
print(en.coef_)
print('MSE train: %.3f, test: %.3f' % (
mean_squared_error(y_train2, y_train_pred),
mean_squared_error(y_test2, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
r2_score(y_train2, y_train_pred),
r2_score(y_test2, y_test_pred)))
For example, if we set l1_ratio to 1.0, the ElasticNet regressor would be equal to LASSO regression.
from sklearn.tree import DecisionTreeRegressor
tree = DecisionTreeRegressor(max_depth=3)
tree.fit(X_train, y_train)
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)
print "For Y1:"
print('MSE train: %.3f, test: %.3f' % (
mean_squared_error(y_train, y_train_pred),
mean_squared_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
r2_score(y_train, y_train_pred),
r2_score(y_test, y_test_pred)))
from sklearn.tree import DecisionTreeRegressor
tree = DecisionTreeRegressor(max_depth=3)
tree.fit(X_train2, y_train2)
y_train_pred = tree.predict(X_train2)
y_test_pred = tree.predict(X_test2)
print "For Y2:"
print('MSE train: %.3f, test: %.3f' % (
mean_squared_error(y_train2, y_train_pred),
mean_squared_error(y_test2, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
r2_score(y_train2, y_train_pred),
r2_score(y_test2, y_test_pred)))
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(n_estimators=1000,
criterion='mse',
random_state=1,
n_jobs=-1)
forest.fit(X_train, y_train)
y_train_pred = forest.predict(X_train)
y_test_pred = forest.predict(X_test)
print "For Y1 :"
print('MSE train: %.3f, test: %.3f' % (
mean_squared_error(y_train, y_train_pred),
mean_squared_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
r2_score(y_train, y_train_pred),
r2_score(y_test, y_test_pred)))
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(n_estimators=1000,
criterion='mse',
random_state=1,
n_jobs=-1)
forest.fit(X_train2, y_train2)
y_train_pred = forest.predict(X_train2)
y_test_pred = forest.predict(X_test2)
print "For Y2 : "
print('MSE train: %.3f, test: %.3f' % (
mean_squared_error(y_train2, y_train_pred),
mean_squared_error(y_test2, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
r2_score(y_train2, y_train_pred),
r2_score(y_test2, y_test_pred)))
plt.scatter(y_train_pred,
y_train_pred - y_train,
c='black',
marker='o',
s=35,
alpha=0.5,
label='Training data')
plt.scatter(y_test_pred,
y_test_pred - y_test,
c='lightgreen',
marker='s',
s=35,
alpha=0.7,
label='Test data')
plt.xlabel('Predicted values')
plt.ylabel('Residuals')
plt.legend(loc='upper left')
plt.hlines(y=0, xmin=min(y_train.min(),y_test.min()), xmax=max(y_train.max(), y_test.max()), lw=2, color='red')
plt.xlim([min(y_train.min(),y_test.min()), max(y_train.max(), y_test.max())])
plt.tight_layout()
# plt.savefig('./figures/slr_residuals.png', dpi=300)
plt.show()
regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_predictions = regressor.predict(X_test)
print 'R-squared for Y1:', regressor.score(X_test, y_test)
regressor = LinearRegression()
regressor.fit(X_train2, y_train2)
y_predictions = regressor.predict(X_test2)
print 'R-squared for Y2:', regressor.score(X_test2, y_test2)
print "For Y1"
scores = cross_val_score(regressor, X, y1, cv=5)
print "Average of scores: ", scores.mean()
print "Cross validation scores: ", scores
print "For Y2"
scores2 = cross_val_score(regressor, X, y2, cv=5)
print "Average of scores: ", scores2.mean()
print "Cross validation scores: ", scores2
Let's inspect some of the model's predictions and plot the true quality scores against the predicted scores:
plt.scatter(y_test,y_predictions)
plt.xlabel('True Heating Level')
plt.ylabel('Predicted Heating Level')
plt.title('Predicted Heating Level Against True Heating Level')
plt.show()
plt.scatter(y_test2,y_predictions)
plt.xlabel('True Cooling Level')
plt.ylabel('Predicted Cooling Level')
plt.title('Predicted Cooling Level Against True Cooling Level')
plt.show()
# Scaling the features using StandardScaler:
X_scaler = StandardScaler()
y_scaler = StandardScaler()
X_train = X_scaler.fit_transform(X_train)
y_train = y_scaler.fit_transform(y_train)
X_test = X_scaler.transform(X_test)
y_test = y_scaler.transform(y_test)
X_train2 = X_scaler.fit_transform(X_train2)
y_train2 = y_scaler.fit_transform(y_train2)
X_test2 = X_scaler.transform(X_test2)
y_test2 = y_scaler.transform(y_test2)
print "For Y1:"
regressor = SGDRegressor(loss='squared_loss')
scores = cross_val_score(regressor, X_train, y_train, cv=5)
print 'Cross validation r-squared scores:', scores
print 'Average cross validation r-squared score:', np.mean(scores)
regressor.fit_transform(X_train, y_train)
print 'Test set r-squared score', regressor.score(X_test, y_test)
print "For Y2:"
regressor = SGDRegressor(loss='squared_loss')
scores = cross_val_score(regressor, X_train2, y_train2, cv=5)
print 'Cross validation r-squared scores:', scores
print 'Average cross validation r-squared score:', np.mean(scores)
regressor.fit_transform(X_train2, y_train2)
print 'Test set r-squared score', regressor.score(X_test2, y_test2)
Sometimes there are a lot of features in the dataset, so before learning, we should try to see which features are more relevant for our learning task, i.e. which of them are better prize predictors. We will use the SelectKBest method from the feature_selection package, and plot the results.
df.head()
from sklearn.feature_selection import *
fs=SelectKBest(score_func=f_regression,k=3)
X_new=fs.fit_transform(X_train,y_train)
print zip(fs.get_support(),features)
x_min, x_max = X_new[:,0].min(), X_new[:, 0].max()
y_min, y_max = y_train.min(), y_train.max()
fig=plt.figure()
#fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)
# Two subplots, unpack the axes array immediately
fig, axes = plt.subplots(1,3)
fig.set_size_inches(12,12)
for i in range(3):
axes[i].set_aspect('equal')
axes[i].set_title('Feature ' + str(i))
axes[i].set_xlabel('Feature')
axes[i].set_ylabel('Target')
axes[i].set_xlim(x_min, x_max)
axes[i].set_ylim(y_min, y_max)
plt.sca(axes[i])
plt.scatter(X_new[:,i],y_train)
from sklearn.feature_selection import *
fs=SelectKBest(score_func=f_regression,k=3)
X_new=fs.fit_transform(X_train2,y_train2)
print zip(fs.get_support(),features)
x_min, x_max = X_new[:,0].min(), X_new[:, 0].max()
y_min, y_max = y_train2.min(), y_train2.max()
fig=plt.figure()
#fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)
# Two subplots, unpack the axes array immediately
fig, axes = plt.subplots(1,3)
fig.set_size_inches(12,12)
for i in range(3):
axes[i].set_aspect('equal')
axes[i].set_title('Feature ' + str(i))
axes[i].set_xlabel('Feature')
axes[i].set_ylabel('Target')
axes[i].set_xlim(x_min, x_max)
axes[i].set_ylim(y_min, y_max)
plt.sca(axes[i])
plt.scatter(X_new[:,i],y_train)
from sklearn.cross_validation import *
def train_and_evaluate(clf, X_train, y_train):
clf.fit(X_train, y_train)
print "Coefficient of determination on training set:",clf.score(X_train, y_train)
# create a k-fold croos validation iterator of k=5 folds
cv = KFold(X_train.shape[0], 5, shuffle=True, random_state=33)
scores = cross_val_score(clf, X_train, y_train, cv=cv)
print "Average coefficient of determination using 5-fold crossvalidation:",np.mean(scores)
print "For Y1:"
from sklearn import linear_model
clf_sgd = linear_model.SGDRegressor(loss='squared_loss', penalty=None, random_state=42)
train_and_evaluate(clf_sgd,X_train,y_train)
print clf_sgd.coef_
print "For Y2:"
from sklearn import linear_model
clf_sgd = linear_model.SGDRegressor(loss='squared_loss', penalty=None, random_state=42)
train_and_evaluate(clf_sgd,X_train2,y_train2)
print clf_sgd.coef_
print "For Y1:"
clf_sgd1 = linear_model.SGDRegressor(loss='squared_loss', penalty='l2', random_state=42)
train_and_evaluate(clf_sgd1,X_train,y_train)
print "For Y2:"
clf_sgd1 = linear_model.SGDRegressor(loss='squared_loss', penalty='l2', random_state=42)
train_and_evaluate(clf_sgd1,X_train2,y_train2)
print "For Y1:"
clf_sgd2 = linear_model.SGDRegressor(loss='squared_loss', penalty='l1', random_state=42)
train_and_evaluate(clf_sgd2,X_train,y_train)
print "For Y2:"
clf_sgd2 = linear_model.SGDRegressor(loss='squared_loss', penalty='l1', random_state=42)
train_and_evaluate(clf_sgd2,X_train2,y_train2)
The regression version of SVM can be used instead to find the hyperplane (note how easy is to change the classification method in scikit-learn!). We will try a linear kernel, a polynomial kernel, and finally, a rbf kernel. For more information on kernels, see http://scikit-learn.org/stable/modules/svm.html#svm-kernels
from sklearn import svm
clf_svr= svm.SVR(kernel='linear')
train_and_evaluate(clf_svr,X_train,y_train)
clf_svr_poly= svm.SVR(kernel='poly')
train_and_evaluate(clf_svr_poly,X_train,y_train)
clf_svr_rbf= svm.SVR(kernel='rbf')
train_and_evaluate(clf_svr_rbf,X_train,y_train)
clf_svr_poly2= svm.SVR(kernel='poly',degree=2)
train_and_evaluate(clf_svr_poly2,X_train,y_train)
from sklearn import svm
clf_svr= svm.SVR(kernel='linear')
train_and_evaluate(clf_svr,X_train2,y_train2)
clf_svr_poly= svm.SVR(kernel='poly')
train_and_evaluate(clf_svr_poly,X_train2,y_train2)
clf_svr_rbf= svm.SVR(kernel='rbf')
train_and_evaluate(clf_svr_rbf,X_train2,y_train2)
clf_svr_poly2= svm.SVR(kernel='poly',degree=2)
train_and_evaluate(clf_svr_poly2,X_train2,y_train2)
Finally, let's try again Random Forests, in their Extra Trees, and Regression version
from sklearn import ensemble
clf_et=ensemble.ExtraTreesRegressor(n_estimators=10,random_state=42)
train_and_evaluate(clf_et,X_train,y_train)
An interesting side effect of random forest classification, is that you can measure how 'important' each feature is when predicting the final result
print np.sort(zip(clf_et.feature_importances_,features),axis=0)
Finally, evaluate our classifiers on the testing set
from sklearn import metrics
def measure_performance(X,y,clf, show_accuracy=True,
show_classification_report=True,
show_confusion_matrix=True,
show_r2_score=False):
y_pred=clf.predict(X)
if show_accuracy:
print "Accuracy:{0:.3f}".format(metrics.accuracy_score(y,y_pred)),"\n"
if show_classification_report:
print "Classification report"
print metrics.classification_report(y,y_pred),"\n"
if show_confusion_matrix:
print "Confusion matrix"
print metrics.confusion_matrix(y,y_pred),"\n"
if show_r2_score:
print "Coefficient of determination:{0:.3f}".format(metrics.r2_score(y,y_pred)),"\n"
measure_performance(X_test,y_test,clf_et,
show_accuracy=False,
show_classification_report=False,
show_confusion_matrix=False,
show_r2_score=True)
from sklearn import ensemble
clf_et=ensemble.ExtraTreesRegressor(n_estimators=10,random_state=42)
train_and_evaluate(clf_et,X_train2,y_train2)
print np.sort(zip(clf_et.feature_importances_,features),axis=0)
Again, evaluating our classifiers on the testing set:
measure_performance(X_test2,y_test2,clf_et,
show_accuracy=False,
show_classification_report=False,
show_confusion_matrix=False,
show_r2_score=True)