Plants Clustering¶

https://archive.ics.uci.edu/ml/datasets/Plants

The data is in the transactional form. It contains the Latin names (species or genus) and state abbreviations..Each row contains a Latin name (species or genus) and a list of state abbreviations.

%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn import cross_validation, metrics
from sklearn import preprocessing
import matplotlib
import matplotlib.pyplot as plt

# read .csv from provided dataset
csv_filename="plants.data"

# df=pd.read_csv(csv_filename,index_col=0)
df=pd.read_csv(csv_filename)

cols = ['Class']
for i in range(64):
    str = 'f{}'.format(i)
    cols.append(str)

# read .csv from provided dataset
csv_filename="data_Mar_64.txt"

# df=pd.read_csv(csv_filename,index_col=0)
df=pd.read_csv(csv_filename,names=cols)

df.head()

df.shape

(1600, 65)

df['Class'].unique()

array(['Acer Campestre', 'Acer Capillipes', 'Acer Circinatum', 'Acer Mono',
       'Acer Opalus', 'Acer Palmatum', 'Acer Pictum', 'Acer Platanoids',
       'Acer Rubrum', 'Acer Rufinerve', 'Acer Saccharinum',
       'Alnus Cordata', 'Alnus Maximowiczii', 'Alnus Rubra',
       'Alnus Sieboldiana', 'Alnus Viridis', 'Arundinaria Simonii',
       'Betula Austrosinensis', 'Betula Pendula', 'Callicarpa Bodinieri',
       'Castanea Sativa', 'Celtis Koraiensis', 'Cercis Siliquastrum',
       'Cornus Chinensis', 'Cornus Controversa', 'Cornus Macrophylla',
       'Cotinus Coggygria', 'Crataegus Monogyna', 'Cytisus Battandieri',
       'Eucalyptus Glaucescens', 'Eucalyptus Neglecta',
       'Eucalyptus Urnigera', 'Fagus Sylvatica', 'Ginkgo Biloba',
       'Ilex Aquifolium', 'Ilex Cornuta', 'Liquidambar Styraciflua',
       'Liriodendron Tulipifera', 'Lithocarpus Cleistocarpus',
       'Lithocarpus Edulis', 'Magnolia Heptapeta', 'Magnolia Salicifolia',
       'Morus Nigra', 'Olea Europaea', 'Phildelphus', 'Populus Adenopoda',
       'Populus Grandidentata', 'Populus Nigra', 'Prunus Avium',
       'Prunus X Shmittii', 'Pterocarya Stenoptera', 'Quercus Afares',
       'Quercus Agrifolia', 'Quercus Alnifolia', 'Quercus Brantii',
       'Quercus Canariensis', 'Quercus Castaneifolia', 'Quercus Cerris',
       'Quercus Chrysolepis', 'Quercus Coccifera', 'Quercus Coccinea',
       'Quercus Crassifolia', 'Quercus Crassipes', 'Quercus Dolicholepis',
       'Quercus Ellipsoidalis', 'Quercus Greggii', 'Quercus Hartwissiana',
       'Quercus Ilex', 'Quercus Imbricaria', 'Quercus Infectoria sub',
       'Quercus Kewensis', 'Quercus Nigra', 'Quercus Palustris',
       'Quercus Phellos', 'Quercus Phillyraeoides', 'Quercus Pontica',
       'Quercus Pubescens', 'Quercus Pyrenaica', 'Quercus Rhysophylla',
       'Quercus Rubra', 'Quercus Semecarpifolia', 'Quercus Shumardii',
       'Quercus Suber', 'Quercus Texana', 'Quercus Trojana',
       'Quercus Variabilis', 'Quercus Vulcanica', 'Quercus x Hispanica',
       'Quercus x Turneri', 'Rhododendron x Russellianum',
       'Salix Fragilis', 'Salix Intergra', 'Sorbus Aria', 'Tilia Oliveri',
       'Tilia Platyphyllos', 'Tilia Tomentosa', 'Ulmus Bergmanniana',
       'Viburnum Tinus', 'Viburnum x Rhytidophylloides', 'Zelkova Serrata'], dtype=object)

len(df['Class'].unique())

100

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Class'] = le.fit_transform(df['Class'])

df['Class'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99], dtype=int64)

df.head()

features = df.columns[1:]
features

Index([u'f0', u'f1', u'f2', u'f3', u'f4', u'f5', u'f6', u'f7', u'f8', u'f9',
       u'f10', u'f11', u'f12', u'f13', u'f14', u'f15', u'f16', u'f17', u'f18',
       u'f19', u'f20', u'f21', u'f22', u'f23', u'f24', u'f25', u'f26', u'f27',
       u'f28', u'f29', u'f30', u'f31', u'f32', u'f33', u'f34', u'f35', u'f36',
       u'f37', u'f38', u'f39', u'f40', u'f41', u'f42', u'f43', u'f44', u'f45',
       u'f46', u'f47', u'f48', u'f49', u'f50', u'f51', u'f52', u'f53', u'f54',
       u'f55', u'f56', u'f57', u'f58', u'f59', u'f60', u'f61', u'f62', u'f63'],
      dtype='object')

X = df[features]
y = df['Class']

X.head()

# split dataset to 60% training and 40% testing
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0)

print X_train.shape, y_train.shape

(960, 64) (960L,)

Unsupervised Learning¶

PCA¶

y.unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99], dtype=int64)

len(features)

64

# Apply PCA with the same number of dimensions as variables in the dataset
from sklearn.decomposition import PCA
pca = PCA(n_components=64)
pca.fit(X)

# Print the components and the amount of variance in the data contained in each dimension
print(pca.components_)
print(pca.explained_variance_ratio_)

[[ 0.17301085  0.3678857  -0.0376574  ..., -0.04515881  0.10784595
  -0.02792765]
 [ 0.06860826  0.17120937 -0.16743028 ...,  0.05114625 -0.04738932
   0.00340058]
 [-0.01217028 -0.08481612  0.30296649 ..., -0.04069022  0.17427581
  -0.03918934]
 ..., 
 [-0.00740531 -0.00196931 -0.00802288 ..., -0.00735558  0.00890972
   0.04235162]
 [ 0.0151684   0.01028616  0.01684166 ...,  0.05617535  0.01335735
  -0.00934287]
 [ 0.1249996   0.12499963  0.1249996  ...,  0.12499625  0.12500077
   0.12500567]]
[  3.66346160e-01   1.36479827e-01   8.76136880e-02   6.76087515e-02
   5.62833251e-02   4.56950262e-02   2.62293511e-02   2.23588970e-02
   1.85408968e-02   1.68000379e-02   1.51209237e-02   1.19416550e-02
   9.62145182e-03   8.87215304e-03   8.70710022e-03   7.38789680e-03
   6.71526565e-03   6.42370588e-03   5.45940047e-03   4.81032134e-03
   4.16110779e-03   4.04450437e-03   3.98474616e-03   3.56179763e-03
   3.37395149e-03   3.11290968e-03   2.93953519e-03   2.72151214e-03
   2.66239203e-03   2.57375991e-03   2.44332593e-03   2.31217010e-03
   2.11575025e-03   2.09009210e-03   2.01793326e-03   1.93257495e-03
   1.84261074e-03   1.79722687e-03   1.77056628e-03   1.73038935e-03
   1.64383197e-03   1.55318733e-03   1.46506434e-03   1.39524503e-03
   1.34309348e-03   1.27063003e-03   1.03304595e-03   9.83972274e-04
   9.72012625e-04   8.30449484e-04   7.88002854e-04   6.93523108e-04
   6.00723467e-04   5.70686500e-04   5.22928128e-04   4.52364252e-04
   3.92603544e-04   3.30888609e-04   3.03654919e-04   2.54559631e-04
   1.82047138e-04   1.48132720e-04   6.46620494e-05   5.06768097e-12]

%matplotlib inline
import matplotlib.pyplot as plt
plt.plot(list(pca.explained_variance_ratio_),'-o')
plt.title('Explained variance ratio as function of PCA components')
plt.ylabel('Explained variance ratio')
plt.xlabel('Component')
plt.show()

# First we reduce the data to two dimensions using PCA to capture variation
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(X)
print(reduced_data[:10])  # print upto 10 elements

[[-0.03307489 -0.02892331]
 [-0.02422727 -0.01839179]
 [-0.02629431 -0.03441262]
 [ 0.00831379  0.00113884]
 [ 0.00117595 -0.05432339]
 [-0.00205065 -0.02189747]
 [ 0.00664743 -0.0258008 ]
 [-0.02080581 -0.00804307]
 [-0.0357074   0.00314164]
 [-0.02317419  0.00840398]]

kmeans = KMeans(n_clusters=100)
clusters = kmeans.fit(reduced_data)
print(clusters)

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=100, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

# Plot the decision boundary by building a mesh grid to populate a graph.
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
hx = (x_max-x_min)/1000.
hy = (y_max-y_min)/1000.
xx, yy = np.meshgrid(np.arange(x_min, x_max, hx), np.arange(y_min, y_max, hy))

# Obtain labels for each point in mesh. Use last trained model.
Z = clusters.predict(np.c_[xx.ravel(), yy.ravel()])

# Find the centroids for KMeans or the cluster means for GMM 

centroids = kmeans.cluster_centers_
print('*** K MEANS CENTROIDS ***')
print(centroids)

# TRANSFORM DATA BACK TO ORIGINAL SPACE FOR ANSWERING 7
print('*** CENTROIDS TRANSFERED TO ORIGINAL SPACE ***')
print(pca.inverse_transform(centroids))

*** K MEANS CENTROIDS ***
[[-0.00569156 -0.02757334]
 [ 0.13246213 -0.01621052]
 [-0.09288891  0.03512336]
 [-0.04629578 -0.14273808]
 [ 0.16744926  0.05257156]
 [ 0.07064058 -0.02604673]
 [-0.02826312  0.01388355]
 [-0.05375892 -0.08335535]
 [-0.11115126  0.05581906]
 [ 0.01562468 -0.07296821]
 [-0.01047396  0.02356011]
 [ 0.11544097  0.06833212]
 [ 0.01681328 -0.00510339]
 [-0.07041002 -0.00715404]
 [ 0.1227513   0.02325   ]
 [ 0.03765457  0.02418866]
 [-0.0727005   0.04570051]
 [ 0.23678466  0.02092281]
 [ 0.09965405 -0.07586469]
 [ 0.02956955 -0.14766104]
 [ 0.24783442  0.09124872]
 [-0.05173007 -0.01858108]
 [-0.07509212 -0.16741198]
 [ 0.15820097  0.02933633]
 [ 0.17652355 -0.08188853]
 [ 0.08251682  0.03693327]
 [-0.07078136 -0.13687759]
 [ 0.19897381  0.06630531]
 [-0.02931185 -0.06391228]
 [ 0.09146325 -0.04824824]
 [-0.08578384  0.01404901]
 [-0.02962707 -0.11551206]
 [ 0.19834241 -0.03538007]
 [-0.05972279  0.03032355]
 [ 0.08670001 -0.00606107]
 [-0.03116005  0.04406629]
 [ 0.02648243 -0.05101348]
 [-0.10810175  0.02162109]
 [-0.10345345  0.07732651]
 [-0.05484872 -0.17262396]
 [-0.11811638  0.07819233]
 [ 0.00570303 -0.1432835 ]
 [-0.01907073 -0.0475412 ]
 [ 0.14628249  0.01832428]
 [ 0.05575008 -0.05731427]
 [ 0.11206784 -0.02420592]
 [ 0.02723779 -0.11093354]
 [ 0.10353602  0.01005479]
 [-0.0224084  -0.01092377]
 [-0.09914013  0.06366138]
 [ 0.03007613 -0.02285616]
 [-0.09979745  0.00107765]
 [-0.08069197  0.05916232]
 [-0.05493688  0.00061032]
 [-0.00936725 -0.07803598]
 [-0.0801698  -0.02510252]
 [ 0.05976957  0.01190185]
 [ 0.05585779 -0.01290171]
 [-0.0017517   0.04210121]
 [ 0.13281497  0.05043614]
 [ 0.18931821  0.02747014]
 [ 0.22638096  0.07670289]
 [-0.06116483 -0.11844676]
 [-0.10376601  0.04464   ]
 [-0.11810946  0.04675272]
 [-0.07741126  0.02926753]
 [-0.03989306  0.02506544]
 [-0.03842906 -0.03845785]
 [-0.12118508  0.06150249]
 [ 0.04853543 -0.08333611]
 [ 0.00320653 -0.04752446]
 [ 0.00705696  0.01755481]
 [ 0.13125472 -0.04330547]
 [ 0.15408349  0.04558488]
 [-0.05532449  0.05504591]
 [ 0.03576639 -0.00278353]
 [-0.07983163 -0.07187959]
 [ 0.18520214  0.05829571]
 [-0.06727929  0.01146632]
 [-0.01335953  0.00502683]
 [ 0.00114023 -0.172542  ]
 [ 0.11054053  0.04174668]
 [ 0.15029777  0.07876001]
 [-0.03478568 -0.09472081]
 [-0.0701767  -0.19748871]
 [ 0.05397147  0.04461728]
 [ 0.15789829 -0.00141685]
 [-0.04004061  0.00684292]
 [-0.08793461  0.04723146]
 [-0.02320017 -0.0277043 ]
 [ 0.21870363  0.05245085]
 [-0.03890206 -0.01079456]
 [ 0.12431625  0.00142029]
 [ 0.09176176 -0.02673565]
 [ 0.00026089 -0.00786204]
 [ 0.04546378 -0.03759898]
 [ 0.01077029 -0.0266246 ]
 [ 0.02483525  0.01424494]
 [-0.0546426  -0.04536985]
 [ 0.17367441  0.03448724]]
*** CENTROIDS TRANSFERED TO ORIGINAL SPACE ***
[[-0.00569156 -0.02757334]
 [ 0.13246213 -0.01621052]
 [-0.09288891  0.03512336]
 [-0.04629578 -0.14273808]
 [ 0.16744926  0.05257156]
 [ 0.07064058 -0.02604673]
 [-0.02826312  0.01388355]
 [-0.05375892 -0.08335535]
 [-0.11115126  0.05581906]
 [ 0.01562468 -0.07296821]
 [-0.01047396  0.02356011]
 [ 0.11544097  0.06833212]
 [ 0.01681328 -0.00510339]
 [-0.07041002 -0.00715404]
 [ 0.1227513   0.02325   ]
 [ 0.03765457  0.02418866]
 [-0.0727005   0.04570051]
 [ 0.23678466  0.02092281]
 [ 0.09965405 -0.07586469]
 [ 0.02956955 -0.14766104]
 [ 0.24783442  0.09124872]
 [-0.05173007 -0.01858108]
 [-0.07509212 -0.16741198]
 [ 0.15820097  0.02933633]
 [ 0.17652355 -0.08188853]
 [ 0.08251682  0.03693327]
 [-0.07078136 -0.13687759]
 [ 0.19897381  0.06630531]
 [-0.02931185 -0.06391228]
 [ 0.09146325 -0.04824824]
 [-0.08578384  0.01404901]
 [-0.02962707 -0.11551206]
 [ 0.19834241 -0.03538007]
 [-0.05972279  0.03032355]
 [ 0.08670001 -0.00606107]
 [-0.03116005  0.04406629]
 [ 0.02648243 -0.05101348]
 [-0.10810175  0.02162109]
 [-0.10345345  0.07732651]
 [-0.05484872 -0.17262396]
 [-0.11811638  0.07819233]
 [ 0.00570303 -0.1432835 ]
 [-0.01907073 -0.0475412 ]
 [ 0.14628249  0.01832428]
 [ 0.05575008 -0.05731427]
 [ 0.11206784 -0.02420592]
 [ 0.02723779 -0.11093354]
 [ 0.10353602  0.01005479]
 [-0.0224084  -0.01092377]
 [-0.09914013  0.06366138]
 [ 0.03007613 -0.02285616]
 [-0.09979745  0.00107765]
 [-0.08069197  0.05916232]
 [-0.05493688  0.00061032]
 [-0.00936725 -0.07803598]
 [-0.0801698  -0.02510252]
 [ 0.05976957  0.01190185]
 [ 0.05585779 -0.01290171]
 [-0.0017517   0.04210121]
 [ 0.13281497  0.05043614]
 [ 0.18931821  0.02747014]
 [ 0.22638096  0.07670289]
 [-0.06116483 -0.11844676]
 [-0.10376601  0.04464   ]
 [-0.11810946  0.04675272]
 [-0.07741126  0.02926753]
 [-0.03989306  0.02506544]
 [-0.03842906 -0.03845785]
 [-0.12118508  0.06150249]
 [ 0.04853543 -0.08333611]
 [ 0.00320653 -0.04752446]
 [ 0.00705696  0.01755481]
 [ 0.13125472 -0.04330547]
 [ 0.15408349  0.04558488]
 [-0.05532449  0.05504591]
 [ 0.03576639 -0.00278353]
 [-0.07983163 -0.07187959]
 [ 0.18520214  0.05829571]
 [-0.06727929  0.01146632]
 [-0.01335953  0.00502683]
 [ 0.00114023 -0.172542  ]
 [ 0.11054053  0.04174668]
 [ 0.15029777  0.07876001]
 [-0.03478568 -0.09472081]
 [-0.0701767  -0.19748871]
 [ 0.05397147  0.04461728]
 [ 0.15789829 -0.00141685]
 [-0.04004061  0.00684292]
 [-0.08793461  0.04723146]
 [-0.02320017 -0.0277043 ]
 [ 0.21870363  0.05245085]
 [-0.03890206 -0.01079456]
 [ 0.12431625  0.00142029]
 [ 0.09176176 -0.02673565]
 [ 0.00026089 -0.00786204]
 [ 0.04546378 -0.03759898]
 [ 0.01077029 -0.0266246 ]
 [ 0.02483525  0.01424494]
 [-0.0546426  -0.04536985]
 [ 0.17367441  0.03448724]]

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(Z, interpolation='nearest',
           extent=(xx.min(), xx.max(), yy.min(), yy.max()),
           cmap=plt.cm.Paired,
           aspect='auto', origin='lower')

plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
plt.scatter(centroids[:, 0], centroids[:, 1],
            marker='x', s=169, linewidths=3,
            color='w', zorder=10)
plt.title('Clustering on the seeds dataset (PCA-reduced data)\n'
          'Centroids are marked with white cross')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()

Applying agglomerative clustering via scikit-learn¶

from sklearn.cluster import AgglomerativeClustering

ac = AgglomerativeClustering(n_clusters=100, affinity='euclidean', linkage='complete')
labels = ac.fit_predict(X)
print('Cluster labels: %s' % labels)

Cluster labels: [ 9 33 33 ..., 47 47 47]

from sklearn.cross_validation import train_test_split
X = df[features]
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y ,test_size=0.25, random_state=42)

K Means¶

from sklearn import cluster
clf = cluster.KMeans(init='k-means++', n_clusters=100, random_state=5)
clf.fit(X_train)
print clf.labels_.shape
print clf.labels_

(1200L,)
[41  7 41 ..., 98 70 58]

# Predict clusters on testing data
y_pred = clf.predict(X_test)

from sklearn import metrics
print "Addjusted rand score:{:.2}".format(metrics.adjusted_rand_score(y_test, y_pred))
print "Homogeneity score:{:.2} ".format(metrics.homogeneity_score(y_test, y_pred)) 
print "Completeness score: {:.2} ".format(metrics.completeness_score(y_test, y_pred))
print "Confusion matrix"
print metrics.confusion_matrix(y_test, y_pred)

Addjusted rand score:0.46
Homogeneity score:0.84 
Completeness score: 0.87 
Confusion matrix
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]

Affinity Propogation¶

# Affinity propagation
aff = cluster.AffinityPropagation()
aff.fit(X_train)
print aff.cluster_centers_indices_.shape

(66L,)

y_pred = aff.predict(X_test)

from sklearn import metrics
print "Addjusted rand score:{:.2}".format(metrics.adjusted_rand_score(y_test, y_pred))
print "Homogeneity score:{:.2} ".format(metrics.homogeneity_score(y_test, y_pred)) 
print "Completeness score: {:.2} ".format(metrics.completeness_score(y_test, y_pred))
print "Confusion matrix"
print metrics.confusion_matrix(y_test, y_pred)

Addjusted rand score:0.37
Homogeneity score:0.77 
Completeness score: 0.87 
Confusion matrix
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]

MeanShift¶

ms = cluster.MeanShift()
ms.fit(X_train)

MeanShift(bandwidth=None, bin_seeding=False, cluster_all=True, min_bin_freq=1,
     n_jobs=1, seeds=None)

y_pred = ms.predict(X_test)

from sklearn import metrics
print "Addjusted rand score:{:.2}".format(metrics.adjusted_rand_score(y_test, y_pred))
print "Homogeneity score:{:.2} ".format(metrics.homogeneity_score(y_test, y_pred)) 
print "Completeness score: {:.2} ".format(metrics.completeness_score(y_test, y_pred))
print "Confusion matrix"
print metrics.confusion_matrix(y_test, y_pred)

Addjusted rand score:0.00047
Homogeneity score:0.015 
Completeness score: 1.0 
Confusion matrix
[[1 0 0 ..., 0 0 0]
 [3 0 0 ..., 0 0 0]
 [3 0 0 ..., 0 0 0]
 ..., 
 [3 0 0 ..., 0 0 0]
 [3 0 0 ..., 0 0 0]
 [4 0 0 ..., 0 0 0]]

Mixture of Guassian Models¶

from sklearn import mixture

# Define a heldout dataset to estimate covariance type
X_train_heldout, X_test_heldout, y_train_heldout, y_test_heldout = train_test_split(
        X_train, y_train,test_size=0.25, random_state=42)
for covariance_type in ['spherical','tied','diag','full']:
    gm=mixture.GMM(n_components=100, covariance_type=covariance_type, random_state=42, n_init=5)
    gm.fit(X_train_heldout)
    y_pred=gm.predict(X_test_heldout)
    print "Adjusted rand score for covariance={}:{:.2}".format(covariance_type, 
                                                               metrics.adjusted_rand_score(y_test_heldout, y_pred))

Adjusted rand score for covariance=spherical:0.2
Adjusted rand score for covariance=tied:0.24
Adjusted rand score for covariance=diag:0.2
Adjusted rand score for covariance=full:0.082

X = df[features].values
y= df['Class'].values
pca = PCA(n_components=2)
X = pca.fit_transform(X)

c = []
from matplotlib.pyplot import cm 
n=100
color=iter(cm.rainbow(np.linspace(0,1,n)))
for i in range(n):
    c.append(next(color))

c[99]

array([  1.00000000e+00,   1.22464680e-16,   6.12323400e-17,
         1.00000000e+00])

n = 100
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(20,10))

km = KMeans(n_clusters= n , random_state=0)
y_km = km.fit_predict(X)

for i in range(n):
    ax1.scatter(X[y_km==i,0], X[y_km==i,1], c=c[i], marker='o', s=40, label='cluster{}'.format(i))
ax1.set_title('K-means clustering')

ac = AgglomerativeClustering(n_clusters=100, affinity='euclidean', linkage='complete')
y_ac = ac.fit_predict(X)
for i in range(n):
    ax2.scatter(X[y_ac==i,0], X[y_ac==i,1], c=c[i], marker='o', s=40, label='cluster{}'.format(i))
ax2.set_title('Agglomerative clustering')

# Put a legend below current axis
plt.legend(loc='upper center', bbox_to_anchor=(0, -0.05),
          fancybox=True, shadow=True, ncol=10)
    
plt.tight_layout()
#plt.savefig('./figures/kmeans_and_ac.png', dpi=300)
plt.show()

Classification¶

import os
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn import cross_validation, metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from time import time
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score , classification_report
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report

X = df[features]
y = df['Class']

# split dataset to 60% training and 40% testing
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0)

print X_train.shape, y_train.shape

(960, 64) (960L,)

Decision Tree accuracy and time elapsed caculation¶

t0=time()
print "DecisionTree"

dt = DecisionTreeClassifier(min_samples_split=20,random_state=99)
# dt = DecisionTreeClassifier(min_samples_split=20,max_depth=5,random_state=99)

clf_dt=dt.fit(X_train,y_train)

print "Acurracy: ", clf_dt.score(X_test,y_test)
t1=time()
print "time elapsed: ", t1-t0

DecisionTree
Acurracy:  0.3890625
time elapsed:  0.18700003624

tt0=time()
print "cross result========"
scores = cross_validation.cross_val_score(dt, X,y, cv=5)
print scores
print scores.mean()
tt1=time()
print "time elapsed: ", tt1-tt0
print "\n"

cross result========
[ 0.48        0.45666667  0.42333333  0.42333333  0.43666667]
0.444
time elapsed:  0.757999897003

from sklearn.metrics import classification_report

pipeline = Pipeline([
    ('clf', DecisionTreeClassifier(criterion='entropy'))
])

parameters = {
    'clf__max_depth': (5, 25 , 50),
    'clf__min_samples_split': (1, 5, 10),
    'clf__min_samples_leaf': (1, 2, 3)
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='f1')
grid_search.fit(X_train, y_train)

print 'Best score: %0.3f' % grid_search.best_score_
print 'Best parameters set:'

best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])

predictions = grid_search.predict(X_test)

print classification_report(y_test, predictions)

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   36.8s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   38.7s finished

Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best score: 0.372
Best parameters set:
	clf__max_depth: 50
	clf__min_samples_leaf: 1
	clf__min_samples_split: 5
             precision    recall  f1-score   support

          0       0.33      0.44      0.38         9
          1       0.33      0.14      0.20         7
          2       0.83      0.71      0.77         7
          3       0.40      0.20      0.27        10
          4       0.29      0.29      0.29         7
          5       0.80      0.80      0.80         5
          6       0.00      0.00      0.00         4
          7       0.67      0.50      0.57         4
          8       0.17      0.40      0.24         5
          9       1.00      0.25      0.40         4
         10       1.00      1.00      1.00         5
         11       0.71      0.62      0.67         8
         12       0.38      0.43      0.40         7
         13       0.42      0.83      0.56         6
         14       0.50      0.60      0.55         5
         15       0.50      0.33      0.40         9
         16       0.62      0.83      0.71         6
         17       0.50      0.25      0.33         4
         18       1.00      0.33      0.50         6
         19       0.38      0.38      0.38         8
         20       0.60      0.50      0.55         6
         21       0.25      0.50      0.33         4
         22       0.64      1.00      0.78         7
         23       0.00      0.00      0.00         3
         24       0.18      0.75      0.29         4
         25       0.00      0.00      0.00         7
         26       0.75      0.38      0.50         8
         27       0.67      0.18      0.29        11
         28       0.50      0.40      0.44         5
         29       0.83      0.42      0.56        12
         30       0.10      0.12      0.11         8
         31       0.29      0.40      0.33         5
         32       0.43      0.33      0.38         9
         33       0.44      0.78      0.56         9
         34       0.54      1.00      0.70         7
         35       0.55      0.60      0.57        10
         36       0.62      0.89      0.73         9
         37       0.20      0.33      0.25         3
         38       0.50      0.50      0.50         6
         39       0.25      0.50      0.33         2
         40       0.50      0.11      0.18         9
         41       0.10      0.17      0.12         6
         42       0.40      0.57      0.47         7
         43       1.00      0.83      0.91         6
         44       0.25      0.17      0.20         6
         45       0.50      0.43      0.46         7
         46       0.80      0.57      0.67         7
         47       0.20      0.17      0.18         6
         48       0.22      0.50      0.31         4
         49       0.53      1.00      0.70         8
         50       0.22      0.33      0.27         6
         51       0.67      0.57      0.62         7
         52       0.50      1.00      0.67         5
         53       0.25      0.33      0.29         3
         54       0.50      0.25      0.33         4
         55       0.29      0.40      0.33         5
         56       0.10      0.17      0.12         6
         57       0.50      0.29      0.36         7
         58       0.43      0.38      0.40         8
         59       0.56      0.71      0.63         7
         60       0.00      0.00      0.00        10
         61       0.00      0.00      0.00         5
         62       0.90      1.00      0.95         9
         63       0.43      0.50      0.46         6
         64       0.17      0.14      0.15         7
         65       0.33      0.20      0.25         5
         66       0.38      1.00      0.55         3
         67       0.67      0.86      0.75         7
         68       0.14      0.29      0.19         7
         69       0.75      0.43      0.55         7
         70       0.00      0.00      0.00         7
         71       0.25      0.25      0.25         4
         72       0.50      0.67      0.57         3
         73       0.00      0.00      0.00        11
         74       0.71      0.56      0.63         9
         75       0.33      0.17      0.22         6
         76       0.60      0.43      0.50         7
         77       0.40      0.33      0.36         6
         78       0.25      0.20      0.22         5
         79       0.71      0.71      0.71         7
         80       0.50      0.12      0.20         8
         81       0.50      0.40      0.44         5
         82       0.40      0.50      0.44         4
         83       0.33      0.50      0.40         6
         84       0.00      0.00      0.00         8
         85       0.33      0.33      0.33         3
         86       0.57      0.67      0.62         6
         87       0.50      0.29      0.36         7
         88       0.67      0.29      0.40         7
         89       1.00      0.50      0.67         6
         90       0.67      0.67      0.67         9
         91       0.50      0.17      0.25         6
         92       0.00      0.00      0.00         3
         93       0.40      0.67      0.50         6
         94       0.67      0.25      0.36         8
         95       0.50      0.25      0.33         8
         96       0.20      0.33      0.25         6
         97       0.00      0.00      0.00         6
         98       0.00      0.00      0.00         2
         99       1.00      0.30      0.46        10

avg / total       0.46      0.42      0.41       640

C:\Miniconda2\lib\site-packages\sklearn\metrics\classification.py:1074: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

Random Forest accuracy and time elapsed caculation¶

t2=time()
print "RandomForest"
rf = RandomForestClassifier(n_estimators=100,n_jobs=-1)
clf_rf = rf.fit(X_train,y_train)
print "Acurracy: ", clf_rf.score(X_test,y_test)
t3=time()
print "time elapsed: ", t3-t2

RandomForest
Acurracy:  0.79375
time elapsed:  1.11400008202

tt2=time()
print "cross result========"
scores = cross_validation.cross_val_score(dt, X,y, cv=5)
print scores
print scores.mean()
tt3=time()
print "time elapsed: ", tt3-tt2
print "\n"

cross result========
[ 0.48        0.45666667  0.42333333  0.42333333  0.43666667]
0.444
time elapsed:  0.690999984741

pipeline2 = Pipeline([
('clf', RandomForestClassifier(criterion='entropy'))
])

parameters = {
    'clf__n_estimators': (5, 25, 50, 100),
    'clf__max_depth': (5, 25 , 50),
    'clf__min_samples_split': (1, 5, 10),
    'clf__min_samples_leaf': (1, 2, 3)
}

grid_search = GridSearchCV(pipeline2, parameters, n_jobs=-1, verbose=1, scoring='accuracy', cv=3)

grid_search.fit(X_train, y_train)

print 'Best score: %0.3f' % grid_search.best_score_

print 'Best parameters set:'
best_parameters = grid_search.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])

predictions = grid_search.predict(X_test)
print 'Accuracy:', accuracy_score(y_test, predictions)
print classification_report(y_test, predictions)

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   27.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed:  2.5min finished

Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best score: 0.797
Best parameters set:
	clf__max_depth: 25
	clf__min_samples_leaf: 1
	clf__min_samples_split: 1
	clf__n_estimators: 100
Accuracy: 0.7625
             precision    recall  f1-score   support

          0       0.71      0.56      0.63         9
          1       0.71      0.71      0.71         7
          2       1.00      1.00      1.00         7
          3       1.00      0.70      0.82        10
          4       0.88      1.00      0.93         7
          5       1.00      1.00      1.00         5
          6       0.67      1.00      0.80         4
          7       1.00      1.00      1.00         4
          8       0.71      1.00      0.83         5
          9       0.50      1.00      0.67         4
         10       1.00      1.00      1.00         5
         11       1.00      0.75      0.86         8
         12       0.70      1.00      0.82         7
         13       0.75      1.00      0.86         6
         14       0.80      0.80      0.80         5
         15       1.00      0.56      0.71         9
         16       0.75      1.00      0.86         6
         17       0.50      0.75      0.60         4
         18       0.83      0.83      0.83         6
         19       0.86      0.75      0.80         8
         20       0.86      1.00      0.92         6
         21       1.00      0.75      0.86         4
         22       0.78      1.00      0.88         7
         23       0.12      0.67      0.21         3
         24       0.36      1.00      0.53         4
         25       0.33      0.14      0.20         7
         26       1.00      0.75      0.86         8
         27       1.00      0.64      0.78        11
         28       0.62      1.00      0.77         5
         29       1.00      0.50      0.67        12
         30       0.33      0.25      0.29         8
         31       1.00      0.60      0.75         5
         32       1.00      0.89      0.94         9
         33       1.00      0.56      0.71         9
         34       0.88      1.00      0.93         7
         35       1.00      1.00      1.00        10
         36       1.00      1.00      1.00         9
         37       0.75      1.00      0.86         3
         38       0.50      0.67      0.57         6
         39       0.25      0.50      0.33         2
         40       0.80      0.44      0.57         9
         41       0.38      0.50      0.43         6
         42       1.00      0.86      0.92         7
         43       0.86      1.00      0.92         6
         44       1.00      0.67      0.80         6
         45       0.88      1.00      0.93         7
         46       1.00      0.86      0.92         7
         47       0.86      1.00      0.92         6
         48       0.67      1.00      0.80         4
         49       0.78      0.88      0.82         8
         50       0.86      1.00      0.92         6
         51       1.00      0.71      0.83         7
         52       0.83      1.00      0.91         5
         53       0.50      1.00      0.67         3
         54       0.40      1.00      0.57         4
         55       1.00      0.80      0.89         5
         56       0.67      0.33      0.44         6
         57       0.83      0.71      0.77         7
         58       0.50      0.12      0.20         8
         59       0.70      1.00      0.82         7
         60       1.00      0.30      0.46        10
         61       0.50      0.80      0.62         5
         62       1.00      1.00      1.00         9
         63       0.86      1.00      0.92         6
         64       1.00      0.43      0.60         7
         65       0.80      0.80      0.80         5
         66       0.60      1.00      0.75         3
         67       1.00      1.00      1.00         7
         68       0.38      0.71      0.50         7
         69       0.80      0.57      0.67         7
         70       0.60      0.43      0.50         7
         71       0.57      1.00      0.73         4
         72       0.60      1.00      0.75         3
         73       1.00      0.09      0.17        11
         74       1.00      0.89      0.94         9
         75       1.00      0.33      0.50         6
         76       1.00      0.86      0.92         7
         77       0.86      1.00      0.92         6
         78       0.25      0.20      0.22         5
         79       1.00      1.00      1.00         7
         80       1.00      0.50      0.67         8
         81       0.71      1.00      0.83         5
         82       0.57      1.00      0.73         4
         83       0.46      1.00      0.63         6
         84       1.00      0.75      0.86         8
         85       0.75      1.00      0.86         3
         86       1.00      1.00      1.00         6
         87       1.00      0.71      0.83         7
         88       1.00      1.00      1.00         7
         89       0.75      1.00      0.86         6
         90       0.90      1.00      0.95         9
         91       1.00      0.83      0.91         6
         92       0.40      0.67      0.50         3
         93       0.71      0.83      0.77         6
         94       0.71      0.62      0.67         8
         95       1.00      0.62      0.77         8
         96       0.67      0.33      0.44         6
         97       0.67      0.33      0.44         6
         98       0.67      1.00      0.80         2
         99       1.00      1.00      1.00        10

avg / total       0.82      0.76      0.76       640

Naive Bayes accuracy and time elapsed caculation¶

t4=time()
print "NaiveBayes"
nb = BernoulliNB()
clf_nb=nb.fit(X_train,y_train)
print "Acurracy: ", clf_nb.score(X_test,y_test)
t5=time()
print "time elapsed: ", t5-t4

NaiveBayes
Acurracy:  0.36875
time elapsed:  0.119999885559

tt4=time()
print "cross result========"
scores = cross_validation.cross_val_score(dt, X,y, cv=5)
print scores
print scores.mean()
tt5=time()
print "time elapsed: ", tt5-tt4
print "\n"

cross result========
[ 0.48        0.45666667  0.42333333  0.42333333  0.43666667]
0.444
time elapsed:  0.680999994278

KNN accuracy and time elapsed caculation¶

t6=time()
print "KNN"
# knn = KNeighborsClassifier(n_neighbors=3)
knn = KNeighborsClassifier()
clf_knn=knn.fit(X_train, y_train)
print "Acurracy: ", clf_knn.score(X_test,y_test) 
t7=time()
print "time elapsed: ", t7-t6

KNN
Acurracy:  0.71875
time elapsed:  0.184000015259

tt6=time()
print "cross result========"
scores = cross_validation.cross_val_score(dt, X,y, cv=5)
print scores
print scores.mean()
tt7=time()
print "time elapsed: ", tt7-tt6
print "\n"

cross result========
[ 0.48        0.45666667  0.42333333  0.42333333  0.43666667]
0.444
time elapsed:  0.661000013351

SVM accuracy and time elapsed caculation¶

t7=time()
print "SVM"

svc = SVC()
clf_svc=svc.fit(X_train, y_train)
print "Acurracy: ", clf_svc.score(X_test,y_test) 
t8=time()
print "time elapsed: ", t8-t7

SVM
Acurracy:  0.00625
time elapsed:  0.776000022888

tt7=time()
print "cross result========"
scores = cross_validation.cross_val_score(dt, X,y, cv=5)
print scores
print scores.mean()
tt8=time()
print "time elapsed: ", tt7-tt6
print "\n"

cross result========
[ 0.48        0.45666667  0.42333333  0.42333333  0.43666667]
0.444
time elapsed:  10.0399999619

from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn import grid_search

svc = SVC()

parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}

grid = grid_search.GridSearchCV(svc, parameters, n_jobs=-1, verbose=1, scoring='accuracy')


grid.fit(X_train, y_train)

print 'Best score: %0.3f' % grid.best_score_

print 'Best parameters set:'
best_parameters = grid.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])
    
predictions = grid.predict(X_test)
print classification_report(y_test, predictions)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best score: 0.231
Best parameters set:
	C: 10
	kernel: 'linear'
             precision    recall  f1-score   support

          0       0.00      0.00      0.00         9
          1       0.00      0.00      0.00         7
          2       0.00      0.00      0.00         7
          3       0.00      0.00      0.00        10
          4       0.00      0.00      0.00         7
          5       1.00      1.00      1.00         5
          6       0.40      1.00      0.57         4
          7       0.36      1.00      0.53         4
          8       0.00      0.00      0.00         5
          9       0.04      0.25      0.07         4
         10       1.00      1.00      1.00         5
         11       0.00      0.00      0.00         8
         12       0.00      0.00      0.00         7
         13       0.00      0.00      0.00         6
         14       1.00      0.40      0.57         5
         15       0.00      0.00      0.00         9
         16       1.00      0.83      0.91         6
         17       0.14      0.75      0.24         4
         18       0.00      0.00      0.00         6
         19       0.00      0.00      0.00         8
         20       0.00      0.00      0.00         6
         21       0.00      0.00      0.00         4
         22       0.00      0.00      0.00         7
         23       0.04      1.00      0.08         3
         24       0.00      0.00      0.00         4
         25       0.00      0.00      0.00         7
         26       0.00      0.00      0.00         8
         27       0.00      0.00      0.00        11
         28       0.00      0.00      0.00         5
         29       0.00      0.00      0.00        12
         30       0.00      0.00      0.00         8
         31       0.00      0.00      0.00         5
         32       0.00      0.00      0.00         9
         33       0.00      0.00      0.00         9
         34       1.00      0.57      0.73         7
         35       0.00      0.00      0.00        10
         36       1.00      0.89      0.94         9
         37       0.20      1.00      0.33         3
         38       0.00      0.00      0.00         6
         39       0.13      1.00      0.24         2
         40       0.00      0.00      0.00         9
         41       0.00      0.00      0.00         6
         42       0.00      0.00      0.00         7
         43       0.46      1.00      0.63         6
         44       0.00      0.00      0.00         6
         45       0.00      0.00      0.00         7
         46       1.00      0.71      0.83         7
         47       0.00      0.00      0.00         6
         48       0.30      0.75      0.43         4
         49       0.00      0.00      0.00         8
         50       0.00      0.00      0.00         6
         51       0.00      0.00      0.00         7
         52       0.00      0.00      0.00         5
         53       0.05      1.00      0.09         3
         54       0.17      1.00      0.29         4
         55       0.00      0.00      0.00         5
         56       0.00      0.00      0.00         6
         57       0.00      0.00      0.00         7
         58       0.00      0.00      0.00         8
         59       0.00      0.00      0.00         7
         60       0.00      0.00      0.00        10
         61       0.25      0.20      0.22         5
         62       0.00      0.00      0.00         9
         63       0.00      0.00      0.00         6
         64       0.00      0.00      0.00         7
         65       0.33      0.80      0.47         5
         66       0.18      1.00      0.30         3
         67       0.00      0.00      0.00         7
         68       0.00      0.00      0.00         7
         69       0.00      0.00      0.00         7
         70       0.00      0.00      0.00         7
         71       0.11      0.75      0.19         4
         72       0.16      1.00      0.27         3
         73       0.00      0.00      0.00        11
         74       0.00      0.00      0.00         9
         75       0.00      0.00      0.00         6
         76       0.00      0.00      0.00         7
         77       0.46      1.00      0.63         6
         78       0.00      0.00      0.00         5
         79       1.00      0.14      0.25         7
         80       0.00      0.00      0.00         8
         81       0.40      0.80      0.53         5
         82       0.06      0.75      0.12         4
         83       0.00      0.00      0.00         6
         84       0.00      0.00      0.00         8
         85       0.14      1.00      0.25         3
         86       1.00      0.50      0.67         6
         87       0.00      0.00      0.00         7
         88       0.00      0.00      0.00         7
         89       0.00      0.00      0.00         6
         90       0.00      0.00      0.00         9
         91       0.00      0.00      0.00         6
         92       0.04      1.00      0.07         3
         93       0.00      0.00      0.00         6
         94       0.00      0.00      0.00         8
         95       0.00      0.00      0.00         8
         96       0.00      0.00      0.00         6
         97       0.00      0.00      0.00         6
         98       0.02      0.50      0.04         2
         99       0.00      0.00      0.00        10

avg / total       0.12      0.17      0.11       640

[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:   18.5s finished

pipeline = Pipeline([
    ('clf', SVC(kernel='rbf', gamma=0.01, C=100))
])

parameters = {
    'clf__gamma': (0.01, 0.03, 0.1, 0.3, 1),
    'clf__C': (0.1, 0.3, 1, 3, 10, 30),
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy')

grid_search.fit(X_train, y_train)

print 'Best score: %0.3f' % grid_search.best_score_

print 'Best parameters set:'
best_parameters = grid_search.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])
    
predictions = grid_search.predict(X_test)
print classification_report(y_test, predictions)

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   24.3s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:   31.1s finished

Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best score: 0.698
Best parameters set:
	clf__C: 30
	clf__gamma: 1
             precision    recall  f1-score   support

          0       0.83      0.56      0.67         9
          1       0.88      1.00      0.93         7
          2       1.00      1.00      1.00         7
          3       1.00      0.20      0.33        10
          4       1.00      1.00      1.00         7
          5       1.00      1.00      1.00         5
          6       0.67      1.00      0.80         4
          7       1.00      1.00      1.00         4
          8       0.71      1.00      0.83         5
          9       0.57      1.00      0.73         4
         10       1.00      1.00      1.00         5
         11       1.00      0.75      0.86         8
         12       0.43      0.86      0.57         7
         13       1.00      1.00      1.00         6
         14       1.00      1.00      1.00         5
         15       0.00      0.00      0.00         9
         16       1.00      1.00      1.00         6
         17       0.60      0.75      0.67         4
         18       0.86      1.00      0.92         6
         19       0.83      0.62      0.71         8
         20       1.00      1.00      1.00         6
         21       1.00      1.00      1.00         4
         22       1.00      1.00      1.00         7
         23       0.16      1.00      0.27         3
         24       0.12      0.75      0.20         4
         25       1.00      0.29      0.44         7
         26       0.50      0.12      0.20         8
         27       1.00      0.73      0.84        11
         28       0.38      0.60      0.46         5
         29       1.00      0.08      0.15        12
         30       0.00      0.00      0.00         8
         31       0.30      0.60      0.40         5
         32       0.89      0.89      0.89         9
         33       1.00      0.89      0.94         9
         34       1.00      1.00      1.00         7
         35       1.00      0.90      0.95        10
         36       1.00      0.89      0.94         9
         37       0.75      1.00      0.86         3
         38       0.67      0.67      0.67         6
         39       0.20      0.50      0.29         2
         40       0.00      0.00      0.00         9
         41       1.00      0.17      0.29         6
         42       0.88      1.00      0.93         7
         43       1.00      1.00      1.00         6
         44       0.57      0.67      0.62         6
         45       0.88      1.00      0.93         7
         46       1.00      1.00      1.00         7
         47       0.75      1.00      0.86         6
         48       1.00      1.00      1.00         4
         49       1.00      1.00      1.00         8
         50       0.75      0.50      0.60         6
         51       1.00      0.71      0.83         7
         52       1.00      0.80      0.89         5
         53       0.60      1.00      0.75         3
         54       0.67      1.00      0.80         4
         55       1.00      1.00      1.00         5
         56       1.00      0.67      0.80         6
         57       0.75      0.86      0.80         7
         58       0.25      0.25      0.25         8
         59       0.88      1.00      0.93         7
         60       0.00      0.00      0.00        10
         61       0.50      0.80      0.62         5
         62       0.69      1.00      0.82         9
         63       1.00      1.00      1.00         6
         64       1.00      1.00      1.00         7
         65       0.80      0.80      0.80         5
         66       0.75      1.00      0.86         3
         67       0.40      0.29      0.33         7
         68       0.40      0.29      0.33         7
         69       0.83      0.71      0.77         7
         70       0.50      0.43      0.46         7
         71       0.60      0.75      0.67         4
         72       1.00      0.67      0.80         3
         73       0.00      0.00      0.00        11
         74       0.89      0.89      0.89         9
         75       1.00      0.50      0.67         6
         76       0.86      0.86      0.86         7
         77       0.86      1.00      0.92         6
         78       0.36      1.00      0.53         5
         79       1.00      1.00      1.00         7
         80       1.00      0.50      0.67         8
         81       0.83      1.00      0.91         5
         82       0.67      1.00      0.80         4
         83       0.40      1.00      0.57         6
         84       1.00      1.00      1.00         8
         85       0.75      1.00      0.86         3
         86       1.00      1.00      1.00         6
         87       0.86      0.86      0.86         7
         88       1.00      1.00      1.00         7
         89       0.42      0.83      0.56         6
         90       0.80      0.89      0.84         9
         91       0.83      0.83      0.83         6
         92       0.33      1.00      0.50         3
         93       0.75      1.00      0.86         6
         94       0.62      0.62      0.62         8
         95       0.00      0.00      0.00         8
         96       0.67      0.67      0.67         6
         97       1.00      0.17      0.29         6
         98       0.25      0.50      0.33         2
         99       1.00      1.00      1.00        10

avg / total       0.75      0.72      0.70       640

Ensemble Learning¶

from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.externals import six
from sklearn.base import clone
from sklearn.pipeline import _name_estimators
import numpy as np
import operator


class MajorityVoteClassifier(BaseEstimator, 
                             ClassifierMixin):
    """ A majority vote ensemble classifier

    Parameters
    ----------
    classifiers : array-like, shape = [n_classifiers]
      Different classifiers for the ensemble

    vote : str, {'classlabel', 'probability'} (default='label')
      If 'classlabel' the prediction is based on the argmax of
        class labels. Else if 'probability', the argmax of
        the sum of probabilities is used to predict the class label
        (recommended for calibrated classifiers).

    weights : array-like, shape = [n_classifiers], optional (default=None)
      If a list of `int` or `float` values are provided, the classifiers
      are weighted by importance; Uses uniform weights if `weights=None`.

    """
    def __init__(self, classifiers, vote='classlabel', weights=None):

        self.classifiers = classifiers
        self.named_classifiers = {key: value for key, value
                                  in _name_estimators(classifiers)}
        self.vote = vote
        self.weights = weights

    def fit(self, X, y):
        """ Fit classifiers.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Matrix of training samples.

        y : array-like, shape = [n_samples]
            Vector of target class labels.

        Returns
        -------
        self : object

        """
        if self.vote not in ('probability', 'classlabel'):
            raise ValueError("vote must be 'probability' or 'classlabel'"
                             "; got (vote=%r)"
                             % self.vote)

        if self.weights and len(self.weights) != len(self.classifiers):
            raise ValueError('Number of classifiers and weights must be equal'
                             '; got %d weights, %d classifiers'
                             % (len(self.weights), len(self.classifiers)))

        # Use LabelEncoder to ensure class labels start with 0, which
        # is important for np.argmax call in self.predict
        self.lablenc_ = LabelEncoder()
        self.lablenc_.fit(y)
        self.classes_ = self.lablenc_.classes_
        self.classifiers_ = []
        for clf in self.classifiers:
            fitted_clf = clone(clf).fit(X, self.lablenc_.transform(y))
            self.classifiers_.append(fitted_clf)
        return self

    def predict(self, X):
        """ Predict class labels for X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Matrix of training samples.

        Returns
        ----------
        maj_vote : array-like, shape = [n_samples]
            Predicted class labels.
            
        """
        if self.vote == 'probability':
            maj_vote = np.argmax(self.predict_proba(X), axis=1)
        else:  # 'classlabel' vote

            #  Collect results from clf.predict calls
            predictions = np.asarray([clf.predict(X)
                                      for clf in self.classifiers_]).T

            maj_vote = np.apply_along_axis(
                                      lambda x:
                                      np.argmax(np.bincount(x,
                                                weights=self.weights)),
                                      axis=1,
                                      arr=predictions)
        maj_vote = self.lablenc_.inverse_transform(maj_vote)
        return maj_vote

    def predict_proba(self, X):
        """ Predict class probabilities for X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.

        Returns
        ----------
        avg_proba : array-like, shape = [n_samples, n_classes]
            Weighted average probability for each class per sample.

        """
        probas = np.asarray([clf.predict_proba(X)
                             for clf in self.classifiers_])
        avg_proba = np.average(probas, axis=0, weights=self.weights)
        return avg_proba

    def get_params(self, deep=True):
        """ Get classifier parameter names for GridSearch"""
        if not deep:
            return super(MajorityVoteClassifier, self).get_params(deep=False)
        else:
            out = self.named_classifiers.copy()
            for name, step in six.iteritems(self.named_classifiers):
                for key, value in six.iteritems(step.get_params(deep=True)):
                    out['%s__%s' % (name, key)] = value
            return out

from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.pipeline import Pipeline
import numpy as np

clf1 = LogisticRegression(penalty='l2', 
                          C=0.001, 
                          random_state=0)

clf2 = DecisionTreeClassifier(max_depth=1, 
                              criterion='entropy', 
                              random_state=0)

clf3 = KNeighborsClassifier(n_neighbors=1, 
                            p=2, 
                            metric='minkowski')

pipe1 = Pipeline([['sc', StandardScaler()],
                  ['clf', clf1]])
pipe3 = Pipeline([['sc', StandardScaler()],
                  ['clf', clf3]])

clf_labels = ['Logistic Regression', 'Decision Tree', 'KNN']

print('10-fold cross validation:\n')
for clf, label in zip([pipe1, clf2, pipe3], clf_labels):
    scores = cross_val_score(estimator=clf, 
                             X=X_train, 
                             y=y_train, 
                             cv=10, 
                             scoring='roc_auc')
    print("ROC AUC: %0.2f (+/- %0.2f) [%s]" 
               % (scores.mean(), scores.std(), label))

# Majority Rule (hard) Voting

mv_clf = MajorityVoteClassifier(
                classifiers=[pipe1, clf2, pipe3])

clf_labels += ['Majority Voting']
all_clf = [pipe1, clf2, pipe3, mv_clf]

for clf, label in zip(all_clf, clf_labels):
    scores = cross_val_score(estimator=clf, 
                             X=X_train, 
                             y=y_train, 
                             cv=10, 
                             scoring='roc_auc')
    print("ROC AUC: %0.2f (+/- %0.2f) [%s]" 
               % (scores.mean(), scores.std(), label))

mv_clf.get_params()

from sklearn.grid_search import GridSearchCV

params = {'decisiontreeclassifier__max_depth': [1, 2],
          'pipeline-1__clf__C': [0.001, 0.1, 100.0]}

grid = GridSearchCV(estimator=mv_clf, 
                    param_grid=params, 
                    cv=10, 
                    scoring='roc_auc')
grid.fit(X_train, y_train)

for params, mean_score, scores in grid.grid_scores_:
    print("%0.3f+/-%0.2f %r"
            % (mean_score, scores.std() / 2, params))

print('Best parameters: %s' % grid.best_params_)
print('Accuracy: %.2f' % grid.best_score_)

Bagging -- Building an ensemble of classifiers from bootstrap samples¶

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(criterion='entropy', 
                              max_depth=None)

bag = BaggingClassifier(base_estimator=tree,
                        n_estimators=500, 
                        max_samples=1.0, 
                        max_features=1.0, 
                        bootstrap=True, 
                        bootstrap_features=False, 
                        n_jobs=1, 
                        random_state=1)

from sklearn.metrics import accuracy_score

tree = tree.fit(X_train, y_train)
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)

tree_train = accuracy_score(y_train, y_train_pred)
tree_test = accuracy_score(y_test, y_test_pred)
print('Decision tree train/test accuracies %.3f/%.3f'
      % (tree_train, tree_test))

bag = bag.fit(X_train, y_train)
y_train_pred = bag.predict(X_train)
y_test_pred = bag.predict(X_test)

bag_train = accuracy_score(y_train, y_train_pred) 
bag_test = accuracy_score(y_test, y_test_pred) 
print('Bagging train/test accuracies %.3f/%.3f'
      % (bag_train, bag_test))

Leveraging weak learners via adaptive boosting¶

from sklearn.ensemble import AdaBoostClassifier

tree = DecisionTreeClassifier(criterion='entropy', 
                              max_depth=1)

ada = AdaBoostClassifier(base_estimator=tree,
                         n_estimators=500, 
                         learning_rate=0.1,
                         random_state=0)

tree = tree.fit(X_train, y_train)
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)

tree_train = accuracy_score(y_train, y_train_pred)
tree_test = accuracy_score(y_test, y_test_pred)
print('Decision tree train/test accuracies %.3f/%.3f'
      % (tree_train, tree_test))

ada = ada.fit(X_train, y_train)
y_train_pred = ada.predict(X_train)
y_test_pred = ada.predict(X_test)

ada_train = accuracy_score(y_train, y_train_pred) 
ada_test = accuracy_score(y_test, y_test_pred) 
print('AdaBoost train/test accuracies %.3f/%.3f'
      % (ada_train, ada_test))

	Class	f0	f1	f2	f3	f4	f5	f6	f7	f8	...	f54	f55	f56	f57	f58	f59	f60	f61	f62
0	Acer Campestre	0.003906	0.003906	0.027344	0.033203	0.007812	0.017578	0.023438	0.005859	0.000000	...	0.011719	0.000000	0.005859	0.035156	0.027344	0.033203	0.001953	0.000000	0.017578
1	Acer Campestre	0.005859	0.013672	0.027344	0.025391	0.013672	0.029297	0.019531	0.000000	0.001953	...	0.017578	0.000000	0.021484	0.017578	0.046875	0.005859	0.003906	0.003906	0.046875
2	Acer Campestre	0.011719	0.001953	0.027344	0.044922	0.017578	0.042969	0.023438	0.000000	0.003906	...	0.035156	0.000000	0.015625	0.021484	0.056641	0.009766	0.003906	0.000000	0.015625
3	Acer Campestre	0.013672	0.011719	0.037109	0.017578	0.011719	0.087891	0.023438	0.000000	0.000000	...	0.015625	0.001953	0.021484	0.029297	0.033203	0.003906	0.000000	0.001953	0.027344
4	Acer Campestre	0.007812	0.009766	0.027344	0.025391	0.001953	0.005859	0.015625	0.000000	0.005859	...	0.023438	0.001953	0.021484	0.048828	0.056641	0.019531	0.000000	0.000000	0.013672

	f0	f1	f2	f3	f4	f5	f6	f7	f8	...	f54	f55	f56	f57	f58	f59	f60	f61	f62
0	0.003906	0.003906	0.027344	0.033203	0.007812	0.017578	0.023438	0.005859	0.000000	...	0.011719	0.000000	0.005859	0.035156	0.027344	0.033203	0.001953	0.000000	0.017578
1	0.005859	0.013672	0.027344	0.025391	0.013672	0.029297	0.019531	0.000000	0.001953	...	0.017578	0.000000	0.021484	0.017578	0.046875	0.005859	0.003906	0.003906	0.046875
2	0.011719	0.001953	0.027344	0.044922	0.017578	0.042969	0.023438	0.000000	0.003906	...	0.035156	0.000000	0.015625	0.021484	0.056641	0.009766	0.003906	0.000000	0.015625
3	0.013672	0.011719	0.037109	0.017578	0.011719	0.087891	0.023438	0.000000	0.000000	...	0.015625	0.001953	0.021484	0.029297	0.033203	0.003906	0.000000	0.001953	0.027344
4	0.007812	0.009766	0.027344	0.025391	0.001953	0.005859	0.015625	0.000000	0.005859	...	0.023438	0.001953	0.021484	0.048828	0.056641	0.019531	0.000000	0.000000	0.013672

	f0	f1	f2	f3	f4	f5	f6	f7	f8	f9	...	f54	f55	f56	f57	f58	f59	f60	f61	f62
0	0.003906	0.003906	0.027344	0.033203	0.007812	0.017578	0.023438	0.005859	0.000000	0.015625	...	0.011719	0.000000	0.005859	0.035156	0.027344	0.033203	0.001953	0.000000	0.017578
1	0.005859	0.013672	0.027344	0.025391	0.013672	0.029297	0.019531	0.000000	0.001953	0.021484	...	0.017578	0.000000	0.021484	0.017578	0.046875	0.005859	0.003906	0.003906	0.046875
2	0.011719	0.001953	0.027344	0.044922	0.017578	0.042969	0.023438	0.000000	0.003906	0.019531	...	0.035156	0.000000	0.015625	0.021484	0.056641	0.009766	0.003906	0.000000	0.015625
3	0.013672	0.011719	0.037109	0.017578	0.011719	0.087891	0.023438	0.000000	0.000000	0.027344	...	0.015625	0.001953	0.021484	0.029297	0.033203	0.003906	0.000000	0.001953	0.027344
4	0.007812	0.009766	0.027344	0.025391	0.001953	0.005859	0.015625	0.000000	0.005859	0.017578	...	0.023438	0.001953	0.021484	0.048828	0.056641	0.019531	0.000000	0.000000	0.013672