Plants Clustering

https://archive.ics.uci.edu/ml/datasets/Plants

The data is in the transactional form. It contains the Latin names (species or genus) and state abbreviations..Each row contains a Latin name (species or genus) and a list of state abbreviations.

In [72]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn import cross_validation, metrics
from sklearn import preprocessing
import matplotlib
import matplotlib.pyplot as plt
In [ ]:
# read .csv from provided dataset
csv_filename="plants.data"

# df=pd.read_csv(csv_filename,index_col=0)
df=pd.read_csv(csv_filename)
In [11]:
cols = ['Class']
for i in range(64):
    str = 'f{}'.format(i)
    cols.append(str)
In [20]:
# read .csv from provided dataset
csv_filename="data_Mar_64.txt"

# df=pd.read_csv(csv_filename,index_col=0)
df=pd.read_csv(csv_filename,names=cols)
In [21]:
df.head()
Out[21]:
Class f0 f1 f2 f3 f4 f5 f6 f7 f8 ... f54 f55 f56 f57 f58 f59 f60 f61 f62 f63
0 Acer Campestre 0.003906 0.003906 0.027344 0.033203 0.007812 0.017578 0.023438 0.005859 0.000000 ... 0.011719 0.000000 0.005859 0.035156 0.027344 0.033203 0.001953 0.000000 0.017578 0.0
1 Acer Campestre 0.005859 0.013672 0.027344 0.025391 0.013672 0.029297 0.019531 0.000000 0.001953 ... 0.017578 0.000000 0.021484 0.017578 0.046875 0.005859 0.003906 0.003906 0.046875 0.0
2 Acer Campestre 0.011719 0.001953 0.027344 0.044922 0.017578 0.042969 0.023438 0.000000 0.003906 ... 0.035156 0.000000 0.015625 0.021484 0.056641 0.009766 0.003906 0.000000 0.015625 0.0
3 Acer Campestre 0.013672 0.011719 0.037109 0.017578 0.011719 0.087891 0.023438 0.000000 0.000000 ... 0.015625 0.001953 0.021484 0.029297 0.033203 0.003906 0.000000 0.001953 0.027344 0.0
4 Acer Campestre 0.007812 0.009766 0.027344 0.025391 0.001953 0.005859 0.015625 0.000000 0.005859 ... 0.023438 0.001953 0.021484 0.048828 0.056641 0.019531 0.000000 0.000000 0.013672 0.0

5 rows × 65 columns

In [22]:
df.shape
Out[22]:
(1600, 65)
In [23]:
df['Class'].unique()
Out[23]:
array(['Acer Campestre', 'Acer Capillipes', 'Acer Circinatum', 'Acer Mono',
       'Acer Opalus', 'Acer Palmatum', 'Acer Pictum', 'Acer Platanoids',
       'Acer Rubrum', 'Acer Rufinerve', 'Acer Saccharinum',
       'Alnus Cordata', 'Alnus Maximowiczii', 'Alnus Rubra',
       'Alnus Sieboldiana', 'Alnus Viridis', 'Arundinaria Simonii',
       'Betula Austrosinensis', 'Betula Pendula', 'Callicarpa Bodinieri',
       'Castanea Sativa', 'Celtis Koraiensis', 'Cercis Siliquastrum',
       'Cornus Chinensis', 'Cornus Controversa', 'Cornus Macrophylla',
       'Cotinus Coggygria', 'Crataegus Monogyna', 'Cytisus Battandieri',
       'Eucalyptus Glaucescens', 'Eucalyptus Neglecta',
       'Eucalyptus Urnigera', 'Fagus Sylvatica', 'Ginkgo Biloba',
       'Ilex Aquifolium', 'Ilex Cornuta', 'Liquidambar Styraciflua',
       'Liriodendron Tulipifera', 'Lithocarpus Cleistocarpus',
       'Lithocarpus Edulis', 'Magnolia Heptapeta', 'Magnolia Salicifolia',
       'Morus Nigra', 'Olea Europaea', 'Phildelphus', 'Populus Adenopoda',
       'Populus Grandidentata', 'Populus Nigra', 'Prunus Avium',
       'Prunus X Shmittii', 'Pterocarya Stenoptera', 'Quercus Afares',
       'Quercus Agrifolia', 'Quercus Alnifolia', 'Quercus Brantii',
       'Quercus Canariensis', 'Quercus Castaneifolia', 'Quercus Cerris',
       'Quercus Chrysolepis', 'Quercus Coccifera', 'Quercus Coccinea',
       'Quercus Crassifolia', 'Quercus Crassipes', 'Quercus Dolicholepis',
       'Quercus Ellipsoidalis', 'Quercus Greggii', 'Quercus Hartwissiana',
       'Quercus Ilex', 'Quercus Imbricaria', 'Quercus Infectoria sub',
       'Quercus Kewensis', 'Quercus Nigra', 'Quercus Palustris',
       'Quercus Phellos', 'Quercus Phillyraeoides', 'Quercus Pontica',
       'Quercus Pubescens', 'Quercus Pyrenaica', 'Quercus Rhysophylla',
       'Quercus Rubra', 'Quercus Semecarpifolia', 'Quercus Shumardii',
       'Quercus Suber', 'Quercus Texana', 'Quercus Trojana',
       'Quercus Variabilis', 'Quercus Vulcanica', 'Quercus x Hispanica',
       'Quercus x Turneri', 'Rhododendron x Russellianum',
       'Salix Fragilis', 'Salix Intergra', 'Sorbus Aria', 'Tilia Oliveri',
       'Tilia Platyphyllos', 'Tilia Tomentosa', 'Ulmus Bergmanniana',
       'Viburnum Tinus', 'Viburnum x Rhytidophylloides', 'Zelkova Serrata'], dtype=object)
In [25]:
len(df['Class'].unique())
Out[25]:
100
In [26]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Class'] = le.fit_transform(df['Class'])
In [41]:
df['Class'].unique()
Out[41]:
array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99], dtype=int64)
In [27]:
df.head()
Out[27]:
Class f0 f1 f2 f3 f4 f5 f6 f7 f8 ... f54 f55 f56 f57 f58 f59 f60 f61 f62 f63
0 0 0.003906 0.003906 0.027344 0.033203 0.007812 0.017578 0.023438 0.005859 0.000000 ... 0.011719 0.000000 0.005859 0.035156 0.027344 0.033203 0.001953 0.000000 0.017578 0.0
1 0 0.005859 0.013672 0.027344 0.025391 0.013672 0.029297 0.019531 0.000000 0.001953 ... 0.017578 0.000000 0.021484 0.017578 0.046875 0.005859 0.003906 0.003906 0.046875 0.0
2 0 0.011719 0.001953 0.027344 0.044922 0.017578 0.042969 0.023438 0.000000 0.003906 ... 0.035156 0.000000 0.015625 0.021484 0.056641 0.009766 0.003906 0.000000 0.015625 0.0
3 0 0.013672 0.011719 0.037109 0.017578 0.011719 0.087891 0.023438 0.000000 0.000000 ... 0.015625 0.001953 0.021484 0.029297 0.033203 0.003906 0.000000 0.001953 0.027344 0.0
4 0 0.007812 0.009766 0.027344 0.025391 0.001953 0.005859 0.015625 0.000000 0.005859 ... 0.023438 0.001953 0.021484 0.048828 0.056641 0.019531 0.000000 0.000000 0.013672 0.0

5 rows × 65 columns

In [28]:
features = df.columns[1:]
features
Out[28]:
Index([u'f0', u'f1', u'f2', u'f3', u'f4', u'f5', u'f6', u'f7', u'f8', u'f9',
       u'f10', u'f11', u'f12', u'f13', u'f14', u'f15', u'f16', u'f17', u'f18',
       u'f19', u'f20', u'f21', u'f22', u'f23', u'f24', u'f25', u'f26', u'f27',
       u'f28', u'f29', u'f30', u'f31', u'f32', u'f33', u'f34', u'f35', u'f36',
       u'f37', u'f38', u'f39', u'f40', u'f41', u'f42', u'f43', u'f44', u'f45',
       u'f46', u'f47', u'f48', u'f49', u'f50', u'f51', u'f52', u'f53', u'f54',
       u'f55', u'f56', u'f57', u'f58', u'f59', u'f60', u'f61', u'f62', u'f63'],
      dtype='object')
In [30]:
X = df[features]
y = df['Class']
In [31]:
X.head()
Out[31]:
f0 f1 f2 f3 f4 f5 f6 f7 f8 f9 ... f54 f55 f56 f57 f58 f59 f60 f61 f62 f63
0 0.003906 0.003906 0.027344 0.033203 0.007812 0.017578 0.023438 0.005859 0.000000 0.015625 ... 0.011719 0.000000 0.005859 0.035156 0.027344 0.033203 0.001953 0.000000 0.017578 0.0
1 0.005859 0.013672 0.027344 0.025391 0.013672 0.029297 0.019531 0.000000 0.001953 0.021484 ... 0.017578 0.000000 0.021484 0.017578 0.046875 0.005859 0.003906 0.003906 0.046875 0.0
2 0.011719 0.001953 0.027344 0.044922 0.017578 0.042969 0.023438 0.000000 0.003906 0.019531 ... 0.035156 0.000000 0.015625 0.021484 0.056641 0.009766 0.003906 0.000000 0.015625 0.0
3 0.013672 0.011719 0.037109 0.017578 0.011719 0.087891 0.023438 0.000000 0.000000 0.027344 ... 0.015625 0.001953 0.021484 0.029297 0.033203 0.003906 0.000000 0.001953 0.027344 0.0
4 0.007812 0.009766 0.027344 0.025391 0.001953 0.005859 0.015625 0.000000 0.005859 0.017578 ... 0.023438 0.001953 0.021484 0.048828 0.056641 0.019531 0.000000 0.000000 0.013672 0.0

5 rows × 64 columns

In [32]:
# split dataset to 60% training and 40% testing
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0)
In [33]:
print X_train.shape, y_train.shape
(960, 64) (960L,)

Unsupervised Learning

PCA

In [34]:
y.unique()
Out[34]:
array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99], dtype=int64)
In [35]:
len(features)
Out[35]:
64
In [38]:
# Apply PCA with the same number of dimensions as variables in the dataset
from sklearn.decomposition import PCA
pca = PCA(n_components=64)
pca.fit(X)

# Print the components and the amount of variance in the data contained in each dimension
print(pca.components_)
print(pca.explained_variance_ratio_)
[[ 0.17301085  0.3678857  -0.0376574  ..., -0.04515881  0.10784595
  -0.02792765]
 [ 0.06860826  0.17120937 -0.16743028 ...,  0.05114625 -0.04738932
   0.00340058]
 [-0.01217028 -0.08481612  0.30296649 ..., -0.04069022  0.17427581
  -0.03918934]
 ..., 
 [-0.00740531 -0.00196931 -0.00802288 ..., -0.00735558  0.00890972
   0.04235162]
 [ 0.0151684   0.01028616  0.01684166 ...,  0.05617535  0.01335735
  -0.00934287]
 [ 0.1249996   0.12499963  0.1249996  ...,  0.12499625  0.12500077
   0.12500567]]
[  3.66346160e-01   1.36479827e-01   8.76136880e-02   6.76087515e-02
   5.62833251e-02   4.56950262e-02   2.62293511e-02   2.23588970e-02
   1.85408968e-02   1.68000379e-02   1.51209237e-02   1.19416550e-02
   9.62145182e-03   8.87215304e-03   8.70710022e-03   7.38789680e-03
   6.71526565e-03   6.42370588e-03   5.45940047e-03   4.81032134e-03
   4.16110779e-03   4.04450437e-03   3.98474616e-03   3.56179763e-03
   3.37395149e-03   3.11290968e-03   2.93953519e-03   2.72151214e-03
   2.66239203e-03   2.57375991e-03   2.44332593e-03   2.31217010e-03
   2.11575025e-03   2.09009210e-03   2.01793326e-03   1.93257495e-03
   1.84261074e-03   1.79722687e-03   1.77056628e-03   1.73038935e-03
   1.64383197e-03   1.55318733e-03   1.46506434e-03   1.39524503e-03
   1.34309348e-03   1.27063003e-03   1.03304595e-03   9.83972274e-04
   9.72012625e-04   8.30449484e-04   7.88002854e-04   6.93523108e-04
   6.00723467e-04   5.70686500e-04   5.22928128e-04   4.52364252e-04
   3.92603544e-04   3.30888609e-04   3.03654919e-04   2.54559631e-04
   1.82047138e-04   1.48132720e-04   6.46620494e-05   5.06768097e-12]
In [39]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.plot(list(pca.explained_variance_ratio_),'-o')
plt.title('Explained variance ratio as function of PCA components')
plt.ylabel('Explained variance ratio')
plt.xlabel('Component')
plt.show()
In [45]:
# First we reduce the data to two dimensions using PCA to capture variation
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(X)
print(reduced_data[:10])  # print upto 10 elements
[[-0.03307489 -0.02892331]
 [-0.02422727 -0.01839179]
 [-0.02629431 -0.03441262]
 [ 0.00831379  0.00113884]
 [ 0.00117595 -0.05432339]
 [-0.00205065 -0.02189747]
 [ 0.00664743 -0.0258008 ]
 [-0.02080581 -0.00804307]
 [-0.0357074   0.00314164]
 [-0.02317419  0.00840398]]
In [47]:
kmeans = KMeans(n_clusters=100)
clusters = kmeans.fit(reduced_data)
print(clusters)
KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=100, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)
In [48]:
# Plot the decision boundary by building a mesh grid to populate a graph.
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
hx = (x_max-x_min)/1000.
hy = (y_max-y_min)/1000.
xx, yy = np.meshgrid(np.arange(x_min, x_max, hx), np.arange(y_min, y_max, hy))

# Obtain labels for each point in mesh. Use last trained model.
Z = clusters.predict(np.c_[xx.ravel(), yy.ravel()])
In [49]:
# Find the centroids for KMeans or the cluster means for GMM 

centroids = kmeans.cluster_centers_
print('*** K MEANS CENTROIDS ***')
print(centroids)

# TRANSFORM DATA BACK TO ORIGINAL SPACE FOR ANSWERING 7
print('*** CENTROIDS TRANSFERED TO ORIGINAL SPACE ***')
print(pca.inverse_transform(centroids))
*** K MEANS CENTROIDS ***
[[-0.00569156 -0.02757334]
 [ 0.13246213 -0.01621052]
 [-0.09288891  0.03512336]
 [-0.04629578 -0.14273808]
 [ 0.16744926  0.05257156]
 [ 0.07064058 -0.02604673]
 [-0.02826312  0.01388355]
 [-0.05375892 -0.08335535]
 [-0.11115126  0.05581906]
 [ 0.01562468 -0.07296821]
 [-0.01047396  0.02356011]
 [ 0.11544097  0.06833212]
 [ 0.01681328 -0.00510339]
 [-0.07041002 -0.00715404]
 [ 0.1227513   0.02325   ]
 [ 0.03765457  0.02418866]
 [-0.0727005   0.04570051]
 [ 0.23678466  0.02092281]
 [ 0.09965405 -0.07586469]
 [ 0.02956955 -0.14766104]
 [ 0.24783442  0.09124872]
 [-0.05173007 -0.01858108]
 [-0.07509212 -0.16741198]
 [ 0.15820097  0.02933633]
 [ 0.17652355 -0.08188853]
 [ 0.08251682  0.03693327]
 [-0.07078136 -0.13687759]
 [ 0.19897381  0.06630531]
 [-0.02931185 -0.06391228]
 [ 0.09146325 -0.04824824]
 [-0.08578384  0.01404901]
 [-0.02962707 -0.11551206]
 [ 0.19834241 -0.03538007]
 [-0.05972279  0.03032355]
 [ 0.08670001 -0.00606107]
 [-0.03116005  0.04406629]
 [ 0.02648243 -0.05101348]
 [-0.10810175  0.02162109]
 [-0.10345345  0.07732651]
 [-0.05484872 -0.17262396]
 [-0.11811638  0.07819233]
 [ 0.00570303 -0.1432835 ]
 [-0.01907073 -0.0475412 ]
 [ 0.14628249  0.01832428]
 [ 0.05575008 -0.05731427]
 [ 0.11206784 -0.02420592]
 [ 0.02723779 -0.11093354]
 [ 0.10353602  0.01005479]
 [-0.0224084  -0.01092377]
 [-0.09914013  0.06366138]
 [ 0.03007613 -0.02285616]
 [-0.09979745  0.00107765]
 [-0.08069197  0.05916232]
 [-0.05493688  0.00061032]
 [-0.00936725 -0.07803598]
 [-0.0801698  -0.02510252]
 [ 0.05976957  0.01190185]
 [ 0.05585779 -0.01290171]
 [-0.0017517   0.04210121]
 [ 0.13281497  0.05043614]
 [ 0.18931821  0.02747014]
 [ 0.22638096  0.07670289]
 [-0.06116483 -0.11844676]
 [-0.10376601  0.04464   ]
 [-0.11810946  0.04675272]
 [-0.07741126  0.02926753]
 [-0.03989306  0.02506544]
 [-0.03842906 -0.03845785]
 [-0.12118508  0.06150249]
 [ 0.04853543 -0.08333611]
 [ 0.00320653 -0.04752446]
 [ 0.00705696  0.01755481]
 [ 0.13125472 -0.04330547]
 [ 0.15408349  0.04558488]
 [-0.05532449  0.05504591]
 [ 0.03576639 -0.00278353]
 [-0.07983163 -0.07187959]
 [ 0.18520214  0.05829571]
 [-0.06727929  0.01146632]
 [-0.01335953  0.00502683]
 [ 0.00114023 -0.172542  ]
 [ 0.11054053  0.04174668]
 [ 0.15029777  0.07876001]
 [-0.03478568 -0.09472081]
 [-0.0701767  -0.19748871]
 [ 0.05397147  0.04461728]
 [ 0.15789829 -0.00141685]
 [-0.04004061  0.00684292]
 [-0.08793461  0.04723146]
 [-0.02320017 -0.0277043 ]
 [ 0.21870363  0.05245085]
 [-0.03890206 -0.01079456]
 [ 0.12431625  0.00142029]
 [ 0.09176176 -0.02673565]
 [ 0.00026089 -0.00786204]
 [ 0.04546378 -0.03759898]
 [ 0.01077029 -0.0266246 ]
 [ 0.02483525  0.01424494]
 [-0.0546426  -0.04536985]
 [ 0.17367441  0.03448724]]
*** CENTROIDS TRANSFERED TO ORIGINAL SPACE ***
[[-0.00569156 -0.02757334]
 [ 0.13246213 -0.01621052]
 [-0.09288891  0.03512336]
 [-0.04629578 -0.14273808]
 [ 0.16744926  0.05257156]
 [ 0.07064058 -0.02604673]
 [-0.02826312  0.01388355]
 [-0.05375892 -0.08335535]
 [-0.11115126  0.05581906]
 [ 0.01562468 -0.07296821]
 [-0.01047396  0.02356011]
 [ 0.11544097  0.06833212]
 [ 0.01681328 -0.00510339]
 [-0.07041002 -0.00715404]
 [ 0.1227513   0.02325   ]
 [ 0.03765457  0.02418866]
 [-0.0727005   0.04570051]
 [ 0.23678466  0.02092281]
 [ 0.09965405 -0.07586469]
 [ 0.02956955 -0.14766104]
 [ 0.24783442  0.09124872]
 [-0.05173007 -0.01858108]
 [-0.07509212 -0.16741198]
 [ 0.15820097  0.02933633]
 [ 0.17652355 -0.08188853]
 [ 0.08251682  0.03693327]
 [-0.07078136 -0.13687759]
 [ 0.19897381  0.06630531]
 [-0.02931185 -0.06391228]
 [ 0.09146325 -0.04824824]
 [-0.08578384  0.01404901]
 [-0.02962707 -0.11551206]
 [ 0.19834241 -0.03538007]
 [-0.05972279  0.03032355]
 [ 0.08670001 -0.00606107]
 [-0.03116005  0.04406629]
 [ 0.02648243 -0.05101348]
 [-0.10810175  0.02162109]
 [-0.10345345  0.07732651]
 [-0.05484872 -0.17262396]
 [-0.11811638  0.07819233]
 [ 0.00570303 -0.1432835 ]
 [-0.01907073 -0.0475412 ]
 [ 0.14628249  0.01832428]
 [ 0.05575008 -0.05731427]
 [ 0.11206784 -0.02420592]
 [ 0.02723779 -0.11093354]
 [ 0.10353602  0.01005479]
 [-0.0224084  -0.01092377]
 [-0.09914013  0.06366138]
 [ 0.03007613 -0.02285616]
 [-0.09979745  0.00107765]
 [-0.08069197  0.05916232]
 [-0.05493688  0.00061032]
 [-0.00936725 -0.07803598]
 [-0.0801698  -0.02510252]
 [ 0.05976957  0.01190185]
 [ 0.05585779 -0.01290171]
 [-0.0017517   0.04210121]
 [ 0.13281497  0.05043614]
 [ 0.18931821  0.02747014]
 [ 0.22638096  0.07670289]
 [-0.06116483 -0.11844676]
 [-0.10376601  0.04464   ]
 [-0.11810946  0.04675272]
 [-0.07741126  0.02926753]
 [-0.03989306  0.02506544]
 [-0.03842906 -0.03845785]
 [-0.12118508  0.06150249]
 [ 0.04853543 -0.08333611]
 [ 0.00320653 -0.04752446]
 [ 0.00705696  0.01755481]
 [ 0.13125472 -0.04330547]
 [ 0.15408349  0.04558488]
 [-0.05532449  0.05504591]
 [ 0.03576639 -0.00278353]
 [-0.07983163 -0.07187959]
 [ 0.18520214  0.05829571]
 [-0.06727929  0.01146632]
 [-0.01335953  0.00502683]
 [ 0.00114023 -0.172542  ]
 [ 0.11054053  0.04174668]
 [ 0.15029777  0.07876001]
 [-0.03478568 -0.09472081]
 [-0.0701767  -0.19748871]
 [ 0.05397147  0.04461728]
 [ 0.15789829 -0.00141685]
 [-0.04004061  0.00684292]
 [-0.08793461  0.04723146]
 [-0.02320017 -0.0277043 ]
 [ 0.21870363  0.05245085]
 [-0.03890206 -0.01079456]
 [ 0.12431625  0.00142029]
 [ 0.09176176 -0.02673565]
 [ 0.00026089 -0.00786204]
 [ 0.04546378 -0.03759898]
 [ 0.01077029 -0.0266246 ]
 [ 0.02483525  0.01424494]
 [-0.0546426  -0.04536985]
 [ 0.17367441  0.03448724]]
In [50]:
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(Z, interpolation='nearest',
           extent=(xx.min(), xx.max(), yy.min(), yy.max()),
           cmap=plt.cm.Paired,
           aspect='auto', origin='lower')

plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
plt.scatter(centroids[:, 0], centroids[:, 1],
            marker='x', s=169, linewidths=3,
            color='w', zorder=10)
plt.title('Clustering on the seeds dataset (PCA-reduced data)\n'
          'Centroids are marked with white cross')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()

Applying agglomerative clustering via scikit-learn

In [52]:
from sklearn.cluster import AgglomerativeClustering

ac = AgglomerativeClustering(n_clusters=100, affinity='euclidean', linkage='complete')
labels = ac.fit_predict(X)
print('Cluster labels: %s' % labels)
Cluster labels: [ 9 33 33 ..., 47 47 47]

In [53]:
from sklearn.cross_validation import train_test_split
X = df[features]
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y ,test_size=0.25, random_state=42)

K Means

In [55]:
from sklearn import cluster
clf = cluster.KMeans(init='k-means++', n_clusters=100, random_state=5)
clf.fit(X_train)
print clf.labels_.shape
print clf.labels_
(1200L,)
[41  7 41 ..., 98 70 58]
In [56]:
# Predict clusters on testing data
y_pred = clf.predict(X_test)
In [57]:
from sklearn import metrics
print "Addjusted rand score:{:.2}".format(metrics.adjusted_rand_score(y_test, y_pred))
print "Homogeneity score:{:.2} ".format(metrics.homogeneity_score(y_test, y_pred)) 
print "Completeness score: {:.2} ".format(metrics.completeness_score(y_test, y_pred))
print "Confusion matrix"
print metrics.confusion_matrix(y_test, y_pred)
Addjusted rand score:0.46
Homogeneity score:0.84 
Completeness score: 0.87 
Confusion matrix
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]

Affinity Propogation

In [58]:
# Affinity propagation
aff = cluster.AffinityPropagation()
aff.fit(X_train)
print aff.cluster_centers_indices_.shape
(66L,)
In [59]:
y_pred = aff.predict(X_test)
In [60]:
from sklearn import metrics
print "Addjusted rand score:{:.2}".format(metrics.adjusted_rand_score(y_test, y_pred))
print "Homogeneity score:{:.2} ".format(metrics.homogeneity_score(y_test, y_pred)) 
print "Completeness score: {:.2} ".format(metrics.completeness_score(y_test, y_pred))
print "Confusion matrix"
print metrics.confusion_matrix(y_test, y_pred)
Addjusted rand score:0.37
Homogeneity score:0.77 
Completeness score: 0.87 
Confusion matrix
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]

MeanShift

In [61]:
ms = cluster.MeanShift()
ms.fit(X_train)
Out[61]:
MeanShift(bandwidth=None, bin_seeding=False, cluster_all=True, min_bin_freq=1,
     n_jobs=1, seeds=None)
In [62]:
y_pred = ms.predict(X_test)
In [63]:
from sklearn import metrics
print "Addjusted rand score:{:.2}".format(metrics.adjusted_rand_score(y_test, y_pred))
print "Homogeneity score:{:.2} ".format(metrics.homogeneity_score(y_test, y_pred)) 
print "Completeness score: {:.2} ".format(metrics.completeness_score(y_test, y_pred))
print "Confusion matrix"
print metrics.confusion_matrix(y_test, y_pred)
Addjusted rand score:0.00047
Homogeneity score:0.015 
Completeness score: 1.0 
Confusion matrix
[[1 0 0 ..., 0 0 0]
 [3 0 0 ..., 0 0 0]
 [3 0 0 ..., 0 0 0]
 ..., 
 [3 0 0 ..., 0 0 0]
 [3 0 0 ..., 0 0 0]
 [4 0 0 ..., 0 0 0]]

Mixture of Guassian Models

In [65]:
from sklearn import mixture

# Define a heldout dataset to estimate covariance type
X_train_heldout, X_test_heldout, y_train_heldout, y_test_heldout = train_test_split(
        X_train, y_train,test_size=0.25, random_state=42)
for covariance_type in ['spherical','tied','diag','full']:
    gm=mixture.GMM(n_components=100, covariance_type=covariance_type, random_state=42, n_init=5)
    gm.fit(X_train_heldout)
    y_pred=gm.predict(X_test_heldout)
    print "Adjusted rand score for covariance={}:{:.2}".format(covariance_type, 
                                                               metrics.adjusted_rand_score(y_test_heldout, y_pred))
Adjusted rand score for covariance=spherical:0.2
Adjusted rand score for covariance=tied:0.24
Adjusted rand score for covariance=diag:0.2
Adjusted rand score for covariance=full:0.082

In [67]:
X = df[features].values
y= df['Class'].values
pca = PCA(n_components=2)
X = pca.fit_transform(X)
In [77]:
c = []
from matplotlib.pyplot import cm 
n=100
color=iter(cm.rainbow(np.linspace(0,1,n)))
for i in range(n):
    c.append(next(color))
In [80]:
c[99]
Out[80]:
array([  1.00000000e+00,   1.22464680e-16,   6.12323400e-17,
         1.00000000e+00])
In [94]:
n = 100
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(20,10))

km = KMeans(n_clusters= n , random_state=0)
y_km = km.fit_predict(X)

for i in range(n):
    ax1.scatter(X[y_km==i,0], X[y_km==i,1], c=c[i], marker='o', s=40, label='cluster{}'.format(i))
ax1.set_title('K-means clustering')

ac = AgglomerativeClustering(n_clusters=100, affinity='euclidean', linkage='complete')
y_ac = ac.fit_predict(X)
for i in range(n):
    ax2.scatter(X[y_ac==i,0], X[y_ac==i,1], c=c[i], marker='o', s=40, label='cluster{}'.format(i))
ax2.set_title('Agglomerative clustering')

# Put a legend below current axis
plt.legend(loc='upper center', bbox_to_anchor=(0, -0.05),
          fancybox=True, shadow=True, ncol=10)
    
plt.tight_layout()
#plt.savefig('./figures/kmeans_and_ac.png', dpi=300)
plt.show()

Classification

In [98]:
import os
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn import cross_validation, metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from time import time
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score , classification_report
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report
In [97]:
X = df[features]
y = df['Class']
In [103]:
# split dataset to 60% training and 40% testing
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0)
In [104]:
print X_train.shape, y_train.shape
(960, 64) (960L,)

Decision Tree accuracy and time elapsed caculation

In [105]:
t0=time()
print "DecisionTree"

dt = DecisionTreeClassifier(min_samples_split=20,random_state=99)
# dt = DecisionTreeClassifier(min_samples_split=20,max_depth=5,random_state=99)

clf_dt=dt.fit(X_train,y_train)

print "Acurracy: ", clf_dt.score(X_test,y_test)
t1=time()
print "time elapsed: ", t1-t0
DecisionTree
Acurracy:  0.3890625
time elapsed:  0.18700003624
In [106]:
tt0=time()
print "cross result========"
scores = cross_validation.cross_val_score(dt, X,y, cv=5)
print scores
print scores.mean()
tt1=time()
print "time elapsed: ", tt1-tt0
print "\n"
cross result========
[ 0.48        0.45666667  0.42333333  0.42333333  0.43666667]
0.444
time elapsed:  0.757999897003


In [107]:
from sklearn.metrics import classification_report

pipeline = Pipeline([
    ('clf', DecisionTreeClassifier(criterion='entropy'))
])

parameters = {
    'clf__max_depth': (5, 25 , 50),
    'clf__min_samples_split': (1, 5, 10),
    'clf__min_samples_leaf': (1, 2, 3)
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='f1')
grid_search.fit(X_train, y_train)

print 'Best score: %0.3f' % grid_search.best_score_
print 'Best parameters set:'

best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])

predictions = grid_search.predict(X_test)

print classification_report(y_test, predictions)
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   36.8s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   38.7s finished
Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best score: 0.372
Best parameters set:
	clf__max_depth: 50
	clf__min_samples_leaf: 1
	clf__min_samples_split: 5
             precision    recall  f1-score   support

          0       0.33      0.44      0.38         9
          1       0.33      0.14      0.20         7
          2       0.83      0.71      0.77         7
          3       0.40      0.20      0.27        10
          4       0.29      0.29      0.29         7
          5       0.80      0.80      0.80         5
          6       0.00      0.00      0.00         4
          7       0.67      0.50      0.57         4
          8       0.17      0.40      0.24         5
          9       1.00      0.25      0.40         4
         10       1.00      1.00      1.00         5
         11       0.71      0.62      0.67         8
         12       0.38      0.43      0.40         7
         13       0.42      0.83      0.56         6
         14       0.50      0.60      0.55         5
         15       0.50      0.33      0.40         9
         16       0.62      0.83      0.71         6
         17       0.50      0.25      0.33         4
         18       1.00      0.33      0.50         6
         19       0.38      0.38      0.38         8
         20       0.60      0.50      0.55         6
         21       0.25      0.50      0.33         4
         22       0.64      1.00      0.78         7
         23       0.00      0.00      0.00         3
         24       0.18      0.75      0.29         4
         25       0.00      0.00      0.00         7
         26       0.75      0.38      0.50         8
         27       0.67      0.18      0.29        11
         28       0.50      0.40      0.44         5
         29       0.83      0.42      0.56        12
         30       0.10      0.12      0.11         8
         31       0.29      0.40      0.33         5
         32       0.43      0.33      0.38         9
         33       0.44      0.78      0.56         9
         34       0.54      1.00      0.70         7
         35       0.55      0.60      0.57        10
         36       0.62      0.89      0.73         9
         37       0.20      0.33      0.25         3
         38       0.50      0.50      0.50         6
         39       0.25      0.50      0.33         2
         40       0.50      0.11      0.18         9
         41       0.10      0.17      0.12         6
         42       0.40      0.57      0.47         7
         43       1.00      0.83      0.91         6
         44       0.25      0.17      0.20         6
         45       0.50      0.43      0.46         7
         46       0.80      0.57      0.67         7
         47       0.20      0.17      0.18         6
         48       0.22      0.50      0.31         4
         49       0.53      1.00      0.70         8
         50       0.22      0.33      0.27         6
         51       0.67      0.57      0.62         7
         52       0.50      1.00      0.67         5
         53       0.25      0.33      0.29         3
         54       0.50      0.25      0.33         4
         55       0.29      0.40      0.33         5
         56       0.10      0.17      0.12         6
         57       0.50      0.29      0.36         7
         58       0.43      0.38      0.40         8
         59       0.56      0.71      0.63         7
         60       0.00      0.00      0.00        10
         61       0.00      0.00      0.00         5
         62       0.90      1.00      0.95         9
         63       0.43      0.50      0.46         6
         64       0.17      0.14      0.15         7
         65       0.33      0.20      0.25         5
         66       0.38      1.00      0.55         3
         67       0.67      0.86      0.75         7
         68       0.14      0.29      0.19         7
         69       0.75      0.43      0.55         7
         70       0.00      0.00      0.00         7
         71       0.25      0.25      0.25         4
         72       0.50      0.67      0.57         3
         73       0.00      0.00      0.00        11
         74       0.71      0.56      0.63         9
         75       0.33      0.17      0.22         6
         76       0.60      0.43      0.50         7
         77       0.40      0.33      0.36         6
         78       0.25      0.20      0.22         5
         79       0.71      0.71      0.71         7
         80       0.50      0.12      0.20         8
         81       0.50      0.40      0.44         5
         82       0.40      0.50      0.44         4
         83       0.33      0.50      0.40         6
         84       0.00      0.00      0.00         8
         85       0.33      0.33      0.33         3
         86       0.57      0.67      0.62         6
         87       0.50      0.29      0.36         7
         88       0.67      0.29      0.40         7
         89       1.00      0.50      0.67         6
         90       0.67      0.67      0.67         9
         91       0.50      0.17      0.25         6
         92       0.00      0.00      0.00         3
         93       0.40      0.67      0.50         6
         94       0.67      0.25      0.36         8
         95       0.50      0.25      0.33         8
         96       0.20      0.33      0.25         6
         97       0.00      0.00      0.00         6
         98       0.00      0.00      0.00         2
         99       1.00      0.30      0.46        10

avg / total       0.46      0.42      0.41       640

C:\Miniconda2\lib\site-packages\sklearn\metrics\classification.py:1074: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

Random Forest accuracy and time elapsed caculation

In [110]:
t2=time()
print "RandomForest"
rf = RandomForestClassifier(n_estimators=100,n_jobs=-1)
clf_rf = rf.fit(X_train,y_train)
print "Acurracy: ", clf_rf.score(X_test,y_test)
t3=time()
print "time elapsed: ", t3-t2
RandomForest
Acurracy:  0.79375
time elapsed:  1.11400008202
In [111]:
tt2=time()
print "cross result========"
scores = cross_validation.cross_val_score(dt, X,y, cv=5)
print scores
print scores.mean()
tt3=time()
print "time elapsed: ", tt3-tt2
print "\n"
cross result========
[ 0.48        0.45666667  0.42333333  0.42333333  0.43666667]
0.444
time elapsed:  0.690999984741


In [112]:
pipeline2 = Pipeline([
('clf', RandomForestClassifier(criterion='entropy'))
])

parameters = {
    'clf__n_estimators': (5, 25, 50, 100),
    'clf__max_depth': (5, 25 , 50),
    'clf__min_samples_split': (1, 5, 10),
    'clf__min_samples_leaf': (1, 2, 3)
}

grid_search = GridSearchCV(pipeline2, parameters, n_jobs=-1, verbose=1, scoring='accuracy', cv=3)

grid_search.fit(X_train, y_train)

print 'Best score: %0.3f' % grid_search.best_score_

print 'Best parameters set:'
best_parameters = grid_search.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])

predictions = grid_search.predict(X_test)
print 'Accuracy:', accuracy_score(y_test, predictions)
print classification_report(y_test, predictions)
    
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   27.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed:  2.5min finished
Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best score: 0.797
Best parameters set:
	clf__max_depth: 25
	clf__min_samples_leaf: 1
	clf__min_samples_split: 1
	clf__n_estimators: 100
Accuracy: 0.7625
             precision    recall  f1-score   support

          0       0.71      0.56      0.63         9
          1       0.71      0.71      0.71         7
          2       1.00      1.00      1.00         7
          3       1.00      0.70      0.82        10
          4       0.88      1.00      0.93         7
          5       1.00      1.00      1.00         5
          6       0.67      1.00      0.80         4
          7       1.00      1.00      1.00         4
          8       0.71      1.00      0.83         5
          9       0.50      1.00      0.67         4
         10       1.00      1.00      1.00         5
         11       1.00      0.75      0.86         8
         12       0.70      1.00      0.82         7
         13       0.75      1.00      0.86         6
         14       0.80      0.80      0.80         5
         15       1.00      0.56      0.71         9
         16       0.75      1.00      0.86         6
         17       0.50      0.75      0.60         4
         18       0.83      0.83      0.83         6
         19       0.86      0.75      0.80         8
         20       0.86      1.00      0.92         6
         21       1.00      0.75      0.86         4
         22       0.78      1.00      0.88         7
         23       0.12      0.67      0.21         3
         24       0.36      1.00      0.53         4
         25       0.33      0.14      0.20         7
         26       1.00      0.75      0.86         8
         27       1.00      0.64      0.78        11
         28       0.62      1.00      0.77         5
         29       1.00      0.50      0.67        12
         30       0.33      0.25      0.29         8
         31       1.00      0.60      0.75         5
         32       1.00      0.89      0.94         9
         33       1.00      0.56      0.71         9
         34       0.88      1.00      0.93         7
         35       1.00      1.00      1.00        10
         36       1.00      1.00      1.00         9
         37       0.75      1.00      0.86         3
         38       0.50      0.67      0.57         6
         39       0.25      0.50      0.33         2
         40       0.80      0.44      0.57         9
         41       0.38      0.50      0.43         6
         42       1.00      0.86      0.92         7
         43       0.86      1.00      0.92         6
         44       1.00      0.67      0.80         6
         45       0.88      1.00      0.93         7
         46       1.00      0.86      0.92         7
         47       0.86      1.00      0.92         6
         48       0.67      1.00      0.80         4
         49       0.78      0.88      0.82         8
         50       0.86      1.00      0.92         6
         51       1.00      0.71      0.83         7
         52       0.83      1.00      0.91         5
         53       0.50      1.00      0.67         3
         54       0.40      1.00      0.57         4
         55       1.00      0.80      0.89         5
         56       0.67      0.33      0.44         6
         57       0.83      0.71      0.77         7
         58       0.50      0.12      0.20         8
         59       0.70      1.00      0.82         7
         60       1.00      0.30      0.46        10
         61       0.50      0.80      0.62         5
         62       1.00      1.00      1.00         9
         63       0.86      1.00      0.92         6
         64       1.00      0.43      0.60         7
         65       0.80      0.80      0.80         5
         66       0.60      1.00      0.75         3
         67       1.00      1.00      1.00         7
         68       0.38      0.71      0.50         7
         69       0.80      0.57      0.67         7
         70       0.60      0.43      0.50         7
         71       0.57      1.00      0.73         4
         72       0.60      1.00      0.75         3
         73       1.00      0.09      0.17        11
         74       1.00      0.89      0.94         9
         75       1.00      0.33      0.50         6
         76       1.00      0.86      0.92         7
         77       0.86      1.00      0.92         6
         78       0.25      0.20      0.22         5
         79       1.00      1.00      1.00         7
         80       1.00      0.50      0.67         8
         81       0.71      1.00      0.83         5
         82       0.57      1.00      0.73         4
         83       0.46      1.00      0.63         6
         84       1.00      0.75      0.86         8
         85       0.75      1.00      0.86         3
         86       1.00      1.00      1.00         6
         87       1.00      0.71      0.83         7
         88       1.00      1.00      1.00         7
         89       0.75      1.00      0.86         6
         90       0.90      1.00      0.95         9
         91       1.00      0.83      0.91         6
         92       0.40      0.67      0.50         3
         93       0.71      0.83      0.77         6
         94       0.71      0.62      0.67         8
         95       1.00      0.62      0.77         8
         96       0.67      0.33      0.44         6
         97       0.67      0.33      0.44         6
         98       0.67      1.00      0.80         2
         99       1.00      1.00      1.00        10

avg / total       0.82      0.76      0.76       640

Naive Bayes accuracy and time elapsed caculation

In [113]:
t4=time()
print "NaiveBayes"
nb = BernoulliNB()
clf_nb=nb.fit(X_train,y_train)
print "Acurracy: ", clf_nb.score(X_test,y_test)
t5=time()
print "time elapsed: ", t5-t4
NaiveBayes
Acurracy:  0.36875
time elapsed:  0.119999885559
In [114]:
tt4=time()
print "cross result========"
scores = cross_validation.cross_val_score(dt, X,y, cv=5)
print scores
print scores.mean()
tt5=time()
print "time elapsed: ", tt5-tt4
print "\n"
cross result========
[ 0.48        0.45666667  0.42333333  0.42333333  0.43666667]
0.444
time elapsed:  0.680999994278


KNN accuracy and time elapsed caculation

In [115]:
t6=time()
print "KNN"
# knn = KNeighborsClassifier(n_neighbors=3)
knn = KNeighborsClassifier()
clf_knn=knn.fit(X_train, y_train)
print "Acurracy: ", clf_knn.score(X_test,y_test) 
t7=time()
print "time elapsed: ", t7-t6
KNN
Acurracy:  0.71875
time elapsed:  0.184000015259
In [116]:
tt6=time()
print "cross result========"
scores = cross_validation.cross_val_score(dt, X,y, cv=5)
print scores
print scores.mean()
tt7=time()
print "time elapsed: ", tt7-tt6
print "\n"
cross result========
[ 0.48        0.45666667  0.42333333  0.42333333  0.43666667]
0.444
time elapsed:  0.661000013351


SVM accuracy and time elapsed caculation

In [119]:
t7=time()
print "SVM"

svc = SVC()
clf_svc=svc.fit(X_train, y_train)
print "Acurracy: ", clf_svc.score(X_test,y_test) 
t8=time()
print "time elapsed: ", t8-t7
SVM
Acurracy:  0.00625
time elapsed:  0.776000022888
In [120]:
tt7=time()
print "cross result========"
scores = cross_validation.cross_val_score(dt, X,y, cv=5)
print scores
print scores.mean()
tt8=time()
print "time elapsed: ", tt7-tt6
print "\n"
cross result========
[ 0.48        0.45666667  0.42333333  0.42333333  0.43666667]
0.444
time elapsed:  10.0399999619


In [121]:
from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn import grid_search

svc = SVC()

parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}

grid = grid_search.GridSearchCV(svc, parameters, n_jobs=-1, verbose=1, scoring='accuracy')


grid.fit(X_train, y_train)

print 'Best score: %0.3f' % grid.best_score_

print 'Best parameters set:'
best_parameters = grid.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])
    
predictions = grid.predict(X_test)
print classification_report(y_test, predictions)
Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best score: 0.231
Best parameters set:
	C: 10
	kernel: 'linear'
             precision    recall  f1-score   support

          0       0.00      0.00      0.00         9
          1       0.00      0.00      0.00         7
          2       0.00      0.00      0.00         7
          3       0.00      0.00      0.00        10
          4       0.00      0.00      0.00         7
          5       1.00      1.00      1.00         5
          6       0.40      1.00      0.57         4
          7       0.36      1.00      0.53         4
          8       0.00      0.00      0.00         5
          9       0.04      0.25      0.07         4
         10       1.00      1.00      1.00         5
         11       0.00      0.00      0.00         8
         12       0.00      0.00      0.00         7
         13       0.00      0.00      0.00         6
         14       1.00      0.40      0.57         5
         15       0.00      0.00      0.00         9
         16       1.00      0.83      0.91         6
         17       0.14      0.75      0.24         4
         18       0.00      0.00      0.00         6
         19       0.00      0.00      0.00         8
         20       0.00      0.00      0.00         6
         21       0.00      0.00      0.00         4
         22       0.00      0.00      0.00         7
         23       0.04      1.00      0.08         3
         24       0.00      0.00      0.00         4
         25       0.00      0.00      0.00         7
         26       0.00      0.00      0.00         8
         27       0.00      0.00      0.00        11
         28       0.00      0.00      0.00         5
         29       0.00      0.00      0.00        12
         30       0.00      0.00      0.00         8
         31       0.00      0.00      0.00         5
         32       0.00      0.00      0.00         9
         33       0.00      0.00      0.00         9
         34       1.00      0.57      0.73         7
         35       0.00      0.00      0.00        10
         36       1.00      0.89      0.94         9
         37       0.20      1.00      0.33         3
         38       0.00      0.00      0.00         6
         39       0.13      1.00      0.24         2
         40       0.00      0.00      0.00         9
         41       0.00      0.00      0.00         6
         42       0.00      0.00      0.00         7
         43       0.46      1.00      0.63         6
         44       0.00      0.00      0.00         6
         45       0.00      0.00      0.00         7
         46       1.00      0.71      0.83         7
         47       0.00      0.00      0.00         6
         48       0.30      0.75      0.43         4
         49       0.00      0.00      0.00         8
         50       0.00      0.00      0.00         6
         51       0.00      0.00      0.00         7
         52       0.00      0.00      0.00         5
         53       0.05      1.00      0.09         3
         54       0.17      1.00      0.29         4
         55       0.00      0.00      0.00         5
         56       0.00      0.00      0.00         6
         57       0.00      0.00      0.00         7
         58       0.00      0.00      0.00         8
         59       0.00      0.00      0.00         7
         60       0.00      0.00      0.00        10
         61       0.25      0.20      0.22         5
         62       0.00      0.00      0.00         9
         63       0.00      0.00      0.00         6
         64       0.00      0.00      0.00         7
         65       0.33      0.80      0.47         5
         66       0.18      1.00      0.30         3
         67       0.00      0.00      0.00         7
         68       0.00      0.00      0.00         7
         69       0.00      0.00      0.00         7
         70       0.00      0.00      0.00         7
         71       0.11      0.75      0.19         4
         72       0.16      1.00      0.27         3
         73       0.00      0.00      0.00        11
         74       0.00      0.00      0.00         9
         75       0.00      0.00      0.00         6
         76       0.00      0.00      0.00         7
         77       0.46      1.00      0.63         6
         78       0.00      0.00      0.00         5
         79       1.00      0.14      0.25         7
         80       0.00      0.00      0.00         8
         81       0.40      0.80      0.53         5
         82       0.06      0.75      0.12         4
         83       0.00      0.00      0.00         6
         84       0.00      0.00      0.00         8
         85       0.14      1.00      0.25         3
         86       1.00      0.50      0.67         6
         87       0.00      0.00      0.00         7
         88       0.00      0.00      0.00         7
         89       0.00      0.00      0.00         6
         90       0.00      0.00      0.00         9
         91       0.00      0.00      0.00         6
         92       0.04      1.00      0.07         3
         93       0.00      0.00      0.00         6
         94       0.00      0.00      0.00         8
         95       0.00      0.00      0.00         8
         96       0.00      0.00      0.00         6
         97       0.00      0.00      0.00         6
         98       0.02      0.50      0.04         2
         99       0.00      0.00      0.00        10

avg / total       0.12      0.17      0.11       640

[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:   18.5s finished
In [122]:
pipeline = Pipeline([
    ('clf', SVC(kernel='rbf', gamma=0.01, C=100))
])

parameters = {
    'clf__gamma': (0.01, 0.03, 0.1, 0.3, 1),
    'clf__C': (0.1, 0.3, 1, 3, 10, 30),
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy')

grid_search.fit(X_train, y_train)

print 'Best score: %0.3f' % grid_search.best_score_

print 'Best parameters set:'
best_parameters = grid_search.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])
    
predictions = grid_search.predict(X_test)
print classification_report(y_test, predictions)
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   24.3s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:   31.1s finished
Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best score: 0.698
Best parameters set:
	clf__C: 30
	clf__gamma: 1
             precision    recall  f1-score   support

          0       0.83      0.56      0.67         9
          1       0.88      1.00      0.93         7
          2       1.00      1.00      1.00         7
          3       1.00      0.20      0.33        10
          4       1.00      1.00      1.00         7
          5       1.00      1.00      1.00         5
          6       0.67      1.00      0.80         4
          7       1.00      1.00      1.00         4
          8       0.71      1.00      0.83         5
          9       0.57      1.00      0.73         4
         10       1.00      1.00      1.00         5
         11       1.00      0.75      0.86         8
         12       0.43      0.86      0.57         7
         13       1.00      1.00      1.00         6
         14       1.00      1.00      1.00         5
         15       0.00      0.00      0.00         9
         16       1.00      1.00      1.00         6
         17       0.60      0.75      0.67         4
         18       0.86      1.00      0.92         6
         19       0.83      0.62      0.71         8
         20       1.00      1.00      1.00         6
         21       1.00      1.00      1.00         4
         22       1.00      1.00      1.00         7
         23       0.16      1.00      0.27         3
         24       0.12      0.75      0.20         4
         25       1.00      0.29      0.44         7
         26       0.50      0.12      0.20         8
         27       1.00      0.73      0.84        11
         28       0.38      0.60      0.46         5
         29       1.00      0.08      0.15        12
         30       0.00      0.00      0.00         8
         31       0.30      0.60      0.40         5
         32       0.89      0.89      0.89         9
         33       1.00      0.89      0.94         9
         34       1.00      1.00      1.00         7
         35       1.00      0.90      0.95        10
         36       1.00      0.89      0.94         9
         37       0.75      1.00      0.86         3
         38       0.67      0.67      0.67         6
         39       0.20      0.50      0.29         2
         40       0.00      0.00      0.00         9
         41       1.00      0.17      0.29         6
         42       0.88      1.00      0.93         7
         43       1.00      1.00      1.00         6
         44       0.57      0.67      0.62         6
         45       0.88      1.00      0.93         7
         46       1.00      1.00      1.00         7
         47       0.75      1.00      0.86         6
         48       1.00      1.00      1.00         4
         49       1.00      1.00      1.00         8
         50       0.75      0.50      0.60         6
         51       1.00      0.71      0.83         7
         52       1.00      0.80      0.89         5
         53       0.60      1.00      0.75         3
         54       0.67      1.00      0.80         4
         55       1.00      1.00      1.00         5
         56       1.00      0.67      0.80         6
         57       0.75      0.86      0.80         7
         58       0.25      0.25      0.25         8
         59       0.88      1.00      0.93         7
         60       0.00      0.00      0.00        10
         61       0.50      0.80      0.62         5
         62       0.69      1.00      0.82         9
         63       1.00      1.00      1.00         6
         64       1.00      1.00      1.00         7
         65       0.80      0.80      0.80         5
         66       0.75      1.00      0.86         3
         67       0.40      0.29      0.33         7
         68       0.40      0.29      0.33         7
         69       0.83      0.71      0.77         7
         70       0.50      0.43      0.46         7
         71       0.60      0.75      0.67         4
         72       1.00      0.67      0.80         3
         73       0.00      0.00      0.00        11
         74       0.89      0.89      0.89         9
         75       1.00      0.50      0.67         6
         76       0.86      0.86      0.86         7
         77       0.86      1.00      0.92         6
         78       0.36      1.00      0.53         5
         79       1.00      1.00      1.00         7
         80       1.00      0.50      0.67         8
         81       0.83      1.00      0.91         5
         82       0.67      1.00      0.80         4
         83       0.40      1.00      0.57         6
         84       1.00      1.00      1.00         8
         85       0.75      1.00      0.86         3
         86       1.00      1.00      1.00         6
         87       0.86      0.86      0.86         7
         88       1.00      1.00      1.00         7
         89       0.42      0.83      0.56         6
         90       0.80      0.89      0.84         9
         91       0.83      0.83      0.83         6
         92       0.33      1.00      0.50         3
         93       0.75      1.00      0.86         6
         94       0.62      0.62      0.62         8
         95       0.00      0.00      0.00         8
         96       0.67      0.67      0.67         6
         97       1.00      0.17      0.29         6
         98       0.25      0.50      0.33         2
         99       1.00      1.00      1.00        10

avg / total       0.75      0.72      0.70       640

Ensemble Learning

In [123]:
from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.externals import six
from sklearn.base import clone
from sklearn.pipeline import _name_estimators
import numpy as np
import operator


class MajorityVoteClassifier(BaseEstimator, 
                             ClassifierMixin):
    """ A majority vote ensemble classifier

    Parameters
    ----------
    classifiers : array-like, shape = [n_classifiers]
      Different classifiers for the ensemble

    vote : str, {'classlabel', 'probability'} (default='label')
      If 'classlabel' the prediction is based on the argmax of
        class labels. Else if 'probability', the argmax of
        the sum of probabilities is used to predict the class label
        (recommended for calibrated classifiers).

    weights : array-like, shape = [n_classifiers], optional (default=None)
      If a list of `int` or `float` values are provided, the classifiers
      are weighted by importance; Uses uniform weights if `weights=None`.

    """
    def __init__(self, classifiers, vote='classlabel', weights=None):

        self.classifiers = classifiers
        self.named_classifiers = {key: value for key, value
                                  in _name_estimators(classifiers)}
        self.vote = vote
        self.weights = weights

    def fit(self, X, y):
        """ Fit classifiers.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Matrix of training samples.

        y : array-like, shape = [n_samples]
            Vector of target class labels.

        Returns
        -------
        self : object

        """
        if self.vote not in ('probability', 'classlabel'):
            raise ValueError("vote must be 'probability' or 'classlabel'"
                             "; got (vote=%r)"
                             % self.vote)

        if self.weights and len(self.weights) != len(self.classifiers):
            raise ValueError('Number of classifiers and weights must be equal'
                             '; got %d weights, %d classifiers'
                             % (len(self.weights), len(self.classifiers)))

        # Use LabelEncoder to ensure class labels start with 0, which
        # is important for np.argmax call in self.predict
        self.lablenc_ = LabelEncoder()
        self.lablenc_.fit(y)
        self.classes_ = self.lablenc_.classes_
        self.classifiers_ = []
        for clf in self.classifiers:
            fitted_clf = clone(clf).fit(X, self.lablenc_.transform(y))
            self.classifiers_.append(fitted_clf)
        return self

    def predict(self, X):
        """ Predict class labels for X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Matrix of training samples.

        Returns
        ----------
        maj_vote : array-like, shape = [n_samples]
            Predicted class labels.
            
        """
        if self.vote == 'probability':
            maj_vote = np.argmax(self.predict_proba(X), axis=1)
        else:  # 'classlabel' vote

            #  Collect results from clf.predict calls
            predictions = np.asarray([clf.predict(X)
                                      for clf in self.classifiers_]).T

            maj_vote = np.apply_along_axis(
                                      lambda x:
                                      np.argmax(np.bincount(x,
                                                weights=self.weights)),
                                      axis=1,
                                      arr=predictions)
        maj_vote = self.lablenc_.inverse_transform(maj_vote)
        return maj_vote

    def predict_proba(self, X):
        """ Predict class probabilities for X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.

        Returns
        ----------
        avg_proba : array-like, shape = [n_samples, n_classes]
            Weighted average probability for each class per sample.

        """
        probas = np.asarray([clf.predict_proba(X)
                             for clf in self.classifiers_])
        avg_proba = np.average(probas, axis=0, weights=self.weights)
        return avg_proba

    def get_params(self, deep=True):
        """ Get classifier parameter names for GridSearch"""
        if not deep:
            return super(MajorityVoteClassifier, self).get_params(deep=False)
        else:
            out = self.named_classifiers.copy()
            for name, step in six.iteritems(self.named_classifiers):
                for key, value in six.iteritems(step.get_params(deep=True)):
                    out['%s__%s' % (name, key)] = value
            return out
In [ ]:
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.pipeline import Pipeline
import numpy as np

clf1 = LogisticRegression(penalty='l2', 
                          C=0.001, 
                          random_state=0)

clf2 = DecisionTreeClassifier(max_depth=1, 
                              criterion='entropy', 
                              random_state=0)

clf3 = KNeighborsClassifier(n_neighbors=1, 
                            p=2, 
                            metric='minkowski')

pipe1 = Pipeline([['sc', StandardScaler()],
                  ['clf', clf1]])
pipe3 = Pipeline([['sc', StandardScaler()],
                  ['clf', clf3]])

clf_labels = ['Logistic Regression', 'Decision Tree', 'KNN']

print('10-fold cross validation:\n')
for clf, label in zip([pipe1, clf2, pipe3], clf_labels):
    scores = cross_val_score(estimator=clf, 
                             X=X_train, 
                             y=y_train, 
                             cv=10, 
                             scoring='roc_auc')
    print("ROC AUC: %0.2f (+/- %0.2f) [%s]" 
               % (scores.mean(), scores.std(), label))
In [ ]:
# Majority Rule (hard) Voting

mv_clf = MajorityVoteClassifier(
                classifiers=[pipe1, clf2, pipe3])

clf_labels += ['Majority Voting']
all_clf = [pipe1, clf2, pipe3, mv_clf]

for clf, label in zip(all_clf, clf_labels):
    scores = cross_val_score(estimator=clf, 
                             X=X_train, 
                             y=y_train, 
                             cv=10, 
                             scoring='roc_auc')
    print("ROC AUC: %0.2f (+/- %0.2f) [%s]" 
               % (scores.mean(), scores.std(), label))
In [ ]:
mv_clf.get_params()
In [ ]:
from sklearn.grid_search import GridSearchCV

params = {'decisiontreeclassifier__max_depth': [1, 2],
          'pipeline-1__clf__C': [0.001, 0.1, 100.0]}

grid = GridSearchCV(estimator=mv_clf, 
                    param_grid=params, 
                    cv=10, 
                    scoring='roc_auc')
grid.fit(X_train, y_train)

for params, mean_score, scores in grid.grid_scores_:
    print("%0.3f+/-%0.2f %r"
            % (mean_score, scores.std() / 2, params))
In [ ]:
print('Best parameters: %s' % grid.best_params_)
print('Accuracy: %.2f' % grid.best_score_)

Bagging -- Building an ensemble of classifiers from bootstrap samples

In [ ]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(criterion='entropy', 
                              max_depth=None)

bag = BaggingClassifier(base_estimator=tree,
                        n_estimators=500, 
                        max_samples=1.0, 
                        max_features=1.0, 
                        bootstrap=True, 
                        bootstrap_features=False, 
                        n_jobs=1, 
                        random_state=1)

from sklearn.metrics import accuracy_score

tree = tree.fit(X_train, y_train)
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)

tree_train = accuracy_score(y_train, y_train_pred)
tree_test = accuracy_score(y_test, y_test_pred)
print('Decision tree train/test accuracies %.3f/%.3f'
      % (tree_train, tree_test))

bag = bag.fit(X_train, y_train)
y_train_pred = bag.predict(X_train)
y_test_pred = bag.predict(X_test)

bag_train = accuracy_score(y_train, y_train_pred) 
bag_test = accuracy_score(y_test, y_test_pred) 
print('Bagging train/test accuracies %.3f/%.3f'
      % (bag_train, bag_test))

Leveraging weak learners via adaptive boosting

In [ ]:
from sklearn.ensemble import AdaBoostClassifier

tree = DecisionTreeClassifier(criterion='entropy', 
                              max_depth=1)

ada = AdaBoostClassifier(base_estimator=tree,
                         n_estimators=500, 
                         learning_rate=0.1,
                         random_state=0)

tree = tree.fit(X_train, y_train)
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)

tree_train = accuracy_score(y_train, y_train_pred)
tree_test = accuracy_score(y_test, y_test_pred)
print('Decision tree train/test accuracies %.3f/%.3f'
      % (tree_train, tree_test))

ada = ada.fit(X_train, y_train)
y_train_pred = ada.predict(X_train)
y_test_pred = ada.predict(X_test)

ada_train = accuracy_score(y_train, y_train_pred) 
ada_test = accuracy_score(y_test, y_test_pred) 
print('AdaBoost train/test accuracies %.3f/%.3f'
      % (ada_train, ada_test))