Seeds Clustering

https://archive.ics.uci.edu/ml/datasets/seeds

To construct the data, seven geometric parameters of wheat kernels were measured:

  1. area A,
  2. perimeter P,
  3. compactness C = 4piA/P^2,
  4. length of kernel,
  5. width of kernel,
  6. asymmetry coefficient
  7. length of kernel groove.
  8. The 3 Classes - three different varieties of wheat

All of these parameters are real-valued continuous.

In [26]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn import cross_validation, metrics
from sklearn import preprocessing
import matplotlib.pyplot as plt
In [7]:
cols = ['Area', 'Perimeter','Compactness','Kernel_Length','Kernel_Width','Assymetry_Coefficient','Kernel_Groove_Length', 'Class']
In [10]:
# read .csv from provided dataset
csv_filename="seeds_dataset.txt"

# df=pd.read_csv(csv_filename,index_col=0)
df=pd.read_csv(csv_filename,delim_whitespace=True,names=cols)
In [12]:
df.head()
Out[12]:
Area Perimeter Compactness Kernel_Length Kernel_Width Assymetry_Coefficient Kernel_Groove_Length Class
0 15.26 14.84 0.8710 5.763 3.312 2.221 5.220 1
1 14.88 14.57 0.8811 5.554 3.333 1.018 4.956 1
2 14.29 14.09 0.9050 5.291 3.337 2.699 4.825 1
3 13.84 13.94 0.8955 5.324 3.379 2.259 4.805 1
4 16.14 14.99 0.9034 5.658 3.562 1.355 5.175 1
In [17]:
features = df.columns[:-1]
features
Out[17]:
Index([u'Area', u'Perimeter', u'Compactness', u'Kernel_Length',
       u'Kernel_Width', u'Assymetry_Coefficient', u'Kernel_Groove_Length'],
      dtype='object')
In [108]:
X = df[features]
y = df['Class']
In [109]:
X.head()
Out[109]:
Area Perimeter Compactness Kernel_Length Kernel_Width Assymetry_Coefficient Kernel_Groove_Length
0 15.26 14.84 0.8710 5.763 3.312 2.221 5.220
1 14.88 14.57 0.8811 5.554 3.333 1.018 4.956
2 14.29 14.09 0.9050 5.291 3.337 2.699 4.825
3 13.84 13.94 0.8955 5.324 3.379 2.259 4.805
4 16.14 14.99 0.9034 5.658 3.562 1.355 5.175
In [110]:
# split dataset to 60% training and 40% testing
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0)
In [111]:
print X_train.shape, y_train.shape
(126, 7) (126L,)

Unsupervised Learning

PCA

In [23]:
y.unique()
Out[23]:
array([1, 2, 3], dtype=int64)
In [43]:
len(features)
Out[43]:
7
In [44]:
# Apply PCA with the same number of dimensions as variables in the dataset
from sklearn.decomposition import PCA
pca = PCA(n_components=7)
pca.fit(X)

# Print the components and the amount of variance in the data contained in each dimension
print(pca.components_)
print(pca.explained_variance_ratio_)
[[-0.8842285  -0.39540542 -0.00431132 -0.12854448 -0.11105914  0.12761562
  -0.1289665 ]
 [ 0.10080577  0.05648963 -0.00289474  0.03062173  0.00237229  0.98941048
   0.08223339]
 [ 0.26453354 -0.28251995  0.05903584 -0.40014946  0.31923869  0.06429754
  -0.76193973]
 [ 0.19944949 -0.57881686  0.05776023 -0.43610024  0.23416358 -0.02514736
   0.61335659]
 [-0.13717297  0.57475603 -0.05310454 -0.78699776 -0.1448029  -0.00157564
   0.08765361]
 [ 0.28063956 -0.30155864 -0.04522905 -0.11343761 -0.89626785  0.003288
  -0.10992364]
 [-0.02539824  0.0658399   0.99412565  0.00143143 -0.0815499   0.00114269
   0.00897193]]
[  8.29385197e-01   1.63632452e-01   5.65790880e-03   9.90306086e-04
   2.11180347e-04   1.20677139e-04   2.27879552e-06]
In [45]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.plot(list(pca.explained_variance_ratio_),'-o')
plt.title('Explained variance ratio as function of PCA components')
plt.ylabel('Explained variance ratio')
plt.xlabel('Component')
plt.show()

Applying PCA to visualize high-dimensional data

In [93]:
X = df[features].values
y= df['Class'].values
pca = PCA(n_components=2)
reduced_X = pca.fit_transform(X)
In [95]:
red_x, red_y = [], []
blue_x, blue_y = [], []
green_x, green_y = [], []

for i in range(len(reduced_X)):
    if y[i] == 1:
        red_x.append(reduced_X[i][0])
        red_y.append(reduced_X[i][1])
    elif y[i] == 2:
        blue_x.append(reduced_X[i][0])
        blue_y.append(reduced_X[i][1])
    else:
        green_x.append(reduced_X[i][0])
        green_y.append(reduced_X[i][1])

plt.scatter(red_x, red_y, c='r', marker='x')
plt.scatter(blue_x, blue_y, c='b', marker='D')
plt.scatter(green_x, green_y, c='g', marker='.')
plt.show()

Clustering

In [50]:
# Import clustering modules
from sklearn.cluster import KMeans
from sklearn.mixture import GMM
In [51]:
# First we reduce the data to two dimensions using PCA to capture variation
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(X)
print(reduced_data[:10])  # print upto 10 elements
[[-0.66344838 -1.41732098]
 [-0.31566651 -2.68922915]
 [ 0.6604993  -1.13150635]
 [ 1.0552759  -1.62119002]
 [-1.61999921 -2.18338442]
 [ 0.47693801 -1.33649437]
 [ 0.18483472 -0.15036441]
 [ 0.78062962 -1.12979883]
 [-2.2821081  -1.3600169 ]
 [-1.97854147 -1.49468793]]
In [52]:
kmeans = KMeans(n_clusters=3)
clusters = kmeans.fit(reduced_data)
print(clusters)
KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=3, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)
In [53]:
# Plot the decision boundary by building a mesh grid to populate a graph.
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
hx = (x_max-x_min)/1000.
hy = (y_max-y_min)/1000.
xx, yy = np.meshgrid(np.arange(x_min, x_max, hx), np.arange(y_min, y_max, hy))

# Obtain labels for each point in mesh. Use last trained model.
Z = clusters.predict(np.c_[xx.ravel(), yy.ravel()])
In [54]:
# Find the centroids for KMeans or the cluster means for GMM 

centroids = kmeans.cluster_centers_
print('*** K MEANS CENTROIDS ***')
print(centroids)

# TRANSFORM DATA BACK TO ORIGINAL SPACE FOR ANSWERING 7
print('*** CENTROIDS TRANSFERED TO ORIGINAL SPACE ***')
print(pca.inverse_transform(centroids))
*** K MEANS CENTROIDS ***
[[ 0.11491258 -1.08548863]
 [-4.33639856  0.46609094]
 [ 3.32787801  0.64576148]]
*** CENTROIDS TRANSFERED TO ORIGINAL SPACE ***
[[ 14.63649131  14.45252981   0.87364536   5.58052241   3.24326757
    2.64087177   5.30398814]
 [ 18.72887568  16.3002505    0.88834498   6.20022594   3.74130716
    3.607964     6.00564981]
 [ 11.9700157   13.27990355   0.8547817    5.22052733   2.89054543
    4.76381335   5.03198981]]
In [56]:
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(Z, interpolation='nearest',
           extent=(xx.min(), xx.max(), yy.min(), yy.max()),
           cmap=plt.cm.Paired,
           aspect='auto', origin='lower')

plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
plt.scatter(centroids[:, 0], centroids[:, 1],
            marker='x', s=169, linewidths=3,
            color='w', zorder=10)
plt.title('Clustering on the seeds dataset (PCA-reduced data)\n'
          'Centroids are marked with white cross')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()

Elbow Method

In [60]:
distortions = []
for i in range(1, 11):
    km = KMeans(n_clusters=i, 
                init='k-means++', 
                n_init=10, 
                max_iter=300, 
                random_state=0)
    km.fit(X)
    distortions .append(km.inertia_)
plt.plot(range(1,11), distortions , marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.tight_layout()
#plt.savefig('./figures/elbow.png', dpi=300)
plt.show()

Quantifying the quality of clustering via silhouette plots

In [61]:
import numpy as np
from matplotlib import cm
from sklearn.metrics import silhouette_samples

km = KMeans(n_clusters=3, 
            init='k-means++', 
            n_init=10, 
            max_iter=300,
            tol=1e-04,
            random_state=0)
y_km = km.fit_predict(X)

cluster_labels = np.unique(y_km)
n_clusters = cluster_labels.shape[0]
silhouette_vals = silhouette_samples(X, y_km, metric='euclidean')
y_ax_lower, y_ax_upper = 0, 0
yticks = []
for i, c in enumerate(cluster_labels):
    c_silhouette_vals = silhouette_vals[y_km == c]
    c_silhouette_vals.sort()
    y_ax_upper += len(c_silhouette_vals)
    color = cm.jet(i / n_clusters)
    plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_vals, height=1.0, 
            edgecolor='none', color=color)

    yticks.append((y_ax_lower + y_ax_upper) / 2)
    y_ax_lower += len(c_silhouette_vals)
    
silhouette_avg = np.mean(silhouette_vals)
plt.axvline(silhouette_avg, color="red", linestyle="--") 

plt.yticks(yticks, cluster_labels + 1)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')

plt.tight_layout()
# plt.savefig('./figures/silhouette.png', dpi=300)
plt.show()

Our clustering with 3 centroids is good.

Applying agglomerative clustering via scikit-learn

In [76]:
from sklearn.cluster import AgglomerativeClustering

ac = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='complete')
labels = ac.fit_predict(X)
print('Cluster labels: %s' % labels)
Cluster labels: [2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 0 2 2 2 0 2 2 0 0 2 2 0 2 2 2 2 2 2
 2 2 0 0 0 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 0 0 0 0 0 0 2 2 2 0 2 2 2 1
 2 2 2 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1 1 1 1 2 1 1 1
 1 1 1 1 1 1 1 1 1 1 2 2 1 2 1 1 1 1 2 1 1 2 2 2 2 2 2 2 2 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]

In [78]:
from sklearn.cross_validation import train_test_split
X = df[features]
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y ,test_size=0.25, random_state=42)

K Means

In [79]:
from sklearn import cluster
clf = cluster.KMeans(init='k-means++', n_clusters=3, random_state=5)
clf.fit(X_train)
print clf.labels_.shape
print clf.labels_
(157L,)
[0 1 0 0 0 0 0 1 0 2 2 2 1 1 0 0 0 0 0 2 2 1 0 2 1 2 0 1 2 0 0 1 1 0 0 2 0
 2 2 2 2 1 0 2 1 0 0 2 0 0 1 1 0 0 1 0 1 0 0 2 1 2 1 2 0 1 1 1 1 0 1 1 2 1
 0 0 0 0 2 0 1 1 2 1 1 0 2 1 0 1 0 2 0 0 1 1 1 0 2 0 0 2 2 2 1 1 2 0 0 0 1
 2 0 1 2 1 0 1 1 1 0 2 0 1 1 1 0 0 2 0 0 1 1 2 2 1 1 0 0 1 2 1 2 2 2 2 2 2
 1 0 1 2 2 0 2 0 2]
In [80]:
# Predict clusters on testing data
y_pred = clf.predict(X_test)
In [81]:
from sklearn import metrics
print "Addjusted rand score:{:.2}".format(metrics.adjusted_rand_score(y_test, y_pred))
print "Homogeneity score:{:.2} ".format(metrics.homogeneity_score(y_test, y_pred)) 
print "Completeness score: {:.2} ".format(metrics.completeness_score(y_test, y_pred))
print "Confusion matrix"
print metrics.confusion_matrix(y_test, y_pred)
Addjusted rand score:0.71
Homogeneity score:0.69 
Completeness score: 0.69 
Confusion matrix
[[ 0  0  0  0]
 [11  3  0  0]
 [ 2  0 16  0]
 [ 1 20  0  0]]

Affinity Propogation

In [82]:
# Affinity propagation
aff = cluster.AffinityPropagation()
aff.fit(X_train)
print aff.cluster_centers_indices_.shape
(10L,)
In [83]:
y_pred = aff.predict(X_test)
In [84]:
from sklearn import metrics
print "Addjusted rand score:{:.2}".format(metrics.adjusted_rand_score(y_test, y_pred))
print "Homogeneity score:{:.2} ".format(metrics.homogeneity_score(y_test, y_pred)) 
print "Completeness score: {:.2} ".format(metrics.completeness_score(y_test, y_pred))
print "Confusion matrix"
print metrics.confusion_matrix(y_test, y_pred)
Addjusted rand score:0.31
Homogeneity score:0.84 
Completeness score: 0.42 
Confusion matrix
[[0 0 0 0 0 0 0 0 0 0]
 [0 0 2 4 0 0 1 0 7 0]
 [5 7 1 0 0 1 0 0 0 4]
 [0 0 0 2 3 0 9 7 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]]

MeanShift

In [86]:
ms = cluster.MeanShift()
ms.fit(X_train)
print ms.cluster_centers_
[[ 12.90285714  13.6938961    0.86236364   5.35018182   3.01996104
    3.46636364   5.0844026 ]
 [ 17.32053571  15.67982143   0.88386429   5.98578571   3.57646429
    3.34401786   5.801875  ]]
In [87]:
y_pred = ms.predict(X_test)
In [89]:
from sklearn import metrics
print "Addjusted rand score:{:.2}".format(metrics.adjusted_rand_score(y_test, y_pred))
print "Homogeneity score:{:.2} ".format(metrics.homogeneity_score(y_test, y_pred)) 
print "Completeness score: {:.2} ".format(metrics.completeness_score(y_test, y_pred))
print "Confusion matrix"
print metrics.confusion_matrix(y_test, y_pred)
Addjusted rand score:0.55
Homogeneity score:0.51 
Completeness score: 0.84 
Confusion matrix
[[ 0  0  0  0]
 [12  2  0  0]
 [ 0 18  0  0]
 [21  0  0  0]]

Mixture of Guassian Models

In [112]:
from sklearn import mixture

# Define a heldout dataset to estimate covariance type
X_train_heldout, X_test_heldout, y_train_heldout, y_test_heldout = train_test_split(
        X_train, y_train,test_size=0.25, random_state=42)
for covariance_type in ['spherical','tied','diag','full']:
    gm=mixture.GMM(n_components=3, covariance_type=covariance_type, random_state=42, n_init=5)
    gm.fit(X_train_heldout)
    y_pred=gm.predict(X_test_heldout)
    print "Adjusted rand score for covariance={}:{:.2}".format(covariance_type, 
                                                               metrics.adjusted_rand_score(y_test_heldout, y_pred))
Adjusted rand score for covariance=spherical:0.89
Adjusted rand score for covariance=tied:0.71
Adjusted rand score for covariance=diag:0.64
Adjusted rand score for covariance=full:0.8
In [113]:
gm = mixture.GMM(n_components=3, covariance_type='tied', random_state=42)
gm.fit(X_train)
Out[113]:
GMM(covariance_type='tied', init_params='wmc', min_covar=0.001,
  n_components=3, n_init=1, n_iter=100, params='wmc', random_state=42,
  thresh=None, tol=0.001, verbose=0)
In [114]:
# Print train clustering and confusion matrix
y_pred = gm.predict(X_test)
print "Addjusted rand score:{:.2}".format(metrics.adjusted_rand_score(y_test, y_pred))
print "Homogeneity score:{:.2} ".format(metrics.homogeneity_score(y_test, y_pred)) 
print "Completeness score: {:.2} ".format(metrics.completeness_score(y_test, y_pred))

print "Confusion matrix"
print metrics.confusion_matrix(y_test, y_pred)
Addjusted rand score:0.86
Homogeneity score:0.83 
Completeness score: 0.83 
Confusion matrix
[[ 0  0  0  0]
 [24  1  2  0]
 [ 1 32  0  0]
 [ 0  0 24  0]]
In [115]:
pl=plt
from sklearn import decomposition
# In this case the seeding of the centers is deterministic,
# hence we run the kmeans algorithm only once with n_init=1

pca = decomposition.PCA(n_components=2).fit(X_train)
reduced_X_train = pca.transform(X_train)

# Step size of the mesh. Decrease to increase the quality of the VQ.
h = .01     # point in the mesh [x_min, m_max]x[y_min, y_max].

# Plot the decision boundary. For that, we will asign a color to each
x_min, x_max = reduced_X_train[:, 0].min() + 1, reduced_X_train[:, 0].max() - 1
y_min, y_max = reduced_X_train[:, 1].min() + 1, reduced_X_train[:, 1].max() - 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

gm.fit(reduced_X_train)
#print np.c_[xx.ravel(),yy.ravel()]
Z = gm.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
pl.figure(1)
pl.clf()
pl.imshow(Z, interpolation='nearest',
          extent=(xx.min(), xx.max(), yy.min(), yy.max()),
          cmap=pl.cm.Paired,
          aspect='auto', origin='lower')
#print reduced_X_train.shape

pl.plot(reduced_X_train[:, 0], reduced_X_train[:, 1], 'k.', markersize=2)
# Plot the centroids as a white X
centroids = gm.means_

pl.scatter(centroids[:, 0], centroids[:, 1],
           marker='.', s=169, linewidths=3,
           color='w', zorder=10)

pl.title('Mixture of gaussian models on the seeds dataset (PCA-reduced data)\n'
         'Means are marked with white dots')
pl.xlim(x_min, x_max)
pl.ylim(y_min, y_max)
pl.xticks(())
pl.yticks(())
pl.show()