Lenses Data Classification¶

https://archive.ics.uci.edu/ml/datasets/lenses

import os
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn import cross_validation, metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from time import time
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score , classification_report
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report,log_loss

cols = []
for r in range(11):
    cols.append('c'+str(r))

# read .csv from provided dataset
csv_filename="lenses.data"

# df=pd.read_csv(csv_filename,index_col=0)
df=pd.read_csv(csv_filename,sep=' ',names=cols)

df

df.fillna(0 ,inplace=True)

df.head()

df['Age'] = df['c1'] + df['c2']

df['Specs'] = df['c3'] + df['c4']

df['Astigmatic'] = df['c5'] + df['c6']

df['Tear-Production-Rate'] = df['c7'] + df['c8']

df['Target-Lenses'] = df['c9'] + df['c10']

df.drop(cols,axis=1,inplace=True)

df

df.columns

Index([u'Age', u'Specs', u'Astigmatic', u'Tear-Production-Rate',
       u'Target-Lenses'],
      dtype='object')

features = ['Age', 'Specs', 'Astigmatic', 'Tear-Production-Rate']
X = df[features]
y = df['Target-Lenses']

X.head()

# split dataset to 60% training and 40% testing
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0)

print X_train.shape, y_train.shape

(14, 4) (14L,)

Feature importances with forests of trees¶

This examples shows the use of forests of trees to evaluate the importance of features on an artificial classification task. The red bars are the feature importances of the forest, along with their inter-trees variability.

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import ExtraTreesClassifier

# Build a classification task using 3 informative features

# Build a forest and compute the feature importances
forest = ExtraTreesClassifier(n_estimators=250,
                              random_state=0)

forest.fit(X, y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d - %s (%f) " % (f + 1, indices[f], features[indices[f]], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure(num=None, figsize=(14, 10), dpi=80, facecolor='w', edgecolor='k')
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()

Feature ranking:
1. feature 3 - Milk (0.138606) 
2. feature 1 - Feathers (0.138508) 
3. feature 2 - Eggs (0.113904) 
4. feature 7 - Toothed (0.083274) 
5. feature 0 - Hair (0.083251) 
6. feature 8 - Backbone (0.082373) 
7. feature 9 - Breathes (0.076449) 
8. feature 11 - Fins (0.063457) 
9. feature 13 - Tail (0.050104) 
10. feature 12 - Legs (0.048997) 
11. feature 5 - Aquatic (0.035794) 
12. feature 4 - Airborne (0.035151) 
13. feature 16 - animals (0.016738) 
14. feature 15 - Catsize (0.011835) 
15. feature 6 - Predator (0.011787) 
16. feature 10 - Venomous (0.009257) 
17. feature 14 - Domestic (0.000515)

importances[indices[:5]]

array([ 0.13860564,  0.13850845,  0.11390376,  0.083274  ,  0.08325078])

for f in range(5):
    print("%d. feature %d - %s (%f)" % (f + 1, indices[f], features[indices[f]] ,importances[indices[f]]))

1. feature 3 - Milk (0.138606)
2. feature 1 - Feathers (0.138508)
3. feature 2 - Eggs (0.113904)
4. feature 7 - Toothed (0.083274)
5. feature 0 - Hair (0.083251)

best_features = []
for i in indices[:5]:
    best_features.append(features[i])

# Plot the top 5 feature importances of the forest
plt.figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
plt.title("Feature importances")
plt.bar(range(5), importances[indices][:5], 
       color="r",  yerr=std[indices][:5], align="center")
plt.xticks(range(5), best_features)
plt.xlim([-1, 5])
plt.show()

Decision Tree accuracy and time elapsed caculation¶

t0=time()
print "DecisionTree"

dt = DecisionTreeClassifier(min_samples_split=20,random_state=99)
# dt = DecisionTreeClassifier(min_samples_split=20,max_depth=5,random_state=99)

clf_dt=dt.fit(X_train,y_train)

print "Acurracy: ", clf_dt.score(X_test,y_test)
t1=time()
print "time elapsed: ", t1-t0

DecisionTree
Acurracy:  0.7
time elapsed:  0.0350000858307

cross validation for DT¶

tt0=time()
print "cross result========"
scores = cross_validation.cross_val_score(dt, X, y, cv=3)
print scores
print scores.mean()
tt1=time()
print "time elapsed: ", tt1-tt0
print "\n"

cross result========
[ 0.55555556  0.625       0.71428571]
0.631613756614
time elapsed:  0.055999994278

Tuning our hyperparameters using GridSearch¶

from sklearn.metrics import classification_report

pipeline = Pipeline([
    ('clf', DecisionTreeClassifier(criterion='entropy'))
])

parameters = {
    'clf__max_depth': (5, 25 , 50),
    'clf__min_samples_split': (1, 5, 10),
    'clf__min_samples_leaf': (1, 2, 3)
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='f1')
grid_search.fit(X_train, y_train)

print 'Best score: %0.3f' % grid_search.best_score_
print 'Best parameters set:'

best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])

predictions = grid_search.predict(X_test)

print classification_report(y_test, predictions)

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   17.4s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   18.1s finished

Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best score: 0.719
Best parameters set:
	clf__max_depth: 5
	clf__min_samples_leaf: 1
	clf__min_samples_split: 5
             precision    recall  f1-score   support

        1.0       0.50      1.00      0.67         1
        2.0       1.00      1.00      1.00         2
        3.0       1.00      0.86      0.92         7

avg / total       0.95      0.90      0.91        10

Random Forest accuracy and time elapsed caculation¶

t2=time()
print "RandomForest"
rf = RandomForestClassifier(n_estimators=100,n_jobs=-1)
clf_rf = rf.fit(X_train,y_train)
print "Acurracy: ", clf_rf.score(X_test,y_test)
t3=time()
print "time elapsed: ", t3-t2

RandomForest
Acurracy:  1.0
time elapsed:  1.02699995041

cross validation for RF¶

tt2=time()
print "cross result========"
scores = cross_validation.cross_val_score(rf, X, y, cv=3)
print scores
print scores.mean()
tt3=time()
print "time elapsed: ", tt3-tt2
print "\n"

cross result========
[ 0.88888889  0.875       0.85714286]
0.873677248677
time elapsed:  3.34599995613

Tuning Models using GridSearch¶

pipeline2 = Pipeline([
('clf', RandomForestClassifier(criterion='entropy'))
])

parameters = {
    'clf__n_estimators': (5, 25, 50, 100),
    'clf__max_depth': (5, 25 , 50),
    'clf__min_samples_split': (1, 5, 10),
    'clf__min_samples_leaf': (1, 2, 3)
}

grid_search = GridSearchCV(pipeline2, parameters, n_jobs=-1, verbose=1, scoring='accuracy', cv=3)

grid_search.fit(X_train, y_train)

print 'Best score: %0.3f' % grid_search.best_score_

print 'Best parameters set:'
best_parameters = grid_search.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])

predictions = grid_search.predict(X_test)
print 'Accuracy:', accuracy_score(y_test, predictions)
print classification_report(y_test, predictions)

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   19.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   26.8s
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed:   34.0s finished

Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best score: 0.786
Best parameters set:
	clf__max_depth: 5
	clf__min_samples_leaf: 1
	clf__min_samples_split: 1
	clf__n_estimators: 5
Accuracy: 0.9
             precision    recall  f1-score   support

        1.0       0.50      1.00      0.67         1
        2.0       1.00      1.00      1.00         2
        3.0       1.00      0.86      0.92         7

avg / total       0.95      0.90      0.91        10

Naive Bayes accuracy and time elapsed caculation¶

t4=time()
print "NaiveBayes"
nb = BernoulliNB()
clf_nb=nb.fit(X_train,y_train)
print "Acurracy: ", clf_nb.score(X_test,y_test)
t5=time()
print "time elapsed: ", t5-t4

NaiveBayes
Acurracy:  0.7
time elapsed:  0.296000003815

cross-validation for NB¶

tt4=time()
print "cross result========"
scores = cross_validation.cross_val_score(nb, X,y, cv=3)
print scores
print scores.mean()
tt5=time()
print "time elapsed: ", tt5-tt4
print "\n"

cross result========
[ 0.55555556  0.625       0.71428571]
0.631613756614
time elapsed:  0.256999969482

KNN accuracy and time elapsed caculation¶

t6=time()
print "KNN"
# knn = KNeighborsClassifier(n_neighbors=3)
knn = KNeighborsClassifier(n_neighbors=3)
clf_knn=knn.fit(X_train, y_train)
print "Acurracy: ", clf_knn.score(X_test,y_test) 
t7=time()
print "time elapsed: ", t7-t6

KNN
Acurracy:  0.8
time elapsed:  0.0480000972748

cross validation for KNN¶

tt6=time()
print "cross result========"
scores = cross_validation.cross_val_score(knn, X,y, cv=5)
print scores
print scores.mean()
tt7=time()
print "time elapsed: ", tt7-tt6
print "\n"

cross result========
[ 1.   1.   0.8  0.6  0.5]
0.78
time elapsed:  0.0869998931885

C:\Miniconda2\lib\site-packages\sklearn\cross_validation.py:516: Warning: The least populated class in y has only 4 members, which is too few. The minimum number of labels for any class cannot be less than n_folds=5.
  % (min_labels, self.n_folds)), Warning)

Fine tuning the model using GridSearch¶

from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn import grid_search

knn = KNeighborsClassifier()

parameters = {'n_neighbors':[1,]}

grid = grid_search.GridSearchCV(knn, parameters, n_jobs=-1, verbose=1, scoring='accuracy')


grid.fit(X_train, y_train)

print 'Best score: %0.3f' % grid.best_score_

print 'Best parameters set:'
best_parameters = grid.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])
    
predictions = grid.predict(X_test)
print classification_report(y_test, predictions)

SVM accuracy and time elapsed caculation¶

t7=time()
print "SVM"

svc = SVC()
clf_svc=svc.fit(X_train, y_train)
print "Acurracy: ", clf_svc.score(X_test,y_test) 
t8=time()
print "time elapsed: ", t8-t7

SVM
Acurracy:  0.7
time elapsed:  0.0380001068115

cross validation for SVM¶

tt7=time()
print "cross result========"
scores = cross_validation.cross_val_score(svc,X,y, cv=5)
print scores
print scores.mean()
tt8=time()
print "time elapsed: ", tt7-tt6
print "\n"

cross result========
[ 0.6   0.6   0.6   0.6   0.75]
0.63
time elapsed:  54.5379998684

from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn import grid_search

svc = SVC()

parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}

grid = grid_search.GridSearchCV(svc, parameters, n_jobs=-1, verbose=1, scoring='accuracy')


grid.fit(X_train, y_train)

print 'Best score: %0.3f' % grid.best_score_

print 'Best parameters set:'
best_parameters = grid.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])
    
predictions = grid.predict(X_test)
print classification_report(y_test, predictions)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best score: 0.571

[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:   17.5s finished
C:\Miniconda2\lib\site-packages\sklearn\metrics\classification.py:1074: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

Best parameters set:
	C: 1
	kernel: 'linear'
             precision    recall  f1-score   support

        1.0       1.00      1.00      1.00         1
        2.0       0.00      0.00      0.00         2
        3.0       0.78      1.00      0.88         7

avg / total       0.64      0.80      0.71        10

pipeline = Pipeline([
    ('clf', SVC(kernel='linear', gamma=0.01, C=1))
])

parameters = {
    'clf__gamma': (0.01, 0.03, 0.1, 0.3, 1),
    'clf__C': (0.1, 0.3, 1, 3, 10, 30),
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy')

grid_search.fit(X_train, y_train)

print 'Best score: %0.3f' % grid_search.best_score_

print 'Best parameters set:'
best_parameters = grid_search.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])
    
predictions = grid_search.predict(X_test)
print classification_report(y_test, predictions)

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   17.9s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:   18.4s finished

Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best score: 0.571
Best parameters set:
	clf__C: 0.1
	clf__gamma: 0.01
             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00         1
        2.0       0.00      0.00      0.00         2
        3.0       0.70      1.00      0.82         7

avg / total       0.49      0.70      0.58        10

Unsupervised Learning¶

features = ['Age', 'Specs', 'Astigmatic', 'Tear-Production-Rate']

df1 = df[features]

df1.head()

PCA¶

# Apply PCA with the same number of dimensions as variables in the dataset
from sklearn.decomposition import PCA
pca = PCA(n_components=4) # 6 components for 6 variables
pca.fit(df1)

# Print the components and the amount of variance in the data contained in each dimension
print(pca.components_)
print(pca.explained_variance_ratio_)

[[ 1.          0.          0.          0.        ]
 [ 0.         -0.57735027 -0.57735027 -0.57735027]
 [-0.         -0.25819889  0.79991984 -0.54172095]
 [-0.         -0.77459667  0.16369154  0.61090513]]
[ 0.47058824  0.17647059  0.17647059  0.17647059]

%matplotlib inline
import matplotlib.pyplot as plt
plt.plot(list(pca.explained_variance_ratio_),'-o')
plt.title('Explained variance ratio as function of PCA components')
plt.ylabel('Explained variance ratio')
plt.xlabel('Component')
plt.show()

%pylab inline
import IPython
import sklearn as sk
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

Populating the interactive namespace from numpy and matplotlib

WARNING: pylab import has clobbered these variables: ['yticks']
`%matplotlib` prevents importing * from pylab and numpy

y = df['Target-Lenses']

target = array(y.unique())

target

array([ 3.,  2.,  1.])

def plot_pca_scatter():
    colors = ['blue', 'red', 'green']
    for i in xrange(len(colors)):
        px = X_pca[:, 0][y == i+1]
        py = X_pca[:, 1][y == i+1]
        plt.scatter(px, py, c=colors[i])
    plt.legend(target)
    plt.xlabel('First Principal Component')
    plt.ylabel('Second Principal Component')

from sklearn.decomposition import PCA

estimator = PCA(n_components=3)
X_pca = estimator.fit_transform(X)
plot_pca_scatter() # Note that we only plot the first and second principal component

C:\Miniconda2\lib\site-packages\ipykernel\__main__.py:4: FutureWarning: in the future, boolean array-likes will be handled as a boolean array index
C:\Miniconda2\lib\site-packages\ipykernel\__main__.py:5: FutureWarning: in the future, boolean array-likes will be handled as a boolean array index

Clustering¶

# Import clustering modules
from sklearn.cluster import KMeans
from sklearn.mixture import GMM

# First we reduce the data to two dimensions using PCA to capture variation
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(df1)
print(reduced_data[:10])  # print upto 10 elements

[[-1.          0.8660254 ]
 [-1.          0.28867513]
 [-1.          0.28867513]
 [-1.         -0.28867513]
 [-1.          0.28867513]
 [-1.         -0.28867513]
 [-1.         -0.28867513]
 [-1.         -0.8660254 ]
 [ 0.          0.8660254 ]
 [ 0.          0.28867513]]

# Implement your clustering algorithm here, and fit it to the reduced data for visualization
# The visualizer below assumes your clustering object is named 'clusters'

# TRIED OUT 2,3,4,5,6 CLUSTERS AND CONCLUDED THAT 3 CLUSTERS ARE A SENSIBLE CHOICE BASED ON VISUAL INSPECTION, SINCE 
# WE OBTAIN ONE CENTRAL CLUSTER AND TWO CLUSTERS THAT SPREAD FAR OUT IN TWO DIRECTIONS.
kmeans = KMeans(n_clusters=3)
clusters = kmeans.fit(reduced_data)
print(clusters)

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=3, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

# Plot the decision boundary by building a mesh grid to populate a graph.
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
hx = (x_max-x_min)/1000.
hy = (y_max-y_min)/1000.
xx, yy = np.meshgrid(np.arange(x_min, x_max, hx), np.arange(y_min, y_max, hy))

# Obtain labels for each point in mesh. Use last trained model.
Z = clusters.predict(np.c_[xx.ravel(), yy.ravel()])

# Find the centroids for KMeans or the cluster means for GMM 

centroids = kmeans.cluster_centers_
print('*** K MEANS CENTROIDS ***')
print(centroids)

# TRANSFORM DATA BACK TO ORIGINAL SPACE FOR ANSWERING 7
print('*** CENTROIDS TRANSFERED TO ORIGINAL SPACE ***')
print(pca.inverse_transform(centroids))

*** K MEANS CENTROIDS ***
[[ -1.00000000e+00   1.34151949e-16]
 [  0.00000000e+00  -4.62592927e-18]
 [  1.00000000e+00  -7.40148683e-17]]
*** CENTROIDS TRANSFERED TO ORIGINAL SPACE ***
[[ 1.   1.5  1.5  1.5]
 [ 2.   1.5  1.5  1.5]
 [ 3.   1.5  1.5  1.5]]

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(Z, interpolation='nearest',
           extent=(xx.min(), xx.max(), yy.min(), yy.max()),
           cmap=plt.cm.Paired,
           aspect='auto', origin='lower')

plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
plt.scatter(centroids[:, 0], centroids[:, 1],
            marker='x', s=169, linewidths=3,
            color='w', zorder=10)
plt.title('Clustering on the lenses dataset (PCA-reduced data)\n'
          'Centroids are marked with white cross')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()

Elbow Method¶

distortions = []
for i in range(1, 11):
    km = KMeans(n_clusters=i, 
                init='k-means++', 
                n_init=10, 
                max_iter=300, 
                random_state=0)
    km.fit(X)
    distortions .append(km.inertia_)
plt.plot(range(1,11), distortions , marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.tight_layout()
#plt.savefig('./figures/elbow.png', dpi=300)
plt.show()

Quantifying the quality of clustering via silhouette plots¶

import numpy as np
from matplotlib import cm
from sklearn.metrics import silhouette_samples

km = KMeans(n_clusters=3, 
            init='k-means++', 
            n_init=10, 
            max_iter=300,
            tol=1e-04,
            random_state=0)
y_km = km.fit_predict(X)

cluster_labels = np.unique(y_km)
n_clusters = cluster_labels.shape[0]
silhouette_vals = silhouette_samples(X, y_km, metric='euclidean')
y_ax_lower, y_ax_upper = 0, 0
yticks = []
for i, c in enumerate(cluster_labels):
    c_silhouette_vals = silhouette_vals[y_km == c]
    c_silhouette_vals.sort()
    y_ax_upper += len(c_silhouette_vals)
    color = cm.jet(i / n_clusters)
    plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_vals, height=1.0, 
            edgecolor='none', color=color)

    yticks.append((y_ax_lower + y_ax_upper) / 2)
    y_ax_lower += len(c_silhouette_vals)
    
silhouette_avg = np.mean(silhouette_vals)
plt.axvline(silhouette_avg, color="red", linestyle="--") 

plt.yticks(yticks, cluster_labels + 1)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')

plt.tight_layout()
# plt.savefig('./figures/silhouette.png', dpi=300)
plt.show()

Our clustering with 3 centroids is good.

Bad Clustering:¶

km = KMeans(n_clusters=4, 
            init='k-means++', 
            n_init=10, 
            max_iter=300,
            tol=1e-04,
            random_state=0)
y_km = km.fit_predict(X)

cluster_labels = np.unique(y_km)
n_clusters = cluster_labels.shape[0]
silhouette_vals = silhouette_samples(X, y_km, metric='euclidean')
y_ax_lower, y_ax_upper = 0, 0
yticks = []
for i, c in enumerate(cluster_labels):
    c_silhouette_vals = silhouette_vals[y_km == c]
    c_silhouette_vals.sort()
    y_ax_upper += len(c_silhouette_vals)
    color = cm.jet(i / n_clusters)
    plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_vals, height=1.0, 
            edgecolor='none', color=color)

    yticks.append((y_ax_lower + y_ax_upper) / 2)
    y_ax_lower += len(c_silhouette_vals)
    
silhouette_avg = np.mean(silhouette_vals)
plt.axvline(silhouette_avg, color="red", linestyle="--") 

plt.yticks(yticks, cluster_labels + 1)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')

plt.tight_layout()
# plt.savefig('./figures/silhouette_bad.png', dpi=300)
plt.show()

Organizing clusters as a hierarchical tree¶

Performing hierarchical clustering on a distance matrix¶

To calculate the distance matrix as input for the hierarchical clustering algorithm, we will use the pdist function from SciPy's spatial.distance submodule:

labels = []
for i in range(df1.shape[0]):
    str = 'ID_{}'.format(i)
    labels.append(str)

from scipy.spatial.distance import pdist,squareform

row_dist = pd.DataFrame(squareform(pdist(df1, metric='euclidean')), columns=labels, index=labels)
row_dist[:5]

# 1. incorrect approach: Squareform distance matrix

from scipy.cluster.hierarchy import linkage

row_clusters = linkage(row_dist, method='complete', metric='euclidean')
pd.DataFrame(row_clusters, 
             columns=['row label 1', 'row label 2', 'distance', 'no. of items in clust.'],
             index=['cluster %d' %(i+1) for i in range(row_clusters.shape[0])])

# 2. correct approach: Condensed distance matrix

row_clusters = linkage(pdist(df, metric='euclidean'), method='complete')
pd.DataFrame(row_clusters, 
             columns=['row label 1', 'row label 2', 'distance', 'no. of items in clust.'],
             index=['cluster %d' %(i+1) for i in range(row_clusters.shape[0])])

# 3. correct approach: Input sample matrix

row_clusters = linkage(df.values, method='complete', metric='euclidean')
pd.DataFrame(row_clusters, 
             columns=['row label 1', 'row label 2', 'distance', 'no. of items in clust.'],
             index=['cluster %d' %(i+1) for i in range(row_clusters.shape[0])])

As shown in the following table, the linkage matrix consists of several rows where each row represents one merge. The first and second columns denote the most dissimilar members in each cluster, and the third row reports the distance between those members. The last column returns the count of the members in each cluster.

Now that we have computed the linkage matrix, we can visualize the results in the form of a dendrogram:

from scipy.cluster.hierarchy import dendrogram

# make dendrogram black (part 1/2)
# from scipy.cluster.hierarchy import set_link_color_palette
# set_link_color_palette(['black'])

row_dendr = dendrogram(row_clusters, 
                       labels=labels,
                       # make dendrogram black (part 2/2)
                       # color_threshold=np.inf
                       )
plt.tight_layout()
plt.ylabel('Euclidean distance')
#plt.savefig('./figures/dendrogram.png', dpi=300, 
#            bbox_inches='tight')
plt.show()

# plot row dendrogram
fig = plt.figure(figsize=(8,8))
axd = fig.add_axes([0.09,0.1,0.2,0.6])
row_dendr = dendrogram(row_clusters, orientation='right')

# reorder data with respect to clustering
df_rowclust = df.ix[row_dendr['leaves'][::-1]]

axd.set_xticks([])
axd.set_yticks([])

# remove axes spines from dendrogram
for i in axd.spines.values():
        i.set_visible(False)


        
# plot heatmap
axm = fig.add_axes([0.23,0.1,0.6,0.6]) # x-pos, y-pos, width, height
cax = axm.matshow(df_rowclust, interpolation='nearest', cmap='hot_r')
fig.colorbar(cax)
axm.set_xticklabels([''] + list(df_rowclust.columns))
axm.set_yticklabels([''] + list(df_rowclust.index))

# plt.savefig('./figures/heatmap.png', dpi=300)
plt.show()

Applying agglomerative clustering via scikit-learn¶

from sklearn.cluster import AgglomerativeClustering

ac = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='complete')
labels = ac.fit_predict(X)
print('Cluster labels: %s' % labels)

Cluster labels: [2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0]

from sklearn.cross_validation import train_test_split
X = df[features]
y = df['Target-Lenses']
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values ,test_size=0.25, random_state=42)

from sklearn import cluster
clf = cluster.KMeans(init='k-means++', n_clusters=3, random_state=5)
clf.fit(X_train)
print clf.labels_.shape
print clf.labels_

(18L,)
[2 1 2 1 1 2 0 1 1 0 2 2 0 1 0 0 0 1]

# Predict clusters on testing data
y_pred = clf.predict(X_test)

from sklearn import metrics
print "Addjusted rand score:{:.2}".format(metrics.adjusted_rand_score(y_test, y_pred))
print "Homogeneity score:{:.2} ".format(metrics.homogeneity_score(y_test, y_pred)) 
print "Completeness score: {:.2} ".format(metrics.completeness_score(y_test, y_pred))
print "Confusion matrix"
print metrics.confusion_matrix(y_test, y_pred)

Addjusted rand score:-0.18
Homogeneity score:0.37 
Completeness score: 0.31 
Confusion matrix
[[0 0 0 0]
 [1 0 0 0]
 [0 0 1 0]
 [1 1 2 0]]

Affinity Propogation¶

# Affinity propagation
aff = cluster.AffinityPropagation()
aff.fit(X_train)
print aff.cluster_centers_indices_.shape

(4L,)

y_pred = aff.predict(X_test)

from sklearn import metrics
print "Addjusted rand score:{:.2}".format(metrics.adjusted_rand_score(y_test, y_pred))
print "Homogeneity score:{:.2} ".format(metrics.homogeneity_score(y_test, y_pred)) 
print "Completeness score: {:.2} ".format(metrics.completeness_score(y_test, y_pred))
print "Confusion matrix"
print metrics.confusion_matrix(y_test, y_pred)

Addjusted rand score:0.062
Homogeneity score:0.73 
Completeness score: 0.48 
Confusion matrix
[[0 0 0 0]
 [0 1 0 0]
 [1 0 0 0]
 [0 1 2 1]]

#MeanShift
ms = cluster.MeanShift()
ms.fit(X_train)
print ms.cluster_centers_

[[ 1.41666667  1.66666667  1.58333333  1.5       ]
 [ 3.          1.5         1.5         2.        ]]

y_pred = ms.predict(X_test)

from sklearn import metrics
print "Addjusted rand score:{:.2}".format(metrics.adjusted_rand_score(y_test, y_pred))
print "Homogeneity score:{:.2} ".format(metrics.homogeneity_score(y_test, y_pred)) 
print "Completeness score: {:.2} ".format(metrics.completeness_score(y_test, y_pred))
print "Confusion matrix"
print metrics.confusion_matrix(y_test, y_pred)

Addjusted rand score:-0.22
Homogeneity score:0.2 
Completeness score: 0.27 
Confusion matrix
[[0 0 0 0]
 [1 0 0 0]
 [1 0 0 0]
 [2 2 0 0]]

Mixture of Guassian Models¶

from sklearn import mixture

# Define a heldout dataset to estimate covariance type
X_train_heldout, X_test_heldout, y_train_heldout, y_test_heldout = train_test_split(
        X_train, y_train,test_size=0.25, random_state=42)
for covariance_type in ['spherical','tied','diag','full']:
    gm=mixture.GMM(n_components=3, covariance_type=covariance_type, random_state=42, n_init=5)
    gm.fit(X_train_heldout)
    y_pred=gm.predict(X_test_heldout)
    print "Adjusted rand score for covariance={}:{:.2}".format(covariance_type, metrics.adjusted_rand_score(y_test_heldout, y_pred))

Adjusted rand score for covariance=spherical:-0.25
Adjusted rand score for covariance=tied:0.0
Adjusted rand score for covariance=diag:-0.15
Adjusted rand score for covariance=full:-0.25

gm = mixture.GMM(n_components=3, covariance_type='tied', random_state=42)
gm.fit(X_train)

GMM(covariance_type='tied', init_params='wmc', min_covar=0.001,
  n_components=3, n_init=1, n_iter=100, params='wmc', random_state=42,
  thresh=None, tol=0.001, verbose=0)

# Print train clustering and confusion matrix
y_pred = gm.predict(X_test)
print "Addjusted rand score:{:.2}".format(metrics.adjusted_rand_score(y_test, y_pred))
print "Homogeneity score:{:.2} ".format(metrics.homogeneity_score(y_test, y_pred)) 
print "Completeness score: {:.2} ".format(metrics.completeness_score(y_test, y_pred))

print "Confusion matrix"
print metrics.confusion_matrix(y_test, y_pred)

Addjusted rand score:-0.18
Homogeneity score:0.37 
Completeness score: 0.31 
Confusion matrix
[[0 0 0 0]
 [0 1 0 0]
 [0 1 0 0]
 [1 1 2 0]]

	ID_0	ID_1	ID_2	ID_3	ID_4	ID_5	ID_6	ID_7	ID_8	ID_9	...	ID_14	ID_15	ID_16	ID_17	ID_18	ID_19	ID_20	ID_21	ID_22	ID_23
ID_0	0.000000	1.000000	1.000000	1.414214	1.000000	1.414214	1.414214	1.732051	1.000000	1.414214	...	1.732051	2.000000	2.000000	2.236068	2.236068	2.449490	2.236068	2.449490	2.449490	2.645751
ID_1	1.000000	0.000000	1.414214	1.000000	1.414214	1.000000	1.732051	1.414214	1.414214	1.000000	...	2.000000	1.732051	2.236068	2.000000	2.449490	2.236068	2.449490	2.236068	2.645751	2.449490
ID_2	1.000000	1.414214	0.000000	1.000000	1.414214	1.732051	1.000000	1.414214	1.414214	1.732051	...	1.414214	1.732051	2.236068	2.449490	2.000000	2.236068	2.449490	2.645751	2.236068	2.449490
ID_3	1.414214	1.000000	1.000000	0.000000	1.732051	1.414214	1.414214	1.000000	1.732051	1.414214	...	1.732051	1.414214	2.449490	2.236068	2.236068	2.000000	2.645751	2.449490	2.449490	2.236068
ID_4	1.000000	1.414214	1.414214	1.732051	0.000000	1.000000	1.000000	1.414214	1.414214	1.732051	...	1.414214	1.732051	2.236068	2.449490	2.449490	2.645751	2.000000	2.236068	2.236068	2.449490

	row label 1	row label 2	distance	no. of items in clust.
cluster 1	0.0	1.0	2.037116	2.0
cluster 2	2.0	3.0	2.037116	2.0
cluster 3	16.0	17.0	2.037116	2.0
cluster 4	18.0	19.0	2.037116	2.0
cluster 5	20.0	21.0	2.037116	2.0
cluster 6	22.0	23.0	2.037116	2.0
cluster 7	4.0	5.0	2.037116	2.0
cluster 8	6.0	7.0	2.037116	2.0
cluster 9	14.0	15.0	2.161013	2.0
cluster 10	8.0	9.0	2.161013	2.0
cluster 11	12.0	13.0	2.161013	2.0
cluster 12	10.0	11.0	2.161013	2.0
cluster 13	25.0	31.0	2.751299	4.0
cluster 14	27.0	29.0	2.751299	4.0
cluster 15	24.0	30.0	2.751299	4.0
cluster 16	26.0	28.0	2.751299	4.0
cluster 17	32.0	34.0	2.930524	4.0
cluster 18	33.0	35.0	2.930524	4.0
cluster 19	36.0	38.0	3.277628	8.0
cluster 20	37.0	39.0	3.277628	8.0
cluster 21	40.0	41.0	3.498812	8.0
cluster 22	42.0	44.0	4.355821	16.0
cluster 23	43.0	45.0	5.780009	24.0

	row label 1	row label 2	distance	no. of items in clust.
cluster 1	0.0	2.0	1.000000	2.0
cluster 2	1.0	5.0	1.000000	2.0
cluster 3	3.0	7.0	1.000000	2.0
cluster 4	4.0	6.0	1.000000	2.0
cluster 5	8.0	10.0	1.000000	2.0
cluster 6	9.0	13.0	1.000000	2.0
cluster 7	11.0	19.0	1.000000	2.0
cluster 8	12.0	14.0	1.000000	2.0
cluster 9	15.0	23.0	1.000000	2.0
cluster 10	16.0	17.0	1.000000	2.0
cluster 11	18.0	22.0	1.000000	2.0
cluster 12	24.0	27.0	1.414214	4.0
cluster 13	25.0	29.0	1.414214	4.0
cluster 14	28.0	31.0	1.414214	4.0
cluster 15	20.0	33.0	1.414214	3.0
cluster 16	35.0	37.0	1.732051	8.0
cluster 17	21.0	38.0	1.732051	4.0
cluster 18	32.0	34.0	1.732051	4.0
cluster 19	26.0	36.0	2.000000	6.0
cluster 20	40.0	41.0	2.000000	8.0
cluster 21	30.0	42.0	2.645751	8.0
cluster 22	39.0	43.0	2.828427	16.0
cluster 23	44.0	45.0	3.316625	24.0

	row label 1	row label 2	distance	no. of items in clust.
cluster 1	0.0	2.0	1.000000	2.0
cluster 2	1.0	5.0	1.000000	2.0
cluster 3	3.0	7.0	1.000000	2.0
cluster 4	4.0	6.0	1.000000	2.0
cluster 5	8.0	10.0	1.000000	2.0
cluster 6	9.0	13.0	1.000000	2.0
cluster 7	11.0	19.0	1.000000	2.0
cluster 8	12.0	14.0	1.000000	2.0
cluster 9	15.0	23.0	1.000000	2.0
cluster 10	16.0	17.0	1.000000	2.0
cluster 11	18.0	22.0	1.000000	2.0
cluster 12	24.0	27.0	1.414214	4.0
cluster 13	25.0	29.0	1.414214	4.0
cluster 14	28.0	31.0	1.414214	4.0
cluster 15	20.0	33.0	1.414214	3.0
cluster 16	35.0	37.0	1.732051	8.0
cluster 17	21.0	38.0	1.732051	4.0
cluster 18	32.0	34.0	1.732051	4.0
cluster 19	26.0	36.0	2.000000	6.0
cluster 20	40.0	41.0	2.000000	8.0
cluster 21	30.0	42.0	2.645751	8.0
cluster 22	39.0	43.0	2.828427	16.0
cluster 23	44.0	45.0	3.316625	24.0

	c0	c1	c2	c3	c4	c5	c6	c7	c8	c9	c10
0	1	NaN	1.0	NaN	1.0	NaN	1.0	NaN	1.0	NaN	3.0
1	2	NaN	1.0	NaN	1.0	NaN	1.0	NaN	2.0	NaN	2.0
2	3	NaN	1.0	NaN	1.0	NaN	2.0	NaN	1.0	NaN	3.0
3	4	NaN	1.0	NaN	1.0	NaN	2.0	NaN	2.0	NaN	1.0
4	5	NaN	1.0	NaN	2.0	NaN	1.0	NaN	1.0	NaN	3.0
5	6	NaN	1.0	NaN	2.0	NaN	1.0	NaN	2.0	NaN	2.0
6	7	NaN	1.0	NaN	2.0	NaN	2.0	NaN	1.0	NaN	3.0
7	8	NaN	1.0	NaN	2.0	NaN	2.0	NaN	2.0	NaN	1.0
8	9	NaN	2.0	NaN	1.0	NaN	1.0	NaN	1.0	NaN	3.0
9	10	2.0	NaN	1.0	NaN	1.0	NaN	2.0	NaN	2.0	NaN
10	11	2.0	NaN	1.0	NaN	2.0	NaN	1.0	NaN	3.0	NaN
11	12	2.0	NaN	1.0	NaN	2.0	NaN	2.0	NaN	1.0	NaN
12	13	2.0	NaN	2.0	NaN	1.0	NaN	1.0	NaN	3.0	NaN
13	14	2.0	NaN	2.0	NaN	1.0	NaN	2.0	NaN	2.0	NaN
14	15	2.0	NaN	2.0	NaN	2.0	NaN	1.0	NaN	3.0	NaN
15	16	2.0	NaN	2.0	NaN	2.0	NaN	2.0	NaN	3.0	NaN
16	17	3.0	NaN	1.0	NaN	1.0	NaN	1.0	NaN	3.0	NaN
17	18	3.0	NaN	1.0	NaN	1.0	NaN	2.0	NaN	3.0	NaN
18	19	3.0	NaN	1.0	NaN	2.0	NaN	1.0	NaN	3.0	NaN
19	20	3.0	NaN	1.0	NaN	2.0	NaN	2.0	NaN	1.0	NaN
20	21	3.0	NaN	2.0	NaN	1.0	NaN	1.0	NaN	3.0	NaN
21	22	3.0	NaN	2.0	NaN	1.0	NaN	2.0	NaN	2.0	NaN
22	23	3.0	NaN	2.0	NaN	2.0	NaN	1.0	NaN	3.0	NaN
23	24	3.0	NaN	2.0	NaN	2.0	NaN	2.0	NaN	3.0	NaN

	c0	c2	c4	c6	c8	c10
0	1	1.0	1.0	1.0	1.0	3.0
1	2	1.0	1.0	1.0	2.0	2.0
2	3	1.0	1.0	2.0	1.0	3.0
3	4	1.0	1.0	2.0	2.0	1.0
4	5	1.0	2.0	1.0	1.0	3.0

	Age	Specs	Astigmatic	Tear-Production-Rate	Target-Lenses
0	1.0	1.0	1.0	1.0	3.0
1	1.0	1.0	1.0	2.0	2.0
2	1.0	1.0	2.0	1.0	3.0
3	1.0	1.0	2.0	2.0	1.0
4	1.0	2.0	1.0	1.0	3.0
5	1.0	2.0	1.0	2.0	2.0
6	1.0	2.0	2.0	1.0	3.0
7	1.0	2.0	2.0	2.0	1.0
8	2.0	1.0	1.0	1.0	3.0
9	2.0	1.0	1.0	2.0	2.0
10	2.0	1.0	2.0	1.0	3.0
11	2.0	1.0	2.0	2.0	1.0
12	2.0	2.0	1.0	1.0	3.0
13	2.0	2.0	1.0	2.0	2.0
14	2.0	2.0	2.0	1.0	3.0
15	2.0	2.0	2.0	2.0	3.0
16	3.0	1.0	1.0	1.0	3.0
17	3.0	1.0	1.0	2.0	3.0
18	3.0	1.0	2.0	1.0	3.0
19	3.0	1.0	2.0	2.0	1.0
20	3.0	2.0	1.0	1.0	3.0
21	3.0	2.0	1.0	2.0	2.0
22	3.0	2.0	2.0	1.0	3.0
23	3.0	2.0	2.0	2.0	3.0

	Age	Specs	Astigmatic	Tear-Production-Rate
0	1.0	1.0	1.0	1.0
1	1.0	1.0	1.0	2.0
2	1.0	1.0	2.0	1.0
3	1.0	1.0	2.0	2.0
4	1.0	2.0	1.0	1.0

	Age	Specs	Astigmatic	Tear-Production-Rate
0	1.0	1.0	1.0	1.0
1	1.0	1.0	1.0	2.0
2	1.0	1.0	2.0	1.0
3	1.0	1.0	2.0	2.0
4	1.0	2.0	1.0	1.0

	c0	c1	c2	c3	c4	c5	c6	c7	c8	c9	c10
0	1	NaN	1.0	NaN	1.0	NaN	1.0	NaN	1.0	NaN	3.0
1	2	NaN	1.0	NaN	1.0	NaN	1.0	NaN	2.0	NaN	2.0
2	3	NaN	1.0	NaN	1.0	NaN	2.0	NaN	1.0	NaN	3.0
3	4	NaN	1.0	NaN	1.0	NaN	2.0	NaN	2.0	NaN	1.0
4	5	NaN	1.0	NaN	2.0	NaN	1.0	NaN	1.0	NaN	3.0
5	6	NaN	1.0	NaN	2.0	NaN	1.0	NaN	2.0	NaN	2.0
6	7	NaN	1.0	NaN	2.0	NaN	2.0	NaN	1.0	NaN	3.0
7	8	NaN	1.0	NaN	2.0	NaN	2.0	NaN	2.0	NaN	1.0
8	9	NaN	2.0	NaN	1.0	NaN	1.0	NaN	1.0	NaN	3.0
9	10	2.0	NaN	1.0	NaN	1.0	NaN	2.0	NaN	2.0	NaN
10	11	2.0	NaN	1.0	NaN	2.0	NaN	1.0	NaN	3.0	NaN
11	12	2.0	NaN	1.0	NaN	2.0	NaN	2.0	NaN	1.0	NaN
12	13	2.0	NaN	2.0	NaN	1.0	NaN	1.0	NaN	3.0	NaN
13	14	2.0	NaN	2.0	NaN	1.0	NaN	2.0	NaN	2.0	NaN
14	15	2.0	NaN	2.0	NaN	2.0	NaN	1.0	NaN	3.0	NaN
15	16	2.0	NaN	2.0	NaN	2.0	NaN	2.0	NaN	3.0	NaN
16	17	3.0	NaN	1.0	NaN	1.0	NaN	1.0	NaN	3.0	NaN
17	18	3.0	NaN	1.0	NaN	1.0	NaN	2.0	NaN	3.0	NaN
18	19	3.0	NaN	1.0	NaN	2.0	NaN	1.0	NaN	3.0	NaN
19	20	3.0	NaN	1.0	NaN	2.0	NaN	2.0	NaN	1.0	NaN
20	21	3.0	NaN	2.0	NaN	1.0	NaN	1.0	NaN	3.0	NaN
21	22	3.0	NaN	2.0	NaN	1.0	NaN	2.0	NaN	2.0	NaN
22	23	3.0	NaN	2.0	NaN	2.0	NaN	1.0	NaN	3.0	NaN
23	24	3.0	NaN	2.0	NaN	2.0	NaN	2.0	NaN	3.0	NaN

	c0	c2	c4	c6	c8	c10
0	1	1.0	1.0	1.0	1.0	3.0
1	2	1.0	1.0	1.0	2.0	2.0
2	3	1.0	1.0	2.0	1.0	3.0
3	4	1.0	1.0	2.0	2.0	1.0
4	5	1.0	2.0	1.0	1.0	3.0

	Age	Specs	Astigmatic	Tear-Production-Rate	Target-Lenses
0	1.0	1.0	1.0	1.0	3.0
1	1.0	1.0	1.0	2.0	2.0
2	1.0	1.0	2.0	1.0	3.0
3	1.0	1.0	2.0	2.0	1.0
4	1.0	2.0	1.0	1.0	3.0
5	1.0	2.0	1.0	2.0	2.0
6	1.0	2.0	2.0	1.0	3.0
7	1.0	2.0	2.0	2.0	1.0
8	2.0	1.0	1.0	1.0	3.0
9	2.0	1.0	1.0	2.0	2.0
10	2.0	1.0	2.0	1.0	3.0
11	2.0	1.0	2.0	2.0	1.0
12	2.0	2.0	1.0	1.0	3.0
13	2.0	2.0	1.0	2.0	2.0
14	2.0	2.0	2.0	1.0	3.0
15	2.0	2.0	2.0	2.0	3.0
16	3.0	1.0	1.0	1.0	3.0
17	3.0	1.0	1.0	2.0	3.0
18	3.0	1.0	2.0	1.0	3.0
19	3.0	1.0	2.0	2.0	1.0
20	3.0	2.0	1.0	1.0	3.0
21	3.0	2.0	1.0	2.0	2.0
22	3.0	2.0	2.0	1.0	3.0
23	3.0	2.0	2.0	2.0	3.0

	Age	Specs	Astigmatic	Tear-Production-Rate
0	1.0	1.0	1.0	1.0
1	1.0	1.0	1.0	2.0
2	1.0	1.0	2.0	1.0
3	1.0	1.0	2.0	2.0
4	1.0	2.0	1.0	1.0

	Age	Specs	Astigmatic	Tear-Production-Rate
0	1.0	1.0	1.0	1.0
1	1.0	1.0	1.0	2.0
2	1.0	1.0	2.0	1.0
3	1.0	1.0	2.0	2.0
4	1.0	2.0	1.0	1.0

	c0	c1	c2	c3	c4	c5	c6	c7	c8	c9	c10
0	1	NaN	1.0	NaN	1.0	NaN	1.0	NaN	1.0	NaN	3.0
1	2	NaN	1.0	NaN	1.0	NaN	1.0	NaN	2.0	NaN	2.0
2	3	NaN	1.0	NaN	1.0	NaN	2.0	NaN	1.0	NaN	3.0
3	4	NaN	1.0	NaN	1.0	NaN	2.0	NaN	2.0	NaN	1.0
4	5	NaN	1.0	NaN	2.0	NaN	1.0	NaN	1.0	NaN	3.0
5	6	NaN	1.0	NaN	2.0	NaN	1.0	NaN	2.0	NaN	2.0
6	7	NaN	1.0	NaN	2.0	NaN	2.0	NaN	1.0	NaN	3.0
7	8	NaN	1.0	NaN	2.0	NaN	2.0	NaN	2.0	NaN	1.0
8	9	NaN	2.0	NaN	1.0	NaN	1.0	NaN	1.0	NaN	3.0
9	10	2.0	NaN	1.0	NaN	1.0	NaN	2.0	NaN	2.0	NaN
10	11	2.0	NaN	1.0	NaN	2.0	NaN	1.0	NaN	3.0	NaN
11	12	2.0	NaN	1.0	NaN	2.0	NaN	2.0	NaN	1.0	NaN
12	13	2.0	NaN	2.0	NaN	1.0	NaN	1.0	NaN	3.0	NaN
13	14	2.0	NaN	2.0	NaN	1.0	NaN	2.0	NaN	2.0	NaN
14	15	2.0	NaN	2.0	NaN	2.0	NaN	1.0	NaN	3.0	NaN
15	16	2.0	NaN	2.0	NaN	2.0	NaN	2.0	NaN	3.0	NaN
16	17	3.0	NaN	1.0	NaN	1.0	NaN	1.0	NaN	3.0	NaN
17	18	3.0	NaN	1.0	NaN	1.0	NaN	2.0	NaN	3.0	NaN
18	19	3.0	NaN	1.0	NaN	2.0	NaN	1.0	NaN	3.0	NaN
19	20	3.0	NaN	1.0	NaN	2.0	NaN	2.0	NaN	1.0	NaN
20	21	3.0	NaN	2.0	NaN	1.0	NaN	1.0	NaN	3.0	NaN
21	22	3.0	NaN	2.0	NaN	1.0	NaN	2.0	NaN	2.0	NaN
22	23	3.0	NaN	2.0	NaN	2.0	NaN	1.0	NaN	3.0	NaN
23	24	3.0	NaN	2.0	NaN	2.0	NaN	2.0	NaN	3.0	NaN

	c0	c2	c4	c6	c8	c10
0	1	1.0	1.0	1.0	1.0	3.0
1	2	1.0	1.0	1.0	2.0	2.0
2	3	1.0	1.0	2.0	1.0	3.0
3	4	1.0	1.0	2.0	2.0	1.0
4	5	1.0	2.0	1.0	1.0	3.0

	Age	Specs	Astigmatic	Tear-Production-Rate	Target-Lenses
0	1.0	1.0	1.0	1.0	3.0
1	1.0	1.0	1.0	2.0	2.0
2	1.0	1.0	2.0	1.0	3.0
3	1.0	1.0	2.0	2.0	1.0
4	1.0	2.0	1.0	1.0	3.0
5	1.0	2.0	1.0	2.0	2.0
6	1.0	2.0	2.0	1.0	3.0
7	1.0	2.0	2.0	2.0	1.0
8	2.0	1.0	1.0	1.0	3.0
9	2.0	1.0	1.0	2.0	2.0
10	2.0	1.0	2.0	1.0	3.0
11	2.0	1.0	2.0	2.0	1.0
12	2.0	2.0	1.0	1.0	3.0
13	2.0	2.0	1.0	2.0	2.0
14	2.0	2.0	2.0	1.0	3.0
15	2.0	2.0	2.0	2.0	3.0
16	3.0	1.0	1.0	1.0	3.0
17	3.0	1.0	1.0	2.0	3.0
18	3.0	1.0	2.0	1.0	3.0
19	3.0	1.0	2.0	2.0	1.0
20	3.0	2.0	1.0	1.0	3.0
21	3.0	2.0	1.0	2.0	2.0
22	3.0	2.0	2.0	1.0	3.0
23	3.0	2.0	2.0	2.0	3.0

	Age	Specs	Astigmatic	Tear-Production-Rate
0	1.0	1.0	1.0	1.0
1	1.0	1.0	1.0	2.0
2	1.0	1.0	2.0	1.0
3	1.0	1.0	2.0	2.0
4	1.0	2.0	1.0	1.0

	Age	Specs	Astigmatic	Tear-Production-Rate
0	1.0	1.0	1.0	1.0
1	1.0	1.0	1.0	2.0
2	1.0	1.0	2.0	1.0
3	1.0	1.0	2.0	2.0
4	1.0	2.0	1.0	1.0