import os
import numpy as np
from PIL import Image
def load_datasets(datapath):
    '''
    Xtrain, Xdev, Xtest, Ytrain, Ydev, Ytest, personnames = load_datasets(datadir)
    datadir (string) = directory from which to load data
    Xtrain (NTOKSxNDIM) = matrix, each row of which is an image
    Xdev (NTOKSxNDIM) = matrix, each row of which is an image
    Xtest (NTOKSxNDIM) = matrix, each row of which is an image
    Ytrain (NTOKS) = vector of indices specifying personname of each image
    Ydev (NTOKS) = vector of indices specifying personname of each image
    Ytest (NTOKS) = vector of indices specifying personname of each image
    personnames (list of strings) = names of the people
    '''
    Xtrain, Xdev, Xtest = [], [], []
    Ytrain, Ydev, Ytest = [], [], []
    personnames = os.listdir(datapath)
    n = 0
    for personname in personnames:
        for filename in os.listdir(os.path.join(datapath,personname)):
            imagepath = os.path.join(datapath,personname,filename)
            if n % 5 == 4:
                image = np.asarray(Image.open(imagepath)).astype('float64')
                Xtest.append(image.reshape(np.prod(image.shape)))
                Ytest.append(personnames.index(personname))
            elif n % 5 == 3:
                image = np.asarray(Image.open(imagepath)).astype('float64')
                Xdev.append(image.reshape(np.prod(image.shape)))
                Ydev.append(personnames.index(personname))
            else:
                image = np.asarray(Image.open(imagepath)).astype('float64')
                Xtrain.append(image.reshape(np.prod(image.shape)))
                Ytrain.append(personnames.index(personname))
            n += 1
    return np.array(Xtrain),np.array(Xdev),np.array(Xtest), np.array(Ytrain), np.array(Ydev), np.array(Ytest), personnames

def vector_to_image(vector, shape=(250,250,3)):
    '''Convert a vector to an image (with shape (nrows, ncols, ncolors)'''
    return vector.reshape(shape).astype(dtype='uint8')


import os
Xtrain, Xdev, Xtest, Ytrain, Ydev, Ytest, personnames = load_datasets('data')
print(Xtrain.shape, Xdev.shape, Xtest.shape, Ytrain.shape, Ydev.shape, Ytest.shape)
print(np.amin(Xtrain),np.amax(Xtrain))

(165, 187500) (55, 187500) (55, 187500) (165,) (55,) (55,)
0.0 255.0


import matplotlib.pyplot as plt
fig, ax = plt.subplots(2,2,figsize=(10,10))
plotted = np.zeros(4)
for i in range(165):
    if plotted[Ytrain[i]]==0:
        ax[Ytrain[i]//2,Ytrain[i]%2].imshow(vector_to_image(Xtrain[i,:]))
        ax[Ytrain[i]//2,Ytrain[i]%2].set_title(personnames[Ytrain[i]])
fig.tight_layout()


import matplotlib.figure
fig, axs = plt.subplots(2,2,figsize=(10,10))
axs[0,0].plot(Xtrain[:,0],Xtrain[:,1],'.')
axs[0,0].set_title('Pixel 1 vs. Pixel 0')
axs[0,1].plot(Xtrain[:,0],Xtrain[:,2],'.')
axs[0,1].set_title('Pixel 2 vs. Pixel 0')
axs[1,0].plot(Xtrain[:,0],Xtrain[:,3],'.')
axs[1,0].set_title('Pixel 3 vs. Pixel 0')
axs[1,1].plot(Xtrain[:,0],Xtrain[:,4],'.')
axs[1,1].set_title('Pixel 4 vs. Pixel 0')

Text(0.5, 1.0, 'Pixel 4 vs. Pixel 0')


fig, axs = plt.subplots(2,2,figsize=(10,10))
axs[0,0].hist(Xtrain[:,0],bins=100)
axs[0,0].set_title('Pixel 0 histogram')
axs[0,0].set_ylabel('# images')
axs[0,1].hist(Xtrain[:,1],bins=100)
axs[0,1].set_title('Pixel 1 histogram')
axs[1,0].hist(Xtrain[:,2],bins=100)
axs[1,0].set_title('Pixel 2 histogram')
axs[1,0].set_xlabel('Pixel value')
axs[1,0].set_ylabel('# images')
axs[1,1].hist(Xtrain[:,3],bins=100)
axs[1,1].set_title('Pixel 3 histogram')
axs[1,1].set_xlabel('Pixel value')

Text(0.5, 0, 'Pixel value')


import submitted, importlib
importlib.reload(submitted)
help(submitted.todo_dataset_mean)

Help on function todo_dataset_mean in module submitted:

todo_dataset_mean(X)
    mu = todo_dataset_mean(X)
    Compute the average of the rows in X (you may use any numpy function)
    X (NTOKSxNDIMS) = data matrix
    mu (NDIMS) = mean vector


mu = submitted.todo_dataset_mean(Xtrain)
mu.shape

(187500,)


fig, ax = plt.subplots(figsize=(6,6))
ax.imshow(vector_to_image(mu))
ax.set_title('Average of all training images')

Text(0.5, 1.0, 'Average of all training images')


importlib.reload(submitted)
help(submitted.todo_center_datasets)

Help on function todo_center_datasets in module submitted:

todo_center_datasets(train, dev, test, mu)
    ctrain, cdev, ctest = todo_center_datasets(train, dev, test, mu)
    Subtract mu from each row of each matrix, return the resulting three matrices.


importlib.reload(submitted)
ctrain, cdev, ctest = submitted.todo_center_datasets(Xtrain, Xdev, Xtest, mu)
ctrain.shape

(165, 187500)


fig, axs = plt.subplots(2,2,figsize=(10,10))
axs[0,0].clear()
axs[0,0].plot(ctrain[:,0],ctrain[:,1],'.')
axs[0,0].set_title('Centered Pixel 1 vs. Pixel 0')
axs[0,1].clear()
axs[0,1].plot(ctrain[:,0],ctrain[:,2],'.')
axs[0,1].set_title('Centered Pixel 2 vs. Pixel 0')
axs[1,0].clear()
axs[1,0].plot(ctrain[:,0],ctrain[:,3],'.')
axs[1,0].set_title('Centered Pixel 3 vs. Pixel 0')
axs[1,1].clear()
axs[1,1].plot(ctrain[:,0],ctrain[:,4],'.')
axs[1,1].set_title('Centered Pixel 4 vs. Pixel 0')

Text(0.5, 1.0, 'Centered Pixel 4 vs. Pixel 0')


fig, axs = plt.subplots(2,2,figsize=(10,10))
axs[0,0].hist(ctrain[:,0],bins=100)
axs[0,0].set_title('Centered 0 histogram')
axs[0,0].set_ylabel('# images')
axs[0,1].hist(ctrain[:,1],bins=100)
axs[0,1].set_title('Centered 1 histogram')
axs[1,0].hist(ctrain[:,2],bins=100)
axs[1,0].set_title('Centered 2 histogram')
axs[1,0].set_ylabel('# images')
axs[1,0].set_xlabel('Pixel value')
axs[1,1].hist(ctrain[:,3],bins=100)
axs[1,1].set_title('Centered 3 histogram')
axs[1,1].set_xlabel('Pixel value')

Text(0.5, 0, 'Pixel value')


importlib.reload(submitted)
help(submitted.todo_find_transform)

Help on function todo_find_transform in module submitted:

todo_find_transform(X)
    V, Lambda = todo_find_transform(X)
    X (NTOKS x NDIM) - data matrix.  You may assume that NDIM > NTOKS
    V (NDIM x NTOKS) - The first NTOKS principal component vectors of X
    Lambda (NTOKS) - The  first NTOKS eigenvalues of the covariance or gram matrix of X
    
    Find and return the PCA transform for the given X matrix:
    a matrix in which each column is a principal component direction.
    You can assume that the # data is less than the # dimensions per vector,
    so you should probably use the gram-matrix method, not the covariance method.
    Standardization: Make sure that each of your returned vectors has unit norm,
    and that its first element is non-negative.
    Return: (V, Lambda)
      V[:,i] = the i'th principal component direction
      Lambda[i] = the variance explained by the i'th principal component
    
    V and Lambda should both be sorted so that Lambda is in descending order of absolute
    value.  Notice: np.linalg.eig doesn't always do this, and in fact, the order it generates
    is different on my laptop vs. the grader, leading to spurious errors.  Consider using 
    np.argsort and np.take_along_axis to solve this problem, or else use np.linalg.svd instead.


importlib.reload(submitted)
V, Lambda = submitted.todo_find_transform(ctrain)
print(V.shape, Lambda.shape)

(187500, 165) (165,)


def normalize_to_uint8_scale(v):
    return 255*(v-np.amin(v))/(np.amax(v)-np.amin(v)+1e-6)

fig, axs = plt.subplots(2,2,figsize=(10,10))
axs[0,0].imshow(vector_to_image(normalize_to_uint8_scale(V[:,0])))
axs[0,0].set_title('PCA0 as image')
axs[0,1].imshow(vector_to_image(normalize_to_uint8_scale(V[:,1])))
axs[0,1].set_title('PCA1 as image')
axs[1,0].imshow(vector_to_image(normalize_to_uint8_scale(V[:,2])))
axs[1,0].set_title('PCA2 as image')
axs[1,1].imshow(vector_to_image(normalize_to_uint8_scale(V[:,3])))
axs[1,1].set_title('PCA3 as image')

Text(0.5, 1.0, 'PCA3 as image')


fig, axs = plt.subplots(2,1,figsize=(14,8))
axs[0].plot(Lambda)
axs[0].set_title('Variance on each of the principal component directions')
axs[1].plot(np.cumsum(Lambda))
axs[1].set_title("Total variance explained by all PCA up to the k'th PCA")
axs[1].set_xlabel('$k$, principal component index')

Text(0.5, 0, '$k$, principal component index')


importlib.reload(submitted)
help(submitted.todo_transform_datasets)

Help on function todo_transform_datasets in module submitted:

todo_transform_datasets(ctrain, cdev, ctest, V)
    ttrain, tdev, ttest = todo_transform_datasets(ctrain, cdev, ctest, V)
    ctrain, cdev, ctest are each (NTOKS x NDIMS) matrices (with different numbers of tokens)
    V is an (NDIM x K) matrix, containing the first K principal component vectors
    
    Transform each x using transform, return the resulting three datasets.


importlib.reload(submitted)
ttrain, tdev, ttest = submitted.todo_transform_datasets(ctrain, cdev, ctest, V)
print(ttrain.shape, tdev.shape, ttest.shape)

(165, 165) (55, 165) (55, 165)


fig, axs = plt.subplots(2,2,figsize=(10,10))
axs[0,0].plot(ttrain[:,0],ttrain[:,1],'.')
axs[0,0].set_title('PCA1 vs. PCA0')
axs[0,1].plot(ttrain[:,0],ttrain[:,2],'.')
axs[0,1].set_title('PCA2 vs. PCA0')
axs[1,0].plot(ttrain[:,0],ttrain[:,3],'.')
axs[1,0].set_title('PCA3 vs. PCA0')
axs[1,1].plot(ttrain[:,0],ttrain[:,4],'.')
axs[1,1].set_title('PCA4 vs. PCA0')

Text(0.5, 1.0, 'PCA4 vs. PCA0')


fig, axs = plt.subplots(2,2,figsize=(10,10))
axs[0,0].hist(ttrain[:,0],bins=100)
axs[0,0].set_title('PCA0 histogram')
axs[0,0].set_ylabel('# images')
axs[0,1].hist(ttrain[:,1],bins=100)
axs[0,1].set_title('PCA1 histogram')
axs[1,0].hist(ttrain[:,2],bins=100)
axs[1,0].set_title('PCA2 histogram')
axs[1,0].set_ylabel('# images')
axs[1,0].set_xlabel('Feature value')
axs[1,1].hist(ttrain[:,3],bins=100)
axs[1,1].set_title('PCA3 histogram')
axs[1,1].set_xlabel('Feature value')

Text(0.5, 0, 'Feature value')


importlib.reload(submitted)
help(submitted.todo_distances)

Help on function todo_distances in module submitted:

todo_distances(train, test, size)
    D = todo_distances(train, test, size)
    train (NTRAINxNDIM) - one training vector per row
    test (NTESTxNDIM) - one test vector per row
    size (scalar) - number of dimensions to be used in calculating distance
    D (NTRAIN x NTEST) - pairwise Euclidean distances between vectors
    
    Return a matrix D such that D[i,j]=distance(train[i,:size],test[j,:size])


importlib.reload(submitted)
Dtraintrain = submitted.todo_distances(ttrain, ttrain, ttrain.shape[1])
Dtraindev = submitted.todo_distances(ttrain, tdev, ttrain.shape[1])
Dtraindev.shape

(165, 55)


fig, axs = plt.subplots(1,2,figsize=(14,8))
axs[0].imshow(Dtraintrain)
axs[0].set_title('distances from train to train data')
axs[0].set_xlabel('training token number')
axs[0].set_xlabel('training token number')
axs[1].imshow(Dtraindev)
axs[1].set_title('distances from train to dev data')
axs[1].set_ylabel('training token number')
axs[1].set_xlabel('dev token number')

Text(0.5, 0, 'dev token number')


importlib.reload(submitted)
help(submitted.todo_nearest_neighbor)

Help on function todo_nearest_neighbor in module submitted:

todo_nearest_neighbor(Ytrain, D)
    hyps = todo_nearest_neighbor(Ytrain, D)
    Ytrain (NTRAIN) - a vector listing the class indices of each token in the training set
    D (NTRAIN x NTEST) - a matrix of distances from train to test vectors
    hyps (NTEST) - a vector containing a predicted class label for each test token
    
    Given the dataset train, and the (NTRAINxNTEST) matrix D, returns
    an int numpy array of length NTEST, specifying the person number (y) of the training token
    that is closest to each of the NTEST test tokens.


importlib.reload(submitted)
hypsfull = submitted.todo_nearest_neighbor(Ytrain, Dtraindev)
hypsfull.shape

(55,)


fig, ax = plt.subplots(figsize=(14,4))
ax.plot(np.arange(len(Ydev)),Ydev,'b-',np.arange(len(Ydev)),hypsfull,'r--')
ax.set_title('True class label (blue solid) and Predicted class label (red dashed)')

Text(0.5, 1.0, 'True class label (blue solid) and Predicted class label (red dashed)')


accuracyfull, confusionfull = submitted.todo_compute_accuracy(Ydev, hypsfull)
print('Accuracy is %2.2g'%(accuracyfull))
print('and the confusion matrix is')
print(confusionfull)

Accuracy is 0.51
and the confusion matrix is
[[16.  3.  0.  1.]
 [11.  8.  0.  0.]
 [ 4.  0.  1.  4.]
 [ 2.  2.  0.  3.]]


importlib.reload(submitted)
help(submitted.todo_find_bestsize)

Help on function todo_find_bestsize in module submitted:

todo_find_bestsize(ttrain, tdev, Ytrain, Ydev, variances)
    BESTSIZE, ACCURACIES = todo_find_bestsize(TTRAIN, TDEV, YTRAIN, YDEV, VARIANCES)
    TTRAIN (NTRAINxNDIMS) - training data, one vector per row, PCA-transformed
    TDEV (NDEVxNDIMS)  - devtest data, one vector per row, PCA-transformed
    YTRAIN (NTRAIN) - true labels of each training vector
    YDEV (NDEV) - true labels of each devtest token
    VARIANCES - nonzero eigenvectors of the covariance matrix = eigenvectors of the gram matrix
    
    BESTSIZE (scalar) - the best size to use for the nearest-neighbor classifier
    ACCURACIES (NTRAIN) - accuracy of dev classification, as function of the size of the NN classifier
    
    The only sizes you need to test (the only nonzero entries in the ACCURACIES
    vector) are the ones where the PCA features explain between 92.5% and
    97.5% of the variance of the training set, as specified by the provided
    per-feature variances.  All others should be zero.


importlib.reload(submitted)
bestsize, accuracies = submitted.todo_find_bestsize(ttrain, tdev, Ytrain, Ydev, Lambda)
print('Best size is %d'%(bestsize))
print('The accuracies vector has %d entries, but only %d were actually calculated'%(len(accuracies),len(np.nonzero(accuracies)[0])))

Best size is 79
The accuracies vector has 165 entries, but only 43 were actually calculated


fig, ax = plt.subplots(figsize=(12,4))
ax.plot(accuracies)
ax.set_title('Dev Accuracy vs. Feature Dim: Best Size=%d'%(bestsize))
ax.set_xlabel('Number of PCA Dimensions')
ax.set_ylabel('Dev Set Accuracy')

Text(0, 0.5, 'Dev Set Accuracy')


importlib.reload(submitted)
Dtest = submitted.todo_distances(ttrain, ttest, bestsize)
hypstest = submitted.todo_nearest_neighbor(Ytrain, Dtest)
accuracytest, confusiontest = submitted.todo_compute_accuracy(Ytest, hypstest)
print('Accuracy on test data is %2.2g'%(accuracytest))
print('Confusion matrix is:')
print(confusiontest)

Accuracy on test data is 0.56
Confusion matrix is:
[[12.  6.  1.  0.]
 [ 5. 14.  1.  0.]
 [ 2.  4.  0.  3.]
 [ 1.  0.  1.  5.]]


import h5py
with h5py.File('solutions.hdf5','r') as f:
    print(list(f.keys()))

['Dtest', 'Dtraindev', 'Lambda', 'V', 'Xdev', 'Xtest', 'Xtrain', 'Ydev', 'Ytest', 'Ytrain', 'accuracies', 'cdev', 'confusionfull', 'confusiontest', 'ctest', 'ctrain', 'hypsfull', 'hypstest', 'mu', 'tdev', 'ttest', 'ttrain']


importlib.reload(submitted)
with h5py.File('solutions.hdf5','r') as f:
    ttrain, tdev, ttest = submitted.todo_transform_datasets(f['ctrain'][:], f['cdev'][:], f['ctest'][:], f['V'][:])
    print('Error of ttrain is',np.sum(np.abs(ttrain-f['ttrain'][:])))

Error of ttrain is 0.0


import extra
importlib.reload(extra)
help(extra.classify)

Help on function classify in module extra:

classify(Xtrain, Ytrain, Xdev, Ydev, Xtest)
    Ytest = classify(Xtrain, Ytrain, Xdev, Ydev, Xtest)
    
    Use any technique you like to train a classifier with the training set,
    and then return the correct class labels for the test set.
    Extra credit points are provided for beating various thresholds above 50%.
    
    Xtrain (NTRAIN x NDIM) - data matrix.  You may assume that NDIM > NTOKS
    Ytrain (NTRAIN) - list of class indices
    Xdev (NDEV x NDIM) - data matrix.
    Ydev (NDEV) - list of class indices
    Xtest (NTEST x NDIM) - data matrix.


with h5py.File('solutions_extra.hdf5','r') as f:
    print(list(f.keys()))

['Xdev', 'Xtest', 'Xtrain', 'Ydev', 'Ytest', 'Ytrain', 'accuracytest']

MP3: Eigenfaces¶

Part 0: Loading the image files¶

Part 1: Compute a Mean Vector¶

Part 2: Center all three datasets (train, dev, and test)¶

Part 3: Compute the Principal Component Axes¶

Part 4: Transform all three centered datasets¶

Part 5: Find the nearest neighbors¶

Part 6: Find the best feature size using devset¶

Part 7: Show the Test Confusion Matrix¶

Part 8: How to Debug!!¶

Extra Credit¶