# lec03PCAExample.py
# CS 4786, Profs Lillian Lee and Karthik Sridharan
# Jan 29, 2015
'''An example provided in class on Jan 29th for the steps of computing the PCA
and understanding what information you (sometimes) get from the covariance matrix
and the principal components. Real-life data might not work out this nicely,
but we just wanted to give you a flavor of the possibilities. '''
import numpy as np
# Didn't end up using scipy (a bit painful to install), but wanted you to see
# an example of how scipy's imports are a little different
# http://stackoverflow.com/questions/8728732/what-is-wrong-with-importing-modules-in-scipy-is-it-a-bug
import scipy.linalg
# Generate a data set of two types of vectors, one where the odd-indexed
# items tend to be bigger than the even-indexed items, and vice versa.
# The "halfn" and "numreps" are just convenience parameters for specifying
# how many data instances there will be, and how many repetitions of the "big small"
# or "small big" pairs to have in the data instances.
halfn=3
numreps=2
baselength=2
mu1 = np.array([5,7]*numreps)
mu2 = np.array([7,5]*numreps)
cov = np.diagflat([.1]*(baselength*numreps))
odds = np.random.multivariate_normal(mu1, cov, halfn)
evens = np.random.multivariate_normal(mu2, cov, halfn)
data = np.concatenate((odds, evens)) # need the input to be a tuple
assert data.shape == (2*halfn, baselength*numreps), 'unexpected shape for the matrix called data'
print 'data:\n', data, '\n..............'
raw_input("...pause for contemplation")
covmat = np.cov(data.T,bias=1)
# numpy wants the rows to be the features ("variables"),
# and the columns to be the observations ("instances")
# 'bias=1' means the normalization is by n, instead of n-1.
print 'covariance matrix (in pcaEx.covmat):\n', covmat
raw_input("...pause for contemplation")
evals,evecs = np.linalg.eig(covmat)
# Sort the eigenvectors by decreasing eigenvalues.
# One source: http://stackoverflow.com/questions/8092920/sort-eigenvalues-and-associated-eigenvectors-after-using-numpy-linalg-eig-in-pyt
indices = evals.argsort()[::-1]
evals = evals[indices]
evecs = evecs[:,indices]
print 'eigenvalues (in pcaEx.evals):\n', evals
print 'eigenvectors (in pcaEx.evecs):\n', evecs
## uncomment for some debugging info/reminders
#
# print 'length of claimed first eigenvector is:', np.linalg.norm(evecs[:,0])
#
# np.dot(covmat,evecs[:,0])/evals[0] should
# give back the first eigenvector evecs[:,0]