# Cornell CS4786, Profs Lee and Sridharan
# February 3, 2015

# Implementation in R of:
# Penn State STAT 505 lesson 8 on canonical correlation analysis
# https://onlinecourses.science.psu.edu/stat505/node/63
# data from https://onlinecourses.science.psu.edu/stat505/node/63

# With respect to the pedagogical benefits of this example:
# on the downside - math has such high correlation with all the individual sales variables.
# but the canonical coefficient is much higher than the max pair-pair (.944, profit and math)

require(ggplot2)
require(GGally)
require(CCA)

mm <- read.csv("sales.csv")
colnames(mm) <- c("growth", "profit", "newacct", "creative", "mechanical", "abstract", "math")
summary(mm)

# our split of the variables
sales <- mm[, 1:3]
tests <- mm[, 4:7]

# get an idea of what the data looks like
#print(summary(mm))
#ggpairs(mm)
readline('data matrix mm is loaded (....Pause....)')
#@ hit ^C here for first part of example


# gives us the canonical correlations and the canonical coefficients
cc1 <- cc(sales, tests)

print('The three columns below correspond to the three pairs of linear combinations, where each pair of linear combinations maximizes the correlation between the paired combinations subject to being not correlated with the previous combinations.)')

# display the canonical correlations
print('The canonical correlations (how much the pairs in each of the three combinations correlate, in order):')
print(cc1$cor)
readline('(....Pause....)')


# raw canonical coefficients
print('The canonical coefficients, i.e., the linear-combo coefficients (but remember the numbers had different ranges)')
print(cc1[3:4])
readline('(....Pause....)')

#@ should prove that the variance of the new vectors is 1.

# compute canonical loadings
print('Interpreting what the linear combinations mean: how much is each variable correlated with the two combinations in the pair?')
print('"xscores" means the linear combo of the sales, and "yscores" means the linear combo of the tests')
cc2 <- comput(sales, tests, cc1)
# display canonical loadings
print(cc2[3:6])

#"When the variables in the model have very different standard deviations, the
#standardized coefficients allow for easier comparisons among the variables."
# http://www.ats.ucla.edu/stat/r/dae/canonical.htm

# standardized sales canonical coefficients, using diagonal matrix of sales sd's
s1 <- diag(sqrt(diag(cov(sales))))
s1 %*% cc1$xcoef

# standardized tests canonical coefficients, using diagonal matrix of tests sd's
s2 <- diag(sqrt(diag(cov(tests))))
s2 %*% cc1$ycoef