using LinearAlgebra
using Statistics
using SparseArrays
using MLDatasets
using PyPlot

d = 100000;
n = 100;

Xs = randn(d,n); # the dataset

epsilon = 0.1;
r = Int64(ceil(1 * log(n) / epsilon^2))

461

A = randn(r, d) / sqrt(r);

# let's loook at this for a single example
norm(Xs[:,1] - Xs[:,2])

445.77408091448353

norm(A*Xs[:,1] - A*Xs[:,2])

439.99821861375597

norm(A*Xs[:,1] - A*Xs[:,2]) / norm(Xs[:,1] - Xs[:,2])

0.9870430728298992

AXs = A*Xs;

# over the whole dataset
extrema((i == j) ? 1.0 : norm(AXs[:,i] - AXs[:,j]) / norm(Xs[:,i] - Xs[:,j]) for i = 1:n, j = 1:n)

(0.884736519025868, 1.1603533040975242)

# how much did we decrease the dimension?
d/r

216.91973969631238

n = 60000;
d = 28*28;

train_x, train_y = MNIST.traindata();

train_x = Float64.(reshape(train_x, (d, n)));

train_x_mean = mean(train_x; dims=2);

train_x_minus_mean = train_x .- train_x_mean;

Sigma = (train_x_minus_mean * train_x_minus_mean')/n;

# find the eigendecomposition of Sigma
ESigma = eigen(Sigma; sortby = (x -> -x))

Eigen{Float64, Float64, Matrix{Float64}, Vector{Float64}}
values:
784-element Vector{Float64}:
  5.116787728342095
  3.7413284788648085
  3.2526542396844857
  2.8415733372665466
  2.5670749588127753
  2.273625491620609
  1.7251262295281957
  1.5205348976536204
  1.456280980887692
  1.24272937641733
  1.1120709732036007
  1.066622765935621
  0.9046657547344792
  ⋮
 -7.623437305843539e-19
 -1.2729150970044885e-18
 -1.7668886090777884e-18
 -5.950201742648399e-18
 -1.0450825653106704e-17
 -2.008205752491917e-17
 -4.418029991480794e-17
 -4.664583884447206e-17
 -1.0047562437025719e-16
 -1.4704751974041651e-16
 -1.48481691022555e-16
 -1.877203746228916e-16
vectors:
784×784 Matrix{Float64}:
  0.0         0.0          0.0          …   0.0           0.0
  0.0         0.0          0.0              0.0           0.0
  0.0         0.0          0.0              0.0           0.0
  0.0         0.0          0.0              0.0           0.0
  0.0         0.0          0.0              0.0           0.0
  0.0         0.0          0.0          …   0.0           0.0
  0.0         0.0          0.0              0.0           0.0
  0.0         0.0          0.0              0.0           0.0
  0.0         0.0          0.0              0.0           0.0
  0.0         0.0          0.0              0.0           0.0
  0.0         0.0          0.0          …   0.0           0.0
  0.0         0.0          0.0              0.0           0.0
 -1.13023e-6  1.79706e-6   7.96314e-7      -8.44767e-5    5.80556e-7
  ⋮                                     ⋱                
 -8.769e-5    0.00143078   0.00113599       1.69282e-13   2.31908e-14
 -4.5962e-5   0.000920176  0.000863292     -1.58675e-13  -2.16169e-13
 -1.27477e-5  0.000500157  0.000487834      2.18568e-13   2.39267e-13
  1.72528e-5  0.000219159  0.000211216  …  -7.11618e-13   1.05959e-13
  5.51266e-6  0.000109011  9.13385e-5       1.90423e-12  -4.67126e-13
 -2.59364e-7  3.54844e-5   3.45235e-5      -3.5538e-12    2.0413e-13
 -4.95273e-7  2.12795e-5   1.89427e-5       7.94817e-13  -2.23857e-13
  1.37842e-7  2.22338e-6   2.25569e-6       0.0           0.0
  0.0         0.0          0.0          …   0.0           0.0
  0.0         0.0          0.0              0.0           0.0
  0.0         0.0          0.0              0.0           0.0
  0.0         0.0          0.0              0.0           0.0

# largest eigenvalue
ESigma.values[1]

5.116787728342095

# corresponding eigenvector
ESigma.vectors[:, 1]

784-element Vector{Float64}:
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
 -1.1302299848127223e-6
  ⋮
 -8.768996662291903e-5
 -4.596197162692818e-5
 -1.2747685974477327e-5
  1.7252770690368013e-5
  5.512661031548127e-6
 -2.593635884534651e-7
 -4.952733257714004e-7
  1.3784202612423166e-7
  0.0
  0.0
  0.0
  0.0

imshow(reshape(ESigma.vectors[:, 1], (28,28))');

imshow(reshape(ESigma.vectors[:, 2], (28,28))');

imshow(reshape(ESigma.vectors[:, 3], (28,28))');

imshow(reshape(ESigma.vectors[:, 4], (28,28))');

imshow(reshape(ESigma.vectors[:, 5], (28,28))');

semilogx(cumsum(ESigma.values))
ylabel("eigenvalue")
xlabel("index of eigenvalue")
title("Eigenvalues of MNIST Covariance Matrix");

D = 20;

A = ESigma.vectors[:, 1:D]';

# original image
i = 1337;
imshow(reshape(train_x[:,i], (28,28))');

# dimension-reduced image
x_dr = A * train_x_minus_mean[:,i]
x_recovered = A' * x_dr + train_x_mean;

# original image
imshow(reshape(x_recovered, (28,28))');

n = 4096;
X = sprand(n,n,0.1);
Y = sprand(n,n,0.1);

# visualize part of it
imshow(X[1:64,1:64]);
colorbar();

# get the size of X in bytes
Base.summarysize(X)

26930056

26930056/(n*n*8)

0.20064455270767212

# type of X
typeof(X)

SparseMatrixCSC{Float64, Int64}

Xdense = Matrix(X);
Ydense = Matrix(Y);
typeof(Xdense)

Matrix{Float64} (alias for Array{Float64, 2})

Base.summarysize(Xdense)

134217776

# what's the ratio?
Base.summarysize(X) / Base.summarysize(Xdense)

0.20064448095161405

# what fraction of X's entries are nonzero?
nnz(X) / length(X)

0.1000131368637085

@time X * Y;

  0.631217 seconds (13 allocations: 280.660 MiB, 1.60% gc time)

dense_mmpy_time = @elapsed Xdense * Ydense

0.401853167

# make X and Y less dense
Xsparser = sprand(n,n,0.01);
Ysparser = sprand(n,n,0.01);

@time Xsparser * Ysparser;

  0.065674 seconds (13 allocations: 123.676 MiB, 0.89% gc time)

densities = [0.001, 0.003, 0.01, 0.03, 0.1, 0.3];
sparse_mm_times = Float64[];
for d in densities
    Xsparse = sprand(n,n,d);
    Ysparse = sprand(n,n,d);
    push!(sparse_mm_times, @elapsed(Xsparse * Ysparse));
end

loglog(densities, sparse_mm_times; label="sparse mmpy");
loglog(densities, densities * 0 .+ dense_mmpy_time; label="dense mmpy");
xlabel("density");
ylabel("matrix multiply runtime");
legend();

spv = X[:,1]

4096-element SparseVector{Float64, Int64} with 418 stored entries:
  [6   ]  =  0.965586
  [10  ]  =  0.770424
  [13  ]  =  0.607012
  [30  ]  =  0.821796
  [35  ]  =  0.743811
  [47  ]  =  0.173643
  [48  ]  =  0.333206
  [65  ]  =  0.75069
  [75  ]  =  0.631344
  [77  ]  =  0.840866
          ⋮
  [3972]  =  0.584529
  [3981]  =  0.358314
  [3985]  =  0.312482
  [3986]  =  0.0932752
  [4005]  =  0.317625
  [4007]  =  0.761526
  [4012]  =  0.0655958
  [4019]  =  0.345667
  [4022]  =  0.185923
  [4090]  =  0.715475
  [4095]  =  0.520961

spv.n

4096

spv.nzind

429-element Vector{Int64}:
    9
   17
   18
   24
   28
   54
   60
   67
   70
   79
   83
   89
   93
    ⋮
 4010
 4011
 4032
 4034
 4036
 4040
 4057
 4061
 4062
 4065
 4066
 4094

spv.nzval

429-element Vector{Float64}:
 0.3665466206937954
 0.9898197098105949
 0.8472553677537141
 0.4200382543710388
 0.575582096772398
 0.478161953650292
 0.5218372708763237
 0.5337721068240063
 0.6665316753775424
 0.2454782188665694
 0.6461326583138556
 0.8885971852707479
 0.3711005317479982
 ⋮
 0.5222351482640311
 0.15576228978743545
 0.0989029063487934
 0.0017576590602756959
 0.5716640778660788
 0.2710602459932675
 0.8826343694387182
 0.9838845254047324
 0.8704305124740629
 0.9049698008038184
 0.6041680082714094
 0.24830600332937425

Test of Johnson-Lindenstrauss transform¶

Principal Component Analysis¶

Sparsity¶