In [1]:
using LinearAlgebra
using Statistics
using SparseArrays
using MLDatasets
using PyPlot

Test of Johnson-Lindenstrauss transform¶

In [59]:
d = 100000;
n = 100;

Xs = randn(d,n); # the dataset
In [79]:
epsilon = 0.1;
r = Int64(ceil(1 * log(n) / epsilon^2))
Out[79]:
461
In [80]:
A = randn(r, d) / sqrt(r);
In [81]:
# let's loook at this for a single example
norm(Xs[:,1] - Xs[:,2])
Out[81]:
445.77408091448353
In [82]:
norm(A*Xs[:,1] - A*Xs[:,2])
Out[82]:
439.99821861375597
In [83]:
norm(A*Xs[:,1] - A*Xs[:,2]) / norm(Xs[:,1] - Xs[:,2])
Out[83]:
0.9870430728298992
In [84]:
AXs = A*Xs;
In [85]:
# over the whole dataset
extrema((i == j) ? 1.0 : norm(AXs[:,i] - AXs[:,j]) / norm(Xs[:,i] - Xs[:,j]) for i = 1:n, j = 1:n)
Out[85]:
(0.884736519025868, 1.1603533040975242)
In [86]:
# how much did we decrease the dimension?
d/r
Out[86]:
216.91973969631238

Principal Component Analysis¶

In [87]:
n = 60000;
d = 28*28;
In [88]:
train_x, train_y = MNIST.traindata();
In [89]:
train_x = Float64.(reshape(train_x, (d, n)));
In [90]:
train_x_mean = mean(train_x; dims=2);
In [91]:
train_x_minus_mean = train_x .- train_x_mean;
$$\Sigma = \frac{1}{n} \sum_{i=1}^n \left( x_i - \frac{1}{n} \sum_{j=1}^n x_j \right) \left( x_i - \frac{1}{n} \sum_{j=1}^n x_j \right)^T$$
In [92]:
Sigma = (train_x_minus_mean * train_x_minus_mean')/n;
In [93]:
# find the eigendecomposition of Sigma
ESigma = eigen(Sigma; sortby = (x -> -x))
Out[93]:
Eigen{Float64, Float64, Matrix{Float64}, Vector{Float64}}
values:
784-element Vector{Float64}:
  5.116787728342095
  3.7413284788648085
  3.2526542396844857
  2.8415733372665466
  2.5670749588127753
  2.273625491620609
  1.7251262295281957
  1.5205348976536204
  1.456280980887692
  1.24272937641733
  1.1120709732036007
  1.066622765935621
  0.9046657547344792
  ⋮
 -7.623437305843539e-19
 -1.2729150970044885e-18
 -1.7668886090777884e-18
 -5.950201742648399e-18
 -1.0450825653106704e-17
 -2.008205752491917e-17
 -4.418029991480794e-17
 -4.664583884447206e-17
 -1.0047562437025719e-16
 -1.4704751974041651e-16
 -1.48481691022555e-16
 -1.877203746228916e-16
vectors:
784×784 Matrix{Float64}:
  0.0         0.0          0.0          …   0.0           0.0
  0.0         0.0          0.0              0.0           0.0
  0.0         0.0          0.0              0.0           0.0
  0.0         0.0          0.0              0.0           0.0
  0.0         0.0          0.0              0.0           0.0
  0.0         0.0          0.0          …   0.0           0.0
  0.0         0.0          0.0              0.0           0.0
  0.0         0.0          0.0              0.0           0.0
  0.0         0.0          0.0              0.0           0.0
  0.0         0.0          0.0              0.0           0.0
  0.0         0.0          0.0          …   0.0           0.0
  0.0         0.0          0.0              0.0           0.0
 -1.13023e-6  1.79706e-6   7.96314e-7      -8.44767e-5    5.80556e-7
  ⋮                                     ⋱                
 -8.769e-5    0.00143078   0.00113599       1.69282e-13   2.31908e-14
 -4.5962e-5   0.000920176  0.000863292     -1.58675e-13  -2.16169e-13
 -1.27477e-5  0.000500157  0.000487834      2.18568e-13   2.39267e-13
  1.72528e-5  0.000219159  0.000211216  …  -7.11618e-13   1.05959e-13
  5.51266e-6  0.000109011  9.13385e-5       1.90423e-12  -4.67126e-13
 -2.59364e-7  3.54844e-5   3.45235e-5      -3.5538e-12    2.0413e-13
 -4.95273e-7  2.12795e-5   1.89427e-5       7.94817e-13  -2.23857e-13
  1.37842e-7  2.22338e-6   2.25569e-6       0.0           0.0
  0.0         0.0          0.0          …   0.0           0.0
  0.0         0.0          0.0              0.0           0.0
  0.0         0.0          0.0              0.0           0.0
  0.0         0.0          0.0              0.0           0.0
In [94]:
# largest eigenvalue
ESigma.values[1]
Out[94]:
5.116787728342095
In [95]:
# corresponding eigenvector
ESigma.vectors[:, 1]
Out[95]:
784-element Vector{Float64}:
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
 -1.1302299848127223e-6
  ⋮
 -8.768996662291903e-5
 -4.596197162692818e-5
 -1.2747685974477327e-5
  1.7252770690368013e-5
  5.512661031548127e-6
 -2.593635884534651e-7
 -4.952733257714004e-7
  1.3784202612423166e-7
  0.0
  0.0
  0.0
  0.0
In [96]:
imshow(reshape(ESigma.vectors[:, 1], (28,28))');
No description has been provided for this image
In [97]:
imshow(reshape(ESigma.vectors[:, 2], (28,28))');
No description has been provided for this image
In [98]:
imshow(reshape(ESigma.vectors[:, 3], (28,28))');
No description has been provided for this image
In [31]:
imshow(reshape(ESigma.vectors[:, 4], (28,28))');
No description has been provided for this image
In [32]:
imshow(reshape(ESigma.vectors[:, 5], (28,28))');
No description has been provided for this image
In [115]:
semilogx(cumsum(ESigma.values))
ylabel("eigenvalue")
xlabel("index of eigenvalue")
title("Eigenvalues of MNIST Covariance Matrix");
No description has been provided for this image

PCA can represent objects in low dimension without losing information.

In [107]:
D = 20;
In [108]:
A = ESigma.vectors[:, 1:D]';
In [109]:
# original image
i = 1337;
imshow(reshape(train_x[:,i], (28,28))');
No description has been provided for this image
In [110]:
# dimension-reduced image
x_dr = A * train_x_minus_mean[:,i]
x_recovered = A' * x_dr + train_x_mean;

# original image
imshow(reshape(x_recovered, (28,28))');
No description has been provided for this image

Still enough to classify the image!

In [ ]:

Sparsity¶

In [117]:
n = 4096;
X = sprand(n,n,0.1);
Y = sprand(n,n,0.1);
In [118]:
# visualize part of it
imshow(X[1:64,1:64]);
colorbar();
No description has been provided for this image
In [119]:
# get the size of X in bytes
Base.summarysize(X)
Out[119]:
26930056
In [121]:
26930056/(n*n*8)
Out[121]:
0.20064455270767212
In [122]:
# type of X
typeof(X)
Out[122]:
SparseMatrixCSC{Float64, Int64}
In [123]:
Xdense = Matrix(X);
Ydense = Matrix(Y);
typeof(Xdense)
Out[123]:
Matrix{Float64} (alias for Array{Float64, 2})
In [124]:
Base.summarysize(Xdense)
Out[124]:
134217776
In [125]:
# what's the ratio?
Base.summarysize(X) / Base.summarysize(Xdense)
Out[125]:
0.20064448095161405
In [126]:
# what fraction of X's entries are nonzero?
nnz(X) / length(X)
Out[126]:
0.1000131368637085

Why are these numbers not equal?

In [129]:
@time X * Y;
  0.631217 seconds (13 allocations: 280.660 MiB, 1.60% gc time)
In [131]:
dense_mmpy_time = @elapsed Xdense * Ydense
Out[131]:
0.401853167

What happened here???

In [134]:
# make X and Y less dense
Xsparser = sprand(n,n,0.01);
Ysparser = sprand(n,n,0.01);
In [136]:
@time Xsparser * Ysparser;
  0.065674 seconds (13 allocations: 123.676 MiB, 0.89% gc time)
In [137]:
densities = [0.001, 0.003, 0.01, 0.03, 0.1, 0.3];
sparse_mm_times = Float64[];
for d in densities
    Xsparse = sprand(n,n,d);
    Ysparse = sprand(n,n,d);
    push!(sparse_mm_times, @elapsed(Xsparse * Ysparse));
end
In [138]:
loglog(densities, sparse_mm_times; label="sparse mmpy");
loglog(densities, densities * 0 .+ dense_mmpy_time; label="dense mmpy");
xlabel("density");
ylabel("matrix multiply runtime");
legend();
No description has been provided for this image
In [139]:
spv = X[:,1]
Out[139]:
4096-element SparseVector{Float64, Int64} with 418 stored entries:
  [6   ]  =  0.965586
  [10  ]  =  0.770424
  [13  ]  =  0.607012
  [30  ]  =  0.821796
  [35  ]  =  0.743811
  [47  ]  =  0.173643
  [48  ]  =  0.333206
  [65  ]  =  0.75069
  [75  ]  =  0.631344
  [77  ]  =  0.840866
          ⋮
  [3972]  =  0.584529
  [3981]  =  0.358314
  [3985]  =  0.312482
  [3986]  =  0.0932752
  [4005]  =  0.317625
  [4007]  =  0.761526
  [4012]  =  0.0655958
  [4019]  =  0.345667
  [4022]  =  0.185923
  [4090]  =  0.715475
  [4095]  =  0.520961
In [56]:
spv.n
Out[56]:
4096
In [57]:
spv.nzind
Out[57]:
429-element Vector{Int64}:
    9
   17
   18
   24
   28
   54
   60
   67
   70
   79
   83
   89
   93
    ⋮
 4010
 4011
 4032
 4034
 4036
 4040
 4057
 4061
 4062
 4065
 4066
 4094
In [58]:
spv.nzval
Out[58]:
429-element Vector{Float64}:
 0.3665466206937954
 0.9898197098105949
 0.8472553677537141
 0.4200382543710388
 0.575582096772398
 0.478161953650292
 0.5218372708763237
 0.5337721068240063
 0.6665316753775424
 0.2454782188665694
 0.6461326583138556
 0.8885971852707479
 0.3711005317479982
 ⋮
 0.5222351482640311
 0.15576228978743545
 0.0989029063487934
 0.0017576590602756959
 0.5716640778660788
 0.2710602459932675
 0.8826343694387182
 0.9838845254047324
 0.8704305124740629
 0.9049698008038184
 0.6041680082714094
 0.24830600332937425
In [ ]: