In [22]:
using LinearAlgebra
using Statistics
using SparseArrays
using MLDatasets
using PyPlot
ArgumentError: Package PyPlot not found in current path.
- Run `import Pkg; Pkg.add("PyPlot")` to install the PyPlot package.

Stacktrace:
 [1] macro expansion
   @ ./loading.jl:1163 [inlined]
 [2] macro expansion
   @ ./lock.jl:223 [inlined]
 [3] require(into::Module, mod::Symbol)
   @ Base ./loading.jl:1144
In [2]:
import Pkg; Pkg.add("MLDatasets")
    Updating registry at `~/.julia/registries/General.toml`
   Resolving package versions...
   Installed ShowCases ─────────────────── v0.1.0
   Installed ZipFile ───────────────────── v0.10.1
   Installed InitialValues ─────────────── v0.3.1
   Installed OffsetArrays ──────────────── v1.12.10
   Installed InlineStrings ─────────────── v1.4.0
   Installed ContextVariablesX ─────────── v0.1.3
   Installed Unitful ───────────────────── v1.17.0
   Installed NNlib ─────────────────────── v0.8.21
   Installed InvertedIndices ───────────── v1.3.0
   Installed FileIO ────────────────────── v1.16.1
   Installed DataFrames ────────────────── v1.5.0
   Installed BFloat16s ─────────────────── v0.4.2
   Installed PrettyPrint ───────────────── v0.2.0
   Installed TupleTools ────────────────── v1.4.3
   Installed MLUtils ───────────────────── v0.4.3
   Installed MAT ───────────────────────── v0.10.6
   Installed JLD2 ──────────────────────── v0.4.35
   Installed LLVM ──────────────────────── v6.3.0
   Installed GZip ──────────────────────── v0.5.2
   Installed Chemfiles ─────────────────── v0.10.41
   Installed FLoopsBase ────────────────── v0.1.1
   Installed AtomsBase ─────────────────── v0.3.5
   Installed SentinelArrays ────────────── v1.4.0
   Installed StringEncodings ───────────── v0.3.7
   Installed DefineSingletons ──────────── v0.1.2
   Installed NameResolution ────────────── v0.1.5
   Installed MicroCollections ──────────── v0.1.4
   Installed WorkerUtilities ───────────── v1.6.1
   Installed UnsafeAtomicsLLVM ─────────── v0.1.3
   Installed MPIPreferences ────────────── v0.1.9
   Installed Graphics ──────────────────── v1.1.2
   Installed WeakRefStrings ────────────── v1.4.2
   Installed MappedArrays ──────────────── v0.4.2
   Installed BufferedStreams ───────────── v1.2.1
   Installed PaddedViews ───────────────── v0.5.12
   Installed Glob ──────────────────────── v1.3.1
   Installed HDF5_jll ──────────────────── v1.12.2+2
   Installed CEnum ─────────────────────── v0.4.2
   Installed BangBang ──────────────────── v0.3.39
   Installed LazyModules ───────────────── v0.3.1
   Installed MLStyle ───────────────────── v0.4.17
   Installed FLoops ────────────────────── v0.2.1
   Installed PeriodicTable ─────────────── v1.1.4
   Installed MosaicViews ───────────────── v0.3.4
   Installed StructTypes ───────────────── v1.10.0
   Installed ArgCheck ──────────────────── v2.3.0
   Installed ImageCore ─────────────────── v0.9.4
   Installed JSON3 ─────────────────────── v1.13.2
   Installed ImageShow ─────────────────── v0.3.8
   Installed TableTraits ───────────────── v1.0.1
   Installed DataDeps ──────────────────── v0.7.11
   Installed Setfield ──────────────────── v1.1.1
   Installed PrettyTables ──────────────── v2.2.7
   Installed PooledArrays ──────────────── v1.4.3
   Installed AbstractFFTs ──────────────── v1.5.0
   Installed DataValueInterfaces ───────── v1.0.0
   Installed LLVMExtra_jll ─────────────── v0.0.26+0
   Installed ConstructionBase ──────────── v1.5.4
   Installed StackViews ────────────────── v0.1.1
   Installed CompositionsBase ──────────── v0.1.2
   Installed Strided ───────────────────── v1.2.3
   Installed UnsafeAtomics ─────────────── v0.2.1
   Installed KernelAbstractions ────────── v0.9.8
   Installed MLDatasets ────────────────── v0.7.13
   Installed HDF5 ──────────────────────── v0.17.1
   Installed JuliaVariables ────────────── v0.2.4
   Installed GPUArraysCore ─────────────── v0.1.5
   Installed SimpleTraits ──────────────── v0.9.4
   Installed Atomix ────────────────────── v0.1.0
   Installed Crayons ───────────────────── v4.1.1
   Installed Chemfiles_jll ─────────────── v0.10.4+0
   Installed InternedStrings ───────────── v0.7.0
   Installed Adapt ─────────────────────── v3.6.2
   Installed ImageBase ─────────────────── v0.1.5
   Installed NPZ ───────────────────────── v0.4.3
   Installed Transducers ───────────────── v0.4.78
   Installed CSV ───────────────────────── v0.10.11
   Installed Tables ────────────────────── v1.11.0
   Installed UnitfulAtomic ─────────────── v1.0.0
   Installed Baselet ───────────────────── v0.1.1
   Installed IteratorInterfaceExtensions ─ v1.0.0
   Installed SplittablesBase ───────────── v0.1.15
   Installed FilePathsBase ─────────────── v0.9.21
   Installed StringManipulation ────────── v0.3.4
   Installed PrecompileTools ───────────── v1.2.0
   Installed Pickle ────────────────────── v0.3.3
    Updating `~/.julia/environments/v1.8/Project.toml`
  [eb30cadb] + MLDatasets v0.7.13
    Updating `~/.julia/environments/v1.8/Manifest.toml`
  [621f4979] + AbstractFFTs v1.5.0
  [79e6a3ab] + Adapt v3.6.2
  [dce04be8] + ArgCheck v2.3.0
  [a9b6321e] + Atomix v0.1.0
  [a963bdd2] + AtomsBase v0.3.5
  [ab4f0b2a] + BFloat16s v0.4.2
  [198e06fe] + BangBang v0.3.39
  [9718e550] + Baselet v0.1.1
  [e1450e63] + BufferedStreams v1.2.1
  [fa961155] + CEnum v0.4.2
  [336ed68f] + CSV v0.10.11
  [46823bd8] + Chemfiles v0.10.41
  [a33af91c] + CompositionsBase v0.1.2
  [187b0558] + ConstructionBase v1.5.4
  [6add18c4] + ContextVariablesX v0.1.3
  [a8cc5b0e] + Crayons v4.1.1
  [124859b0] + DataDeps v0.7.11
⌃ [a93c6f00] + DataFrames v1.5.0
  [e2d170a0] + DataValueInterfaces v1.0.0
  [244e2a9f] + DefineSingletons v0.1.2
  [cc61a311] + FLoops v0.2.1
  [b9860ae5] + FLoopsBase v0.1.1
  [5789e2e9] + FileIO v1.16.1
  [48062228] + FilePathsBase v0.9.21
  [46192b85] + GPUArraysCore v0.1.5
⌅ [92fee26a] + GZip v0.5.2
  [c27321d9] + Glob v1.3.1
  [a2bd30eb] + Graphics v1.1.2
  [f67ccb44] + HDF5 v0.17.1
⌃ [c817782e] + ImageBase v0.1.5
⌅ [a09fc81d] + ImageCore v0.9.4
  [4e3cecfd] + ImageShow v0.3.8
  [22cec73e] + InitialValues v0.3.1
  [842dd82b] + InlineStrings v1.4.0
  [7d512f48] + InternedStrings v0.7.0
  [41ab1584] + InvertedIndices v1.3.0
  [82899510] + IteratorInterfaceExtensions v1.0.0
  [033835bb] + JLD2 v0.4.35
  [0f8b85d8] + JSON3 v1.13.2
  [b14d175d] + JuliaVariables v0.2.4
  [63c18a36] + KernelAbstractions v0.9.8
  [929cbde3] + LLVM v6.3.0
  [8cdb02fc] + LazyModules v0.3.1
  [23992714] + MAT v0.10.6
  [eb30cadb] + MLDatasets v0.7.13
  [d8e11817] + MLStyle v0.4.17
  [f1d291b0] + MLUtils v0.4.3
  [3da0fdf6] + MPIPreferences v0.1.9
  [dbb5928d] + MappedArrays v0.4.2
  [128add7d] + MicroCollections v0.1.4
  [e94cdb99] + MosaicViews v0.3.4
⌅ [872c559c] + NNlib v0.8.21
  [15e1cf62] + NPZ v0.4.3
  [71a1bf82] + NameResolution v0.1.5
  [6fe1bfb0] + OffsetArrays v1.12.10
  [5432bcbf] + PaddedViews v0.5.12
  [7b2266bf] + PeriodicTable v1.1.4
  [fbb45041] + Pickle v0.3.3
  [2dfb63ee] + PooledArrays v1.4.3
  [aea7be01] + PrecompileTools v1.2.0
  [8162dcfd] + PrettyPrint v0.2.0
  [08abe8d2] + PrettyTables v2.2.7
  [91c51154] + SentinelArrays v1.4.0
  [efcf1570] + Setfield v1.1.1
  [605ecd9f] + ShowCases v0.1.0
  [699a6c99] + SimpleTraits v0.9.4
  [171d559e] + SplittablesBase v0.1.15
  [cae243ae] + StackViews v0.1.1
⌅ [5e0ebb24] + Strided v1.2.3
  [69024149] + StringEncodings v0.3.7
  [892a3eda] + StringManipulation v0.3.4
  [856f2bd8] + StructTypes v1.10.0
  [3783bdb8] + TableTraits v1.0.1
  [bd369af6] + Tables v1.11.0
  [28d57a85] + Transducers v0.4.78
  [9d95972d] + TupleTools v1.4.3
  [1986cc42] + Unitful v1.17.0
  [a7773ee8] + UnitfulAtomic v1.0.0
  [013be700] + UnsafeAtomics v0.2.1
  [d80eeb9a] + UnsafeAtomicsLLVM v0.1.3
  [ea10d353] + WeakRefStrings v1.4.2
  [76eceee3] + WorkerUtilities v1.6.1
  [a5390f91] + ZipFile v0.10.1
  [78a364fa] + Chemfiles_jll v0.10.4+0
⌃ [0234f1f7] + HDF5_jll v1.12.2+2
  [dad2f222] + LLVMExtra_jll v0.0.26+0
  [9fa8497b] + Future
  [4af54fe1] + LazyArtifacts
        Info Packages marked with ⌃ and ⌅ have new versions available, but those with ⌅ are restricted by compatibility constraints from upgrading. To see why use `status --outdated -m`
Precompiling project...
  ✓ IteratorInterfaceExtensions
  ✓ Glob
  ✓ CEnum
  ✓ WorkerUtilities
  ✓ BufferedStreams
  ✓ InitialValues
  ✓ PrettyPrint
  ✓ ArgCheck
  ✓ InternedStrings
  ✓ DataValueInterfaces
  ✓ StructTypes
  ✓ ShowCases
  ✓ SentinelArrays
  ✓ CompositionsBase
  ✓ DefineSingletons
  ✓ BFloat16s
  ✓ MPIPreferences
  ✓ LazyModules
  ✓ ContextVariablesX
  ✓ InvertedIndices
  ✓ Adapt
  ✓ ConstructionBase
  ✓ PrecompileTools
  ✓ UnsafeAtomics
  ✓ MappedArrays
  ✓ TupleTools
  ✓ GZip
  ✓ Crayons
  ✓ ZipFile
  ✓ SimpleTraits
  ✓ Baselet
  ✓ Chemfiles_jll
  ✓ PooledArrays
  ✓ LLVMExtra_jll
  ✓ HDF5_jll
  ✓ FilePathsBase
  ✓ TableTraits
  ✓ AbstractFFTs
  ✓ StringEncodings
  ✓ NameResolution
  ✓ Graphics
  ✓ FLoopsBase
  ✓ FileIO
  ✓ GPUArraysCore
  ✓ InlineStrings
  ✓ Atomix
  ✓ OffsetArrays
  ✓ Setfield
  ✓ DataDeps
  ✓ Strided
  ✓ StringManipulation
  ✓ NPZ
  ✓ Tables
  ✓ WeakRefStrings
  ✓ StackViews
  ✓ PaddedViews
  ✓ SplittablesBase
  ✓ BangBang
  ✓ Pickle
  ✓ MosaicViews
  ✓ JSON3
  ✓ LLVM
  ✓ MicroCollections
  ✓ UnsafeAtomicsLLVM
  ✓ MLStyle
  ✓ HDF5
  ✓ Transducers
  ✓ PrettyTables
  ✓ MAT
  ✓ KernelAbstractions
  ✓ JuliaVariables
  ✓ JLD2
  ✓ CSV
  ✓ FLoops
  ✓ NNlib
  ✓ Unitful
  ✓ PeriodicTable
  ✓ UnitfulAtomic
  ✓ AtomsBase
  ✓ ImageCore
  ✓ MLUtils
  ✓ Chemfiles
  ✓ ImageBase
  ✓ ImageShow
  ✓ DataFrames
  ✓ MLDatasets
  86 dependencies successfully precompiled in 23 seconds. 170 already precompiled.

Test of Johnson-Lindenstrauss transform¶

In [3]:
d = 100000;
n = 100;

Xs = randn(d,n); # the dataset
In [12]:
epsilon = 0.1;
r = Int64(ceil(8 * log(n) / epsilon^2))
Out[12]:
3685
In [13]:
A = randn(D, d) / sqrt(r);
In [14]:
# let's loook at this for a single example
norm(Xs[:,1] - Xs[:,2])
Out[14]:
447.22513073742795
In [15]:
norm(A*Xs[:,1] - A*Xs[:,2])
Out[15]:
441.36399897928436
In [16]:
norm(A*Xs[:,1] - A*Xs[:,2]) / norm(Xs[:,1] - Xs[:,2])
Out[16]:
0.9868944490027222
In [17]:
AXs = A*Xs;
In [18]:
# over the whole dataset
extrema((i == j) ? 1.0 : norm(AXs[:,i] - AXs[:,j]) / norm(Xs[:,i] - Xs[:,j]) for i = 1:n, j = 1:n)
Out[18]:
(0.9605492098673976, 1.0413298714811747)
In [19]:
# how much did we decrease the dimension?
D/d
Out[19]:
0.03685

Principal Component Analysis¶

In [20]:
n = 60000;
d = 28*28;
In [21]:
train_x, train_y = MNIST.traindata();
UndefVarError: MNIST not defined

Stacktrace:
 [1] top-level scope
   @ In[21]:1
In [13]:
train_x = Float64.(reshape(train_x, (d, n)));
In [14]:
train_x_mean = mean(train_x; dims=2);
In [15]:
train_x_minus_mean = train_x .- train_x_mean;
$$\Sigma = \frac{1}{n} \sum_{i=1}^n \left( x_i - \frac{1}{n} \sum_{j=1}^n x_j \right) \left( x_i - \frac{1}{n} \sum_{j=1}^n x_j \right)^T$$
In [16]:
Sigma = (train_x_minus_mean * train_x_minus_mean')/n;
In [17]:
# find the eigendecomposition of Sigma
ESigma = eigen(Sigma; sortby = (x -> -x))
Out[17]:
Eigen{Float64,Float64,Array{Float64,2},Array{Float64,1}}
values:
784-element Array{Float64,1}:
  5.116787728342092
  3.7413284788648067
  3.2526542396844844
  2.8415733372665466
  2.5670749588127744
  2.273625491620609
  1.7251262295281955
  1.5205348976536206
  1.4562809808876922
  1.2427293764173304
  1.1120709732036005
  1.0666227659356209
  0.9046657547344794
  ⋮
 -4.341596436079342e-19
 -7.816133441131003e-19
 -8.328205317261318e-19
 -9.06846445474923e-19
 -2.7674683969166128e-18
 -7.155496110387914e-18
 -1.057007120922721e-17
 -2.366763579350205e-17
 -4.458784081324597e-17
 -6.16504989725162e-17
 -6.378346028260054e-17
 -9.992685906775396e-17
vectors:
784×784 Array{Float64,2}:
  0.0         0.0          0.0          …   0.0           0.0
  0.0         0.0          0.0              0.0           0.0
  0.0         0.0          0.0              0.0           0.0
  0.0         0.0          0.0              0.0           0.0
  0.0         0.0          0.0              0.0           0.0
  0.0         0.0          0.0          …   0.0           0.0
  0.0         0.0          0.0              0.0           0.0
  0.0         0.0          0.0              0.0           0.0
  0.0         0.0          0.0              0.0           0.0
  0.0         0.0          0.0              0.0           0.0
  0.0         0.0          0.0          …   0.0           0.0
  0.0         0.0          0.0              0.0           0.0
 -1.13023e-6  1.79706e-6   7.96314e-7      -1.82916e-5   -1.69162e-5
  ⋮                                     ⋱                
 -8.769e-5    0.00143078   0.00113599       1.16734e-13   8.06924e-14
 -4.5962e-5   0.000920176  0.000863292      9.46519e-14  -2.08525e-13
 -1.27477e-5  0.000500157  0.000487834     -4.86915e-13   4.10592e-13
  1.72528e-5  0.000219159  0.000211216  …   1.07941e-12  -3.80997e-13
  5.51266e-6  0.000109011  9.13385e-5      -1.36204e-12   9.24027e-14
 -2.59364e-7  3.54844e-5   3.45235e-5       2.02711e-12  -1.66652e-13
 -4.95273e-7  2.12795e-5   1.89427e-5      -5.01332e-13   1.7726e-13
  1.37842e-7  2.22338e-6   2.25569e-6       0.0           0.0
  0.0         0.0          0.0          …   0.0           0.0
  0.0         0.0          0.0              0.0           0.0
  0.0         0.0          0.0              0.0           0.0
  0.0         0.0          0.0              0.0           0.0
In [18]:
# largest eigenvalue
ESigma.values[1]
Out[18]:
5.116787728342092
In [19]:
# corresponding eigenvector
ESigma.vectors[:, 1]
Out[19]:
784-element Array{Float64,1}:
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
 -1.1302299848127289e-6
  ⋮
 -8.768996662290797e-5
 -4.596197162656801e-5
 -1.2747685973350732e-5
  1.7252770690812103e-5
  5.512661031686905e-6
 -2.593635886755097e-7
 -4.952733260038533e-7
  1.3784202612423542e-7
  0.0
  0.0
  0.0
  0.0
In [20]:
imshow(reshape(ESigma.vectors[:, 1], (28,28))');
In [21]:
imshow(reshape(ESigma.vectors[:, 2], (28,28))');
In [22]:
imshow(reshape(ESigma.vectors[:, 3], (28,28))');
In [23]:
imshow(reshape(ESigma.vectors[:, 4], (28,28))');
In [24]:
imshow(reshape(ESigma.vec
tors[:, 5], (28,28))');
syntax: missing comma or ) in argument list

Stacktrace:
 [1] top-level scope at In[24]:2
 [2] include_string(::Function, ::Module, ::String, ::String) at ./loading.jl:1091
In [25]:
plot(ESigma.values)
ylabel("eigenvalue")
xlabel("index of eigenvalue")
title("Eigenvalues of MNIST Covariance Matrix");

PCA can represent objects in low dimension without losing information.

In [26]:
D = 5;
In [27]:
A = ESigma.vectors[:, 1:D]';
In [28]:
# original image
i = 1337;
imshow(reshape(train_x[:,i], (28,28))');
In [29]:
# dimension-reduced image
x_dr = A * train_x_minus_mean[:,i]
x_recovered = A' * x_dr + train_x_mean;

# original image
imshow(reshape(x_recovered, (28,28))');

Still enough to classify the image!

In [ ]:

In [30]:
Matrix(X)
UndefVarError: X not defined

Stacktrace:
 [1] top-level scope at In[30]:1
 [2] include_string(::Function, ::Module, ::String, ::String) at ./loading.jl:1091

Sparsity¶

In [23]:
n = 4096;
X = sprand(n,n,0.1);
Y = sprand(n,n,0.1);
In [32]:
# visualize part of it
imshow(X[1:64,1:64]);
colorbar();
In [33]:
# get the size of X in bytes
Base.summarysize(X)
Out[33]:
26896696
In [26]:
26896696/(n*n*8)
Out[26]:
0.20039600133895874
In [34]:
# type of X
typeof(X)
Out[34]:
SparseMatrixCSC{Float64,Int64}
In [35]:
Xdense = Matrix(X);
Ydense = Matrix(Y);
typeof(Xdense)
Out[35]:
Array{Float64,2}
In [36]:
Base.summarysize(Xdense)
Out[36]:
134217768
In [37]:
# what's the ratio?
Base.summarysize(X) / Base.summarysize(Xdense)
Out[37]:
0.20039594161631416
In [38]:
# what fraction of X's entries are nonzero?
nnz(X) / length(X)
Out[38]:
0.10007530450820923

Why are these numbers not equal?

In [39]:
@time X * Y;
  1.312289 seconds (110.34 k allocations: 287.229 MiB, 0.31% gc time)
In [40]:
dense_mmpy_time = @elapsed Xdense * Ydense
Out[40]:
1.174083337

What happened here???

In [41]:
# make X and Y less dense
Xsparser = sprand(n,n,0.01);
Ysparser = sprand(n,n,0.01);
@time Xsparser * Ysparser;
  0.121884 seconds (8 allocations: 94.578 MiB)
In [42]:
densities = [0.001, 0.003, 0.01, 0.03, 0.1, 0.3];
sparse_mm_times = Float64[];
for d in densities
    Xsparse = sprand(n,n,d);
    Ysparse = sprand(n,n,d);
    push!(sparse_mm_times, @elapsed(Xsparse * Ysparse));
end
In [43]:
loglog(densities, sparse_mm_times; label="sparse mmpy");
loglog(densities, densities * 0 .+ dense_mmpy_time; label="dense mmpy");
xlabel("density");
ylabel("matrix multiply runtime");
legend();
In [46]:
spv = X[:,1]
Out[46]:
4096-element SparseVector{Float64,Int64} with 427 stored entries:
  [16  ]  =  0.494497
  [18  ]  =  0.826087
  [34  ]  =  0.687849
  [59  ]  =  0.318373
  [63  ]  =  0.543537
  [73  ]  =  0.460325
  [75  ]  =  0.0852525
  [90  ]  =  0.141923
  [108 ]  =  0.0701148
  [116 ]  =  0.994332
          ⋮
  [4029]  =  0.570289
  [4035]  =  0.54624
  [4040]  =  0.202171
  [4042]  =  0.403688
  [4062]  =  0.420507
  [4066]  =  0.538902
  [4068]  =  0.265724
  [4072]  =  0.191174
  [4080]  =  0.67434
  [4084]  =  0.384076
  [4089]  =  0.188717
In [47]:
spv.n
Out[47]:
4096
In [48]:
spv.nzind
Out[48]:
427-element Array{Int64,1}:
   16
   18
   34
   59
   63
   73
   75
   90
  108
  116
  125
  127
  134
    ⋮
 4021
 4029
 4035
 4040
 4042
 4062
 4066
 4068
 4072
 4080
 4084
 4089
In [49]:
spv.nzval
Out[49]:
427-element Array{Float64,1}:
 0.4944967485040994
 0.8260867791524058
 0.6878492561841358
 0.31837265760532185
 0.5435373410214652
 0.4603252124070474
 0.08525252863099708
 0.14192308123381947
 0.07011482669433211
 0.994332364738097
 0.07054927165346192
 0.3335951909204913
 0.9303865975954675
 ⋮
 0.2442214611309792
 0.5702885838128535
 0.5462397434084996
 0.20217057750630207
 0.40368809673279826
 0.420507447700027
 0.5389015609303036
 0.2657240323430008
 0.19117367805893304
 0.674340219210197
 0.3840757164610593
 0.18871747643515846
In [ ]: