using LinearAlgebra
using Statistics
using SparseArrays
using MLDatasets
using PyPlot
ArgumentError: Package PyPlot not found in current path.
- Run `import Pkg; Pkg.add("PyPlot")` to install the PyPlot package.
Stacktrace:
[1] macro expansion
@ ./loading.jl:1163 [inlined]
[2] macro expansion
@ ./lock.jl:223 [inlined]
[3] require(into::Module, mod::Symbol)
@ Base ./loading.jl:1144
import Pkg; Pkg.add("MLDatasets")
Updating registry at `~/.julia/registries/General.toml` Resolving package versions... Installed ShowCases ─────────────────── v0.1.0 Installed ZipFile ───────────────────── v0.10.1 Installed InitialValues ─────────────── v0.3.1 Installed OffsetArrays ──────────────── v1.12.10 Installed InlineStrings ─────────────── v1.4.0 Installed ContextVariablesX ─────────── v0.1.3 Installed Unitful ───────────────────── v1.17.0 Installed NNlib ─────────────────────── v0.8.21 Installed InvertedIndices ───────────── v1.3.0 Installed FileIO ────────────────────── v1.16.1 Installed DataFrames ────────────────── v1.5.0 Installed BFloat16s ─────────────────── v0.4.2 Installed PrettyPrint ───────────────── v0.2.0 Installed TupleTools ────────────────── v1.4.3 Installed MLUtils ───────────────────── v0.4.3 Installed MAT ───────────────────────── v0.10.6 Installed JLD2 ──────────────────────── v0.4.35 Installed LLVM ──────────────────────── v6.3.0 Installed GZip ──────────────────────── v0.5.2 Installed Chemfiles ─────────────────── v0.10.41 Installed FLoopsBase ────────────────── v0.1.1 Installed AtomsBase ─────────────────── v0.3.5 Installed SentinelArrays ────────────── v1.4.0 Installed StringEncodings ───────────── v0.3.7 Installed DefineSingletons ──────────── v0.1.2 Installed NameResolution ────────────── v0.1.5 Installed MicroCollections ──────────── v0.1.4 Installed WorkerUtilities ───────────── v1.6.1 Installed UnsafeAtomicsLLVM ─────────── v0.1.3 Installed MPIPreferences ────────────── v0.1.9 Installed Graphics ──────────────────── v1.1.2 Installed WeakRefStrings ────────────── v1.4.2 Installed MappedArrays ──────────────── v0.4.2 Installed BufferedStreams ───────────── v1.2.1 Installed PaddedViews ───────────────── v0.5.12 Installed Glob ──────────────────────── v1.3.1 Installed HDF5_jll ──────────────────── v1.12.2+2 Installed CEnum ─────────────────────── v0.4.2 Installed BangBang ──────────────────── v0.3.39 Installed LazyModules ───────────────── v0.3.1 Installed MLStyle ───────────────────── v0.4.17 Installed FLoops ────────────────────── v0.2.1 Installed PeriodicTable ─────────────── v1.1.4 Installed MosaicViews ───────────────── v0.3.4 Installed StructTypes ───────────────── v1.10.0 Installed ArgCheck ──────────────────── v2.3.0 Installed ImageCore ─────────────────── v0.9.4 Installed JSON3 ─────────────────────── v1.13.2 Installed ImageShow ─────────────────── v0.3.8 Installed TableTraits ───────────────── v1.0.1 Installed DataDeps ──────────────────── v0.7.11 Installed Setfield ──────────────────── v1.1.1 Installed PrettyTables ──────────────── v2.2.7 Installed PooledArrays ──────────────── v1.4.3 Installed AbstractFFTs ──────────────── v1.5.0 Installed DataValueInterfaces ───────── v1.0.0 Installed LLVMExtra_jll ─────────────── v0.0.26+0 Installed ConstructionBase ──────────── v1.5.4 Installed StackViews ────────────────── v0.1.1 Installed CompositionsBase ──────────── v0.1.2 Installed Strided ───────────────────── v1.2.3 Installed UnsafeAtomics ─────────────── v0.2.1 Installed KernelAbstractions ────────── v0.9.8 Installed MLDatasets ────────────────── v0.7.13 Installed HDF5 ──────────────────────── v0.17.1 Installed JuliaVariables ────────────── v0.2.4 Installed GPUArraysCore ─────────────── v0.1.5 Installed SimpleTraits ──────────────── v0.9.4 Installed Atomix ────────────────────── v0.1.0 Installed Crayons ───────────────────── v4.1.1 Installed Chemfiles_jll ─────────────── v0.10.4+0 Installed InternedStrings ───────────── v0.7.0 Installed Adapt ─────────────────────── v3.6.2 Installed ImageBase ─────────────────── v0.1.5 Installed NPZ ───────────────────────── v0.4.3 Installed Transducers ───────────────── v0.4.78 Installed CSV ───────────────────────── v0.10.11 Installed Tables ────────────────────── v1.11.0 Installed UnitfulAtomic ─────────────── v1.0.0 Installed Baselet ───────────────────── v0.1.1 Installed IteratorInterfaceExtensions ─ v1.0.0 Installed SplittablesBase ───────────── v0.1.15 Installed FilePathsBase ─────────────── v0.9.21 Installed StringManipulation ────────── v0.3.4 Installed PrecompileTools ───────────── v1.2.0 Installed Pickle ────────────────────── v0.3.3 Updating `~/.julia/environments/v1.8/Project.toml` [eb30cadb] + MLDatasets v0.7.13 Updating `~/.julia/environments/v1.8/Manifest.toml` [621f4979] + AbstractFFTs v1.5.0 [79e6a3ab] + Adapt v3.6.2 [dce04be8] + ArgCheck v2.3.0 [a9b6321e] + Atomix v0.1.0 [a963bdd2] + AtomsBase v0.3.5 [ab4f0b2a] + BFloat16s v0.4.2 [198e06fe] + BangBang v0.3.39 [9718e550] + Baselet v0.1.1 [e1450e63] + BufferedStreams v1.2.1 [fa961155] + CEnum v0.4.2 [336ed68f] + CSV v0.10.11 [46823bd8] + Chemfiles v0.10.41 [a33af91c] + CompositionsBase v0.1.2 [187b0558] + ConstructionBase v1.5.4 [6add18c4] + ContextVariablesX v0.1.3 [a8cc5b0e] + Crayons v4.1.1 [124859b0] + DataDeps v0.7.11 ⌃ [a93c6f00] + DataFrames v1.5.0 [e2d170a0] + DataValueInterfaces v1.0.0 [244e2a9f] + DefineSingletons v0.1.2 [cc61a311] + FLoops v0.2.1 [b9860ae5] + FLoopsBase v0.1.1 [5789e2e9] + FileIO v1.16.1 [48062228] + FilePathsBase v0.9.21 [46192b85] + GPUArraysCore v0.1.5 ⌅ [92fee26a] + GZip v0.5.2 [c27321d9] + Glob v1.3.1 [a2bd30eb] + Graphics v1.1.2 [f67ccb44] + HDF5 v0.17.1 ⌃ [c817782e] + ImageBase v0.1.5 ⌅ [a09fc81d] + ImageCore v0.9.4 [4e3cecfd] + ImageShow v0.3.8 [22cec73e] + InitialValues v0.3.1 [842dd82b] + InlineStrings v1.4.0 [7d512f48] + InternedStrings v0.7.0 [41ab1584] + InvertedIndices v1.3.0 [82899510] + IteratorInterfaceExtensions v1.0.0 [033835bb] + JLD2 v0.4.35 [0f8b85d8] + JSON3 v1.13.2 [b14d175d] + JuliaVariables v0.2.4 [63c18a36] + KernelAbstractions v0.9.8 [929cbde3] + LLVM v6.3.0 [8cdb02fc] + LazyModules v0.3.1 [23992714] + MAT v0.10.6 [eb30cadb] + MLDatasets v0.7.13 [d8e11817] + MLStyle v0.4.17 [f1d291b0] + MLUtils v0.4.3 [3da0fdf6] + MPIPreferences v0.1.9 [dbb5928d] + MappedArrays v0.4.2 [128add7d] + MicroCollections v0.1.4 [e94cdb99] + MosaicViews v0.3.4 ⌅ [872c559c] + NNlib v0.8.21 [15e1cf62] + NPZ v0.4.3 [71a1bf82] + NameResolution v0.1.5 [6fe1bfb0] + OffsetArrays v1.12.10 [5432bcbf] + PaddedViews v0.5.12 [7b2266bf] + PeriodicTable v1.1.4 [fbb45041] + Pickle v0.3.3 [2dfb63ee] + PooledArrays v1.4.3 [aea7be01] + PrecompileTools v1.2.0 [8162dcfd] + PrettyPrint v0.2.0 [08abe8d2] + PrettyTables v2.2.7 [91c51154] + SentinelArrays v1.4.0 [efcf1570] + Setfield v1.1.1 [605ecd9f] + ShowCases v0.1.0 [699a6c99] + SimpleTraits v0.9.4 [171d559e] + SplittablesBase v0.1.15 [cae243ae] + StackViews v0.1.1 ⌅ [5e0ebb24] + Strided v1.2.3 [69024149] + StringEncodings v0.3.7 [892a3eda] + StringManipulation v0.3.4 [856f2bd8] + StructTypes v1.10.0 [3783bdb8] + TableTraits v1.0.1 [bd369af6] + Tables v1.11.0 [28d57a85] + Transducers v0.4.78 [9d95972d] + TupleTools v1.4.3 [1986cc42] + Unitful v1.17.0 [a7773ee8] + UnitfulAtomic v1.0.0 [013be700] + UnsafeAtomics v0.2.1 [d80eeb9a] + UnsafeAtomicsLLVM v0.1.3 [ea10d353] + WeakRefStrings v1.4.2 [76eceee3] + WorkerUtilities v1.6.1 [a5390f91] + ZipFile v0.10.1 [78a364fa] + Chemfiles_jll v0.10.4+0 ⌃ [0234f1f7] + HDF5_jll v1.12.2+2 [dad2f222] + LLVMExtra_jll v0.0.26+0 [9fa8497b] + Future [4af54fe1] + LazyArtifacts Info Packages marked with ⌃ and ⌅ have new versions available, but those with ⌅ are restricted by compatibility constraints from upgrading. To see why use `status --outdated -m` Precompiling project... ✓ IteratorInterfaceExtensions ✓ Glob ✓ CEnum ✓ WorkerUtilities ✓ BufferedStreams ✓ InitialValues ✓ PrettyPrint ✓ ArgCheck ✓ InternedStrings ✓ DataValueInterfaces ✓ StructTypes ✓ ShowCases ✓ SentinelArrays ✓ CompositionsBase ✓ DefineSingletons ✓ BFloat16s ✓ MPIPreferences ✓ LazyModules ✓ ContextVariablesX ✓ InvertedIndices ✓ Adapt ✓ ConstructionBase ✓ PrecompileTools ✓ UnsafeAtomics ✓ MappedArrays ✓ TupleTools ✓ GZip ✓ Crayons ✓ ZipFile ✓ SimpleTraits ✓ Baselet ✓ Chemfiles_jll ✓ PooledArrays ✓ LLVMExtra_jll ✓ HDF5_jll ✓ FilePathsBase ✓ TableTraits ✓ AbstractFFTs ✓ StringEncodings ✓ NameResolution ✓ Graphics ✓ FLoopsBase ✓ FileIO ✓ GPUArraysCore ✓ InlineStrings ✓ Atomix ✓ OffsetArrays ✓ Setfield ✓ DataDeps ✓ Strided ✓ StringManipulation ✓ NPZ ✓ Tables ✓ WeakRefStrings ✓ StackViews ✓ PaddedViews ✓ SplittablesBase ✓ BangBang ✓ Pickle ✓ MosaicViews ✓ JSON3 ✓ LLVM ✓ MicroCollections ✓ UnsafeAtomicsLLVM ✓ MLStyle ✓ HDF5 ✓ Transducers ✓ PrettyTables ✓ MAT ✓ KernelAbstractions ✓ JuliaVariables ✓ JLD2 ✓ CSV ✓ FLoops ✓ NNlib ✓ Unitful ✓ PeriodicTable ✓ UnitfulAtomic ✓ AtomsBase ✓ ImageCore ✓ MLUtils ✓ Chemfiles ✓ ImageBase ✓ ImageShow ✓ DataFrames ✓ MLDatasets 86 dependencies successfully precompiled in 23 seconds. 170 already precompiled.
d = 100000;
n = 100;
Xs = randn(d,n); # the dataset
epsilon = 0.1;
r = Int64(ceil(8 * log(n) / epsilon^2))
3685
A = randn(D, d) / sqrt(r);
# let's loook at this for a single example
norm(Xs[:,1] - Xs[:,2])
447.22513073742795
norm(A*Xs[:,1] - A*Xs[:,2])
441.36399897928436
norm(A*Xs[:,1] - A*Xs[:,2]) / norm(Xs[:,1] - Xs[:,2])
0.9868944490027222
AXs = A*Xs;
# over the whole dataset
extrema((i == j) ? 1.0 : norm(AXs[:,i] - AXs[:,j]) / norm(Xs[:,i] - Xs[:,j]) for i = 1:n, j = 1:n)
(0.9605492098673976, 1.0413298714811747)
# how much did we decrease the dimension?
D/d
0.03685
n = 60000;
d = 28*28;
train_x, train_y = MNIST.traindata();
UndefVarError: MNIST not defined Stacktrace: [1] top-level scope @ In[21]:1
train_x = Float64.(reshape(train_x, (d, n)));
train_x_mean = mean(train_x; dims=2);
train_x_minus_mean = train_x .- train_x_mean;
Sigma = (train_x_minus_mean * train_x_minus_mean')/n;
# find the eigendecomposition of Sigma
ESigma = eigen(Sigma; sortby = (x -> -x))
Eigen{Float64,Float64,Array{Float64,2},Array{Float64,1}}
values:
784-element Array{Float64,1}:
5.116787728342092
3.7413284788648067
3.2526542396844844
2.8415733372665466
2.5670749588127744
2.273625491620609
1.7251262295281955
1.5205348976536206
1.4562809808876922
1.2427293764173304
1.1120709732036005
1.0666227659356209
0.9046657547344794
⋮
-4.341596436079342e-19
-7.816133441131003e-19
-8.328205317261318e-19
-9.06846445474923e-19
-2.7674683969166128e-18
-7.155496110387914e-18
-1.057007120922721e-17
-2.366763579350205e-17
-4.458784081324597e-17
-6.16504989725162e-17
-6.378346028260054e-17
-9.992685906775396e-17
vectors:
784×784 Array{Float64,2}:
0.0 0.0 0.0 … 0.0 0.0
0.0 0.0 0.0 0.0 0.0
0.0 0.0 0.0 0.0 0.0
0.0 0.0 0.0 0.0 0.0
0.0 0.0 0.0 0.0 0.0
0.0 0.0 0.0 … 0.0 0.0
0.0 0.0 0.0 0.0 0.0
0.0 0.0 0.0 0.0 0.0
0.0 0.0 0.0 0.0 0.0
0.0 0.0 0.0 0.0 0.0
0.0 0.0 0.0 … 0.0 0.0
0.0 0.0 0.0 0.0 0.0
-1.13023e-6 1.79706e-6 7.96314e-7 -1.82916e-5 -1.69162e-5
⋮ ⋱
-8.769e-5 0.00143078 0.00113599 1.16734e-13 8.06924e-14
-4.5962e-5 0.000920176 0.000863292 9.46519e-14 -2.08525e-13
-1.27477e-5 0.000500157 0.000487834 -4.86915e-13 4.10592e-13
1.72528e-5 0.000219159 0.000211216 … 1.07941e-12 -3.80997e-13
5.51266e-6 0.000109011 9.13385e-5 -1.36204e-12 9.24027e-14
-2.59364e-7 3.54844e-5 3.45235e-5 2.02711e-12 -1.66652e-13
-4.95273e-7 2.12795e-5 1.89427e-5 -5.01332e-13 1.7726e-13
1.37842e-7 2.22338e-6 2.25569e-6 0.0 0.0
0.0 0.0 0.0 … 0.0 0.0
0.0 0.0 0.0 0.0 0.0
0.0 0.0 0.0 0.0 0.0
0.0 0.0 0.0 0.0 0.0
# largest eigenvalue
ESigma.values[1]
5.116787728342092
# corresponding eigenvector
ESigma.vectors[:, 1]
784-element Array{Float64,1}:
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
-1.1302299848127289e-6
⋮
-8.768996662290797e-5
-4.596197162656801e-5
-1.2747685973350732e-5
1.7252770690812103e-5
5.512661031686905e-6
-2.593635886755097e-7
-4.952733260038533e-7
1.3784202612423542e-7
0.0
0.0
0.0
0.0
imshow(reshape(ESigma.vectors[:, 1], (28,28))');
imshow(reshape(ESigma.vectors[:, 2], (28,28))');
imshow(reshape(ESigma.vectors[:, 3], (28,28))');
imshow(reshape(ESigma.vectors[:, 4], (28,28))');
imshow(reshape(ESigma.vec
tors[:, 5], (28,28))');
syntax: missing comma or ) in argument list Stacktrace: [1] top-level scope at In[24]:2 [2] include_string(::Function, ::Module, ::String, ::String) at ./loading.jl:1091
plot(ESigma.values)
ylabel("eigenvalue")
xlabel("index of eigenvalue")
title("Eigenvalues of MNIST Covariance Matrix");
PCA can represent objects in low dimension without losing information.
D = 5;
A = ESigma.vectors[:, 1:D]';
# original image
i = 1337;
imshow(reshape(train_x[:,i], (28,28))');
# dimension-reduced image
x_dr = A * train_x_minus_mean[:,i]
x_recovered = A' * x_dr + train_x_mean;
# original image
imshow(reshape(x_recovered, (28,28))');
Still enough to classify the image!
Matrix(X)
UndefVarError: X not defined Stacktrace: [1] top-level scope at In[30]:1 [2] include_string(::Function, ::Module, ::String, ::String) at ./loading.jl:1091
n = 4096;
X = sprand(n,n,0.1);
Y = sprand(n,n,0.1);
# visualize part of it
imshow(X[1:64,1:64]);
colorbar();
# get the size of X in bytes
Base.summarysize(X)
26896696
26896696/(n*n*8)
0.20039600133895874
# type of X
typeof(X)
SparseMatrixCSC{Float64,Int64}
Xdense = Matrix(X);
Ydense = Matrix(Y);
typeof(Xdense)
Array{Float64,2}
Base.summarysize(Xdense)
134217768
# what's the ratio?
Base.summarysize(X) / Base.summarysize(Xdense)
0.20039594161631416
# what fraction of X's entries are nonzero?
nnz(X) / length(X)
0.10007530450820923
Why are these numbers not equal?
@time X * Y;
1.312289 seconds (110.34 k allocations: 287.229 MiB, 0.31% gc time)
dense_mmpy_time = @elapsed Xdense * Ydense
1.174083337
What happened here???
# make X and Y less dense
Xsparser = sprand(n,n,0.01);
Ysparser = sprand(n,n,0.01);
@time Xsparser * Ysparser;
0.121884 seconds (8 allocations: 94.578 MiB)
densities = [0.001, 0.003, 0.01, 0.03, 0.1, 0.3];
sparse_mm_times = Float64[];
for d in densities
Xsparse = sprand(n,n,d);
Ysparse = sprand(n,n,d);
push!(sparse_mm_times, @elapsed(Xsparse * Ysparse));
end
loglog(densities, sparse_mm_times; label="sparse mmpy");
loglog(densities, densities * 0 .+ dense_mmpy_time; label="dense mmpy");
xlabel("density");
ylabel("matrix multiply runtime");
legend();
spv = X[:,1]
4096-element SparseVector{Float64,Int64} with 427 stored entries:
[16 ] = 0.494497
[18 ] = 0.826087
[34 ] = 0.687849
[59 ] = 0.318373
[63 ] = 0.543537
[73 ] = 0.460325
[75 ] = 0.0852525
[90 ] = 0.141923
[108 ] = 0.0701148
[116 ] = 0.994332
⋮
[4029] = 0.570289
[4035] = 0.54624
[4040] = 0.202171
[4042] = 0.403688
[4062] = 0.420507
[4066] = 0.538902
[4068] = 0.265724
[4072] = 0.191174
[4080] = 0.67434
[4084] = 0.384076
[4089] = 0.188717
spv.n
4096
spv.nzind
427-element Array{Int64,1}:
16
18
34
59
63
73
75
90
108
116
125
127
134
⋮
4021
4029
4035
4040
4042
4062
4066
4068
4072
4080
4084
4089
spv.nzval
427-element Array{Float64,1}:
0.4944967485040994
0.8260867791524058
0.6878492561841358
0.31837265760532185
0.5435373410214652
0.4603252124070474
0.08525252863099708
0.14192308123381947
0.07011482669433211
0.994332364738097
0.07054927165346192
0.3335951909204913
0.9303865975954675
⋮
0.2442214611309792
0.5702885838128535
0.5462397434084996
0.20217057750630207
0.40368809673279826
0.420507447700027
0.5389015609303036
0.2657240323430008
0.19117367805893304
0.674340219210197
0.3840757164610593
0.18871747643515846