In [1]:
using PyPlot
using LinearAlgebra
using Statistics
using Random
import Base.MathConstants.e

â”Œ Warning: PyPlot is using tkagg backend, which is known to cause crashes on MacOS (#410); use the MPLBACKEND environment variable to request a different backend.
â”” @ PyPlot /Users/cdesa/.julia/packages/PyPlot/XHEG0/src/init.jl:192


Let's look at using stochastic gradient descent with various methods to optimize logistic regression. First, we'll generate a training set at random from the generative model associated with logistic regression. (The same generative model we used for Notebook 6.)

Except: here to create some imbalance in the examples, we'll adjust the sampled $X$ to have different variances in different coordinates.

In [2]:
# generate the data
Random.seed!(424242)
d = 50;
N = 10000;
wtrue = randn(d);
wtrue = d^2 * wtrue / norm(wtrue);
X = randn(N, d);
X ./= sqrt.(sum(X.^2; dims=2));
Y = (1 ./ (1 .+ exp.(-X * wtrue)) .>= rand(N)) .* 2 .- 1;
sigma = 1e-4;


Let's do logistic regression with regularization here, just as we did before to study hyperparameter optimization.

In [3]:
w0 = randn(d);

In [4]:
function sgd_logreg(w0, alpha0, gamma, X, Y, sigma, niters, wopt)
w = w0
(N, d) = size(X)
dist_to_optimum = zeros(niters)
for k = 1:niters
alpha = alpha0 / (1 + gamma * (k-1));
i = rand(1:N)
xi = X[i,:];
yi = Y[i];
w = (1 - alpha * sigma) * w + alpha * xi * yi / (1 .+ exp.(yi * dot(xi, w)));
dist_to_optimum[k] = norm(w - wopt);
end
return (w, dist_to_optimum);
end

w = w0;
(N, d) = size(X);
r = zeros(d);
dist_to_optimum = zeros(niters);
for k = 1:niters
i = rand(1:N)
xi = X[i,:];
yi = Y[i];
g = sigma * w - xi * yi / (1 .+ exp.(yi * dot(xi, w)));
r += g.^2;
w -= alpha * g ./ (sqrt.(r) .+ 1e-10);
dist_to_optimum[k] = norm(w - wopt);
end
return (w, dist_to_optimum);
end

Out[4]:
adagrad_logreg (generic function with 1 method)
In [5]:
# find the true minimum
function newton_logreg(w0, X, Y, sigma, niters)
N = size(X, 1);
d = size(X, 2);
w = w0;
for k = 1:niters
g = -X' * (Y ./ (1 .+ exp.(Y .* (X * w)))) + N * sigma * w;
H = X' * ((1 ./ ((1 .+ exp.(Y .* (X * w))) .* (1 .+ exp.(-Y .* (X * w))))) .* X) + N * sigma * I;
w = w - H \ g;
println("gradient norm: $(norm(g))") end return w end  Out[5]: newton_logreg (generic function with 1 method) In [6]: wopt = newton_logreg(wtrue, X, Y, sigma, 20);  gradient norm: 2500.0057447159948 gradient norm: 584.5684610195854 gradient norm: 163.16540824030486 gradient norm: 59.78553169886596 gradient norm: 16.463182017766883 gradient norm: 1.9509453070277536 gradient norm: 0.032480195324245585 gradient norm: 9.212772534985492e-6 gradient norm: 7.404073915397605e-13 gradient norm: 2.878113339454559e-14 gradient norm: 2.1990094859321224e-14 gradient norm: 2.3981962744066243e-14 gradient norm: 2.3901258515491778e-14 gradient norm: 2.3373288946484282e-14 gradient norm: 2.4621050501427697e-14 gradient norm: 2.5759429400022938e-14 gradient norm: 2.4068845573686763e-14 gradient norm: 2.2087846486100198e-14 gradient norm: 2.327145812582982e-14 gradient norm: 2.4489895866246467e-14  In [ ]:   In [7]: # do some simple hyperparameter optimization best_distance_sgd = 1e8; alpha_best_sgd = 0.0; for alpha in [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1.0, 3.0, 10.0, 30.0, 100.0, 300.0, 1000.0] (w, dto) = sgd_logreg(w0, alpha, 0.0, X, Y, sigma, 100000, wopt); if dto[length(dto)] < best_distance_sgd best_distance_sgd = dto[length(dto)]; alpha_best_sgd = alpha; end end println("best sgd: (alpha =$alpha_best_sgd)")

for alpha in [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1.0, 3.0, 10.0, 30.0, 100.0, 300.0, 1000.0]
end
end


best sgd: (alpha = 0.1)

In [8]:
Random.seed!(123456);
(w, dto) = sgd_logreg(w0, alpha_best_sgd, 0.0, X, Y, sigma, 100000, wopt);

In [9]:
semilogy(dto; label="SGD");