In [1]:
using Plots
using LinearAlgebra
using Interact
using Random

gr(size=(500,500));

Random.seed!(8765309);

Unable to load WebIO. Please make sure WebIO works for your Jupyter client. For troubleshooting, please see the WebIO/IJulia documentation.

In [2]:
function quadratic_roots(alpha::Float64, beta::Float64, lambda::Float64)
    root1 = ((1.0 + 0.0im + beta - alpha * lambda) + sqrt((1.0 + 0.0im + beta - alpha * lambda)^2 - 4*beta))/2;
    root2 = ((1.0 + 0.0im + beta - alpha * lambda) - sqrt((1.0 + 0.0im + beta - alpha * lambda)^2 - 4*beta))/2;
    return (root1, root2);
end
Out[2]:
quadratic_roots (generic function with 1 method)
In [3]:
s_lambda = slider(0.1:0.1:5, label = "lambda", value = 1.0);
s_alpha = slider(0.0:0.1:5, label = "alpha", value = 1.0);
s_beta = slider([0.5,0.9,0.99,0.999], label = "beta", value = 0.9);
theta = collect(0:0.01:1) .* (2 * pi);
plt = Interact.@map begin
    (root1, root2) = quadratic_roots(&s_alpha, &s_beta, &s_lambda);
    scatter([real(root1), real(root2)], [imag(root1), imag(root2)]; label="eigenvalues", xlims=(-1.2,1.2), ylims=(-1.2,1.2));
    plot!(sqrt(&s_beta) .* cos.(theta), sqrt(&s_beta) .* sin.(theta); color=:black, linestyle = :dot, label="");
end
wdg = Widget(["s_lambda" => s_lambda, "s_alpha" => s_alpha, "s_beta" => s_beta]);
@layout! wdg hbox(plt, vbox(:s_lambda, :s_alpha, :s_beta))
Out[3]:

Analysis of Momentum

In [4]:
function gradient_descent(w0, A, alpha, niters)
    w = w0
    rv = zeros(niters)
    for i = 1:niters
        w = w - alpha * A * w
        rv[i] = norm(w)
    end
    return rv
end

function momentum_gd(w0, A, alpha, beta, niters)
    w = w0
    wprev = w0
    rv = zeros(niters)
    for i = 1:niters
        (w, wprev) = (w - alpha * A * w + beta * (w - wprev), w)
        rv[i] = norm(w)
    end
    return rv
end
Out[4]:
momentum_gd (generic function with 1 method)
In [5]:
# quadratic objective with many different eigenvalues and kappa = 10
A = diagm(0=>[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]);
beta = ((sqrt(10) - 1)/(sqrt(10) + 1))^2;
alpha_mom = (2 + 2*beta) / (1 + 0.1);
alpha_gd = 2 / (1 + 0.1);
In [6]:
w0 = randn(10);
In [7]:
dists_gd = gradient_descent(w0, A, alpha_gd, 100);
dists_gd_other_step_sizes = [gradient_descent(w0, A, c*alpha_gd, 100) for c in [0.1,0.2,0.5,0.8]];
dists_mom = momentum_gd(w0, A, alpha_mom, beta, 100);
In [8]:
plot(dists_mom; label="momentum", yscale=:log10, xlabel="iteration", ylabel="distance to optimum",legend=:bottomleft);
plot!(dists_gd; label="optimal gd", yscale=:log10);
[plot!(dg; label="non-optimal gd", c="green", linestyle=:dot, yscale=:log10) for dg in dists_gd_other_step_sizes];
plot!()
Out[8]:
0 25 50 75 100 10 - 25 10 - 20 10 - 15 10 - 10 10 - 5 10 0 iteration distance to optimum momentum optimal gd non-optimal gd non-optimal gd non-optimal gd non-optimal gd
In [9]:
function nesterov_gd(w0, A, alpha, beta, niters)
    w = w0
    v = w0
    rv = zeros(niters)
    for i = 1:niters
        wprev = w
        w = v - alpha * A * v
        v = w + beta * (w - wprev)
        rv[i] = norm(w)
    end
    return rv
end
Out[9]:
nesterov_gd (generic function with 1 method)
In [10]:
beta_nest = (sqrt(10) - 1)/(sqrt(10) + 1);
alpha_nest = 1;
dists_nest = momentum_gd(w0, A, alpha_nest, beta_nest, 100);
In [11]:
plot(dists_mom; label="polyak", yscale=:log10, xlabel="iteration", ylabel="distance to optimum");
plot!(dists_gd; label="optimal gd", yscale=:log10);
plot!(dists_nest; label="nesterov", yscale=:log10)
Out[11]:
0 25 50 75 100 10 - 25 10 - 20 10 - 15 10 - 10 10 - 5 10 0 iteration distance to optimum polyak optimal gd nesterov
In [ ]:

In [ ]: