using PyPlot
using LinearAlgebra
using Statistics

┌ Warning: PyPlot is using tkagg backend, which is known to cause crashes on MacOS (#410); use the MPLBACKEND environment variable to request a different backend.
└ @ PyPlot /Users/cdesa/.julia/packages/PyPlot/4wzW1/src/init.jl:192

Let's consider a simple linear gression optimization problem over $\mathbb{R}^d$ with objective function

$f(w; x_i, y_i) = \frac{1}{2} (x_i^T w - y_i)^2$

Here, the gradient is

$\nabla f(w; x_i, y_i) = x_i (x_i^T w - y_i).$

We suppose that we have a dataset of $(x_1, y_1), (x_2, y_2), \ldots, (x_n, y_n)$ , giving us a total empirical risk

$f(w) = \frac{1}{n} \sum_{i=1}^n f(w; x_i, y_i) = \frac{1}{n} \sum_{i=1}^n (x_i^T w - y_i)^2.$

The gradient of this is

$\begin{align*} \nabla f(w) &= \frac{1}{n} \sum_{i=1}^n \nabla f(w; x_i, y_i) \\&= \frac{1}{n} \sum_{i=1}^n x_i (x_i^T w - y_i) \\&= \left( \frac{1}{n} \sum_{i=1}^n x_i x_i^T \right) w - \frac{1}{n} \sum_{i=1}^n x_i y_i. \end{align*}$

Of course, the exact minimum occurs when the gradient is $0$ , at $w = \left( \frac{1}{n} \sum_{i=1}^n x_i x_i^T \right)^{-1} \left( \frac{1}{n} \sum_{i=1}^n x_i y_i \right).$

# lets generate a synthetic dataset, first for the simple case of d = 1.
d = 1;
n = 1024;
X = randn(d, n);
Y = randn(n);

# find the true optimum
A = mean(X[:,i]*X[:,i]' for i = 1:n);
b = mean(X[:,i]*Y[i] for i = 1:n);
w_opt = inv(A)*b;

# component loss function
function loss_fi(w, xi, yi)
    return (dot(xi,w) - yi)^2/2;
end

# component gradient function
function grad_fi(w, xi, yi)
    return xi*(dot(xi,w) - yi);
end

# total loss function
function loss_f(w, X, Y)
    (d, n) = size(X);
    return mean(loss_fi(w, X[:,i], Y[i]) for i = 1:n);
end

# total gradient function
function grad_f(w, X, Y)
    (d,n) = size(X);
    return mean(grad_fi(w, X[:,i], Y[i]) for i = 1:n);
end

# total loss function
function matrix_loss_f(w, X, Y)
    (d,n) = size(X);
    return sum((X'*w - Y).^2)/(2*n);
end

# total gradient function
function matrix_grad_f(w, X, Y)
    return X*(X'*w - Y)/n;
end

matrix_grad_f (generic function with 1 method)

Observe that we can write this loss function and gradient function in terms of matrix operations...which is equivalent and much faster to run! (This is one of the first examples of a systems trick that we can do to make our learning algorithm more scalable...and something you've probably already seen.)

w = randn(d);

println(loss_f(w, X, Y));
println(matrix_loss_f(w, X, Y));

println(grad_f(w, X, Y));
println(matrix_grad_f(w, X, Y));

0.6134462388020057
0.6134462388020059
[0.5045917486042185]
[0.504591748604219]

@time grad_f(w, X, Y);
@time matrix_grad_f(w, X, Y);

  0.000144 seconds (3.07 k allocations: 288.062 KiB)
  0.000009 seconds (4 allocations: 16.438 KiB)

Now let's look at what GD and SGD do. Here, we'll measure the distance to the (known) optimum at each step.

function gradient_descent(w0::Vector{Float64}, X::Matrix{Float64}, Y::Vector{Float64},
        w_opt::Vector{Float64}, alpha::Float64, num_iters::Int64)
    dist_to_optimum = zeros(num_iters)
    loss = zeros(num_iters)
    w = w0
    for t = 1:num_iters
        w = w - alpha * matrix_grad_f(w, X, Y);
        dist_to_optimum[t] = norm(w - w_opt);
        loss[t] = matrix_loss_f(w, X, Y);
    end
    return (dist_to_optimum, loss)
end

function stochastic_gradient_descent(w0::Vector{Float64}, X::Matrix{Float64}, Y::Vector{Float64},
        w_opt::Vector{Float64}, alpha::Float64, num_iters::Int64)
    (d,n) = size(X);
    dist_to_optimum = zeros(num_iters)
    loss = zeros(num_iters)
    w = w0
    for t = 1:num_iters
        i = rand(1:n)
        w = w - alpha * grad_fi(w, X[:,i], Y[i]);
        dist_to_optimum[t] = norm(w - w_opt);
        loss[t] = matrix_loss_f(w, X, Y);
    end
    return (dist_to_optimum, loss)
end

stochastic_gradient_descent (generic function with 1 method)

w0 = [5.0];
alpha = 0.1;
num_iters = 1000;
(gd_dist, gd_loss) = gradient_descent(w0, X, Y, w_opt, alpha, num_iters);
(sgd_dist, sgd_loss) = stochastic_gradient_descent(w0, X, Y, w_opt, alpha, num_iters);

plot(1:num_iters, gd_dist; label="GD");
plot(1:num_iters, sgd_dist; label="SD")
xlabel("iterations");
ylabel("distance to optimum");
legend();

semilogy(1:num_iters, gd_dist; label="GD");
semilogy(1:num_iters, sgd_dist; label="SD")
xlabel("iterations");
ylabel("distance to optimum");
legend();

plot(1:num_iters, gd_loss; label="GD");
plot(1:num_iters, sgd_loss; label="SD")
xlabel("iterations");
ylabel("loss");
legend();
ylim((0,1));

Let's try something with a higher dimension!

d = 100;
n = 1024;
X = randn(d, n);
Y = randn(n);

A = mean(X[:,i]*X[:,i]' for i = 1:n);
b = mean(X[:,i]*Y[i] for i = 1:n);
w_opt = inv(A)*b;

w0 = 2 * randn(100);
alpha = 0.01;
num_iters = 5000;
(gd_dist, gd_loss) = gradient_descent(w0, X, Y, w_opt, alpha, num_iters);
(sgd_dist, sgd_loss) = stochastic_gradient_descent(w0, X, Y, w_opt, alpha, num_iters);

plot(1:num_iters, gd_dist; label="GD");
plot(1:num_iters, sgd_dist; label="SD")
xlabel("iterations");
ylabel("distance to optimum");
legend();

plot(1:num_iters, gd_dist; label="GD");
plot(1:num_iters, sgd_dist; label="SD")
xlabel("iterations");
ylabel("distance to optimum");
legend();
ylim([-0.1,2.1]);

semilogy(1:num_iters, gd_dist; label="GD");
semilogy(1:num_iters, sgd_dist; label="SD")
xlabel("iterations");
ylabel("distance to optimum");
legend();

plot(1:num_iters, gd_loss; label="GD");
plot(1:num_iters, sgd_loss; label="SD")
xlabel("iterations");
ylabel("loss");
legend();
ylim((0,2));

What does this tell us about the convergence of SGD?