In [1]:
using PyPlot
using LinearAlgebra
using Statistics


# Part 1: Regularization Demo¶

Suppose that we use regularization such that for a polynomial of the form $$f_w(x) = \sum_{k = 0}^{10} \frac{w_k x^k}{k!}$$ the loss function is $$h(w) = \sum_{i=1}^N (f_w(x_i) - y_i)^2 + \sigma^2 \| w \|^2.$$ If $A$ is the matrix such that $$A_{i,j} = \frac{x_i^j}{j!}$$ then equivalently $$h(w) = \left\| \left[\begin{array}{c} A \\ \sigma I \end{array} \right] w - \left[\begin{array}{c} y \\ 0 \end{array} \right] \right\|^2$$ so we can still solve this easily with linear regression.

In [17]:
x = randn(10); y = sqrt(0.95) * x + sqrt(0.05) * randn(10);
test_x = randn(20); test_y = sqrt(0.9) * test_x + sqrt(0.1) * randn(20);

In [20]:
# fit with regularized degree 10 polynomial

sigma = 0.01

f = hcat([x.^k / factorial(k) for k = 0:10]...);
f_reg = vcat(f, sigma * Matrix{Float64}(I, 11, 11))
y_reg = vcat(y, zeros(11))

w = y_reg' / f_reg';

xplot = collect(-1.5:0.01:1.5);
fplot = hcat([xplot.^k / factorial(k) for k = 0:10]...)

plot(xplot, fplot * w'; color="red");
scatter(x, y);
scatter(test_x, test_y; color="orange");

xlim([-1.5, 1.5]);
ylim([-1.5, 1.5]);

training_error = round(mean((y - f * w').^2); digits=3)
println("training error: $(training_error)") test_f = hcat([test_x.^k / factorial(k) for k = 0:10]...) test_error = round(mean((test_y - test_f * w').^2); digits=3) println("test error:$(test_error)")

training error: 0.011
test error: 0.355

In [21]:
function regularized_fit_error(sigma)
test_x = randn(10000); test_y = sqrt(0.9) * test_x + sqrt(0.1) * randn(10000);

f = hcat([x.^k / factorial(k) for k = 0:10]...);
f_reg = vcat(f, sigma * Matrix{Float64}(I, 11, 11))
y_reg = vcat(y, zeros(11))

w = y_reg' / f_reg';

test_f = hcat([test_x.^k / factorial(k) for k = 0:10]...)

training_error = round(mean((y - f * w').^2); digits=3)
test_error = round(mean((test_y - test_f * w').^2); digits=3)

return (training_error, test_error)
end

Out[21]:
regularized_fit_error (generic function with 1 method)
In [22]:
sigmas = 10 .^ (-3:0.1:2);
training_errors = zeros(length(sigmas));
test_errors = zeros(length(sigmas));

for i = 1:length(sigmas)
(training_errors[i], test_errors[i]) = regularized_fit_error(sigmas[i]);
end

figure();
loglog(sigmas, training_errors, label="training error");
loglog(sigmas, test_errors, label="test error");
xlabel("regularization coefficient");
ylabel("error");
legend();


# Part 2: Momentum Demo¶

In [5]:
function gradient_descent(x0, A, alpha, niters)
x = x0
rv = zeros(niters)
for i = 1:niters
x = x - alpha * A * x
rv[i] = norm(x)
end
return rv
end

function momentum_gd(x0, A, alpha, beta, niters)
x = x0
xprev = x0
rv = zeros(niters)
for i = 1:niters
(x, xprev) = (x - alpha * A * x + beta * (x - xprev), x)
rv[i] = norm(x)
end
return rv
end

Out[5]:
momentum_gd (generic function with 1 method)
In [6]:
A = diagm(0 => [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]);
beta = ((sqrt(10) - 1)/(sqrt(10) + 1))^2;
alpha_mom = (2 + 2*beta) / (1 + 0.1);
alpha_gd = 2 / (1 + 0.1);

In [7]:
x0 = randn(10);

In [8]:
dists_gd = gradient_descent(x0, A, alpha_gd, 100);
dists_gd2 = gradient_descent(x0, A, 0.5*alpha_gd, 100);
dists_mom = momentum_gd(x0, A, alpha_mom, beta, 100);

In [9]:
semilogy(dists_mom; label="momentum");
semilogy(dists_gd; label="optimal gd");
semilogy(dists_gd2; label="non-optimal gd");
legend();
xlabel("iteration");
ylabel("distance to optimum");


# Part 3: Nesterov Momentum¶

In [10]:
function nesterov_gd(x0, A, alpha, beta, niters)
x = x0
y = x0
rv = zeros(niters)
for i = 1:niters
xprev = x
x = y - alpha * A * y
y = x + beta * (x - xprev)
rv[i] = norm(x)
end
return rv
end

Out[10]:
nesterov_gd (generic function with 1 method)
In [11]:
beta_nest = (sqrt(10) - 1)/(sqrt(10) + 1);
alpha_nest = 1;
dists_nest = momentum_gd(x0, A, alpha_nest, beta_nest, 100);

In [12]:
semilogy(dists_mom; label="polyak");
semilogy(dists_gd; label="optimal gd");
semilogy(dists_nest; label="nesterov");
legend();
xlabel("iteration");
ylabel("distance to optimum");

In [ ]: