{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "using PyPlot\n", "using LinearAlgebra\n", "using Statistics\n", "using Random\n", "import Base.MathConstants.e" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Adaptive Learning Rates\n", "\n", "Let's look at using stochastic gradient descent with various methods to optimize logistic regression.\n", "First, we'll generate a training set at random from the generative model associated with logistic regression.\n", "(The same generative model we used for Notebook 6.)\n", "\n", "Except: here to create some imbalance in the examples, we'll adjust the sampled $X$ to have different variances in different coordinates." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# generate the data\n", "Random.seed!(424242)\n", "d = 50;\n", "N = 10000;\n", "wtrue = randn(d);\n", "wtrue = d^2 * wtrue / norm(wtrue);\n", "X = randn(N, d);\n", "X ./= sqrt.(sum(X.^2; dims=2));\n", "Y = (1 ./ (1 .+ exp.(-X * wtrue)) .>= rand(N)) .* 2 .- 1;\n", "sigma = 1e-4;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's do logistic regression with regularization here, just as we did before to study hyperparameter optimization." ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "w0 = randn(d);" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "adagrad_logreg (generic function with 1 method)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "function sgd_logreg(w0, alpha0, gamma, X, Y, sigma, niters, wopt)\n", " w = w0\n", " (N, d) = size(X)\n", " dist_to_optimum = zeros(niters)\n", " for k = 1:niters\n", " alpha = alpha0 / (1 + gamma * (k-1));\n", " i = rand(1:N)\n", " xi = X[i,:];\n", " yi = Y[i];\n", " w = (1 - alpha * sigma) * w + alpha * xi * yi / (1 .+ exp.(yi * dot(xi, w)));\n", " dist_to_optimum[k] = norm(w - wopt);\n", " end\n", " return (w, dist_to_optimum);\n", "end\n", "\n", "function adagrad_logreg(w0, alpha, X, Y, sigma, niters, wopt)\n", " w = w0;\n", " (N, d) = size(X);\n", " r = zeros(d);\n", " dist_to_optimum = zeros(niters);\n", " for k = 1:niters\n", " i = rand(1:N)\n", " xi = X[i,:];\n", " yi = Y[i];\n", " g = sigma * w - xi * yi / (1 .+ exp.(yi * dot(xi, w)));\n", " r += g.^2;\n", " w -= alpha * g ./ (sqrt.(r) .+ 1e-10); \n", " dist_to_optimum[k] = norm(w - wopt);\n", " end\n", " return (w, dist_to_optimum);\n", "end" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "newton_logreg (generic function with 1 method)" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# find the true minimum\n", "function newton_logreg(w0, X, Y, sigma, niters)\n", " N = size(X, 1);\n", " d = size(X, 2);\n", " w = w0;\n", " for k = 1:niters\n", " g = -X' * (Y ./ (1 .+ exp.(Y .* (X * w)))) + N * sigma * w;\n", " H = X' * ((1 ./ ((1 .+ exp.(Y .* (X * w))) .* (1 .+ exp.(-Y .* (X * w))))) .* X) + N * sigma * I;\n", " w = w - H \\ g;\n", " println(\"gradient norm: $(norm(g))\")\n", " end\n", " return w\n", "end" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "gradient norm: 2500.000416184221\n", "gradient norm: 568.5321463634123\n", "gradient norm: 162.2081388505692\n", "gradient norm: 59.13057294638611\n", "gradient norm: 16.10911785249323\n", "gradient norm: 1.864245478061334\n", "gradient norm: 0.02947084402265636\n", "gradient norm: 7.507787778566566e-6\n", "gradient norm: 4.872576446772484e-13\n", "gradient norm: 2.613891556799585e-14\n", "gradient norm: 2.3055729862370177e-14\n", "gradient norm: 2.4584296439553133e-14\n", "gradient norm: 2.184594791303772e-14\n", "gradient norm: 2.5875997830454982e-14\n", "gradient norm: 2.391654643720554e-14\n", "gradient norm: 2.1902789798638662e-14\n", "gradient norm: 2.138310873593704e-14\n", "gradient norm: 2.6659125626953916e-14\n", "gradient norm: 2.3794672214487743e-14\n", "gradient norm: 2.3505105453973415e-14\n" ] } ], "source": [ "wopt = newton_logreg(wtrue, X, Y, sigma, 20);" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "best sgd: (alpha = 0.1)\n", "best adagrad: (alpha = 1.0)\n" ] } ], "source": [ "# do some simple hyperparameter optimization\n", "best_distance_sgd = 1e8;\n", "alpha_best_sgd = 0.0;\n", "for alpha in [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1.0, 3.0, 10.0, 30.0, 100.0, 300.0, 1000.0]\n", " (w, dto) = sgd_logreg(w0, alpha, 0.0, X, Y, sigma, 100000, wopt);\n", " if dto[length(dto)] < best_distance_sgd\n", " best_distance_sgd = dto[length(dto)];\n", " alpha_best_sgd = alpha;\n", " end\n", "end\n", "\n", "println(\"best sgd: (alpha =$alpha_best_sgd)\")\n", "\n", "best_distance_adagrad = 1e8;\n", "alpha_best_adagrad = 0.0;\n", "for alpha in [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1.0, 3.0, 10.0, 30.0, 100.0, 300.0, 1000.0]\n", " (w, dto) = adagrad_logreg(w0, alpha, X, Y, sigma, 100000, wopt);\n", " if dto[length(dto)] < best_distance_adagrad\n", " best_distance_adagrad = dto[length(dto)];\n", " alpha_best_adagrad = alpha;\n", " end\n", "end\n", "\n", "println(\"best adagrad: (alpha = \$alpha_best_adagrad)\")" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "Random.seed!(123456);\n", "(w, dto) = sgd_logreg(w0, alpha_best_sgd, 0.0, X, Y, sigma, 100000, wopt);\n", "(w2, dto2) = adagrad_logreg(w0, 0.5*alpha_best_adagrad, X, Y, sigma, 100000, wopt);" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "Figure(PyObject