In [1]:
using PyPlot
using LinearAlgebra
using Statistics
using Random

Random.seed!(12345);
┌ Warning: PyPlot is using tkagg backend, which is known to cause crashes on MacOS (#410); use the MPLBACKEND environment variable to request a different backend.
└ @ PyPlot /Users/cdesa/.julia/packages/PyPlot/XHEG0/src/init.jl:192

Exploring Low-Precision Training

Let's look at how our logistic regression model from the previous couple of demos is affected by training in low-precision floating-point arithmetic.

First, define a low-precision numeric type in Julia.

In [2]:
import Base.convert
import Base.length
import Base.zero
import Base.one
import Base.typemin
import Base.transpose
import Base.sqrt
import LinearAlgebra.dot
import Base.exp
import Base.expm1
import Base.isless
import Base.log
import Base.+
import Base.-
import Base.*
import Base./


abstract type NoDynamicBias end;

# NB = number of mantissa bits
# NE = number of exponent bits
# EBIAS = extra exponent bias; default should be 0 which corresponds to a bias of (1 << (NE-1)) - 1
# RM = rounding mode
struct SimFP{NB,NE,EBIAS,RM} <: Number
  v::Float64;
  SimFP{NB,NE,EBIAS,RM}(x::Float64) where {NB,NE,EBIAS,RM} = new{NB,NE,EBIAS,RM}(quantize(x, SimFP{NB,NE,EBIAS,RM})); # NB, NE, EBIAS, RM));
end

function Float64(x::SimFP)
  return x.v
end

function convert(::Type{Float64}, x::SimFP)
  return x.v
end

function convert(::Type{SimFP{NB,NE,EBIAS,RM}}, x::SimFP) where {NB,NE,EBIAS,RM}
  return SimFP{NB,NE,EBIAS,RM}(x.v)
end

function convert(::Type{SimFP{NB,NE,EBIAS,RM}}, x::Float64) where {NB,NE,EBIAS,RM}
  return SimFP{NB,NE,EBIAS,RM}(x)
end

function convert(::Type{SimFP{NB,NE,EBIAS,RM}}, x) where {NB,NE,EBIAS,RM}
  return SimFP{NB,NE,EBIAS,RM}(convert(Float64, x))
end

function (::Type{SimFP{NB,NE,EBIAS,RM}})(x) where {NB,NE,EBIAS,RM}
  return SimFP{NB,NE,EBIAS,RM}(convert(Float64, x))
end

# function convert(::Type{T}, x::SimFP) where {T}
#   return convert(T, x.v)
# end

function zero(::Type{SimFP{NB,NE,EBIAS,RM}}) where {NB,NE,EBIAS,RM}
  return SimFP{NB,NE,EBIAS,RM}(0.0)
end

function zero(::SimFP{NB,NE,EBIAS,RM}) where {NB,NE,EBIAS,RM}
  return SimFP{NB,NE,EBIAS,RM}(0.0)
end

function one(::Type{SimFP{NB,NE,EBIAS,RM}}) where {NB,NE,EBIAS,RM}
  return SimFP{NB,NE,EBIAS,RM}(1.0)
end

function one(::SimFP{NB,NE,EBIAS,RM}) where {NB,NE,EBIAS,RM}
  return SimFP{NB,NE,EBIAS,RM}(1.0)
end

function typemin(::SimFP{NB,NE,EBIAS,RM}) where {NB,NE,EBIAS,RM}
  return SimFP{NB,NE,EBIAS,RM}(-realmax(Float64))
end

function typemax(::SimFP{NB,NE,EBIAS,RM}) where {NB,NE,EBIAS,RM}
  return SimFP{NB,NE,EBIAS,RM}(realmax(Float64))
end

function transpose(x::SimFP{NB,NE,EBIAS,RM}) where {NB,NE,EBIAS,RM}
  return x
end

function sqrt(x::SimFP{NB,NE,EBIAS,RM}) where {NB,NE,EBIAS,RM}
  return SimFP{NB,NE,EBIAS,RM}(sqrt(x.v))
end

function dot(x::SimFP{NB,NE,EBIAS,RM}, y::SimFP{NB,NE,EBIAS,RM}) where {NB,NE,EBIAS,RM}
  return SimFP{NB,NE,EBIAS,RM}(dot(x.v, y.v))
end

function exp(x::SimFP{NB,NE,EBIAS,RM}) where {NB,NE,EBIAS,RM}
  return SimFP{NB,NE,EBIAS,RM}(exp(x.v))
end

function expm1(x::SimFP{NB,NE,EBIAS,RM}) where {NB,NE,EBIAS,RM}
  return SimFP{NB,NE,EBIAS,RM}(expm1(x.v))
end

function log(x::SimFP{NB,NE,EBIAS,RM}) where {NB,NE,EBIAS,RM}
  return SimFP{NB,NE,EBIAS,RM}(log(x.v))
end

function isless(x::SimFP{NB,NE,EBIAS,RM}, y::SimFP{NB,NE,EBIAS,RM}) where {NB,NE,EBIAS,RM}
  return isless(x.v, y.v)
end

abstract type RoundingModeNearest end;
abstract type RoundingModeRandom end;

abstract type RoundingModeNearestFast end;
abstract type RoundingModeRandomFast end;

function quantize(x::Float64, nb::Int64, ne::Int64, ::Type{NoDynamicBias}, rm)
  return quantize(x, nb, ne, 0, rm);
end

function quantize(x::Float64, ::Type{SimFP{nb,ne,ebias,RoundingModeNearest}}) where {nb,ne,ebias}
  if (x == 0.0)
    return x;
  end
  # @assert(ne <= 11);
  xi = reinterpret(Int64, x);
  xir = (xi + 1 << (51 - nb)) & (~((1 << (52 - nb)) - 1));
  if (ne < 11)
    xir_pos = xir & (~(1 << 63));
    xir_sign = xir & (1 << 63);
    bias = ebias + (1 << (ne-1)) - 1;
    qf_max = (((bias + 1 + 1023) << nb) - 1) << (52 - nb);
    qf_min = (1023 - bias + 1) << 52;
    qf_min_d2 = (1023 - bias) << 52;
    if (xir_pos > qf_max)
      return reinterpret(Float64, qf_max | xir_sign);
    elseif (xir_pos < qf_min_d2)
      return Float64(0.0);
    elseif (xir_pos < qf_min)
      return reinterpret(Float64, qf_min | xir_sign);
    end
  end
  return reinterpret(Float64, xir);
end

function quantize(x::Float64, ::Type{SimFP{nb,ne,ebias,RoundingModeRandom}}) where {nb,ne,ebias}
  if (x == 0.0)
    return x;
  end
  # @assert(ne <= 11);
  mask = (1 << (52 - nb)) - 1;
  xi = reinterpret(Int64, x);
  xir = (xi + (rand(Int64) & mask)) & (~mask);
  if (ne < 11)
    xir_pos = xir & (~(1 << 63));
    xir_sign = xir & (1 << 63);
    bias = ebias + (1 << (ne-1)) - 1;
    qf_max = (((bias + 1 + 1023) << nb) - 1) << (52 - nb);
    qf_min = (1023 - bias + 1) << 52;
    if (xir_pos > qf_max)
      return reinterpret(Float64, qf_max | xir_sign);
    elseif (xir_pos < qf_min)
      xfr_pos = reinterpret(Float64, xir_pos);
      ff_min = reinterpret(Float64, qf_min);
      if (xfr_pos / ff_min > rand())
        return reinterpret(Float64, qf_min | xir_sign);
      else
        return Float64(0.0)
      end
    end
  end
  return reinterpret(Float64, xir);
end

function quantize(x::Float64, ::Type{SimFP{nb,ne,ebias,RoundingModeNearestFast}}) where {nb,ne,ebias}
  xi = reinterpret(Int64, x);
  xir = (xi + 1 << (51 - nb)) & (~((1 << (52 - nb)) - 1));
  return reinterpret(Float64, xir);
end

function quantize(x::Float64, ::Type{SimFP{nb,ne,ebias,RoundingModeRandomFast}}) where {nb,ne,ebias}
  mask = (1 << (52 - nb)) - 1;
  xi = reinterpret(Int64, x);
  xir = (xi + (rand(Int64) & mask)) & (~mask);
  return reinterpret(Float64, xir);
end

function +(x::SimFP{NB,NE,EBIAS,RM}, y::SimFP{NB,NE,EBIAS,RM}) where {NB,NE,EBIAS,RM}
  return SimFP{NB,NE,EBIAS,RM}(x.v + y.v)
end

function -(x::SimFP{NB,NE,EBIAS,RM}, y::SimFP{NB,NE,EBIAS,RM}) where {NB,NE,EBIAS,RM}
  return SimFP{NB,NE,EBIAS,RM}(x.v - y.v)
end

function *(x::SimFP{NB,NE,0,RM}, y::SimFP{NB,NE,EBIASY,RM}) where {NB,NE,EBIASY,RM}
  return SimFP{NB,NE,EBIASY,RM}(x.v * y.v)
end

function *(x::SimFP{NB,NE,EBIASX,RM}, y::SimFP{NB,NE,0,RM}) where {NB,NE,EBIASX,RM}
  return SimFP{NB,NE,EBIASX,RM}(x.v * y.v)
end

function *(x::SimFP{NB,NE,0,RM}, y::SimFP{NB,NE,0,RM}) where {NB,NE,RM}
  return SimFP{NB,NE,0,RM}(x.v * y.v)
end

function *(x::SimFP{NB,NE,EBIASX,RM}, y::SimFP{NB,NE,EBIASY,RM}) where {NB,NE,EBIASX,EBIASY,RM}
  return SimFP{NB,NE,EBIASX+EBIASY,RM}(x.v * y.v)
end

function /(x::SimFP{NB,NE,EBIASX,RM}, y::SimFP{NB,NE,0,RM}) where {NB,NE,EBIASX,RM}
  return SimFP{NB,NE,EBIASX,RM}(x.v / y.v)
end

function /(x::SimFP{NB,NE,EBIASX,RM}, y::SimFP{NB,NE,EBIASY,RM}) where {NB,NE,EBIASX,EBIASY,RM}
  return SimFP{NB,NE,EBIASX-EBIASY,RM}(x.v / y.v)
end

function /(x::SimFP{NB,NE,EBIAS,RM}, y::Int64) where {NB,NE,EBIAS,RM}
  return SimFP{NB,NE,EBIAS,RM}(x.v / Float64(y))
end

function -(x::SimFP{NB,NE,EBIAS,RM}) where {NB,NE,EBIAS,RM}
  return SimFP{NB,NE,EBIAS,RM}(-x.v)
end

function +(::Type{NoDynamicBias}, ::Type{NoDynamicBias})
  return NoDynamicBias;
end

function -(::Type{NoDynamicBias}, ::Type{NoDynamicBias})
  return NoDynamicBias;
end

function rescale_bias(::Type{SimFP{NB,NE,EBIAS,RM}}, hint) where {NB,NE,EBIAS,RM}
  hint64 = Float64(hint);
  eb = -Int64(floor(log2(hint64)));
  println("based on hint ($(hint64)), setting exponent bias to: $(eb)")
  return SimFP{NB,NE,eb,RM};
end

function rescale_bias(::Type{SimFP{NB,NE,NoDynamicBias,RM}}, hint) where {NB,NE,RM}
  return SimFP{NB,NE,NoDynamicBias,RM};
end

function rescale_bias(::Type{Float16}, hint)
  return Float16
end

function rescale_bias(::Type{Float32}, hint)
  return Float32
end

function rescale_bias(::Type{Float64}, hint)
  return Float64
end
Out[2]:
rescale_bias (generic function with 5 methods)
In [3]:
# generate the data
Random.seed!(424242)
d = 50;
N = 10000;
wtrue = randn(d);
wtrue = d^2 * wtrue / norm(wtrue);
X = randn(N, d);
X ./= sqrt.(sum(X.^2; dims=2));
Y = (1 ./ (1 .+ exp.(-X * wtrue)) .>= rand(N)) .* 2 .- 1;
sigma = 1e-3;
In [4]:
w0 = zeros(d);
In [5]:
function sgd_logreg(w0::Array{T,1}, alpha::T, gamma::T, X::Array{T,2}, Y::Array{T,1}, sigma::T, niters::Int64, wopt::Array{T,1}) where {T<:Number}
    w = w0
    (N, d) = size(X)
    dist2_to_optimum = zeros(niters)
    for k = 1:niters
        i = rand(1:N)
        xi = X[i,:];
        yi = Y[i];
        w += -alpha * sigma * w + alpha * xi * yi / (one(alpha) .+ exp.(yi * dot(xi, w)));
        dist2_to_optimum[k] = sum((w - wopt).^2);
    end
    return (w, dist2_to_optimum);
end

function sgd_logreg(::Type{T}, w0, alpha, gamma, X, Y, sigma, niters, wopt) where {T<:Number}
    (w, dist2_to_optimum) = sgd_logreg(T.(w0), T(alpha), T(gamma), T.(X), T.(Y), T.(sigma), niters, T.(wopt))
    return (Float64.(w), Float64.(dist2_to_optimum));
end
Out[5]:
sgd_logreg (generic function with 2 methods)
In [6]:
# find the true minimum
function newton_logreg(w0, X, Y, sigma, niters)
    N = size(X, 1);
    d = size(X, 2);
    w = w0;
    for k = 1:niters
        g = -X' * (Y ./ (1 .+ exp.(Y .* (X * w)))) + N * sigma * w;
        H = X' * ((1 ./ ((1 .+ exp.(Y .* (X * w))) .* (1 .+ exp.(-Y .* (X * w))))) .* X) + N * sigma * I;
        w = w - H \ g;
        # println("gradient norm: $(norm(g))")
    end
    return w
end

wopt = newton_logreg(wtrue, X, Y, sigma, 20);
In [7]:
# define low-precision types

Float16Rand = SimFP{10,5,0,RoundingModeRandom};
Float16Near = SimFP{10,5,0,RoundingModeNearest};
In [8]:
alpha = 0.1;

Random.seed!(123456);
(w, dto) = sgd_logreg(Float32, w0, alpha, sigma, X, Y, sigma, 100000, wopt);

Random.seed!(123456);
(w2, dto2) = sgd_logreg(Float16Rand, w0, alpha, sigma, X, Y, sigma, 100000, wopt);

Random.seed!(123456);
(w3, dto3) = sgd_logreg(Float16Near, w0, alpha, sigma, X, Y, sigma, 100000, wopt);
In [9]:
plot(dto; label="Float32");
plot(dto2; label="Float16 (random)");
plot(dto3; label="Float16 (nearest)");
legend();
ylim([0,10]);
In [10]:
alpha = 0.1;

Float12Rand = SimFP{6,5,0,RoundingModeRandom};
Float12Near = SimFP{6,5,0,RoundingModeNearest};

Random.seed!(123456);
(w, dto) = sgd_logreg(Float32, w0, alpha, sigma, X, Y, sigma, 100000, wopt);

Random.seed!(123456);
(w2, dto2) = sgd_logreg(Float12Rand, w0, alpha, sigma, X, Y, sigma, 100000, wopt);

Random.seed!(123456);
(w3, dto3) = sgd_logreg(Float12Near, w0, alpha, sigma, X, Y, sigma, 100000, wopt);
In [11]:
plot(dto; label="Float32");
plot(dto2; label="Float12 (random)");
plot(dto3; label="Float12 (nearest)");
legend();
ylim([0,10]);
In [12]:
alpha = 1.0;

Float8Rand = SimFP{4,5,0,RoundingModeRandom};
Float8Near = SimFP{4,5,0,RoundingModeNearest};

Random.seed!(123456);
(w, dto) = sgd_logreg(Float32, w0, alpha, sigma, X, Y, sigma, 100000, wopt);

Random.seed!(123456);
(w2, dto2) = sgd_logreg(Float8Rand, w0, alpha, sigma, X, Y, sigma, 100000, wopt);

Random.seed!(123456);
(w3, dto3) = sgd_logreg(Float8Near, w0, alpha, sigma, X, Y, sigma, 100000, wopt);
In [13]:
plot(dto; label="Float32");
plot(dto2; label="Float8 (random)");
plot(dto3; label="Float8 (nearest)");
legend();

Effects of Scan Order

Consider a simple linear regression task. Our objective is of the form $$ f(w) = \frac{1}{2n} \sum_{i=1}^n (x_i^T w - y_i)^2. $$ Let's suppose we are working in $w \in \mathbb{R}^2$, and pick a specific set of training examples $$ x_i = \left[ \begin{array}{c} \cos(\pi i / n) \\ \sin(\pi i / n) \end{array} \right], \hspace{2em} y_i = 0. $$ Our stochastic gradient updates will be $$ w_{t+1} = w_t - \alpha x_i (x_i^T w_t - y_i). $$ Let's pick $\alpha = 1$ and see what happens when we vary the scan order.

(This is the normalized tight frame example from here: https://people.eecs.berkeley.edu/~brecht/papers/12.Recht.Re.amgm.pdf).

In [14]:
n = 99;
d = 2;
xs = [[cos(pi * i / n), sin(pi * i / n)] for i = 1:n];
ys = [0.0 for i = 1:n];

w0 = rand(d);
alpha = 1.0;

# true solution is just zero
w_opt = zeros(d);
In [15]:
function sgd_sequential_scan(xs, ys, w0, alpha, w_opt, num_iters)
    n = length(xs);
    w = w0;
    dists_to_opt = Float64[];
    for t = 1:num_iters
        i = mod1(t,n);
        xi = xs[i];
        yi = ys[i];
        w -= alpha * xi * (dot(xi, w) - yi);
        push!(dists_to_opt, norm(w - w_opt));
    end
    return dists_to_opt;
end
Out[15]:
sgd_sequential_scan (generic function with 1 method)
In [16]:
function sgd_random_scan(xs, ys, w0, alpha, w_opt, num_iters)
    n = length(xs);
    w = w0;
    dists_to_opt = Float64[];
    for t = 1:num_iters
        i = rand(1:n);
        xi = xs[i];
        yi = ys[i];
        w -= alpha * xi * (dot(xi, w) - yi);
        push!(dists_to_opt, norm(w - w_opt));
    end
    return dists_to_opt;
end
Out[16]:
sgd_random_scan (generic function with 1 method)
In [17]:
num_iters = 500;

dto_seq_scan = sgd_sequential_scan(xs, ys, w0, alpha, w_opt, num_iters);
dto_rnd_scan = sgd_random_scan(xs, ys, w0, alpha, w_opt, num_iters);
In [18]:
semilogy(dto_seq_scan; label="sequential scan");
semilogy(dto_rnd_scan; label="random scan");

legend(loc=3);
xlabel("iteration");
ylabel("distance to optimum");
title("Comparison of Different Scan Orders for SGD");

Question: is sequential scan generally bad for this problem, or was this order just particularly bad?

Let's investigate by looking at random permutations of the input data.

In [19]:
dto_seq_scan_ro = [sgd_sequential_scan(shuffle(xs), ys, w0, alpha, w_opt, num_iters) for i = 1:5];
In [20]:
semilogy(dto_seq_scan; label="sequential scan");
semilogy(dto_rnd_scan; label="random scan");
for k = 1:5
    semilogy(dto_seq_scan_ro[k]; label="sequential scan (random order $k)");
end

legend(loc=3);
xlabel("iteration");
ylabel("distance to optimum");
title("Comparison of Different Scan Orders for SGD");

So we can see that while sequential scan can sometimes be bad, randomly chosen sequential scans (i.e. "shuffle once") seem to work fine.

Another Scan-Order Example

Consider another simple linear regression task, this time in just one dimension. Our objective is of the form $$ f(w) = \frac{1}{2n} \sum_{i=1}^n (w - y_i)^2. $$ And let's pick a set of training examples $y_i$ at random such that their sum is $0$, meaning that the optimum is $w = 0$. Our stochastic gradient updates will be $$ w_{t+1} = w_t - \alpha_t (w_t - y_i) = (1 - \alpha_t) w_t + \alpha_t y_i. $$

In [21]:
n = 50;
ys = randn(n); ys .-= mean(ys);
w0 = 10.0;
In [22]:
function sgd_random_scan_diminishing(ys, w0, alpha0, gamma, num_iters)
    n = length(ys);
    w = w0;
    dists_to_opt = Float64[];
    for t = 1:num_iters
        alpha = alpha0/(1 + (t-1) * gamma);
        i = rand(1:n);
        yi = ys[i];
        w -= alpha * (w - yi);
        push!(dists_to_opt, abs(w));
    end
    return dists_to_opt;
end

function sgd_sequential_scan_diminishing(ys, w0, alpha0, gamma, num_iters)
    n = length(ys);
    w = w0;
    dists_to_opt = Float64[];
    p = randperm(n);
    for t = 1:num_iters
        alpha = alpha0/(1 + (t-1) * gamma);
        i = p[mod1(t,n)];
        yi = ys[i];
        w -= alpha * (w - yi);
        push!(dists_to_opt, abs(w));
    end
    return dists_to_opt;
end

function sgd_reshuffle_scan_diminishing(ys, w0, alpha0, gamma, num_iters)
    n = length(ys);
    w = w0;
    dists_to_opt = Float64[];
    p = Int64[]
    for t = 1:num_iters
        alpha = alpha0/(1 + (t-1) * gamma);
        if mod1(t,n) == 1
            p = randperm(n);
        end
        i = p[mod1(t,n)];
        yi = ys[i];
        w -= alpha * (w - yi);
        push!(dists_to_opt, abs(w));
    end
    return dists_to_opt;
end
Out[22]:
sgd_reshuffle_scan_diminishing (generic function with 1 method)
In [23]:
num_iters = 10000000;
alpha0 = 0.5;
gamma = 0.2;
nruns = 10;

dto_seq_scan_2 = mean(sgd_sequential_scan_diminishing(ys, w0, alpha0, gamma, num_iters) for k = 1:nruns);
dto_rnd_scan_2 = mean(sgd_random_scan_diminishing(ys, w0, alpha0, gamma, num_iters) for k = 1:nruns);
dto_rr_scan_2 = mean(sgd_reshuffle_scan_diminishing(ys, w0, alpha0, gamma, num_iters) for k = 1:nruns);
In [24]:
loglog(dto_seq_scan_2[1:50:end]; label="sequential scan");
loglog(dto_rnd_scan_2[1:50:end]; label="random scan");
loglog(dto_rr_scan_2[1:50:end]; label="random reshuffling");

legend(loc=3);
xlabel("epoch");
ylabel("distance to optimum");
title("Comparison of Different Scan Orders for SGD");

Observe the asymptotically faster convergence rate with the shuffling methods!

In [ ]:

In [ ]: