In [1]:
using PyPlot
using LinearAlgebra
using Statistics
using Random

Random.seed!(12345);

â”Œ Warning: PyPlot is using tkagg backend, which is known to cause crashes on MacOS (#410); use the MPLBACKEND environment variable to request a different backend.
â”” @ PyPlot /Users/cdesa/.julia/packages/PyPlot/XHEG0/src/init.jl:192


# Exploring Low-Precision Training¶

Let's look at how our logistic regression model from the previous couple of demos is affected by training in low-precision floating-point arithmetic.

First, define a low-precision numeric type in Julia.

In [2]:
import Base.convert
import Base.length
import Base.zero
import Base.one
import Base.typemin
import Base.transpose
import Base.sqrt
import LinearAlgebra.dot
import Base.exp
import Base.expm1
import Base.isless
import Base.log
import Base.+
import Base.-
import Base.*
import Base./

abstract type NoDynamicBias end;

# NB = number of mantissa bits
# NE = number of exponent bits
# EBIAS = extra exponent bias; default should be 0 which corresponds to a bias of (1 << (NE-1)) - 1
# RM = rounding mode
struct SimFP{NB,NE,EBIAS,RM} <: Number
v::Float64;
SimFP{NB,NE,EBIAS,RM}(x::Float64) where {NB,NE,EBIAS,RM} = new{NB,NE,EBIAS,RM}(quantize(x, SimFP{NB,NE,EBIAS,RM})); # NB, NE, EBIAS, RM));
end

function Float64(x::SimFP)
return x.v
end

function convert(::Type{Float64}, x::SimFP)
return x.v
end

function convert(::Type{SimFP{NB,NE,EBIAS,RM}}, x::SimFP) where {NB,NE,EBIAS,RM}
return SimFP{NB,NE,EBIAS,RM}(x.v)
end

function convert(::Type{SimFP{NB,NE,EBIAS,RM}}, x::Float64) where {NB,NE,EBIAS,RM}
return SimFP{NB,NE,EBIAS,RM}(x)
end

function convert(::Type{SimFP{NB,NE,EBIAS,RM}}, x) where {NB,NE,EBIAS,RM}
return SimFP{NB,NE,EBIAS,RM}(convert(Float64, x))
end

function (::Type{SimFP{NB,NE,EBIAS,RM}})(x) where {NB,NE,EBIAS,RM}
return SimFP{NB,NE,EBIAS,RM}(convert(Float64, x))
end

# function convert(::Type{T}, x::SimFP) where {T}
#   return convert(T, x.v)
# end

function zero(::Type{SimFP{NB,NE,EBIAS,RM}}) where {NB,NE,EBIAS,RM}
return SimFP{NB,NE,EBIAS,RM}(0.0)
end

function zero(::SimFP{NB,NE,EBIAS,RM}) where {NB,NE,EBIAS,RM}
return SimFP{NB,NE,EBIAS,RM}(0.0)
end

function one(::Type{SimFP{NB,NE,EBIAS,RM}}) where {NB,NE,EBIAS,RM}
return SimFP{NB,NE,EBIAS,RM}(1.0)
end

function one(::SimFP{NB,NE,EBIAS,RM}) where {NB,NE,EBIAS,RM}
return SimFP{NB,NE,EBIAS,RM}(1.0)
end

function typemin(::SimFP{NB,NE,EBIAS,RM}) where {NB,NE,EBIAS,RM}
return SimFP{NB,NE,EBIAS,RM}(-realmax(Float64))
end

function typemax(::SimFP{NB,NE,EBIAS,RM}) where {NB,NE,EBIAS,RM}
return SimFP{NB,NE,EBIAS,RM}(realmax(Float64))
end

function transpose(x::SimFP{NB,NE,EBIAS,RM}) where {NB,NE,EBIAS,RM}
return x
end

function sqrt(x::SimFP{NB,NE,EBIAS,RM}) where {NB,NE,EBIAS,RM}
return SimFP{NB,NE,EBIAS,RM}(sqrt(x.v))
end

function dot(x::SimFP{NB,NE,EBIAS,RM}, y::SimFP{NB,NE,EBIAS,RM}) where {NB,NE,EBIAS,RM}
return SimFP{NB,NE,EBIAS,RM}(dot(x.v, y.v))
end

function exp(x::SimFP{NB,NE,EBIAS,RM}) where {NB,NE,EBIAS,RM}
return SimFP{NB,NE,EBIAS,RM}(exp(x.v))
end

function expm1(x::SimFP{NB,NE,EBIAS,RM}) where {NB,NE,EBIAS,RM}
return SimFP{NB,NE,EBIAS,RM}(expm1(x.v))
end

function log(x::SimFP{NB,NE,EBIAS,RM}) where {NB,NE,EBIAS,RM}
return SimFP{NB,NE,EBIAS,RM}(log(x.v))
end

function isless(x::SimFP{NB,NE,EBIAS,RM}, y::SimFP{NB,NE,EBIAS,RM}) where {NB,NE,EBIAS,RM}
return isless(x.v, y.v)
end

abstract type RoundingModeNearest end;
abstract type RoundingModeRandom end;

abstract type RoundingModeNearestFast end;
abstract type RoundingModeRandomFast end;

function quantize(x::Float64, nb::Int64, ne::Int64, ::Type{NoDynamicBias}, rm)
return quantize(x, nb, ne, 0, rm);
end

function quantize(x::Float64, ::Type{SimFP{nb,ne,ebias,RoundingModeNearest}}) where {nb,ne,ebias}
if (x == 0.0)
return x;
end
# @assert(ne <= 11);
xi = reinterpret(Int64, x);
xir = (xi + 1 << (51 - nb)) & (~((1 << (52 - nb)) - 1));
if (ne < 11)
xir_pos = xir & (~(1 << 63));
xir_sign = xir & (1 << 63);
bias = ebias + (1 << (ne-1)) - 1;
qf_max = (((bias + 1 + 1023) << nb) - 1) << (52 - nb);
qf_min = (1023 - bias + 1) << 52;
qf_min_d2 = (1023 - bias) << 52;
if (xir_pos > qf_max)
return reinterpret(Float64, qf_max | xir_sign);
elseif (xir_pos < qf_min_d2)
return Float64(0.0);
elseif (xir_pos < qf_min)
return reinterpret(Float64, qf_min | xir_sign);
end
end
return reinterpret(Float64, xir);
end

function quantize(x::Float64, ::Type{SimFP{nb,ne,ebias,RoundingModeRandom}}) where {nb,ne,ebias}
if (x == 0.0)
return x;
end
# @assert(ne <= 11);
mask = (1 << (52 - nb)) - 1;
xi = reinterpret(Int64, x);
if (ne < 11)
xir_pos = xir & (~(1 << 63));
xir_sign = xir & (1 << 63);
bias = ebias + (1 << (ne-1)) - 1;
qf_max = (((bias + 1 + 1023) << nb) - 1) << (52 - nb);
qf_min = (1023 - bias + 1) << 52;
if (xir_pos > qf_max)
return reinterpret(Float64, qf_max | xir_sign);
elseif (xir_pos < qf_min)
xfr_pos = reinterpret(Float64, xir_pos);
ff_min = reinterpret(Float64, qf_min);
if (xfr_pos / ff_min > rand())
return reinterpret(Float64, qf_min | xir_sign);
else
return Float64(0.0)
end
end
end
return reinterpret(Float64, xir);
end

function quantize(x::Float64, ::Type{SimFP{nb,ne,ebias,RoundingModeNearestFast}}) where {nb,ne,ebias}
xi = reinterpret(Int64, x);
xir = (xi + 1 << (51 - nb)) & (~((1 << (52 - nb)) - 1));
return reinterpret(Float64, xir);
end

function quantize(x::Float64, ::Type{SimFP{nb,ne,ebias,RoundingModeRandomFast}}) where {nb,ne,ebias}
mask = (1 << (52 - nb)) - 1;
xi = reinterpret(Int64, x);
return reinterpret(Float64, xir);
end

function +(x::SimFP{NB,NE,EBIAS,RM}, y::SimFP{NB,NE,EBIAS,RM}) where {NB,NE,EBIAS,RM}
return SimFP{NB,NE,EBIAS,RM}(x.v + y.v)
end

function -(x::SimFP{NB,NE,EBIAS,RM}, y::SimFP{NB,NE,EBIAS,RM}) where {NB,NE,EBIAS,RM}
return SimFP{NB,NE,EBIAS,RM}(x.v - y.v)
end

function *(x::SimFP{NB,NE,0,RM}, y::SimFP{NB,NE,EBIASY,RM}) where {NB,NE,EBIASY,RM}
return SimFP{NB,NE,EBIASY,RM}(x.v * y.v)
end

function *(x::SimFP{NB,NE,EBIASX,RM}, y::SimFP{NB,NE,0,RM}) where {NB,NE,EBIASX,RM}
return SimFP{NB,NE,EBIASX,RM}(x.v * y.v)
end

function *(x::SimFP{NB,NE,0,RM}, y::SimFP{NB,NE,0,RM}) where {NB,NE,RM}
return SimFP{NB,NE,0,RM}(x.v * y.v)
end

function *(x::SimFP{NB,NE,EBIASX,RM}, y::SimFP{NB,NE,EBIASY,RM}) where {NB,NE,EBIASX,EBIASY,RM}
return SimFP{NB,NE,EBIASX+EBIASY,RM}(x.v * y.v)
end

function /(x::SimFP{NB,NE,EBIASX,RM}, y::SimFP{NB,NE,0,RM}) where {NB,NE,EBIASX,RM}
return SimFP{NB,NE,EBIASX,RM}(x.v / y.v)
end

function /(x::SimFP{NB,NE,EBIASX,RM}, y::SimFP{NB,NE,EBIASY,RM}) where {NB,NE,EBIASX,EBIASY,RM}
return SimFP{NB,NE,EBIASX-EBIASY,RM}(x.v / y.v)
end

function /(x::SimFP{NB,NE,EBIAS,RM}, y::Int64) where {NB,NE,EBIAS,RM}
return SimFP{NB,NE,EBIAS,RM}(x.v / Float64(y))
end

function -(x::SimFP{NB,NE,EBIAS,RM}) where {NB,NE,EBIAS,RM}
return SimFP{NB,NE,EBIAS,RM}(-x.v)
end

function +(::Type{NoDynamicBias}, ::Type{NoDynamicBias})
return NoDynamicBias;
end

function -(::Type{NoDynamicBias}, ::Type{NoDynamicBias})
return NoDynamicBias;
end

function rescale_bias(::Type{SimFP{NB,NE,EBIAS,RM}}, hint) where {NB,NE,EBIAS,RM}
hint64 = Float64(hint);
eb = -Int64(floor(log2(hint64)));
println("based on hint ($(hint64)), setting exponent bias to:$(eb)")
return SimFP{NB,NE,eb,RM};
end

function rescale_bias(::Type{SimFP{NB,NE,NoDynamicBias,RM}}, hint) where {NB,NE,RM}
return SimFP{NB,NE,NoDynamicBias,RM};
end

function rescale_bias(::Type{Float16}, hint)
return Float16
end

function rescale_bias(::Type{Float32}, hint)
return Float32
end

function rescale_bias(::Type{Float64}, hint)
return Float64
end

Out[2]:
rescale_bias (generic function with 5 methods)
In [3]:
# generate the data
Random.seed!(424242)
d = 50;
N = 10000;
wtrue = randn(d);
wtrue = d^2 * wtrue / norm(wtrue);
X = randn(N, d);
X ./= sqrt.(sum(X.^2; dims=2));
Y = (1 ./ (1 .+ exp.(-X * wtrue)) .>= rand(N)) .* 2 .- 1;
sigma = 1e-3;

In [4]:
w0 = zeros(d);

In [5]:
function sgd_logreg(w0::Array{T,1}, alpha::T, gamma::T, X::Array{T,2}, Y::Array{T,1}, sigma::T, niters::Int64, wopt::Array{T,1}) where {T<:Number}
w = w0
(N, d) = size(X)
dist2_to_optimum = zeros(niters)
for k = 1:niters
i = rand(1:N)
xi = X[i,:];
yi = Y[i];
w += -alpha * sigma * w + alpha * xi * yi / (one(alpha) .+ exp.(yi * dot(xi, w)));
dist2_to_optimum[k] = sum((w - wopt).^2);
end
return (w, dist2_to_optimum);
end

function sgd_logreg(::Type{T}, w0, alpha, gamma, X, Y, sigma, niters, wopt) where {T<:Number}
(w, dist2_to_optimum) = sgd_logreg(T.(w0), T(alpha), T(gamma), T.(X), T.(Y), T.(sigma), niters, T.(wopt))
return (Float64.(w), Float64.(dist2_to_optimum));
end

Out[5]:
sgd_logreg (generic function with 2 methods)
In [6]:
# find the true minimum
function newton_logreg(w0, X, Y, sigma, niters)
N = size(X, 1);
d = size(X, 2);
w = w0;
for k = 1:niters
g = -X' * (Y ./ (1 .+ exp.(Y .* (X * w)))) + N * sigma * w;
H = X' * ((1 ./ ((1 .+ exp.(Y .* (X * w))) .* (1 .+ exp.(-Y .* (X * w))))) .* X) + N * sigma * I;
w = w - H \ g;
# println("gradient norm: $(norm(g))") end return w end wopt = newton_logreg(wtrue, X, Y, sigma, 20);  In [7]: # define low-precision types Float16Rand = SimFP{10,5,0,RoundingModeRandom}; Float16Near = SimFP{10,5,0,RoundingModeNearest};  In [8]: alpha = 0.1; Random.seed!(123456); (w, dto) = sgd_logreg(Float32, w0, alpha, sigma, X, Y, sigma, 100000, wopt); Random.seed!(123456); (w2, dto2) = sgd_logreg(Float16Rand, w0, alpha, sigma, X, Y, sigma, 100000, wopt); Random.seed!(123456); (w3, dto3) = sgd_logreg(Float16Near, w0, alpha, sigma, X, Y, sigma, 100000, wopt);  In [9]: plot(dto; label="Float32"); plot(dto2; label="Float16 (random)"); plot(dto3; label="Float16 (nearest)"); legend(); ylim([0,10]);  In [10]: alpha = 0.1; Float12Rand = SimFP{6,5,0,RoundingModeRandom}; Float12Near = SimFP{6,5,0,RoundingModeNearest}; Random.seed!(123456); (w, dto) = sgd_logreg(Float32, w0, alpha, sigma, X, Y, sigma, 100000, wopt); Random.seed!(123456); (w2, dto2) = sgd_logreg(Float12Rand, w0, alpha, sigma, X, Y, sigma, 100000, wopt); Random.seed!(123456); (w3, dto3) = sgd_logreg(Float12Near, w0, alpha, sigma, X, Y, sigma, 100000, wopt);  In [11]: plot(dto; label="Float32"); plot(dto2; label="Float12 (random)"); plot(dto3; label="Float12 (nearest)"); legend(); ylim([0,10]);  In [12]: alpha = 1.0; Float8Rand = SimFP{4,5,0,RoundingModeRandom}; Float8Near = SimFP{4,5,0,RoundingModeNearest}; Random.seed!(123456); (w, dto) = sgd_logreg(Float32, w0, alpha, sigma, X, Y, sigma, 100000, wopt); Random.seed!(123456); (w2, dto2) = sgd_logreg(Float8Rand, w0, alpha, sigma, X, Y, sigma, 100000, wopt); Random.seed!(123456); (w3, dto3) = sgd_logreg(Float8Near, w0, alpha, sigma, X, Y, sigma, 100000, wopt);  In [13]: plot(dto; label="Float32"); plot(dto2; label="Float8 (random)"); plot(dto3; label="Float8 (nearest)"); legend();  # Effects of Scan Order¶ Consider a simple linear regression task. Our objective is of the form $$f(w) = \frac{1}{2n} \sum_{i=1}^n (x_i^T w - y_i)^2.$$ Let's suppose we are working in$w \in \mathbb{R}^2$, and pick a specific set of training examples $$x_i = \left[ \begin{array}{c} \cos(\pi i / n) \\ \sin(\pi i / n) \end{array} \right], \hspace{2em} y_i = 0.$$ Our stochastic gradient updates will be $$w_{t+1} = w_t - \alpha x_i (x_i^T w_t - y_i).$$ Let's pick$\alpha = 1$and see what happens when we vary the scan order. (This is the normalized tight frame example from here: https://people.eecs.berkeley.edu/~brecht/papers/12.Recht.Re.amgm.pdf). In [14]: n = 99; d = 2; xs = [[cos(pi * i / n), sin(pi * i / n)] for i = 1:n]; ys = [0.0 for i = 1:n]; w0 = rand(d); alpha = 1.0; # true solution is just zero w_opt = zeros(d);  In [15]: function sgd_sequential_scan(xs, ys, w0, alpha, w_opt, num_iters) n = length(xs); w = w0; dists_to_opt = Float64[]; for t = 1:num_iters i = mod1(t,n); xi = xs[i]; yi = ys[i]; w -= alpha * xi * (dot(xi, w) - yi); push!(dists_to_opt, norm(w - w_opt)); end return dists_to_opt; end  Out[15]: sgd_sequential_scan (generic function with 1 method) In [16]: function sgd_random_scan(xs, ys, w0, alpha, w_opt, num_iters) n = length(xs); w = w0; dists_to_opt = Float64[]; for t = 1:num_iters i = rand(1:n); xi = xs[i]; yi = ys[i]; w -= alpha * xi * (dot(xi, w) - yi); push!(dists_to_opt, norm(w - w_opt)); end return dists_to_opt; end  Out[16]: sgd_random_scan (generic function with 1 method) In [17]: num_iters = 500; dto_seq_scan = sgd_sequential_scan(xs, ys, w0, alpha, w_opt, num_iters); dto_rnd_scan = sgd_random_scan(xs, ys, w0, alpha, w_opt, num_iters);  In [18]: semilogy(dto_seq_scan; label="sequential scan"); semilogy(dto_rnd_scan; label="random scan"); legend(loc=3); xlabel("iteration"); ylabel("distance to optimum"); title("Comparison of Different Scan Orders for SGD");  ### Question: is sequential scan generally bad for this problem, or was this order just particularly bad?¶ Let's investigate by looking at random permutations of the input data. In [19]: dto_seq_scan_ro = [sgd_sequential_scan(shuffle(xs), ys, w0, alpha, w_opt, num_iters) for i = 1:5];  In [20]: semilogy(dto_seq_scan; label="sequential scan"); semilogy(dto_rnd_scan; label="random scan"); for k = 1:5 semilogy(dto_seq_scan_ro[k]; label="sequential scan (random order$k)");
end

legend(loc=3);
xlabel("iteration");
ylabel("distance to optimum");
title("Comparison of Different Scan Orders for SGD");


## Another Scan-Order Example¶

Consider another simple linear regression task, this time in just one dimension. Our objective is of the form $$f(w) = \frac{1}{2n} \sum_{i=1}^n (w - y_i)^2.$$ And let's pick a set of training examples $y_i$ at random such that their sum is $0$, meaning that the optimum is $w = 0$. Our stochastic gradient updates will be $$w_{t+1} = w_t - \alpha_t (w_t - y_i) = (1 - \alpha_t) w_t + \alpha_t y_i.$$

In [21]:
n = 50;
ys = randn(n); ys .-= mean(ys);
w0 = 10.0;

In [22]:
function sgd_random_scan_diminishing(ys, w0, alpha0, gamma, num_iters)
n = length(ys);
w = w0;
dists_to_opt = Float64[];
for t = 1:num_iters
alpha = alpha0/(1 + (t-1) * gamma);
i = rand(1:n);
yi = ys[i];
w -= alpha * (w - yi);
push!(dists_to_opt, abs(w));
end
return dists_to_opt;
end

function sgd_sequential_scan_diminishing(ys, w0, alpha0, gamma, num_iters)
n = length(ys);
w = w0;
dists_to_opt = Float64[];
p = randperm(n);
for t = 1:num_iters
alpha = alpha0/(1 + (t-1) * gamma);
i = p[mod1(t,n)];
yi = ys[i];
w -= alpha * (w - yi);
push!(dists_to_opt, abs(w));
end
return dists_to_opt;
end

function sgd_reshuffle_scan_diminishing(ys, w0, alpha0, gamma, num_iters)
n = length(ys);
w = w0;
dists_to_opt = Float64[];
p = Int64[]
for t = 1:num_iters
alpha = alpha0/(1 + (t-1) * gamma);
if mod1(t,n) == 1
p = randperm(n);
end
i = p[mod1(t,n)];
yi = ys[i];
w -= alpha * (w - yi);
push!(dists_to_opt, abs(w));
end
return dists_to_opt;
end

Out[22]:
sgd_reshuffle_scan_diminishing (generic function with 1 method)
In [23]:
num_iters = 10000000;
alpha0 = 0.5;
gamma = 0.2;
nruns = 10;

dto_seq_scan_2 = mean(sgd_sequential_scan_diminishing(ys, w0, alpha0, gamma, num_iters) for k = 1:nruns);
dto_rnd_scan_2 = mean(sgd_random_scan_diminishing(ys, w0, alpha0, gamma, num_iters) for k = 1:nruns);
dto_rr_scan_2 = mean(sgd_reshuffle_scan_diminishing(ys, w0, alpha0, gamma, num_iters) for k = 1:nruns);

In [24]:
loglog(dto_seq_scan_2[1:50:end]; label="sequential scan");
loglog(dto_rnd_scan_2[1:50:end]; label="random scan");
loglog(dto_rr_scan_2[1:50:end]; label="random reshuffling");

legend(loc=3);
xlabel("epoch");
ylabel("distance to optimum");
title("Comparison of Different Scan Orders for SGD");


## Observe the asymptotically faster convergence rate with the shuffling methods!¶

In [ ]:


In [ ]: