A simple demonstration of the effects of writing code that takes parallelism into account

In [40]:
using LinearAlgebra;

n = 1024;
m = 1024;
p = 1024;

A = randn(Float32,m,n);
B = randn(Float32,n,p);

# array to hold result of multiply
C = zeros(Float32,m,p);
In [52]:
@time mul!(C, A, B);
  0.011916 seconds (4 allocations: 160 bytes)
In [53]:
function my_mul!(C::Matrix{Float32}, A::Matrix{Float32}, B::Matrix{Float32})
    (m,n) = size(A);
    (n,p) = size(B);
    for i = 1:m
        for k = 1:p
            acc = 0.0;
            for j = 1:n
                acc += A[i,j] * B[j,k];
            end
            C[i,k] = acc;
        end
    end
    return C;
end
Out[53]:
my_mul! (generic function with 2 methods)
In [56]:
@time my_mul!(C, A, B);
  2.036659 seconds (4 allocations: 160 bytes)
In [ ]:

In [ ]: