A simple demonstration of the effects of writing code that takes parallelism into account¶

using LinearAlgebra;

n = 1024;
m = 1024;
p = 1024;

A = randn(Float32,m,n);
B = randn(Float32,n,p);

# array to hold result of multiply
C = zeros(Float32,m,p);

@time mul!(C, A, B);

  0.011916 seconds (4 allocations: 160 bytes)

function my_mul!(C::Matrix{Float32}, A::Matrix{Float32}, B::Matrix{Float32})
    (m,n) = size(A);
    (n,p) = size(B);
    for i = 1:m
        for k = 1:p
            acc = 0.0;
            for j = 1:n
                acc += A[i,j] * B[j,k];
            end
            C[i,k] = acc;
        end
    end
    return C;
end

my_mul! (generic function with 2 methods)

@time my_mul!(C, A, B);

  2.036659 seconds (4 allocations: 160 bytes)