Let's test the effect of parallelism on some numerics in python

Start with telling numpy to use 4 threads

In [1]:
import os
os.environ["OMP_NUM_THREADS"] = "4"
os.environ["MKL_NUM_THREADS"] = "4"
os.environ["OPENBLAS_NUM_THREADS"] = "4"
import numpy
import time
In [2]:
# look at matrix-matrix multiply
n = 1024 * 16
p = 32
A = numpy.random.rand(n, n)
X = numpy.random.rand(n, p)
X1 = numpy.random.rand(n, 1)
Y = numpy.zeros((n,p))
Y1 = numpy.zeros((n,1))
In [5]:
t = time.time()
for i in range(10):
    numpy.dot(A, X, out=Y)
elapsed_4t_mm = time.time() - t
print("time elapsed: %f" % (elapsed_4t_mm/10))
time elapsed: 0.387947
In [6]:
t = time.time()
for i in range(10):
    numpy.dot(A, X1, out=Y1)
elapsed_4t_mv = time.time() - t
print("time elapsed: %f" % (elapsed_4t_mv/10))
time elapsed: 0.129763

RESTART THE KERNEL

In [1]:
import os
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
import numpy
import time
In [2]:
# look at matrix-matrix multiply
n = 1024 * 16
p = 32
A = numpy.random.rand(n, n)
X = numpy.random.rand(n, p)
X1 = numpy.random.rand(n, 1)
Y = numpy.zeros((n,p))
Y1 = numpy.zeros((n,1))
In [3]:
t = time.time()
for i in range(10):
    numpy.dot(A, X, out=Y)
elapsed_4t_mm = time.time() - t
print("time elapsed: %f" % (elapsed_4t_mm/10))
time elapsed: 0.616824
In [4]:
t = time.time()
for i in range(10):
    numpy.dot(A, X1, out=Y1)
elapsed_4t_mv = time.time() - t
print("time elapsed: %f" % (elapsed_4t_mv/10))
time elapsed: 0.131748
In [ ]:
 
In [ ]: