强大的GPU与张量计算：Pytorch、Numpy与Fortran进行矩阵相乘的速度对比

1 年前

ALAN

Z. C. Song

Greeting！

矩阵相乘一直是数值计算中非常核心的一步，众所周知原生的Python环境并不适合这种高通量的数值计算。

作者认为，有必要对这类核心计算进行加速。目前可以使用三种常用的方法： Pytorch 、 Numpy 与 Fortran 。

代码部分：

实现了一个简单的长度为25000的列矩阵叉乘行矩阵的功能，生成了25000*25000的二维矩阵。所有的代码统一为双精度浮点数计算。

Pytorch

array_len = 25000
my_list = list(range(1, array_len + 1))
my_tensor = torch.DoubleTensor(my_array) # 64 bit
if (torch.cuda.is_available()):
    my_tensor = torch.DoubleTensor(my_array).cuda()
# ------torch code------
start = time.time()
# out_array = my_tensor @ my_tensor.T
out_array = torch.mm(my_tensor, my_tensor.T)
end = time.time()
print("TORCH: {}s".format(end - start))
print(np.shape(out_array))
print(out_array.dtype)

Numpy

start = time.time()
# out_array = my_array.dot(my_array.T)
out_array = np.matmul(my_array, my_array.T)
end = time.time()
print("NUMPY: {}s".format(end - start))
print(np.shape(out_array))
print(type(out_array[100][100]))

Fortran

start = time.time()
out_array = speed.speed(len_in=array_len, len_out=array_len, arr_in=my_array)
end = time.time()
print("FORTRAN: {}s".format(end - start))
print(np.shape(out_array))
print(type(out_array[100][100]))

subroutine speed(len_in, len_out, arr_in, arr_out)
    implicit none
    integer, intent(in) :: len_in, len_out
    real(kind=8), dimension(len_in, 1) :: arr_in
    real(kind=8), dimension(len_out, len_out), intent(out) :: arr_out
    arr_out = matmul(arr_in, transpose(arr_in))
end subroutine speed

结果部分：

# GPU acceleration（Nvidia RTX 2070）
TORCH: 0.0030999183654785156s
torch.Size([25000, 25000])
torch.float64
# CPU only
TORCH: 0.4056272506713867s
torch.Size([25000, 25000])
torch.float64
# CPU only
FORTRAN: 0.8992853164672852s
(25000, 25000)