强大的GPU与张量计算:Pytorch、Numpy与Fortran进行矩阵相乘的速度对比
1 年前
Greeting!
矩阵相乘一直是数值计算中非常核心的一步,众所周知原生的Python环境并不适合这种高通量的数值计算。
作者认为,有必要对这类核心计算进行加速。目前可以使用三种常用的方法: Pytorch 、 Numpy 与 Fortran 。
代码部分:
实现了一个简单的长度为25000的列矩阵叉乘行矩阵的功能,生成了25000*25000的二维矩阵。所有的代码统一为双精度浮点数计算。
Pytorch
array_len = 25000
my_list = list(range(1, array_len + 1))
my_tensor = torch.DoubleTensor(my_array) # 64 bit
if (torch.cuda.is_available()):
my_tensor = torch.DoubleTensor(my_array).cuda()
# ------torch code------
start = time.time()
# out_array = my_tensor @ my_tensor.T
out_array = torch.mm(my_tensor, my_tensor.T)
end = time.time()
print("TORCH: {}s".format(end - start))
print(np.shape(out_array))
print(out_array.dtype)
Numpy
start = time.time()
# out_array = my_array.dot(my_array.T)
out_array = np.matmul(my_array, my_array.T)
end = time.time()
print("NUMPY: {}s".format(end - start))
print(np.shape(out_array))
print(type(out_array[100][100]))
Fortran
start = time.time()
out_array = speed.speed(len_in=array_len, len_out=array_len, arr_in=my_array)
end = time.time()
print("FORTRAN: {}s".format(end - start))
print(np.shape(out_array))
print(type(out_array[100][100]))
&
subroutine speed(len_in, len_out, arr_in, arr_out)
implicit none
integer, intent(in) :: len_in, len_out
real(kind=8), dimension(len_in, 1) :: arr_in
real(kind=8), dimension(len_out, len_out), intent(out) :: arr_out
arr_out = matmul(arr_in, transpose(arr_in))
end subroutine speed
结果部分:
# GPU acceleration(Nvidia RTX 2070)
TORCH: 0.0030999183654785156s
torch.Size([25000, 25000])
torch.float64
# CPU only
TORCH: 0.4056272506713867s
torch.Size([25000, 25000])
torch.float64
# CPU only
FORTRAN: 0.8992853164672852s
(25000, 25000)