我试图使用__ device __变量来制作游戏程序,而不是使用cudaMalloc动态声明它,但它一直告诉我,在调用cudaDeviceSynchronization()的最后一行中遇到了GPUassert:非法内存访问。我已经尝试过使用cudaMalloc的版本,而且效果很好。
#include "cuda_runtime.h" #include "device_launch_parameters.h" #include <iostream> #include <cmath> #include <stdio.h> #include <stdlib.h> #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true) if (code != cudaSuccess) fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); if (abort) exit(code); #define M 3 #define N 3 #define K 3 using namespace std; __device__ double A_dev[M * K]; __device__ double B_dev[K * N]; __device__ double C_dev[M * N]; __global__ void gemm(double* A, double* B, double* C, int m, int n, int k) int x = blockDim.x * blockIdx.x + threadIdx.x; int y = blockDim.y * blockIdx.y + threadIdx.y; int i = x * n + y; double sum = 0.0; for (int j = 0; j < k; j++) sum += A[x * k + j] * B[n * j + y]; C[i] = sum; printf("The value is %f", C[i]); int main(void) double A_h[M * K]; double B_h[K * N]; double C_h[M * N]; for (int i = 0; i < M*K; i++) A_h[i] = (double)i; B_h[i] = (double)i; C_h[i] = 0.0; gpuErrchk(cudaMemcpyToSymbol(A_dev, A_h, M * K * sizeof(double), 0, cudaMemcpyHostToDevice)); gpuErrchk(cudaMemcpyToSymbol(B_dev, B_h, K * N * sizeof(double), 0, cudaMemcpyHostToDevice)); gpuErrchk(cudaMemcpyToSymbol(C_dev, C_h, M * N * sizeof(double), 0, cudaMemcpyHostToDevice)); gpuErrchk(cudaPeekAtLastError()); gpuErrchk(cudaDeviceSynchronize()); dim3 dimGrid(1, 1, 1); dim3 dimBlock(3, 3, 1); gemm <<<dimGrid, dimBlock >>> (A_dev, B_dev, C_dev, 3, 3, 3); gpuErrchk(cudaPeekAtLastError()); gpuErrchk(cudaDeviceSynchronize()); gpuErrchk(cudaMemcpyFromSymbol(C_h, C_dev, M * N * sizeof(double), 0, cudaMemcpyDeviceToHost)); return 0; }
发布于 2020-12-14 18:00:41
当使用 __device__ 变量时,它们本质上处于全局范围,我们不将这些变量作为内核参数传递。您可以在内核代码中直接使用这些变量,而不必为它们设置内核参数。
__device__
如果您对代码进行了以下更改,它将无错误地运行:
#include <iostream> #include <cmath> #include <stdio.h> #include <stdlib.h> #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true) if (code != cudaSuccess) fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); if (abort) exit(code); #define M 3 #define N 3 #define K 3 using namespace std; __device__ double A_dev[M * K]; __device__ double B_dev[K * N]; __device__ double C_dev[M * N]; __global__ void gemm(int m, int n, int k) int x = blockDim.x * blockIdx.x + threadIdx.x; int y = blockDim.y * blockIdx.y + threadIdx.y; int i = x * n + y; double sum = 0.0; for (int j = 0; j < k; j++) sum += A_dev[x * k + j] * B_dev[n * j + y]; C_dev[i] = sum; printf("The value is %f", C_dev[i]); int main(void) double A_h[M * K]; double B_h[K * N]; double C_h[M * N]; for (int i = 0; i < M*K; i++) A_h[i] = (double)i; B_h[i] = (double)i; C_h[i] = 0.0; gpuErrchk(cudaMemcpyToSymbol(A_dev, A_h, M * K * sizeof(double), 0, cudaMemcpyHostToDevice)); gpuErrchk(cudaMemcpyToSymbol(B_dev, B_h, K * N * sizeof(double), 0, cudaMemcpyHostToDevice)); gpuErrchk(cudaMemcpyToSymbol(C_dev, C_h, M * N * sizeof(double), 0, cudaMemcpyHostToDevice)); gpuErrchk(cudaPeekAtLastError()); gpuErrchk(cudaDeviceSynchronize()); dim3 dimGrid(1, 1, 1); dim3 dimBlock(3, 3, 1); gemm <<<dimGrid, dimBlock >>> (3, 3, 3); gpuErrchk(cudaPeekAtLastError());