9 #define PTX_FILE "matrixMul_kernel64.ptx" 10 #define CUBIN_FILE "matrixMul_kernel64.cubin" 12 const bool use_64bit_memory_address = true ; 13 using namespace std; 15 CUdevice cuDevice; 16 CUcontext cuContext; 17 CUmodule cuModule; 18 size_t totalGlobalMem; 20 void constantInit( float *data, int size, float val) 21 { 22 for ( int i = 0 ; i < size; ++ i) 23 data[i] = val; 24 } 26 bool inline findModulePath( const char *module_file, string &module_path, char **argv, string & ptx_source) 27 { 28 char *actual_path = sdkFindFilePath(module_file, argv[ 0 ]); // 依命令行的参数 30 if (actual_path) 31 module_path = actual_path; 32 else 33 { 34 printf( " > findModulePath file not found: <%s> \n " , module_file); 35 return false ; 36 } 38 if (module_path.empty()) 39 { 40 printf( " > findModulePath file not found: <%s> \n " , module_file); 41 return false ; 42 } 43 printf( " > findModulePath <%s>\n " , module_path.c_str()); 45 if (module_path.rfind( " .ptx " ) != string ::npos) 46 { 47 FILE *fp = fopen(module_path.c_str(), " rb " ); 48 fseek(fp, 0 , SEEK_END); 49 int file_size = ftell(fp); 50 char *buf = new char [file_size + 1 ]; 51 fseek(fp, 0 , SEEK_SET); 52 fread(buf, sizeof ( char ), file_size, fp); 53 fclose(fp); 54 buf[file_size] = ' \0 ' ; 55 ptx_source = buf; 56 delete[] buf; 57 } 58 return true ; 59 } 61 static CUresult initCUDA( int argc, char **argv, CUfunction * pMatrixMul) 62 { 63 CUfunction cuFunction = 0 ; // 用于存放取出的函数 64 CUresult status; // 记录每一步操作返回的状态,有false时立即用goto语句转到函数末尾退出 65 int major = 0 , minor = 0 ; 66 char deviceName[ 100 ]; 67 string module_path, ptx_source; 69 cuDevice = findCudaDeviceDRV(argc, ( const char **)argv); // 寻找设备,依命令行参数指定或者选择计算能力最高的 70 cuDeviceComputeCapability(&major, & minor, cuDevice); 71 cuDeviceGetName(deviceName, 256 , cuDevice); 72 printf( " > GPU Device has SM %d.%d compute capability\n " , major, minor); 73 cuDeviceTotalMem(&totalGlobalMem, cuDevice); // 获取显存总量 74 printf( " Total amount of global memory: %llu bytes\n " , (unsigned long long )totalGlobalMem); 75 printf( " 64-bit Memory Address: %s\n " , (totalGlobalMem > (unsigned long long ) 4 * 1024 * 1024 * 1024L ) ? " YES " : " NO " ); 77 status = cuCtxCreate(&cuContext, 0 , cuDevice); // 创建上下文 78 if (CUDA_SUCCESS != status) 79 goto Error; 81 if (!findModulePath(PTX_FILE, module_path, argv, ptx_source)) // 查找指定的模块 "matrixMul_kernel64.ptx" 82 { 83 if (!findModulePath(CUBIN_FILE, module_path, argv, ptx_source)) // 查找模块 "matrixMul_kernel64.cubin" 84 { 85 printf( " > findModulePath could not find <matrixMul_kernel> ptx or cubin\n " ); 86 status = CUDA_ERROR_NOT_FOUND; 87 goto Error; 88 } 89 } 90 else 91 printf( " > initCUDA loading module: <%s>\n " , module_path.c_str()); 93 if (module_path.rfind( " ptx " ) != string ::npos) 94 { 95 // in this branch we use compilation with parameters 96 const unsigned int jitNumOptions = 3 ; 97 CUjit_option *jitOptions = new CUjit_option[jitNumOptions]; 98 void **jitOptVals = new void * [jitNumOptions]; 100 // set up size of compilation log buffer 101 jitOptions[ 0 ] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; 102 int jitLogBufferSize = 1024 ; 103 jitOptVals[ 0 ] = ( void * )(size_t)jitLogBufferSize; 105 // set up pointer to the compilation log buffer 106 jitOptions[ 1 ] = CU_JIT_INFO_LOG_BUFFER; 107 char *jitLogBuffer = new char [jitLogBufferSize]; 108 jitOptVals[ 1 ] = jitLogBuffer; 110 // set up pointer to set the Maximum # of registers for a particular kernel 111 jitOptions[ 2 ] = CU_JIT_MAX_REGISTERS; 112 int jitRegCount = 32 ; 113 jitOptVals[ 2 ] = ( void * )(size_t)jitRegCount; 115 // 编译模块 116 status = cuModuleLoadDataEx(&cuModule, ptx_source.c_str(), jitNumOptions, jitOptions, ( void ** )jitOptVals); 118 printf( " > PTX JIT log:\n%s\n " , jitLogBuffer); 119 } 120 else 121 status = cuModuleLoad(& cuModule, module_path.c_str()); 123 if (CUDA_SUCCESS != status) 124 goto Error; 126 // 取出函数 127 if (totalGlobalMem > (unsigned long long ) 4 * 1024 * 1024 * 1024L ) 128 status = cuModuleGetFunction(&cuFunction, cuModule, " matrixMul_bs32_64bit " ); 129 else 130 status = cuModuleGetFunction(&cuFunction, cuModule, " matrixMul_bs32_32bit " ); 132 if (CUDA_SUCCESS != status) 133 goto Error; 134 *pMatrixMul = cuFunction; 135 return CUDA_SUCCESS; 137 Error: 138 cuCtxDestroy(cuContext); 139 return status; 140 } 142 void runTest( int argc, char ** argv) 143 { 144 int block_size = 32 ; 146 // 获取计算函数 147 CUfunction matrixMul = NULL; // CUDA 函数指针 148 CUresult error_id = initCUDA(argc, argv, &matrixMul); // 获取函数 150 // 数据准备工作 151 unsigned int size_A = WA * HA; 152 unsigned int mem_size_A = sizeof ( float ) * size_A; 153 float *h_A = ( float *) malloc (mem_size_A); 154 unsigned int size_B = WB * HB; 155 unsigned int mem_size_B = sizeof ( float ) * size_B; 156 float *h_B = ( float *) malloc (mem_size_B); 157 size_t size_C = WC * HC; 158 size_t mem_size_C = sizeof ( float ) * size_C; 159 float *h_C = ( float *) malloc (mem_size_C); 160 constantInit(h_A, size_A, 1.0f ); // 全 1 阵 161 constantInit(h_B, size_B, 0.01f ); // 全0.01 阵 163 // 如果是64位系统,则这里申请四块1G的显存占着,没啥用 164 CUdeviceptr d_Mem[ 4 ]; 165 if (use_64bit_memory_address) 166 { 167 unsigned int mem_size = 1024 * 1024 * 1024 ; 168 cuMemAlloc(&d_Mem[ 0 ], mem_size); 169 cuMemAlloc(&d_Mem[ 1 ], mem_size); 170 cuMemAlloc(&d_Mem[ 2 ], mem_size); 171 cuMemAlloc(&d_Mem[ 3 ], mem_size); 172 } 174 CUdeviceptr d_A; 175 cuMemAlloc(& d_A, mem_size_A); 176 CUdeviceptr d_B; 177 cuMemAlloc(& d_B, mem_size_B); 178 CUdeviceptr d_C; 179 cuMemAlloc(& d_C, mem_size_C); 180 cuMemcpyHtoD(d_A, h_A, mem_size_A); 181 cuMemcpyHtoD(d_B, h_B, mem_size_B); 183 // 计时相关 184 StopWatchInterface *timer = NULL; 185 sdkCreateTimer(& timer); 186 sdkStartTimer(& timer); 188 dim3 block(block_size, block_size, 1 ); 189 dim3 grid(WC / block_size, HC / block_size, 1 ); 191 // 两种方式调用 Driver API 192 if ( 1 ) 193 { 194 // 64位内存地址且显存足够大,使用 size_t 为尺寸格式,否则使用 int 为尺寸格式,其调用格式相同 195 if (use_64bit_memory_address && (totalGlobalMem > (unsigned long long ) 4 * 1024 * 1024 * 1024L )) 196 { 197 size_t Matrix_Width_A = (size_t)WA; 198 size_t Matrix_Width_B = (size_t)WB; 199 void *args[ 5 ] = { &d_C, &d_A, &d_B, &Matrix_Width_A, & Matrix_Width_B}; 200 // CUDA 4.0 Driver API 核函数调用,使用倒数第二个指针参数 201 cuLaunchKernel(matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z, 202 2 * block_size*block_size * sizeof ( float ), NULL, args, NULL); 203 } 204 else 205 { 206 int Matrix_Width_A = WA; 207 int Matrix_Width_B = WB; 208 void *args[ 5 ] = { &d_C, &d_A, &d_B, &Matrix_Width_A, & Matrix_Width_B}; 209 cuLaunchKernel(matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z, 210 2 * block_size*block_size * sizeof ( float ), NULL, args, NULL); 211 } 212 } 213 else 214 { 215 int offset = 0 ; 216 char argBuffer[ 256 ]; // 与上面 args 相同顺序依次填入所需的指针参数,用 offset 作偏移 218 *((CUdeviceptr *)&argBuffer[offset]) = d_C; 219 offset += sizeof (d_C); 220 *((CUdeviceptr *)&argBuffer[offset]) = d_A; 221 offset += sizeof (d_A); 222 *((CUdeviceptr *)&argBuffer[offset]) = d_B; 223 offset += sizeof (d_B); 225 if (use_64bit_memory_address && (totalGlobalMem > (unsigned long long ) 4 * 1024 * 1024 * 1024L )) 226 { 227 size_t Matrix_Width_A = (size_t)WA; 228 size_t Matrix_Width_B = (size_t)WB; 229 *((CUdeviceptr *)&argBuffer[offset]) = Matrix_Width_A; 230 offset += sizeof (Matrix_Width_A); 231 *((CUdeviceptr *)&argBuffer[offset]) = Matrix_Width_B; 232 offset += sizeof (Matrix_Width_B); 233 } 234 else 235 { 236 int Matrix_Width_A = WA; 237 int Matrix_Width_B = WB; 238 *(( int *)&argBuffer[offset]) = Matrix_Width_A; 239 offset += sizeof (Matrix_Width_A); 240 *(( int *)&argBuffer[offset]) = Matrix_Width_B; 241 offset += sizeof (Matrix_Width_B); 242 } 244 // 用一个 void * 来封装上面5个参数,并加上参数尺寸和一个指明参数结束的结束宏 245 void *kernel_launch_config[ 5 ] = 246 { 247 CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, 248 CU_LAUNCH_PARAM_BUFFER_SIZE, & offset, 249 CU_LAUNCH_PARAM_END 250 }; 252 // CUDA 4.0 Driver API 核函数调用,使用最后一个指针参数 253 cuLaunchKernel(matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z, 254 2 * block_size*block_size * sizeof ( float ), NULL, NULL, ( void **)& kernel_launch_config); 255 } 257 cuMemcpyDtoH(( void * ) h_C, d_C, mem_size_C); 259 sdkStopTimer(& timer); 260 printf( " Processing time: %f (ms)\n " , sdkGetTimerValue(& timer)); 261 sdkDeleteTimer(& timer); 263 // 检查结果 264 printf( " Checking computed result for correctness: " ); 265 bool correct = true ; 266 for ( int i = 0 ; i < ( int )(WC * HC); i++ ) 267 { 268 if (fabs(h_C[i] - (WA * 0.01f )) > 1e- 5 ) 269 { 270 printf( " Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5\n " , i, h_C[i], WA* 0.01f ); 271 correct = false ; 272 } 273 } 274 printf( " %s\n " , correct ? " Result = PASS " : " Result = FAIL " ); 275 printf( " \nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n " ); 277 if (use_64bit_memory_address) 278 { 279 cuMemFree(d_Mem[ 0 ]); 280 cuMemFree(d_Mem[ 1 ]); 281 cuMemFree(d_Mem[ 2 ]); 282 cuMemFree(d_Mem[ 3 ]); 283 } 284 free (h_A); 285 free (h_B); 286 free (h_C); 287 cuMemFree(d_A); 288 cuMemFree(d_B); 289 cuMemFree(d_C); 290 cuCtxDestroy(cuContext); 291 } 293 int main( int argc, char ** argv) 294 { 295 printf( " [ matrixMulDrv(Driver API) ]\n " ); 296 runTest(argc, argv); 298 getchar(); 299 return 0 ; 300 }

▶ 输出结果:

[ matrixMulDrv (Driver API) ]
> Using CUDA Device [0]: GeForce GTX 1070
> GPU Device has SM 6.1 compute capability
Total amount of global memory:     8589934592 bytes
64-bit Memory Address:             YES
sdkFindFilePath <matrixMul_kernel64.ptx> in ./
sdkFindFilePath <matrixMul_kernel64.ptx> in ./../../bin/win64/Debug/matrixMulDrv_data_files/
sdkFindFilePath <matrixMul_kernel64.ptx> in ./common/
sdkFindFilePath <matrixMul_kernel64.ptx> in ./common/data/
sdkFindFilePath <matrixMul_kernel64.ptx> in ./data/
> findModulePath <./data/matrixMul_kernel64.ptx>
> initCUDA loading module: <./data/matrixMul_kernel64.ptx>
> PTX JIT log:
Processing time: 0.568077 (ms)
Checking computed result for correctness: Result = PASS
NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.

▶ 涨姿势:

● 头文件 matrixMul.h 的内容:

 1 #ifndef _MATRIXMUL_H_
 2 #define _MATRIXMUL_H_
 4 // 规定了参与计算的矩阵的维数
 5 #define WA        (4 * block_size)
 6 #define HA         (6 * block_size)
 7 #define WB        (4 * block_size)
 8 #define HB WA
 9 #define WC WB
10 #define HC HA
12 #endif // _MATRIXMUL_H_

● C++ 中 string 类的基本使用方法

1 using namespace std;
3 string buf, buf2;
4 int n;
5 char *buf = new char[n];// 动态创建字符数组大小,类似malloc
6 buf[n - 1] = '\0';      // 手动结尾补零
7 buf2 = buf;             // 直接赋值
8 delete[] buf;           // 删除该数组,类似 free

class StopWatchInterface ,定义于 helper_timer.h 中用于计时的一个类,这里只说明其使用方法,其内容在头文件随笔中详细讨论。

1 StopWatchInterface *timer = NULL;   // 创建计时类指针   
2 sdkCreateTimer(&timer);             // 创建计时类
3 sdkStartTimer(&timer);              // 开始计时
5 ...                                 // 核函数运行过程
7 sdkStopTimer(&timer);               // 停止计时
8 sdkGetTimerValue(&timer);           // 获取时间(返回浮点类型的毫秒数)
9 sdkDeleteTimer(&timer);             // 删除计时类

● cuda.h 中各种定义

typedef int CUdevice;                      // CUDA int 类型,用于标志设备号
typedef struct CUfunc_st *CUfunction;      // CUDA 函数指针
typedef struct CUmod_st *CUmodule;         // CUDA 模块指针
typedef struct CUctx_st *CUcontext;        // CUDA 上下文指针
typedef enum cudaError_enum {...}CUresult; // CUDA 各种错误信息标号
typedef unsigned long long CUdeviceptr;    // 无符号长长整型
CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev);// 获取设备名称
CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev);   // 获取设备计算能力
inline CUdevice findCudaDeviceDRV(int argc, const char **argv);     // 依命令行指定设备,否则选择计算能力最高的设备。内含函数调用 cuInit(0)
#define cuDeviceTotalMem cuDeviceTotalMem_v2                        // 获取显存大小
CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev);
#define cuMemAlloc cuMemAlloc_v2                                    // 申请显存
CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, size_t bytesize);    
#define cuMemFree cuMemFree_v2                                      // 释放显存
CUresult CUDAAPI cuMemFree(CUdeviceptr dptr);
CUresult CUDAAPI cuInit(unsigned int Flags);                        // 重要的初始化设备参数,在创建上下文之前要先调用它,参数可以设为 0
#define cuCtxCreate cuCtxCreate_v2                                  // 创建上下文
CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
#define cuCtxDestroy cuCtxDestroy_v2                                // 销毁上下文
CUresult CUDAAPI cuCtxDestroy(CUcontext ctx);                       
#define cuMemcpyHtoD __CUDA_API_PTDS(cuMemcpyHtoD_v2)   // cudaMemcpy(cudaMemcpyHostToDevice)的别名
#define cuMemcpyDtoH __CUDA_API_PTDS(cuMemcpyDtoH_v2)   // cudaMemcpy(cudaMemcpyDeviceToHost)的别名
#define __CUDA_API_PTDS(api) api
// 从 ptx 流 image 中编译模块 module,并且包括 numOptions 个参数,参数名列表为 options,参数值列表为 optionValues 
CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
// 指定路径 fname 中获取模块 module
CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname);
// 从指定模块 hmod 中获取名为 name 的函赋给函数指针 hfunc
CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);

● 代码中使用了 goto 语句,基本使用过程如下。好处是函数整个函数 initCUDA 中只有一个 return,坏处是到处是跳转。

 1 int function()
 3     CUresult status;
 5     status = cudaFunction();
 6     if (!status == CUDA_SUCCESS)// 函数cudaFunction运行不正常 
 7         goto Error;
 9     ...                         // 函数运行正常
11     return 0;                   // 正常结束,返回 0
12 Error:                          
13     return status;              // 非正常结束,返回首个错误编号

● Driver API 的简略使用过程。本篇源代码很长,但是压缩后可以变成以下内容,方便看出该接口函数的使用过程。

  1 #include <stdio.h>
  2 #include <cuda.h>
  3 #include <builtin_types.h>
  4 #include <helper_cuda_drvapi.h>
  5 #include <helper_timer.h>
  7 int main()
  9     // 常量
 10     CUdevice cuDevice = 0;
 11     CUcontext cuContext;
 12     CUmodule cuModule;
 13     CUfunction matrixMul = NULL;
 14     CUresult status;
 15     char module_path[30] = "./data/matrixMul_kernel64.ptx";
 16     char ptx_source[63894];
 18     // 创建上下文
 19     cuInit(0);
 20     status = cuCtxCreate(&cuContext, 0, cuDevice);
 22     // 获取函数
 23     FILE *fp = fopen(module_path, "rb");
 24     fseek(fp, 0, SEEK_END);
 25     int file_size = ftell(fp);
 26     fseek(fp, 0, SEEK_SET);
 27     fread(ptx_source, sizeof(char), file_size, fp);
 28     ptx_source[63894 - 1] = '\0';
 30     // 设置编译选项
 31     const unsigned int jitNumOptions = 3;
 32     CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
 33     void **jitOptVals = new void *[jitNumOptions];
 35     // 编译日志大小
 36     jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
 37     int jitLogBufferSize = 1024;
 38     jitOptVals[0] = (void *)(size_t)jitLogBufferSize;
 40     // 编译日志的指针
 41     jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
 42     char *jitLogBuffer = new char[jitLogBufferSize];
 43     jitOptVals[1] = jitLogBuffer;
 45     // 单核函数寄存器数量
 46     jitOptions[2] = CU_JIT_MAX_REGISTERS;
 47     int jitRegCount = 32;
 48     jitOptVals[2] = (void *)(size_t)jitRegCount;
 50     // 编译模块
 51     status = cuModuleLoadDataEx(&cuModule, ptx_source, jitNumOptions, jitOptions, (void **)jitOptVals);
 52     printf("\nPTX JIT log:\n%s\n", jitLogBuffer);
 53     status = cuModuleGetFunction(&matrixMul, cuModule, "matrixMul_bs32_64bit");
 55     // 数据准备工作
 56     int block_size = 32;
 57     int wa = 4 * block_size;
 58     int ha = 6 * block_size;
 59     int wb = 4 * block_size;
 60     int hb = wa;
 61     int wc = wb;
 62     int hc = ha;
 64     unsigned int size_A = wa * ha;
 65     unsigned int mem_size_A = sizeof(float) * size_A;
 66     float *h_A = (float *)malloc(mem_size_A);
 67     unsigned int size_B = wb * hb;
 68     unsigned int mem_size_B = sizeof(float) * size_B;
 69     float *h_B = (float *)malloc(mem_size_B);
 70     size_t size_C = wc * hc;
 71     size_t mem_size_C = sizeof(float) * size_C;
 72     float *h_C = (float *)malloc(mem_size_C);
 74     for (int i = 0; i < size_A; ++i)
 75         h_A[i] = 1.0f;
 76     for (int i = 0; i < size_B; ++i)
 77         h_B[i] = 0.01f;
 79     CUdeviceptr d_A;
 80     cuMemAlloc(&d_A, mem_size_A);
 81     CUdeviceptr d_B;
 82     cuMemAlloc(&d_B, mem_size_B);
 83     CUdeviceptr d_C;
 84     cuMemAlloc(&d_C, mem_size_C);
 85     cuMemcpyHtoD(d_A, h_A, mem_size_A);
 86     cuMemcpyHtoD(d_B, h_B, mem_size_B);
 88     dim3 block(block_size, block_size, 1);
 89     dim3 grid(wc / block_size, hc / block_size, 1);
 91     // 两种方式调用 Driver API
 92     if (1)
 93     {
 94         size_t Matrix_Width_A = (size_t)wa;
 95         size_t Matrix_Width_B = (size_t)wb;
 96         void *args[5] = { &d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B };
 97         // CUDA 4.0 Driver API 核函数调用,使用倒数第二个指针参数
 98         cuLaunchKernel(matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
 99             2 * block_size*block_size * sizeof(float), NULL, args, NULL);
100     }
101     else
102     {
103         int offset = 0;
104         char argBuffer[256];// 与上面 args 相同顺序依次填入所需的指针参数,用 offset 作偏移
106         *((CUdeviceptr *)&argBuffer[offset]) = d_C;
107         offset += sizeof(d_C);
108         *((CUdeviceptr *)&argBuffer[offset]) = d_A;
109         offset += sizeof(d_A);
110         *((CUdeviceptr *)&argBuffer[offset]) = d_B;
111         offset += sizeof(d_B);
112         size_t Matrix_Width_A = (size_t)wa;
113         size_t Matrix_Width_B = (size_t)wb;
114         *((CUdeviceptr *)&argBuffer[offset]) = Matrix_Width_A;
115         offset += sizeof(Matrix_Width_A);
116         *((CUdeviceptr *)&argBuffer[offset]) = Matrix_Width_B;
117         offset += sizeof(Matrix_Width_B);
119         // 用一个 void * 来封装上面5个参数,并加上参数尺寸和一个指明参数结束的结束宏
120         void *kernel_launch_config[5] =
121         {
122             CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
123             CU_LAUNCH_PARAM_BUFFER_SIZE,    &offset,
124             CU_LAUNCH_PARAM_END
125         };
127         // CUDA 4.0 Driver API 核函数调用,使用最后一个指针参数
128         cuLaunchKernel(matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
129             2 * block_size*block_size * sizeof(float), NULL, NULL, (void **)&kernel_launch_config);
130     }
132     cuMemcpyDtoH((void *)h_C, d_C, mem_size_C);
134     //检查结果
135     printf("Checking computed result for correctness: ");
136     bool correct = true;
137     for (int i = 0; i < (int)(wc * hc); i++)
138     {
139         if (fabs(h_C[i] - (wa * 0.01f)) > 1e-5)
140         {
141             printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5\n", i, h_C[i], wa*0.01f);
142             correct = false;
143         }
144     }
145     printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
147     free(h_A);
148     free(h_B);
149     free(h_C);
150     cuMemFree(d_A);
151     cuMemFree(d_B);
152     cuMemFree(d_C);
153     cuCtxDestroy(cuContext);
155     getchar();
156     return 0;
157 }