高大的卡布奇诺 · 打仗时领导人都去哪儿了?探秘各国地下指挥中心 ...· 3 月前 · |
正直的手电筒 · LaTeX:飘齐雄隐 - 知乎· 4 月前 · |
低调的煎饼 · 30家车企7月销量一览:比亚迪破26万,蔚来 ...· 1 年前 · |
才高八斗的李子 · Rabbitmq的connection连接池 ...· 1 年前 · |
▶ 输出结果:
[ matrixMulDrv (Driver API) ] > Using CUDA Device [0]: GeForce GTX 1070 > GPU Device has SM 6.1 compute capability Total amount of global memory: 8589934592 bytes 64-bit Memory Address: YES sdkFindFilePath <matrixMul_kernel64.ptx> in ./ sdkFindFilePath <matrixMul_kernel64.ptx> in ./../../bin/win64/Debug/matrixMulDrv_data_files/ sdkFindFilePath <matrixMul_kernel64.ptx> in ./common/ sdkFindFilePath <matrixMul_kernel64.ptx> in ./common/data/ sdkFindFilePath <matrixMul_kernel64.ptx> in ./data/ > findModulePath <./data/matrixMul_kernel64.ptx> > initCUDA loading module: <./data/matrixMul_kernel64.ptx> > PTX JIT log: Processing time: 0.568077 (ms) Checking computed result for correctness: Result = PASS NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.
▶ 涨姿势:
● 头文件 matrixMul.h 的内容:
1 #ifndef _MATRIXMUL_H_ 2 #define _MATRIXMUL_H_ 4 // 规定了参与计算的矩阵的维数 5 #define WA (4 * block_size) 6 #define HA (6 * block_size) 7 #define WB (4 * block_size) 8 #define HB WA 9 #define WC WB 10 #define HC HA 12 #endif // _MATRIXMUL_H_
● C++ 中 string 类的基本使用方法
1 using namespace std; 3 string buf, buf2; 4 int n; 5 char *buf = new char[n];// 动态创建字符数组大小,类似malloc 6 buf[n - 1] = '\0'; // 手动结尾补零 7 buf2 = buf; // 直接赋值 8 delete[] buf; // 删除该数组,类似 free
● class StopWatchInterface ,定义于 helper_timer.h 中用于计时的一个类,这里只说明其使用方法,其内容在头文件随笔中详细讨论。
1 StopWatchInterface *timer = NULL; // 创建计时类指针 2 sdkCreateTimer(&timer); // 创建计时类 3 sdkStartTimer(&timer); // 开始计时 5 ... // 核函数运行过程 7 sdkStopTimer(&timer); // 停止计时 8 sdkGetTimerValue(&timer); // 获取时间(返回浮点类型的毫秒数) 9 sdkDeleteTimer(&timer); // 删除计时类
● cuda.h 中各种定义
typedef int CUdevice; // CUDA int 类型,用于标志设备号 typedef struct CUfunc_st *CUfunction; // CUDA 函数指针 typedef struct CUmod_st *CUmodule; // CUDA 模块指针 typedef struct CUctx_st *CUcontext; // CUDA 上下文指针 typedef enum cudaError_enum {...}CUresult; // CUDA 各种错误信息标号 typedef unsigned long long CUdeviceptr; // 无符号长长整型 CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev);// 获取设备名称 CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev); // 获取设备计算能力 inline CUdevice findCudaDeviceDRV(int argc, const char **argv); // 依命令行指定设备,否则选择计算能力最高的设备。内含函数调用 cuInit(0) #define cuDeviceTotalMem cuDeviceTotalMem_v2 // 获取显存大小 CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev); #define cuMemAlloc cuMemAlloc_v2 // 申请显存 CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, size_t bytesize); #define cuMemFree cuMemFree_v2 // 释放显存 CUresult CUDAAPI cuMemFree(CUdeviceptr dptr); CUresult CUDAAPI cuInit(unsigned int Flags); // 重要的初始化设备参数,在创建上下文之前要先调用它,参数可以设为 0 #define cuCtxCreate cuCtxCreate_v2 // 创建上下文 CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev); #define cuCtxDestroy cuCtxDestroy_v2 // 销毁上下文 CUresult CUDAAPI cuCtxDestroy(CUcontext ctx); #define cuMemcpyHtoD __CUDA_API_PTDS(cuMemcpyHtoD_v2) // cudaMemcpy(cudaMemcpyHostToDevice)的别名 #define cuMemcpyDtoH __CUDA_API_PTDS(cuMemcpyDtoH_v2) // cudaMemcpy(cudaMemcpyDeviceToHost)的别名 #define __CUDA_API_PTDS(api) api // 从 ptx 流 image 中编译模块 module,并且包括 numOptions 个参数,参数名列表为 options,参数值列表为 optionValues CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues); // 指定路径 fname 中获取模块 module CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname); // 从指定模块 hmod 中获取名为 name 的函赋给函数指针 hfunc CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
● 代码中使用了 goto 语句,基本使用过程如下。好处是函数整个函数 initCUDA 中只有一个 return,坏处是到处是跳转。
1 int function() 3 CUresult status; 5 status = cudaFunction(); 6 if (!status == CUDA_SUCCESS)// 函数cudaFunction运行不正常 7 goto Error; 9 ... // 函数运行正常 11 return 0; // 正常结束,返回 0 12 Error: 13 return status; // 非正常结束,返回首个错误编号● Driver API 的简略使用过程。本篇源代码很长,但是压缩后可以变成以下内容,方便看出该接口函数的使用过程。
1 #include <stdio.h> 2 #include <cuda.h> 3 #include <builtin_types.h> 4 #include <helper_cuda_drvapi.h> 5 #include <helper_timer.h> 7 int main() 9 // 常量 10 CUdevice cuDevice = 0; 11 CUcontext cuContext; 12 CUmodule cuModule; 13 CUfunction matrixMul = NULL; 14 CUresult status; 15 char module_path[30] = "./data/matrixMul_kernel64.ptx"; 16 char ptx_source[63894]; 18 // 创建上下文 19 cuInit(0); 20 status = cuCtxCreate(&cuContext, 0, cuDevice); 22 // 获取函数 23 FILE *fp = fopen(module_path, "rb"); 24 fseek(fp, 0, SEEK_END); 25 int file_size = ftell(fp); 26 fseek(fp, 0, SEEK_SET); 27 fread(ptx_source, sizeof(char), file_size, fp); 28 ptx_source[63894 - 1] = '\0'; 30 // 设置编译选项 31 const unsigned int jitNumOptions = 3; 32 CUjit_option *jitOptions = new CUjit_option[jitNumOptions]; 33 void **jitOptVals = new void *[jitNumOptions]; 35 // 编译日志大小 36 jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; 37 int jitLogBufferSize = 1024; 38 jitOptVals[0] = (void *)(size_t)jitLogBufferSize; 40 // 编译日志的指针 41 jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; 42 char *jitLogBuffer = new char[jitLogBufferSize]; 43 jitOptVals[1] = jitLogBuffer; 45 // 单核函数寄存器数量 46 jitOptions[2] = CU_JIT_MAX_REGISTERS; 47 int jitRegCount = 32; 48 jitOptVals[2] = (void *)(size_t)jitRegCount; 50 // 编译模块 51 status = cuModuleLoadDataEx(&cuModule, ptx_source, jitNumOptions, jitOptions, (void **)jitOptVals); 52 printf("\nPTX JIT log:\n%s\n", jitLogBuffer); 53 status = cuModuleGetFunction(&matrixMul, cuModule, "matrixMul_bs32_64bit"); 55 // 数据准备工作 56 int block_size = 32; 57 int wa = 4 * block_size; 58 int ha = 6 * block_size; 59 int wb = 4 * block_size; 60 int hb = wa; 61 int wc = wb; 62 int hc = ha; 64 unsigned int size_A = wa * ha; 65 unsigned int mem_size_A = sizeof(float) * size_A; 66 float *h_A = (float *)malloc(mem_size_A); 67 unsigned int size_B = wb * hb; 68 unsigned int mem_size_B = sizeof(float) * size_B; 69 float *h_B = (float *)malloc(mem_size_B); 70 size_t size_C = wc * hc; 71 size_t mem_size_C = sizeof(float) * size_C; 72 float *h_C = (float *)malloc(mem_size_C); 74 for (int i = 0; i < size_A; ++i) 75 h_A[i] = 1.0f; 76 for (int i = 0; i < size_B; ++i) 77 h_B[i] = 0.01f; 79 CUdeviceptr d_A; 80 cuMemAlloc(&d_A, mem_size_A); 81 CUdeviceptr d_B; 82 cuMemAlloc(&d_B, mem_size_B); 83 CUdeviceptr d_C; 84 cuMemAlloc(&d_C, mem_size_C); 85 cuMemcpyHtoD(d_A, h_A, mem_size_A); 86 cuMemcpyHtoD(d_B, h_B, mem_size_B); 88 dim3 block(block_size, block_size, 1); 89 dim3 grid(wc / block_size, hc / block_size, 1); 91 // 两种方式调用 Driver API 92 if (1) 93 { 94 size_t Matrix_Width_A = (size_t)wa; 95 size_t Matrix_Width_B = (size_t)wb; 96 void *args[5] = { &d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B }; 97 // CUDA 4.0 Driver API 核函数调用,使用倒数第二个指针参数 98 cuLaunchKernel(matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z, 99 2 * block_size*block_size * sizeof(float), NULL, args, NULL); 100 } 101 else 102 { 103 int offset = 0; 104 char argBuffer[256];// 与上面 args 相同顺序依次填入所需的指针参数,用 offset 作偏移 106 *((CUdeviceptr *)&argBuffer[offset]) = d_C; 107 offset += sizeof(d_C); 108 *((CUdeviceptr *)&argBuffer[offset]) = d_A; 109 offset += sizeof(d_A); 110 *((CUdeviceptr *)&argBuffer[offset]) = d_B; 111 offset += sizeof(d_B); 112 size_t Matrix_Width_A = (size_t)wa; 113 size_t Matrix_Width_B = (size_t)wb; 114 *((CUdeviceptr *)&argBuffer[offset]) = Matrix_Width_A; 115 offset += sizeof(Matrix_Width_A); 116 *((CUdeviceptr *)&argBuffer[offset]) = Matrix_Width_B; 117 offset += sizeof(Matrix_Width_B); 119 // 用一个 void * 来封装上面5个参数,并加上参数尺寸和一个指明参数结束的结束宏 120 void *kernel_launch_config[5] = 121 { 122 CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, 123 CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, 124 CU_LAUNCH_PARAM_END 125 }; 127 // CUDA 4.0 Driver API 核函数调用,使用最后一个指针参数 128 cuLaunchKernel(matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z, 129 2 * block_size*block_size * sizeof(float), NULL, NULL, (void **)&kernel_launch_config); 130 } 132 cuMemcpyDtoH((void *)h_C, d_C, mem_size_C); 134 //检查结果 135 printf("Checking computed result for correctness: "); 136 bool correct = true; 137 for (int i = 0; i < (int)(wc * hc); i++) 138 { 139 if (fabs(h_C[i] - (wa * 0.01f)) > 1e-5) 140 { 141 printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5\n", i, h_C[i], wa*0.01f); 142 correct = false; 143 } 144 } 145 printf("%s\n", correct ? "Result = PASS" : "Result = FAIL"); 147 free(h_A); 148 free(h_B); 149 free(h_C); 150 cuMemFree(d_A); 151 cuMemFree(d_B); 152 cuMemFree(d_C); 153 cuCtxDestroy(cuContext); 155 getchar(); 156 return 0; 157 }
正直的手电筒 · LaTeX:飘齐雄隐 - 知乎 4 月前 |