CUDA共享内存的使用示例
CUDA共享内存使用示例如下:参考教材《GPU高性能编程CUDA实战》。P54-P65
教材下载地址: http://download.csdn.net/download/yizhaoyanbo/10150300 。
如果没有下载分可以评论区留下邮箱,我发你。
1 #include <cuda.h>
2 #include <cuda_runtime.h>
3 #include <device_launch_parameters.h>
4 #include <device_functions.h>
5 #include <iostream>
6 #include <string>
8 using namespace std;
10 #define imin(a,b) (a<b? a:b)
11 const int N = 33 * 1024;
12 const int threadsPerBlock = 256;
13 const int blocksPerGrid = imin(32, (N + threadsPerBlock - 1) / threadsPerBlock);
15 __global__ void dot(float *a, float *b, float *c)
17 __shared__ float cache[threadsPerBlock];
18 int tid = threadIdx.x + blockDim.x*blockIdx.x;
19 int cacheIndex = threadIdx.x;
21 float temp = 0;
22 //每个线程负责计算的点乘,再加和
23 while (tid<N)
24 {
25 temp += a[tid] * b[tid];
26 tid += blockDim.x*gridDim.x;
27 }
29 //每个线程块中线程计算的加和保存到缓冲区cache,一共有blocksPerGrid个缓冲区副本
30 cache[cacheIndex] = temp;
31 //对线程块中的线程进行同步
32 __syncthreads();
34 //归约运算,将每个缓冲区中的值加和,存放到缓冲区第一个元素位置
35 int i = blockDim.x / 2;
36 while (i != 0)
37 {
38 if (cacheIndex < i)
39 {
40 cache[cacheIndex] += cache[cacheIndex + i];
41 }
42 __syncthreads();
43 i /= 2;
44 }
45 //使用第一个线程取出每个缓冲区第一个元素赋值到C数组
46 if (cacheIndex == 0)
47 {
48 c[blockIdx.x] = cache[0];
49 }
52 void main()
54 float *a, *b, c, *partial_c;
55 float *dev_a, *dev_b, *dev_partial_c;
57 //分配CPU内存
58 a = (float*)malloc(N * sizeof(float));
59 b = (float*)malloc(N * sizeof(float));
60 partial_c = (float*)malloc(blocksPerGrid * sizeof(float));
62 //分配GPU内存
63 cudaMalloc(&dev_a, N * sizeof(float));
64 cudaMalloc(&dev_b, N * sizeof(float));
65 cudaMalloc(&dev_partial_c, blocksPerGrid * sizeof(float));
67 float sum = 0;
68 for (int i = 0; i < N; i++)
69 {
70 a[i] = i;
71 b[i] = i * 2;
72 }
74 //将数组上传到GPU
75 cudaMemcpy(dev_a, a, N * sizeof(float), cudaMemcpyHostToDevice);
76 cudaMemcpy(dev_b, b, N * sizeof(float), cudaMemcpyHostToDevice);
78 dot << <blocksPerGrid, threadsPerBlock >> > (dev_a, dev_b, dev_partial_c);
80 cudaMemcpy(partial_c, dev_partial_c, blocksPerGrid * sizeof(float), cudaMemcpyDeviceToHost);
82 //CPU 完成最终求和
83 c = 0;
84 for (int i = 0; i < blocksPerGrid; i++)
85 {
86 c += partial_c[i];
87 }
89 #define sum_squares(x) (x*(x+1)*(2*x+1)/6)
90 printf("does GPU value %.6g = %.6g?\n", c, 2 * sum_squares((float)(N - 1)));
92 cudaFree(dev_a);
93 cudaFree(dev_b);