程序其实很简单,就是实现两个三维矩阵相加A+B=C。

由于单个矩阵超出显卡内存,需要对矩阵在GPU上进行分片。程序如下:

//
// matadd_kernel.cu
//
#include <stdio.h>
#include <cutil.h>
#include <mpi.h>
#define BLOCK_DIMX 16
#define BLOCK_DIMZ 16

int InitCUDA(int myid)
{
int devCount = 0;
int dev = 0;

cudaGetDeviceCount(&devCount);
if (devCount == 0)
{
fprintf(stderr, "There is no device supporting CUDA.\n");
return false;
}

for (dev = 0; dev < devCount; ++ dev)
{
cudaDeviceProp prop;
if (cudaGetDeviceProperties(&prop, dev) == cudaSuccess)
{
if (prop.major >= 1) break;
}
}

if (dev == devCount)
{
fprintf(stderr, "There is no device supporting CUDA.\n");
return false;
}

cudaSetDevice(myid);

return true;
}

__global__ void matadd_kernel(float *matA, float *matB, float *matC, int nx, int ny, int nz)
{
int ix = blockDim.x * blockIdx.x + threadIdx.x;
int iy = blockDim.y * blockIdx.y + threadIdx.y;

for (int iz = 0; iz < nz; iz ++)
{
if (ix < nx && iy < ny)
matC[iz * ny * nx + iy * nx + ix] = matA[iz * ny * nx + iy * nx + ix] + matB[iz * ny * nx + iy * nx + ix];
}
}

void matadd(float *matA, float *matB, float *matC, int nx, int ny, int nz, int myid, int size)
{
int ista, iend;
if (myid != 0)
{
ista = (myid - 1) * ( nz / size);
iend = ista + nz / size - 1;
} else {
ista = (size - 1) * (nz / size);
iend = nz - 1;
}

int loc_nz = iend - ista + 1;
float *loc_matA = (float *) malloc( loc_nz * ny * nx * sizeof(float));
float *loc_matB = (float *) malloc( loc_nz * ny * nx * sizeof(float));
float *loc_matC = (float *) malloc( loc_nz * ny * nx * sizeof(float));

MPI_Status status;
int count[size];

if (myid != 0)
{
MPI_Send(&loc_nz, 1, MPI_INT, 0, myid, MPI_COMM_WORLD);
} else {
count[0] = loc_nz;
for (int i = 1; i < size; i ++) {
MPI_Recv(&count[i], 1, MPI_INT, i, i, MPI_COMM_WORLD, &status);
}
}

if (myid == 0)
{
for (int ix = 0; ix < count[0] * ny * nx; ix ++)
{
loc_matA[count[0] * ny * nx - 1 - ix] = matA[nz * ny * nx - 1 - ix];
loc_matB[count[0] * ny * nx - 1 - ix] = matB[nz * ny * nx - 1 - ix];
}
for (int isz = 1; isz < size; isz ++)
{
int idx = 0;
if (isz == 1) idx = 0;
else idx += count[isz - 1];
MPI_Send(matA + idx * ny * nz, count[isz] * ny * nx, MPI_FLOAT, isz, isz, MPI_COMM_WORLD);
MPI_Send(matB + idx * ny * nz, count[isz] * ny * nx, MPI_FLOAT, isz, isz, MPI_COMM_WORLD);
}
} else {
MPI_Recv(loc_matA, loc_nz * ny * nx, MPI_FLOAT, 0, myid, MPI_COMM_WORLD, &status);
MPI_Recv(loc_matB, loc_nz * ny * nx, MPI_FLOAT, 0, myid, MPI_COMM_WORLD, &status);
}

float *d_loc_matA;
cudaMalloc((void **) &d_loc_matA, loc_nz * ny * nx * sizeof(float));
cudaMemcpy(d_loc_matA, loc_matA, loc_nz * ny * nx * sizeof(float), cudaMemcpyHostToDevice);

float *d_loc_matB;
cudaMalloc((void **) &d_loc_matB, loc_nz * ny * nx * sizeof(float));
cudaMemcpy(d_loc_matB, loc_matB, loc_nz * ny * nx * sizeof(float), cudaMemcpyHostToDevice);

float *d_loc_matC;
cudaMalloc((void **) &d_loc_matC, loc_nz * ny * nx * sizeof(float));

dim3 dimBlock(BLOCK_DIMX, BLOCK_DIMZ);
dim3 dimGrid(nx / BLOCK_DIMX, ny / BLOCK_DIMZ);

matadd_kernel<<<dimGrid, dimBlock>>>(d_loc_matA, d_loc_matB, d_loc_matC, nx, ny, loc_nz);

cudaMemcpy(loc_matC, d_loc_matC, loc_nz * ny * nx * sizeof(float), cudaMemcpyDeviceToHost);

if (myid !=0 )
{
MPI_Send(loc_matC, loc_nz * ny * nx, MPI_FLOAT, 0, myid, MPI_COMM_WORLD);
} else {
for (int ix = 0; ix < count[0] * ny * nx; ix ++)
matC[nz * ny * nx - 1 - ix] = loc_matC[loc_nz * ny * nx - 1 - ix];
for (int isz = 1; isz < size; isz ++)
{
int idx = 0;
if (isz == 1) idx = 0;
else idx += count[isz - 1];
MPI_Recv(matC + idx * ny * nz, count[isz] * ny * nx, MPI_FLOAT, isz, isz, MPI_COMM_WORLD, &status);
}
}

cudaFree(d_loc_matA);
cudaFree(d_loc_matB);
cudaFree(d_loc_matC);

free(loc_matA);
free(loc_matB);
free(loc_matC);
return;
}


// Main program

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <mpi.h>

int InitCUDA(int myid);
void matadd(float *matA, float *matB, float *matC, int nx, int ny, int nz, int myid, int size);
int main(int argc, char *argv[])
{
int myid, numprocs;
int namelen;
MPI_Status status;

MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
MPI_Comm_rank(MPI_COMM_WORLD, &myid);

InitCUDA(myid);

int nx = 1024, ny = 1024, nz = 500;
float *a = (float *)malloc(nx * ny * nz * sizeof(float));
float *b = (float *)malloc(nx * ny * nz * sizeof(float));
float *c = (float *)malloc(nx * ny * nz * sizeof(float));

for (int iz = 0; iz < nz; iz ++)
for (int iy = 0; iy < ny; iy ++)
for (int ix = 0; ix < nx; ix++)
{
a[iz * ny * nx + iy * nx + ix] = 1.0f;
b[iz * ny * nx + iy * nx + ix] = 2.0f;
c[iz * ny * nx + iy * nx + ix] = 0.0f;
}

clock_t tstart = clock();
matadd(a, b, c, nx, ny, nz, myid, numprocs);
clock_t tend = clock();

if (myid == 0)
printf("time for matrix addition is %.5f\n", (double)(tend - tstart)/CLOCKS_PER_SEC);

if (myid == 0)
{
printf("c = %f\n", c[nx * ny * nz - 1]);

for (int iz = 0; iz < nz; iz ++)
for (int iy = 0; iy < ny; iy ++)
for (int ix = 0; ix < nx; ix++)
if ((c[iz * ny * nx + iy * nx + ix] - 3.0) >1.0e-2)
{
fprintf(stderr, "Error occurs\n");
return EXIT_FAILURE;
}
}

free(a);
free(b);
free(c);

MPI_Finalize();
return EXIT_SUCCESS;
}

由上面矩阵维数看,每个矩阵占4*500M=2G空间,因此如果要在1G容量GPU上实现两矩阵相加,那么至少需要大于等于5个GPU才能一次实现(由于子结点上的C矩阵还要占空间)。计算结果如下:
mpirun -np 5 ./test
Time for matrix addition is 45.56000s
mpirun -np 10 ./test
Time for matrix addition is 32.44000s

时间未见大幅度减少,主要是由于数据在节点之间进行消息传递,需要大量的时间。

http://blog.sina.com.cn/s/blog_81fcea1601014pmx.html 用MPI+CUDA实现矩阵加法用MPI+CUDA实现矩阵加法2010-06-10 12:52转载自 w94w最终编辑 w94w程序其实很简单,就是实现两个三维矩阵相加A+B=C。由于单个矩阵超出显卡内存,需要对矩阵在GPU上进行分片。程序如下:////////////////////////////// WHAT YOU WILL LEARN What MPI is How to use MPI for inter GPU communication with CUDA and OpenACC What CUDA -aware MPI is What Multi Process Service is and how to use it How to use NVIDIA Tools in an MPI environment How to hide MPI communication times // 矩阵 转置 __global__ void transposeGPU(double *outdata, double *indata, int width, int height) __shared__ double block[BLOCK_DIM][BLOCK_DIM + 1]; unsigned int xIndex...
TCLB解算器 TCLB是基于Lattice Boltzmann方法的 MPI + CUDA MPI + CPU高性能计算流体动力学仿真代码。 它为复杂物理的计算和新模型的 实现 提供了清晰的界面。 稳定发布 : 当前版本 : 如何使用它 git clone https://github.com/CFD-GO/TCLB.git cd TCLB make configure ./configure --enable-graphics --with- cuda -arch=sm_30 make d2q9 CLB/d2q9/main example/flow/2d/karma
使用 MPI 实现 快速排序的一种方法是使用分治策略。首先,将数组划分成若干个子数组,然后使用 MPI 进程分别对各自的子数组进行快速排序。然后,在所有进程完成排序之后,使用 MPI 函数进行数据交换,将所有子数组中的元素进行归并。最后,对归并后的数组进行快速排序即可完成整个数组的排序。 下面是一个使用 MPI 实现 快速排序的简单示例: #include <stdio.h> #include <stdlib.h> #include < mpi .h> #define N 1000000 int cmp(const void *a, const void *b) return (*(int *)a - *(int *)b); int main(int argc, char *argv[]) int rank, size; int i, pivot, temp; int *data, *sub_data; MPI _Status status; MPI _Init(&argc, &argv); MPI _Comm_rank( MPI _COMM_WORLD, &rank); MPI _Comm_size( MPI _COMM_WORLD, &size); data = (int *)malloc(N * sizeof(int)); sub_data = (int *)malloc((N / size) * sizeof(int)); if (rank == 0) for (i = 0; i < N; i++) data[i] = rand(); MPI _Scatter(data, N / size, MPI _INT, sub_data, N / size, MPI _INT, 0, MPI _COMM_WORLD); qsort(sub_data, N / size, sizeof(int), cmp); if (rank != 0) MPI _Send(sub_data, N / size, MPI _INT, 0, 0, MPI _COMM_WORLD); for (i = 1; i < size; i++) MPI _Recv(data, N / size, MPI _INT, i, 0, MPI _COMM_WORLD, &status); qsort(data, N / size, sizeof(int), cmp); qsort(sub