cuda中的count3是非常慢的

我在CUDA中编写了一个小程序,它计算C数组中有多少3个并打印出来。

#include  #include  #include  #include  __global__ void incrementArrayOnDevice(int *a, int N, int *count) { int id = blockIdx.x * blockDim.x + threadIdx.x; //__shared__ int s_a[512]; // one for each thread //s_a[threadIdx.x] = a[id]; if( id < N ) { //if( s_a[threadIdx.x] == 3 ) if( a[id] == 3 ) { atomicAdd(count, 1); } } } int main(void) { int *a_h; // host memory int *a_d; // device memory int N = 16777216; // allocate array on host a_h = (int*)malloc(sizeof(int) * N); for(int i = 0; i < N; ++i) a_h[i] = (i % 3 == 0 ? 3 : 1); // allocate arrays on device cudaMalloc(&a_d, sizeof(int) * N); // copy data from host to device cudaMemcpy(a_d, a_h, sizeof(int) * N, cudaMemcpyHostToDevice); // do calculation on device int blockSize = 512; int nBlocks = N / blockSize + (N % blockSize == 0 ? 0 : 1); printf("number of blocks: %d\n", nBlocks); int count; int *devCount; cudaMalloc(&devCount, sizeof(int)); cudaMemset(devCount, 0, sizeof(int)); incrementArrayOnDevice<<>> (a_d, N, devCount); // retrieve result from device cudaMemcpy(&count, devCount, sizeof(int), cudaMemcpyDeviceToHost); printf("%d\n", count); free(a_h); cudaFree(a_d); cudaFree(devCount); } 

我得到的结果是:真正的0m3.025s用户0m2.989s sys 0m0.029s

当我在具有4个线程的CPU上运行它时,我得到:real 0m0.101s user 0m0.100s sys 0m0.024s

请注意,GPU是旧的 – 我不知道确切的模型,因为我没有root访问权限,但它使用MESA驱动程序运行的OpenGL版本为1.2。

难道我做错了什么? 我该怎么做才能让它跑得更快?

注意:我已经尝试过为每个块使用存储区(因此每个存储区的atomicAdd()都会减少)但是我得到了完全相同的性能。 我还尝试将分配给此块的512个整数复制到共享内存块(您可以在注释中看到它),时间也是相同的。

这是对您的问题的回答“我能做些什么才能让它跑得更快?” 正如我在评论中提到的,时间方法存在问题(可能),我对速度改进的主要建议是使用“经典并行缩减”算法。 以下代码实现了更好的(在我看来)时序测量,并将内核转换为简化样式内核:

 #include  #include  #include  #define N (1<<24) #define nTPB 512 #define NBLOCKS 32 __global__ void incrementArrayOnDevice(int *a, int n, int *count) { __shared__ int lcnt[nTPB]; int id = blockIdx.x * blockDim.x + threadIdx.x; int lcount = 0; while (id < n) { if (a[id] == 3) lcount++; id += gridDim.x * blockDim.x; } lcnt[threadIdx.x] = lcount; __syncthreads(); int stride = blockDim.x; while(stride > 1) { // assume blockDim.x is a power of 2 stride >>= 1; if (threadIdx.x < stride) lcnt[threadIdx.x] += lcnt[threadIdx.x + stride]; __syncthreads(); } if (threadIdx.x == 0) atomicAdd(count, lcnt[0]); } int main(void) { int *a_h; // host memory int *a_d; // device memory cudaEvent_t gstart1,gstart2,gstop1,gstop2,cstart,cstop; float etg1, etg2, etc; cudaEventCreate(&gstart1); cudaEventCreate(&gstart2); cudaEventCreate(&gstop1); cudaEventCreate(&gstop2); cudaEventCreate(&cstart); cudaEventCreate(&cstop); // allocate array on host a_h = (int*)malloc(sizeof(int) * N); for(int i = 0; i < N; ++i) a_h[i] = (i % 3 == 0 ? 3 : 1); // allocate arrays on device cudaMalloc(&a_d, sizeof(int) * N); int blockSize = nTPB; int nBlocks = NBLOCKS; printf("number of blocks: %d\n", nBlocks); int count; int *devCount; cudaMalloc(&devCount, sizeof(int)); cudaMemset(devCount, 0, sizeof(int)); // copy data from host to device cudaEventRecord(gstart1); cudaMemcpy(a_d, a_h, sizeof(int) * N, cudaMemcpyHostToDevice); cudaMemset(devCount, 0, sizeof(int)); cudaEventRecord(gstart2); // do calculation on device incrementArrayOnDevice<<>> (a_d, N, devCount); cudaEventRecord(gstop2); // retrieve result from device cudaMemcpy(&count, devCount, sizeof(int), cudaMemcpyDeviceToHost); cudaEventRecord(gstop1); printf("GPU count = %d\n", count); int hostCount = 0; cudaEventRecord(cstart); for (int i=0; i < N; i++) if (a_h[i] == 3) hostCount++; cudaEventRecord(cstop); printf("CPU count = %d\n", hostCount); cudaEventSynchronize(cstop); cudaEventElapsedTime(&etg1, gstart1, gstop1); cudaEventElapsedTime(&etg2, gstart2, gstop2); cudaEventElapsedTime(&etc, cstart, cstop); printf("GPU total time = %fs\n", (etg1/(float)1000) ); printf("GPU compute time = %fs\n", (etg2/(float)1000)); printf("CPU time = %fs\n", (etc/(float)1000)); free(a_h); cudaFree(a_d); cudaFree(devCount); } 

当我在相当快的GPU(Quadro 5000,比Tesla M2050慢一点)上运行时,我得到以下结果:

 number of blocks: 32 GPU count = 5592406 CPU count = 5592406 GPU total time = 0.025714s GPU compute time = 0.000793s CPU time = 0.017332s 

我们看到GPU比计算部分的这种(天真的,单线程)CPU实现快得多。 当我们增加传输数据的成本时,GPU版本较慢,但速度不会慢30倍。

通过比较,当我计算你的原始算法时,我得到了这样的数字:

 GPU total time = 0.118131s GPU compute time = 0.093213s 

我的系统配置是Xeon X5560 CPU,RHEL 5.5,CUDA 5.0,Quadro5000 GPU。