cuda中的count3是非常慢的

我在CUDA中编写了一个小程序，它计算C数组中有多少3个并打印出来。

#include  #include  #include  #include  __global__ void incrementArrayOnDevice(int *a, int N, int *count) { int id = blockIdx.x * blockDim.x + threadIdx.x; //__shared__ int s_a[512]; // one for each thread //s_a[threadIdx.x] = a[id]; if( id < N ) { //if( s_a[threadIdx.x] == 3 ) if( a[id] == 3 ) { atomicAdd(count, 1); } } } int main(void) { int *a_h; // host memory int *a_d; // device memory int N = 16777216; // allocate array on host a_h = (int*)malloc(sizeof(int) * N); for(int i = 0; i < N; ++i) a_h[i] = (i % 3 == 0 ? 3 : 1); // allocate arrays on device cudaMalloc(&a_d, sizeof(int) * N); // copy data from host to device cudaMemcpy(a_d, a_h, sizeof(int) * N, cudaMemcpyHostToDevice); // do calculation on device int blockSize = 512; int nBlocks = N / blockSize + (N % blockSize == 0 ? 0 : 1); printf("number of blocks: %d\n", nBlocks); int count; int *devCount; cudaMalloc(&devCount, sizeof(int)); cudaMemset(devCount, 0, sizeof(int)); incrementArrayOnDevice<<>> (a_d, N, devCount); // retrieve result from device cudaMemcpy(&count, devCount, sizeof(int), cudaMemcpyDeviceToHost); printf("%d\n", count); free(a_h); cudaFree(a_d); cudaFree(devCount); }

我得到的结果是：真正的0m3.025s用户0m2.989s sys 0m0.029s

当我在具有4个线程的CPU上运行它时，我得到：real 0m0.101s user 0m0.100s sys 0m0.024s

请注意，GPU是旧的 – 我不知道确切的模型，因为我没有root访问权限，但它使用MESA驱动程序运行的OpenGL版本为1.2。

难道我做错了什么？我该怎么做才能让它跑得更快？

注意：我已经尝试过为每个块使用存储区（因此每个存储区的atomicAdd（）都会减少）但是我得到了完全相同的性能。我还尝试将分配给此块的512个整数复制到共享内存块（您可以在注释中看到它），时间也是相同的。

这是对您的问题的回答“我能做些什么才能让它跑得更快？” 正如我在评论中提到的，时间方法存在问题（可能），我对速度改进的主要建议是使用“经典并行缩减”算法。以下代码实现了更好的（在我看来）时序测量，并将内核转换为简化样式内核：

 #include  #include  #include  #define N (1<<24) #define nTPB 512 #define NBLOCKS 32 __global__ void incrementArrayOnDevice(int *a, int n, int *count) { __shared__ int lcnt[nTPB]; int id = blockIdx.x * blockDim.x + threadIdx.x; int lcount = 0; while (id < n) { if (a[id] == 3) lcount++; id += gridDim.x * blockDim.x; } lcnt[threadIdx.x] = lcount; __syncthreads(); int stride = blockDim.x; while(stride > 1) { // assume blockDim.x is a power of 2 stride >>= 1; if (threadIdx.x < stride) lcnt[threadIdx.x] += lcnt[threadIdx.x + stride]; __syncthreads(); } if (threadIdx.x == 0) atomicAdd(count, lcnt[0]); } int main(void) { int *a_h; // host memory int *a_d; // device memory cudaEvent_t gstart1,gstart2,gstop1,gstop2,cstart,cstop; float etg1, etg2, etc; cudaEventCreate(&gstart1); cudaEventCreate(&gstart2); cudaEventCreate(&gstop1); cudaEventCreate(&gstop2); cudaEventCreate(&cstart); cudaEventCreate(&cstop); // allocate array on host a_h = (int*)malloc(sizeof(int) * N); for(int i = 0; i < N; ++i) a_h[i] = (i % 3 == 0 ? 3 : 1); // allocate arrays on device cudaMalloc(&a_d, sizeof(int) * N); int blockSize = nTPB; int nBlocks = NBLOCKS; printf("number of blocks: %d\n", nBlocks); int count; int *devCount; cudaMalloc(&devCount, sizeof(int)); cudaMemset(devCount, 0, sizeof(int)); // copy data from host to device cudaEventRecord(gstart1); cudaMemcpy(a_d, a_h, sizeof(int) * N, cudaMemcpyHostToDevice); cudaMemset(devCount, 0, sizeof(int)); cudaEventRecord(gstart2); // do calculation on device incrementArrayOnDevice<<>> (a_d, N, devCount); cudaEventRecord(gstop2); // retrieve result from device cudaMemcpy(&count, devCount, sizeof(int), cudaMemcpyDeviceToHost); cudaEventRecord(gstop1); printf("GPU count = %d\n", count); int hostCount = 0; cudaEventRecord(cstart); for (int i=0; i < N; i++) if (a_h[i] == 3) hostCount++; cudaEventRecord(cstop); printf("CPU count = %d\n", hostCount); cudaEventSynchronize(cstop); cudaEventElapsedTime(&etg1, gstart1, gstop1); cudaEventElapsedTime(&etg2, gstart2, gstop2); cudaEventElapsedTime(&etc, cstart, cstop); printf("GPU total time = %fs\n", (etg1/(float)1000) ); printf("GPU compute time = %fs\n", (etg2/(float)1000)); printf("CPU time = %fs\n", (etc/(float)1000)); free(a_h); cudaFree(a_d); cudaFree(devCount); }

当我在相当快的GPU（Quadro 5000，比Tesla M2050慢一点）上运行时，我得到以下结果：

 number of blocks: 32 GPU count = 5592406 CPU count = 5592406 GPU total time = 0.025714s GPU compute time = 0.000793s CPU time = 0.017332s

我们看到GPU比计算部分的这种（天真的，单线程）CPU实现快得多。当我们增加传输数据的成本时，GPU版本较慢，但速度不会慢30倍。

通过比较，当我计算你的原始算法时，我得到了这样的数字：

 GPU total time = 0.118131s GPU compute time = 0.093213s

我的系统配置是Xeon X5560 CPU，RHEL 5.5，CUDA 5.0，Quadro5000 GPU。

cuda中的count3是非常慢的

使用OpenMP并行循环

ANSI C编译器可以删除延迟循环吗？

C / C ++：将char中的hex值转换为整数

在初始化C / C ++多维数组时省略大小

定义和声明应该匹配吗？

Cast C struct指向Swift结构不安全指针的双指针

如何在Java中获得新的指针？

字符串数组中的指针类型警告不兼容

不断运行服务器套接字

如何获取fileName有FILE *？