如何使用cuda沿行方向对大型二维矩阵进行缩减? (每行的最大值和最大值索引)

我正在尝试沿2D矩阵的行方向实现缩减。 我从stackoverflow上找到的代码开始(非常感谢Robert!)

thrust :: max_element比较cublasIsamax慢 – 更有效的实现?

上面的链接显示了一个在单行上执行缩减的自定义内核。 它将输入行分为多行,每行有1024个线程。 效果很好。

对于2D情况,除了现在有一个网格尺寸之外,一切都是一样的。 所以每个块的y维度仍然是1.问题是当我尝试将数据写入每个块内的共享内存(在代码中的“max_idx_kernel_reduction_within_block”内核中)时,需要很长时间(超过(#行) *(在1行上执行减少所需的时间。我宁愿运行for循环)。我知道我有很多元素但是我期待比这更快的东西。

我不认为内存访问模式是一个问题,但我听说TOTAL共享内存量可能是限制? : CUDA:合并全局内存访问速度比共享内存快吗? 另外,分配大型共享内存arrays会减慢程序的速度吗?

任何使我的代码更快的建议(第一个内核是瓶颈)? 非常感谢,非常感谢!!

#include  #include  #include  #include  #include  #define NCOLS 163317 // number of columns #define NROWS 8 // number of rows #define nTPB 1024 // Threads per Block. nTPB should be a power-of-2 #define MAX_BLOCKS_X ((NCOLS/nTPB)+1) // # of blocks I will launch #define MIN(a,b) ((a>b)?b:a) #define FLOAT_MIN -1.0f // lowest anticipated number of the data. Values in array will be compared with this and updated with the larger one #define IDX2F(i,j,ld) ((j-1) * (ld) + ((i) - 1)) // 1 based indexing #define IDX2C(i,j,ld) ((j) * (ld) + i) // 0 based indexing __device__ volatile float blk_vals[NROWS][MAX_BLOCKS_X]; __device__ volatile int blk_idxs[NROWS][MAX_BLOCKS_X]; // blk_vals and blk_idxs are the results obtained from reduction within each block. // after 1st reduction, each row will have blk_vals[MAX_BLOCKS_X] array and blk_idxs[MAX_BLOCKS_X] // these will be passed to the 2nd kernel __global__ void max_idx_kernel_reduction_within_block(const float *data, const int xSize, const int ySize){ // first kernel. Reduction within blocks __shared__ volatile float vals[nTPB]; // Total amount of shared memory per block: 49152 bytes (50 KB). 1024 gives ~ 4KB for single. __shared__ volatile int idxs[nTPB]; // ~ 4 KB for single, when nTPB is 1024. each block will have both indices and values int idx = threadIdx.x+blockDim.x * blockIdx.x; // idx in the x direction float my_val = FLOAT_MIN; // lowest possible number int my_idx = -1; // to check whether you DID perform the kernel. Again, it's the idx in the x dir. // sweep from global memory while (idx  my_val) {my_val = data[IDX2C(blockIdx.y,idx,ySize)]; my_idx = idx;} // compare with my_val, and put the bigger value into my_val for next comparison. my_idx is 0 index based idx += blockDim.x*gridDim.x;} // until here takes about 6 ms !! very fast!! // populate shared memory: takes ~ 270 ms vals[threadIdx.x] = my_val; // put the computed max value for each thread into the shared memory. -> this is the bottleneck!! idxs[threadIdx.x] = my_idx; // do this for index as well -> this is also slow!! __syncthreads(); // sweep in shared memory for (int i = (nTPB>>1); i > 0; i>>=1){ if (threadIdx.x < i) // the first half threads of the block if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; } // the above is comparing shared memory of threadIdx.x with shared memory of threadIdx.x + i. // then puts the larger value into shared memory of threadIdx.x __syncthreads();} // so now in each block, shared memory's first element (index 0) is the max value and max value index // perform block-level reduction if (!threadIdx.x){ // at the shared memory, only the first element (index 0) (actually 2 elements in the first index. max value, and max value index) is what we need. blk_vals[blockIdx.y][blockIdx.x] = vals[0]; // For each window (single x row), the first elements of the blocks are stored into the blk_vals[windowNumber][:] // remember, this is a global variable. blk_idxs[blockIdx.y][blockIdx.x] = idxs[0]; // and the max value index __syncthreads(); } } // originally the following kernel was in the 1st kernel, performed by the last block. So just use one block for this. __global__ void max_idx_kernel_final(int *result_maxInd, float *result_maxVal){ __shared__ volatile float vals[nTPB]; // Total amount of shared memory per block: 49152 bytes (50 KB). 1024 gives ~ 4KB for single. __shared__ volatile int idxs[nTPB]; // ~ 4 KB for single, when nTPB is 1024. each block will have these variables!! (vals and idxs) int idx = threadIdx.x; float my_val = FLOAT_MIN; int my_idx = -1; // remember, these are local variables, so each thread has this variable. This local variable is independent from other thread's local variable while (idx  my_val) {my_val = blk_vals[blockIdx.y][idx]; my_idx = blk_idxs[blockIdx.y][idx]; } idx += blockDim.x;} // all threads in this single block (single in the x dir) are working, so you should loop over blockDim.x. // Imagine where gridDim.x (# of blocks) is huge so that you need to loop over to get the max value and index // After this, each thread in the block has a local variable (max value and max value index). // So far it was sort of a reduction, but instead of pairing values we just looped over the blk_vals and blk_idxs // populate shared memory vals[threadIdx.x] = my_val; // This is now shared memory. This is because reduction requires comparison between different elements idxs[threadIdx.x] = my_idx; // my_idx value is 0 based. This is done for all blocks (in the y direction) __syncthreads(); // Now the final task is to do reduction for all threads in our single block (single block in the x dir, NROWS blocks in the y dir)! // sweep in shared memory for (int i = (nTPB>>1); i > 0; i>>=1) { if (threadIdx.x < i) // the first half threads of the block if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; } __syncthreads();} // now all the results are in threadIdx.x == 0 for each block (there are NROWS blocks in the y dir) // 0th thread. the results are in shared memory, not the local memory, so any thread could do the following. We just selected the 0th thread for no reason. If several threads try to do this, that would be a problem, since we'll have to wait for them if(!threadIdx.x){ result_maxInd[blockIdx.y] = idxs[0]; // the final result for each row goes into the corresponding position (blockIdx.y) result_maxVal[blockIdx.y] = vals[0]; } } int main(){ dim3 grids(MAX_BLOCKS_X, NROWS); dim3 threads(nTPB,1); dim3 grids2(1,NROWS); dim3 threads2(nTPB); float *d_vector, *h_vector; h_vector = (float*)malloc(NROWS * NCOLS * sizeof(float)); for (int j = 1; j <= NCOLS; j++) { for (int i = 1; i <= NROWS; i++) { h_vector[IDX2F(i,j,NROWS)] = (float) (rand()/(float)RAND_MAX); } } h_vector[IDX2F(2,5,NROWS)] = 10; // create definite max element cudaMalloc(&d_vector, NROWS * NCOLS * sizeof(float)); cudaMemcpy(d_vector, h_vector, NROWS * NCOLS * sizeof(float), cudaMemcpyHostToDevice); //d_vector is a pointer on the device pointing to the beginning of the vector, containing nrElements floats. int *max_index; float *max_val; int *d_max_index; float *d_max_val; max_index = (int*)malloc(NROWS * sizeof(int)); max_val = (float*)malloc(NROWS * sizeof(float)); cudaMalloc((void**)&d_max_index, NROWS * sizeof(int)); cudaMalloc((void**)&d_max_val, NROWS * sizeof(float)); max_idx_kernel_reduction_within_block<<>>(d_vector, NCOLS, NROWS); max_idx_kernel_final<<>>(d_max_index, d_max_val); cudaMemcpy(max_index, d_max_index, NROWS * sizeof(int), cudaMemcpyDeviceToHost); cudaMemcpy(max_val, d_max_val, NROWS * sizeof(float), cudaMemcpyDeviceToHost); for(int z=0;z<20;z++) printf("%d ",max_index[z]); printf("\n\n\n"); for(int z=0;z<20;z++) printf("%f ",max_val[z]); return 0; } 

您的代码有各种问题:

  1. 你应该使用适当的cuda错误检查 。 这只是我制作的标准锅炉声明。 我不认为您的代码产生任何运行时错误。
  2. 您应该validation您的结果。 我严重怀疑代码产生了明智的结果。 原因将变得明显。 如果您想自己certificate这一点,请将数据初始化修改为显而易见且易于validation的内容(如我所示),而不进行任何其他更改,您将看到程序产生错误。
  3. 在您的内核中,您没有正确地索引数组。 也许您不了解IDX2CIDX2F宏。 他们以两种方式伤害到你:你不了解他们通过你的数组索引的模式,他们正在杀死你的全局内存合并( 由于你使用它们的方式 )。 每当我们有一个构造是threadIdx.xthreadIdx.y (或者在这种情况下是blockIdx.y )的索引函数时,为了在相邻线程之间保持适当的合并,它希望基于threadIdx.x的组件不会成倍增加通过任何缩放因子 。 但是你在内核IDX2C参数传递给IDX2C的方式,你违反了这个规则(并且还破坏了你想要的行方式访问模式。)所以现在,让我们摆脱这些宏,因为它们混淆了这个问题。
  4. 这是__syncthreads()的非法使用:

      if (!threadIdx.x){ // at the shared memory, only the first element (index 0) (actually 2 elements in the first index. max value, and max value index) is what we need. blk_vals[blockIdx.y][blockIdx.x] = vals[0]; // For each window (single x row), the first elements of the blocks are stored into the blk_vals[windowNumber][:] // remember, this is a global variable. blk_idxs[blockIdx.y][blockIdx.x] = idxs[0]; // and the max value index __syncthreads(); } 

    在条件代码中使用它是非法的,除非条件对块中的每个线程评估相同。 那里完全不需要,所以我们只删除它。

  5. 您的打印输出索引到20而不是NROWS。

通过上述更改,您的代码将被破坏(产生不正确的结果)变为修复,并且系统上内核的执行时间从0.929ms变为0.656ms。 我将所有这些归结为上面第3项中的合并修复。

当我用nvprof --metrics gld_efficiency ... 分析之前和之后的情况时,它显示原始代码效率为12.5%,更改效率为53%。 这是修改后的代码:

 #include  #include  #include  #include  #define NCOLS 163317 // number of columns #define NROWS 8 // number of rows #define nTPB 1024 // Threads per Block. nTPB should be a power-of-2 #define MAX_BLOCKS_X ((NCOLS/nTPB)+1) // # of blocks I will launch #define FLOAT_MIN -1.0f // lowest anticipated number of the data. Values in array will be compared with this and updated with the larger one __device__ volatile float blk_vals[NROWS][MAX_BLOCKS_X]; __device__ volatile int blk_idxs[NROWS][MAX_BLOCKS_X]; // blk_vals and blk_idxs are the results obtained from reduction within each block. // after 1st reduction, each row will have blk_vals[MAX_BLOCKS_X] array and blk_idxs[MAX_BLOCKS_X] // these will be passed to the 2nd kernel __global__ void max_idx_kernel_reduction_within_block(const float *data, const int xSize, const int ySize){ // first kernel. Reduction within blocks __shared__ volatile float vals[nTPB]; // Total amount of shared memory per block: 49152 bytes (50 KB). 1024 gives ~ 4KB for single. __shared__ volatile int idxs[nTPB]; // ~ 4 KB for single, when nTPB is 1024. each block will have both indices and values int idx = threadIdx.x+blockDim.x * blockIdx.x; // idx in the x direction int idy = blockIdx.y; float my_val = FLOAT_MIN; // lowest possible number int my_idx = -1; // to check whether you DID perform the kernel. Again, it's the idx in the x dir. // sweep from global memory while (idx < xSize){ // this ensures you don't go out the size of the array's x direction float temp = data[idy*xSize+idx]; if (temp > my_val) {my_val = temp; my_idx = idx;} // compare with my_val, and put the bigger value into my_val for next comparison. my_idx is 0 index based idx += blockDim.x*gridDim.x;} // until here takes about 6 ms !! very fast!! // populate shared memory: takes ~ 270 ms vals[threadIdx.x] = my_val; // put the computed max value for each thread into the shared memory. -> this is the bottleneck!! idxs[threadIdx.x] = my_idx; // do this for index as well -> this is also slow!! __syncthreads(); // sweep in shared memory for (int i = (nTPB>>1); i > 0; i>>=1){ if (threadIdx.x < i) // the first half threads of the block if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; } // the above is comparing shared memory of threadIdx.x with shared memory of threadIdx.x + i. // then puts the larger value into shared memory of threadIdx.x __syncthreads();} // so now in each block, shared memory's first element (index 0) is the max value and max value index // perform block-level reduction if (!threadIdx.x){ // at the shared memory, only the first element (index 0) (actually 2 elements in the first index. max value, and max value index) is what we need. blk_vals[blockIdx.y][blockIdx.x] = vals[0]; // For each window (single x row), the first elements of the blocks are stored into the blk_vals[windowNumber][:] // remember, this is a global variable. blk_idxs[blockIdx.y][blockIdx.x] = idxs[0]; // and the max value index __syncthreads(); } } // originally the following kernel was in the 1st kernel, performed by the last block. So just use one block for this. __global__ void max_idx_kernel_final(int *result_maxInd, float *result_maxVal){ __shared__ volatile float vals[nTPB]; // Total amount of shared memory per block: 49152 bytes (50 KB). 1024 gives ~ 4KB for single. __shared__ volatile int idxs[nTPB]; // ~ 4 KB for single, when nTPB is 1024. each block will have these variables!! (vals and idxs) int idx = threadIdx.x; int idy = blockIdx.y; float my_val = FLOAT_MIN; int my_idx = -1; // remember, these are local variables, so each thread has this variable. This local variable is independent from other thread's local variable while (idx < MAX_BLOCKS_X ){ // ?? confused whether it should be gridDim.x (actual # of blocks launched) or MAX_BLOCKS_X (# of elements in x dir of the global array blk_vals) float temp = blk_vals[idy][idx]; if (temp > my_val) {my_val = temp; my_idx = blk_idxs[idy][idx]; } idx += blockDim.x;} // all threads in this single block (single in the x dir) are working, so you should loop over blockDim.x. // Imagine where gridDim.x (# of blocks) is huge so that you need to loop over to get the max value and index // After this, each thread in the block has a local variable (max value and max value index). // So far it was sort of a reduction, but instead of pairing values we just looped over the blk_vals and blk_idxs // populate shared memory idx = threadIdx.x; vals[idx] = my_val; // This is now shared memory. This is because reduction requires comparison between different elements idxs[idx] = my_idx; // my_idx value is 0 based. This is done for all blocks (in the y direction) __syncthreads(); // Now the final task is to do reduction for all threads in our single block (single block in the x dir, NROWS blocks in the y dir)! // sweep in shared memory for (int i = (nTPB>>1); i > 0; i>>=1) { if (idx < i) // the first half threads of the block if (vals[idx] < vals[idx + i]) {vals[idx] = vals[idx+i]; idxs[idx] = idxs[idx+i]; } __syncthreads();} // now all the results are in threadIdx.x == 0 for each block (there are NROWS blocks in the y dir) // 0th thread. the results are in shared memory, not the local memory, so any thread could do the following. We just selected the 0th thread for no reason. If several threads try to do this, that would be a problem, since we'll have to wait for them if(!threadIdx.x){ result_maxInd[idy] = idxs[0]; // the final result for each row goes into the corresponding position (blockIdx.y) result_maxVal[idy] = vals[0]; } } int main(){ dim3 grids(MAX_BLOCKS_X, NROWS); dim3 threads(nTPB,1); dim3 grids2(1,NROWS); dim3 threads2(nTPB); float *d_vector, *h_vector; h_vector = (float*)malloc(NROWS * NCOLS * sizeof(float)); memset(h_vector, 0, NROWS*NCOLS*sizeof(float)); for (int i = 0; i < NROWS; i++) h_vector[i*NCOLS + i] = 10.0f; // create definite max element per row cudaMalloc(&d_vector, NROWS * NCOLS * sizeof(float)); cudaMemcpy(d_vector, h_vector, NROWS * NCOLS * sizeof(float), cudaMemcpyHostToDevice); //d_vector is a pointer on the device pointing to the beginning of the vector, containing nrElements floats. int *max_index; float *max_val; int *d_max_index; float *d_max_val; max_index = (int*)malloc(NROWS * sizeof(int)); max_val = (float*)malloc(NROWS * sizeof(float)); cudaMalloc((void**)&d_max_index, NROWS * sizeof(int)); cudaMalloc((void**)&d_max_val, NROWS * sizeof(float)); cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start); max_idx_kernel_reduction_within_block<<>>(d_vector, NCOLS, NROWS); max_idx_kernel_final<<>>(d_max_index, d_max_val); cudaEventRecord(stop); cudaEventSynchronize(stop); float et; cudaEventElapsedTime(&et, start, stop); printf("elapsed time: %fms\n", et); cudaMemcpy(max_index, d_max_index, NROWS * sizeof(int), cudaMemcpyDeviceToHost); cudaMemcpy(max_val, d_max_val, NROWS * sizeof(float), cudaMemcpyDeviceToHost); for(int z=0;z