CUDA结果使用非常大的数组返回垃圾，但报告没有错误

我正在创建一个测试程序，它将创建一个大小为n的设备和主机数组，然后启动一个内核，创建n个线程，为设备arrays中的每个位置分配常量值0.95f。完成后，将设备arrays复制到主机arrays，并汇总所有条目并显示最终总计。

下面的程序似乎适用于大约6000万个浮点数的数组大小并且很快返回正确的结果，但是当达到7000万时，程序似乎挂起了一段时间并最终返回总结果的NAN结果。在6000万次运行后检查主机arrays显示它已正确填充0.95f，但在7000万次运行后检查它显示它填充了NAN。据我所知，没有一个CUDA调用返回错误。

我使用的是2GB GT640m（Compute 3.0），最大块大小为1024，最大网格尺寸为2147483647。

我相信有更好的方法来实现类似的东西，我想听听建议。但我也想了解这里出了什么问题，以便我可以从中学习。

#include "cuda_runtime.h" #include "device_launch_parameters.h" #include  #include  void cudaErrorHandler(cudaError_t status) { // Cuda call returned an error, just print error for now if(status != cudaSuccess) { printf("Error"); } } __global__ void addKernel(float* _Results, int _TotalCombinations) { // Get thread Id unsigned int Id = (blockDim.x * blockDim.y * blockIdx.x) + (blockDim.x * threadIdx.y) + threadIdx.x; //If the Id is within simulation range, log it if(Id < _TotalCombinations) { _Results[Id] = 0.95f; } } #define BLOCK_DIM_X 32 #define BLOCK_DIM_Y 32 #define BLOCK_SIZE BLOCK_DIM_X * BLOCK_DIM_Y // Statc block size of 32*32 (1024) #define CUDA_CALL(x) cudaErrorHandler(x) int main() { // The number of simulations to run unsigned int totalCombinations = 45000000; int gridsize = 1; // Work out how many blocks of size 1024 are required to perform all of totalCombinations for(unsigned int totalsize = gridsize * BLOCK_SIZE; totalsize < totalCombinations; gridsize++, totalsize = gridsize * BLOCK_SIZE) ; // Allocate host memory float* host_results = new float[totalCombinations]; memset(host_results, 0, sizeof(float) * totalCombinations); float *dev_results = 0; cudaSetDevice(0); // Allocate device memory CUDA_CALL(cudaMalloc((void**)&dev_results, totalCombinations * sizeof(float))); dim3 grid, block; block = dim3(BLOCK_DIM_X, BLOCK_DIM_Y); grid = dim3(gridsize); // Launch kernel addKernel<<>>(dev_results, totalCombinations); // Wait for synchronize CUDA_CALL(cudaDeviceSynchronize()); // Copy device data back to host CUDA_CALL(cudaMemcpy(host_results, dev_results, totalCombinations * sizeof(float), cudaMemcpyDeviceToHost)); double total = 0.0; // Total the results in the host array for(unsigned int i = 0; i < totalCombinations; i++) total+=host_results[i]; // Print results to screen printf("Total %f\n", total); delete[] host_results; return 0; }

正如您所发现的，您的error handling方法无效。下面我已经使用我经常使用的错误检查方法粘贴了一段代码。事情在你的失败点不起作用的原因是你的网格（你正在启动一维网格）超过了X维度中的最大网格大小（默认为65535，即计算能力高达2.x）。如果要利用更大的网格大小（2 ^ 31 -1是计算能力3.0的限制），则需要使用-arch=sm_30开关进行编译。

这里仅供参考，是一个代码版本，它显示了我经常使用的错误检查方法。

 #include  #include  #define cudaCheckErrors(msg) \ do { \ cudaError_t __err = cudaGetLastError(); \ if (__err != cudaSuccess) { \ fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \ msg, cudaGetErrorString(__err), \ __FILE__, __LINE__); \ fprintf(stderr, "*** FAILED - ABORTING\n"); \ exit(1); \ } \ } while (0) __global__ void addKernel(float* _Results, int _TotalCombinations) { // Get thread Id unsigned int Id = (blockDim.x * blockDim.y * blockIdx.x) + (blockDim.x * threadIdx.y) + threadIdx.x; //If the Id is within simulation range, log it if(Id < _TotalCombinations) { _Results[Id] = 0.95f; } } #define BLOCK_DIM_X 32 #define BLOCK_DIM_Y 32 #define BLOCK_SIZE BLOCK_DIM_X * BLOCK_DIM_Y // Statc block size of 32*32 (1024) int main() { // The number of simulations to run unsigned int totalCombinations = 65000000; int gridsize = 1; // Work out how many blocks of size 1024 are required to perform all of totalCombinations for(unsigned int totalsize = gridsize * BLOCK_SIZE; totalsize < totalCombinations; gridsize++, totalsize = gridsize * BLOCK_SIZE) ; printf("gridsize = %d, blocksize = %d\n", gridsize, BLOCK_SIZE); // Allocate host memory float* host_results = new float[totalCombinations]; memset(host_results, 0, sizeof(float) * totalCombinations); float *dev_results = 0; cudaSetDevice(0); // Allocate device memory cudaMalloc((void**)&dev_results, totalCombinations * sizeof(float)); cudaCheckErrors("cudaMalloc fail"); dim3 grid, block; block = dim3(BLOCK_DIM_X, BLOCK_DIM_Y); grid = dim3(gridsize); // Launch kernel addKernel<<>>(dev_results, totalCombinations); cudaCheckErrors("kernel fail"); // Wait for synchronize cudaDeviceSynchronize(); cudaCheckErrors("sync fail"); // Copy device data back to host cudaMemcpy(host_results, dev_results, totalCombinations * sizeof(float), cudaMemcpyDeviceToHost); cudaCheckErrors("cudaMemcpy 2 fail"); double total = 0.0; // Total the results in the host array for(unsigned int i = 0; i < totalCombinations; i++) total+=host_results[i]; // Print results to screen printf("Total %f\n", total); delete[] host_results; return 0; }

CUDA结果使用非常大的数组返回垃圾，但报告没有错误

从cuda内核打印

Char * p和scanf

读取nsdictionary与nsarray的性能

C Macro保护定义

软件用于Windows操作系统的C中断服务程序

计算未排序数据中唯一对的数量和非唯一对的实例

内存在音频缓冲区代码中不断增长

如何将64位整数打印为hex？

跨平台：选择使用32/64位的数据类型

在Mac上使用gdb在eclipse中调试 – 无法执行MI命令：-exec-run