无法使用cuda进入__global__函数

我在Nsight上编写了一个编译并可以执行的代码但是第一次启动无法完成。

奇怪的是,当我在调试模式下运行时,它运行得很好,但速度太慢了。

在进入访问GPU的函数之前,这是代码的一部分(我认为有一个我无法找到的错误):

void parallelAction (int * dataReturned, char * data, unsigned char * descBase, int range, int cardBase, int streamIdx) { size_t inputBytes = range*128*sizeof(unsigned char); size_t baseBytes = cardBase*128*sizeof(unsigned char); size_t outputBytes = range*sizeof(int); unsigned char * data_d; unsigned char * descBase_d; int * cardBase_d; int * dataReturned_d; cudaMalloc((void **) &data_d, inputBytes); cudaMalloc((void **) &descBase_d, baseBytes); cudaMalloc((void **) &cardBase_d, sizeof(int)); cudaMalloc((void **) &dataReturned_d, outputBytes); int blockSize = 196; int nBlocks = range/blockSize + (range%blockSize == 0?0:1); cudaMemcpy(data_d, data, inputBytes, cudaMemcpyHostToDevice); cudaMemcpy(descBase_d, descBase, baseBytes, cudaMemcpyHostToDevice); cudaMemcpy(cardBase_d, &cardBase, sizeof(int), cudaMemcpyHostToDevice); FindClosestDescriptor<<>>(dataReturned_d, data_d, descBase_d, cardBase_d); cudaMemcpy(dataReturned, dataReturned_d, outputBytes, cudaMemcpyDeviceToHost); cudaFree(data_d); cudaFree(descBase_d); cudaFree(cardBase_d); cudaFree(dataReturned_d); } 

进入GPU的function(我不认为错误在这里):

 __global__ void FindClosestDescriptor(int * dataReturned, unsigned char * data, unsigned char * base, int *cardBase) { int idx = blockDim.x * blockIdx.x + threadIdx.x; unsigned char descriptor1[128], descriptor2[128]; int part = 0; int result = 0; int winner = 0; int minDistance = 0; int itelimit = *cardBase; for (int k = 0; k < 128; k++) { descriptor1[k] = data[idx*128+k]; } // initialize minDistance for (int k = 0; k < 128; k++) { descriptor2[k] = base[k]; } for (int k = 0; k < 128; k++) { part = (descriptor1[k]-descriptor2[k]); part *= part; minDistance += part; } // test all descriptors in the base : for (int i = 1; i < itelimit; i++) { result = 0; for (int k = 0; k < 128; k++) { descriptor2[k] = base[i*128+k]; // Calculate squared l2 distance : part = (descriptor1[k]-descriptor2[k]); part *= part; result += part; } // Compare to minDistance if (result < minDistance) { minDistance = result; winner = i; } } // Write the result in dataReturned dataReturned[idx] = winner; } 

如果你能帮助我,请提前感谢你。

编辑:最后一个cudaMemcpy返回错误“启动超时并终止”。

linux有一个看门狗机制。 如果你的内核运行了很长时间(你说它在调试模式下很慢)你可以点击linux看门狗,并收到“启动超时并被终止”错误。

在这种情况下,您可以尝试几种方法。 这里介绍了选项。