CUDA写入常量内存错误值

我有以下代码从主机变量复制到CUDA中的__constant__变量

 int main(int argc, char **argv){ int exit_code; if (argc < 4) { std::cout << "Usage: \n " << argv[0] << "   " << std::endl; return 1; } Color *h_input; int h_rows, h_cols; timer1.Start(); exit_code = readText2RGB(argv[1], &h_input, &h_rows, &h_cols); timer1.Stop(); std::cout << "Reading: " << timer1.Elapsed() << std::endl; if (exit_code != SUCCESS){ std::cout << "Error trying to read file." << std::endl; return FAILURE; } CpuTimer timer1; GpuTimer timer2; float timeStep2 = 0, timeStep3 = 0; int h_numColors = atoi(argv[3]); int h_change = 0; int *h_pixelGroup = new int[h_rows*h_cols]; Color *h_groupRep = new Color[h_numColors]; Color *h_output = new Color[h_rows*h_cols]; Color *d_input; int *d_pixelGroup; Color *d_groupRep; Color *d_output; dim3 block(B_WIDTH, B_HEIGHT); dim3 grid((h_cols+B_WIDTH-1)/B_WIDTH, (h_rows+B_HEIGHT-1)/B_HEIGHT); checkCudaError(cudaMalloc((void**)&d_input, sizeof(Color)*h_rows*h_cols)); checkCudaError(cudaMalloc((void**)&d_pixelGroup, sizeof(int)*h_rows*h_cols)); checkCudaError(cudaMalloc((void**)&d_groupRep, sizeof(Color)*h_numColors)); checkCudaError(cudaMalloc((void**)&d_output, sizeof(Color)*h_rows*h_cols)); // STEP 1 //Evenly distribute all pixels of the image onto the color set timer2.Start(); checkCudaError(cudaMemcpyToSymbol(c_rows, &h_rows, sizeof(int))); checkCudaError(cudaMemcpyToSymbol(c_cols, &h_cols, sizeof(int))); checkCudaError(cudaMemcpyToSymbol(c_numColors, &h_numColors, sizeof(int))); checkCudaError(cudaMemcpy(d_input, h_input, sizeof(Color)*h_rows*h_cols, cudaMemcpyHostToDevice)); clut_distributePixels<<>>(d_pixelGroup); checkCudaError(cudaMemcpy(h_pixelGroup, d_pixelGroup, sizeof(int)*h_rows*h_cols, cudaMemcpyDeviceToHost)); timer2.Stop(); std::cout << "Phase 1: " << timer2.Elapsed() << std::endl; std::cout << h_pixelGroup[0] << "," << h_pixelGroup[3] << "," << h_pixelGroup[4] << "," << h_pixelGroup[7] << "," << h_pixelGroup[8] << std::endl; //Do the STEP 2 and STEP 3 as long as there is at least one change of representative in a group do { // STEP 2 //Set the representative value to the average colour of all pixels in the same set timer1.Start(); for (int ng = 0; ng < h_numColors; ng++) { int r = 0, g = 0, b = 0; int elem = 0; for (int i = 0; i < h_rows; i++) { for (int j = 0; j < h_cols; j++) { if (h_pixelGroup[i*h_cols+j] == ng) { r += h_input[i*h_cols+j].r; g += h_input[i*h_cols+j].g; b += h_input[i*h_cols+j].b; elem++; } } } if (elem == 0) { h_groupRep[ng].r = 255; h_groupRep[ng].g = 255; h_groupRep[ng].b = 255; }else{ h_groupRep[ng].r = r/elem; h_groupRep[ng].g = g/elem; h_groupRep[ng].b = b/elem; } } timer1.Stop(); timeStep2 += timer1.Elapsed(); // STEP 3 //For each pixel in the image, compute Euclidean's distance to each representative //and assign it to the set which is closest h_change = 0; timer2.Start(); checkCudaError(cudaMemcpyToSymbol(d_change, &h_change, sizeof(int))); checkCudaError(cudaMemcpy(d_groupRep, h_groupRep, sizeof(Color)*h_numColors, cudaMemcpyHostToDevice)); clut_checkDistances<<>>(d_input, d_pixelGroup, d_groupRep); checkCudaError(cudaMemcpy(h_pixelGroup, d_pixelGroup, sizeof(int)*h_rows*h_cols, cudaMemcpyDeviceToHost)); checkCudaError(cudaMemcpyFromSymbol(&h_change, d_change, sizeof(int))); timer2.Stop(); timeStep3 += timer2.Elapsed(); std::cout << "Chunche" << std::endl; } while (h_change == 1); std::cout << "Phase 2: " << timeStep2 << std::endl; std::cout << "Phase 3: " << timeStep3 << std::endl; // STEP 4 //Create the new image with the resulting color lookup table timer2.Start(); clut_createImage<<>>(d_output, d_pixelGroup, d_groupRep); checkCudaError(cudaMemcpy(h_output, d_output, sizeof(Color)*h_rows*h_cols, cudaMemcpyDeviceToHost)); timer2.Stop(); std::cout << "Phase 4: " << timer2.Elapsed() << std::endl; checkCudaError(cudaFree(d_input)); checkCudaError(cudaFree(d_pixelGroup)); checkCudaError(cudaFree(d_groupRep)); checkCudaError(cudaFree(d_output)); timer1.Start(); exit_code = writeRGB2Text(argv[2], h_input, h_rows, h_cols); timer1.Stop(); std::cout << "Writing: " << timer1.Elapsed() << std::endl; delete[] h_pixelGroup; delete[] h_groupRep; delete[] h_output; return SUCCESS; }

当我从内核中打印时，我得到三个值的零

 __global__ void clut_distributePixels(int *pixelGroup){ int i = blockDim.y * blockIdx.y + threadIdx.y; int j = blockDim.x * blockIdx.x + threadIdx.x; if(i == 0 && j == 0){ printf("a: %d\n", c_rows); printf("b: %d\n", c_cols); printf("c: %d\n", c_numColors); } while (i < c_rows) { while (j < c_cols) { pixelGroup[i*c_cols+j] = (i*c_cols+j)/c_numColors; j += gridDim.x * blockDim.x; } j = blockDim.x * blockIdx.x + threadIdx.x; i += gridDim.y * blockDim.y; } }

要么我没有正确地复制到恒定的记忆中……或者我不知道什么是错的。有任何建议！？我发布了整个主机代码，可能还有其他东西弄乱了常量副本。

UPDATE

Main.cu

 #include "Imageproc.cuh" int main(){ int h_change = 0; int h_rows = 512; cudaMemcpyToSymbol(c_rows, &h_rows, sizeof(int)); chunche<<>>(); cudaMemcpyFromSymbol(&h_change, d_change, sizeof(int)); std::cout << "H = " << h_change << std::endl; return 0 }

Imageproc.cuh

 #ifndef _IMAGEPROC_CUH_ #define _IMAGEPROC_CUH_ #include "Utilities.cuh" #define B_WIDTH 16 #define B_HEIGHT 16 __constant__ int c_rows; __constant__ int c_cols; __constant__ int c_numColors; __device__ int d_change; #ifdef __cplusplus extern "C" { #endif __global__ void chunche(); __global__ void clut_distributePixels(int *pixelGroup); __global__ void clut_checkDistances(Color *input, int *pixelGroup, Color *groupRep); __global__ void clut_createImage(Color *clutImage, int *pixelGroup, Color *groupRep); #ifdef __cplusplus } #endif #endif

Imageproc.cu

 #include "Imageproc.cuh" __global__ void chunche(){ d_change = c_rows + 1; } __global__ void clut_distributePixels(int *pixelGroup){ int i = blockDim.y * blockIdx.y + threadIdx.y; int j = blockDim.x * blockIdx.x + threadIdx.x; while (i < c_rows) { while (j < c_cols) { pixelGroup[i*c_cols+j] = (i*c_cols+j)/c_numColors; j += gridDim.x * blockDim.x; } j = blockDim.x * blockIdx.x + threadIdx.x; i += gridDim.y * blockDim.y; } } __global__ void clut_checkDistances(Color *input, int *pixelGroup, Color *groupRep){ int i = blockDim.y * blockIdx.y + threadIdx.y; int j = blockDim.x * blockIdx.x + threadIdx.x; int newGroup; while (i < c_rows) { while (j < c_cols) { newGroup = 0; for (int ng = 1; ng < c_numColors; ng++) { if ( /*If distance from color to group ng is less than distance from color to group idx then color should belong to ng*/ (groupRep[ng].r-input[i*c_cols+j].r)*(groupRep[ng].r-input[i*c_cols+j].r) + (groupRep[ng].g-input[i*c_cols+j].g)*(groupRep[ng].g-input[i*c_cols+j].g) + (groupRep[ng].b-input[i*c_cols+j].b)*(groupRep[ng].b-input[i*c_cols+j].b) < (groupRep[newGroup].r-input[i*c_cols+j].r)*(groupRep[newGroup].r-input[i*c_cols+j].r)+ (groupRep[newGroup].g-input[i*c_cols+j].g)*(groupRep[newGroup].g-input[i*c_cols+j].g)+ (groupRep[newGroup].b-input[i*c_cols+j].b)*(groupRep[newGroup].b-input[i*c_cols+j].b) ) { newGroup = ng; } } if (pixelGroup[i*c_cols+j] != newGroup) { pixelGroup[i*c_cols+j] = newGroup; d_change = 1; } j += gridDim.x * blockDim.x; } j = blockDim.x * blockIdx.x + threadIdx.x; i += gridDim.y * blockDim.y; } } __global__ void clut_createImage(Color *clutImage, int *pixelGroup, Color *groupRep){ int i = blockDim.y * blockIdx.y + threadIdx.y; int j = blockDim.x * blockIdx.x + threadIdx.x; while (i < c_rows) { while (j < c_cols) { clutImage[i*c_cols+j].r = groupRep[pixelGroup[i*c_cols+j]].r; clutImage[i*c_cols+j].g = groupRep[pixelGroup[i*c_cols+j]].g; clutImage[i*c_cols+j].b = groupRep[pixelGroup[i*c_cols+j]].b; j += gridDim.x * blockDim.x; } j = blockDim.x * blockIdx.x + threadIdx.x; i += gridDim.y * blockDim.y; } }

Utilities.cuh

 #ifndef _UTILITIES_CUH_ #define _UTILITIES_CUH_ #include  #include  #include  #define SUCCESS 1 #define FAILURE 0 #define checkCudaError(val) check( (val), #val, __FILE__, __LINE__) typedef struct { int r; int g; int b; } vec3u; typedef vec3u Color; typedef unsigned char uchar; typedef uchar Grayscale; struct GpuTimer{ cudaEvent_t start; cudaEvent_t stop; GpuTimer(){ cudaEventCreate(&start); cudaEventCreate(&stop); } ~GpuTimer(){ cudaEventDestroy(start); cudaEventDestroy(stop); } void Start(){ cudaEventRecord(start, 0); } void Stop(){ cudaEventRecord(stop, 0); } float Elapsed(){ float elapsed; cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsed, start, stop); return elapsed; } }; template void check(T err, const char* const func, const char* const file, const int line) { if (err != cudaSuccess) { std::cerr << "CUDA error at: " << file << ":" << line << std::endl; std::cerr << cudaGetErrorString(err) << " " << func << std::endl; exit(1); } } int writeGrayscale2Text(const std::string filename, const Grayscale *image, const int rows, const int cols); int readText2Grayscale(const std::string filename, Grayscale **image, int *rows, int *cols); int writeRGB2Text(const std::string filename, const Color *image, const int rows, const int cols); int readText2RGB(const std::string filename, Color **image, int *rows, int *cols); struct CpuTimer{ clock_t start; clock_t stop; void Start(){ start = clock(); } void Stop(){ stop = clock(); } float Elapsed(){ return ((float)stop-start)/CLOCKS_PER_SEC * 1000.0f; } }; #endif

Utilities.cu

 #include "Utilities.cuh" int writeGrayscale2Text(const std::string filename, const Grayscale *image, const int rows, const int cols){ std::ofstream fileWriter(filename.c_str()); if (!fileWriter.is_open()) { std::cerr << "** writeGrayscale2Text() ** : Unable to open file." << std::endl; return FAILURE; } fileWriter << rows << "\n"; fileWriter << cols << "\n"; for (int i = 0; i < rows; i++) { for (int j = 0; j < cols; j++) { fileWriter << (int)image[i*cols+j] << "\n"; } } fileWriter.close(); return SUCCESS; } int readText2Grayscale(const std::string filename, Grayscale **image, int *rows, int *cols){ std::ifstream fileReader(filename.c_str()); if (!fileReader.is_open()) { std::cerr << "** readText2Grayscale() ** : Unable to open file." <> *rows; fileReader >> *cols; *image = new Grayscale[(*rows)*(*cols)]; int value; for (int i = 0; i < *rows; i++) { for (int j = 0; j > value; (*image)[i*(*cols)+j] = (Grayscale)value; } } fileReader.close(); return SUCCESS; } int writeRGB2Text(const std::string filename, const Color *image, const int rows, const int cols){ std::ofstream fileWriter(filename.c_str()); if (!fileWriter.is_open()) { std::cerr << "** writeRGB2Text() ** : Unable to open file." << std::endl; return FAILURE; } fileWriter << rows << "\n"; fileWriter << cols << "\n"; for (int k = 0; k < 3; k++) { for (int i = 0; i < rows; i++) { for (int j = 0; j < cols; j++) { switch (k) { case 0: fileWriter << image[i*cols+j].r << "\n"; break; case 1: fileWriter << image[i*cols+j].g << "\n"; break; case 2: fileWriter << image[i*cols+j].b << "\n"; break; } } } } fileWriter.close(); return SUCCESS; } int readText2RGB(const std::string filename, Color **image, int *rows, int *cols){ std::ifstream fileReader(filename.c_str()); if (!fileReader.is_open()) { std::cerr << "** readText2Grayscale() ** : Unable to open file." <> *rows; fileReader >> *cols; *image = new Color[(*rows)*(*cols)]; for (int k = 0; k < 3; k++) { for (int i = 0; i < *rows; i++) { for (int j = 0; j > (*image)[i*(*cols)+j].r; break; case 1: fileReader >> (*image)[i*(*cols)+j].g; break; case 2: fileReader >> (*image)[i*(*cols)+j].b; break; } } } } fileReader.close(); return SUCCESS; }

常量内存具有隐式本地范围链接 – 在堆栈溢出时回答此问题。这意味着cudaMemcpyToSymbol必须位于要使用它的内核的同一生成的.obj文件中。你在Main.cu进行了memcopy，但是你使用canstant内存的内核是在Imageproc.cu中。因此，内核chunche的常量值是未知的。

解决问题的一个选择是，实现一个包装器。只需在Main.cu添加一个函数，您可以在其中执行cudaMemcpyToSymbol并在Main.cu调用包装器并在Main.cu传递所需的常量内存值。

CUDA写入常量内存错误值

arrays30 x 30的程序失败

CUDA动态并行MakeFile

使用CUDA在本地内存中的数组上定义变量大小

是否可以将批量FFT与CUDA的cuFFT库和cufftPlanMany重叠？

CUDA：平铺矩阵 – 矩阵乘法，共享内存和矩阵大小，是块大小的非倍数

CURAND和内核，在哪里生成？

forceinline对CUDA C device函数的影响

需要随机访问时改善随机存储器访问

如何在CUDA中使用uint4向量正确转换全局内存数组以增加内存吞吐量？

CUDA – Eratosthenes筛分为部分

CUDA写入常量内存错误值

arrays30 x 30的程序失败

CUDA动态并行MakeFile

使用CUDA在本地内存中的数组上定义变量大小

是否可以将批量FFT与CUDA的cuFFT库和cufftPlanMany重叠？

CUDA：平铺矩阵 – 矩阵乘法，共享内存和矩阵大小，是块大小的非倍数

CURAND和内核，在哪里生成？

__forceinline__对CUDA C __device__函数的影响

需要随机访问时改善随机存储器访问

如何在CUDA中使用uint4向量正确转换全局内存数组以增加内存吞吐量？

CUDA – Eratosthenes筛分为部分

forceinline对CUDA C device函数的影响