Cuda从设备存储器创建3d纹理和cudaArray(3d)

我试图从设备arrays的一部分创建纹理3d。

要做到这一点,这些是我的步骤:

  1. malloc设备arrays
  2. 写设备arrays
  3. 创建CudaArray(3D)
  4. 将纹理绑定到CudaArray

我这样做的方式不会产生编译器错误,但是当我运行cuda-memcheck时,当我试图从纹理中获取数据时,它失败了。

无效的全局读取大小为8 ..地址0x10dfaf3a0超出范围

这就是为什么我猜我宣布纹理数组错了。 这是我如何访问纹理:

tex3D(NoiseTextures [I]中,X,Y,Z)

我正在做上述步骤的方式:

1.Malloc设备arrays

cudaMalloc((void **)&d_Noise, sqrSizeNoise*nNoise*sizeof(float)); 

2.编写设备arrays

 curandCreateGenerator(&gen,CURAND_RNG_PSEUDO_DEFAULT); curandSetPseudoRandomGeneratorSeed(gen,Seed); curandGenerateUniform(gen, d_Noise, sqrSizeNoise*nNoise); curandDestroyGenerator(gen); 

3 + 4.创建Cudaarrays并将其绑定到纹理(我猜错了就在这里)

 cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc();//cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); cudaArray *d_cuArr; cudaMalloc3DArray(&d_cuArr, &channelDesc, make_cudaExtent(SizeNoise,SizeNoise,SizeNoise), 0); cudaMemcpy3DParms copyParams = {0}; //Loop for every separated Noise Texture (nNoise = 4) for(int i = 0; i < nNoise; i++){ //initialize the textures NoiseTextures[i] = texture(1,cudaFilterModeLinear,cudaAddressModeWrap,channelDesc); //Array creation //+(sqrSizeNoise*i) is to separate the created Noise Array into smaller parts with the size of SizeNoise^3 copyParams.srcPtr = make_cudaPitchedPtr(d_Noise+(sqrSizeNoise*i), SizeNoise*sizeof(float), SizeNoise, SizeNoise); copyParams.dstArray = d_cuArr; copyParams.extent = make_cudaExtent(SizeNoise,SizeNoise,SizeNoise); copyParams.kind = cudaMemcpyDeviceToDevice; checkCudaErrors(cudaMemcpy3D(&copyParams)); //Array creation End //new Bind // set texture parameters NoiseTextures[i].normalized = true; // access with normalized texture coordinates NoiseTextures[i].filterMode = cudaFilterModeLinear; // linear interpolation NoiseTextures[i].addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates NoiseTextures[i].addressMode[1] = cudaAddressModeWrap; NoiseTextures[i].addressMode[2] = cudaAddressModeWrap; // bind array to 3D texture checkCudaErrors(cudaBindTextureToArray(NoiseTextures[i], d_cuArr, channelDesc)); //end Bind } cudaFreeArray(d_cuArr); 

我已将此代码段粘贴到Pastebin,因此更容易查看颜色等.http://pastebin.com/SM3dYd38

我希望我清楚地描述了我的问题。 如果没有请评论!

你能帮我解决这个问题吗? 谢谢阅读,

Cery

编辑:这是一个完整的代码,您可以在自己的机器上尝试:

 #include  #include  #include  #include  #include  #include  static texture NoiseTextures[4];//texture Array float *d_NoiseTest;//Device Array with random floats int SizeNoiseTest = 32; int sqrSizeNoiseTest = 32768; void CreateTexture(); __global__ void AccesTexture(texture* NoiseTextures) { int test = tex3D(NoiseTextures[0],threadIdx.x,threadIdx.y,threadIdx.z);//by using this the error occurs } int main(int argc, char **argv) { CreateTexture(); } void CreateTexture() { //curand Random Generator (needs compiler link -lcurand) curandGenerator_t gen; cudaMalloc((void **)&d_NoiseTest, sqrSizeNoiseTest*4*sizeof(float));//Allocation of device Array curandCreateGenerator(&gen,CURAND_RNG_PSEUDO_DEFAULT); curandSetPseudoRandomGeneratorSeed(gen,1234ULL); curandGenerateUniform(gen, d_NoiseTest, sqrSizeNoiseTest*4);//writing data to d_NoiseTest curandDestroyGenerator(gen); //cudaArray Descriptor cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); //cuda Array cudaArray *d_cuArr; cudaMalloc3DArray(&d_cuArr, &channelDesc, make_cudaExtent(SizeNoiseTest*sizeof(float),SizeNoiseTest,SizeNoiseTest), 0); cudaMemcpy3DParms copyParams = {0}; //Loop for every separated Noise Texture (4 = 4) for(int i = 0; i < 4; i++){ //initialize the textures NoiseTextures[i] = texture(1,cudaFilterModeLinear,cudaAddressModeWrap,channelDesc); //Array creation //+(sqrSizeNoise*i) is to separate the created Noise Array into smaller parts with the size of SizeNoise^3 copyParams.srcPtr = make_cudaPitchedPtr(d_NoiseTest+(sqrSizeNoiseTest*i), SizeNoiseTest*sizeof(float), SizeNoiseTest, SizeNoiseTest); copyParams.dstArray = d_cuArr; copyParams.extent = make_cudaExtent(SizeNoiseTest*sizeof(float),SizeNoiseTest,SizeNoiseTest); copyParams.kind = cudaMemcpyDeviceToDevice; checkCudaErrors(cudaMemcpy3D(&copyParams)); //Array creation End //new Bind // set texture parameters NoiseTextures[i].normalized = true; // access with normalized texture coordinates NoiseTextures[i].filterMode = cudaFilterModeLinear; // linear interpolation NoiseTextures[i].addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates NoiseTextures[i].addressMode[1] = cudaAddressModeWrap; NoiseTextures[i].addressMode[2] = cudaAddressModeWrap; // bind array to 3D texture checkCudaErrors(cudaBindTextureToArray(NoiseTextures[i], d_cuArr, channelDesc)); //end Bind } cudaFreeArray(d_cuArr); AccesTexture<<>>(NoiseTextures); } 

你需要链接-lcurand。 并包括CUDA-6.0 / samples / common / inc

我现在在这段代码中得到了一个不同的错误

code = 11(cudaErrorInvalidValue)“cudaMemcpy3D(&copyParams)”

即使它与我的原始代码完全相同。 – 我开始变得完全糊涂了。 谢谢您的帮助

这是一个工作示例,显示了一个纹理对象数组的创建,大致遵循您提供的代码的路径。 通过与我放在这里的纹理参考代码进行比较,您可以看到第一个纹理对象的第一组纹理读取(即第一个内核调用)与纹理参考示例中的读取集合的数值相同(您可能需要调整两个示例代码的网格大小以匹配)。

纹理对象使用需要3.0或更高的计算能力。

例:

 $ cat t507.cu #include  #include  #define NUM_TEX 4 const int SizeNoiseTest = 32; const int cubeSizeNoiseTest = SizeNoiseTest*SizeNoiseTest*SizeNoiseTest; static cudaTextureObject_t texNoise[NUM_TEX]; __global__ void AccesTexture(cudaTextureObject_t my_tex) { float test = tex3D(my_tex,(float)threadIdx.x,(float)threadIdx.y,(float)threadIdx.z);//by using this the error occurs printf("thread: %d,%d,%d, value: %f\n", threadIdx.x, threadIdx.y, threadIdx.z, test); } void CreateTexture() { float *d_NoiseTest;//Device Array with random floats cudaMalloc((void **)&d_NoiseTest, cubeSizeNoiseTest*sizeof(float));//Allocation of device Array for (int i = 0; i < NUM_TEX; i++){ //curand Random Generator (needs compiler link -lcurand) curandGenerator_t gen; curandCreateGenerator(&gen,CURAND_RNG_PSEUDO_DEFAULT); curandSetPseudoRandomGeneratorSeed(gen,1235ULL+i); curandGenerateUniform(gen, d_NoiseTest, cubeSizeNoiseTest);//writing data to d_NoiseTest curandDestroyGenerator(gen); //cudaArray Descriptor cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); //cuda Array cudaArray *d_cuArr; checkCudaErrors(cudaMalloc3DArray(&d_cuArr, &channelDesc, make_cudaExtent(SizeNoiseTest*sizeof(float),SizeNoiseTest,SizeNoiseTest), 0)); cudaMemcpy3DParms copyParams = {0}; //Array creation copyParams.srcPtr = make_cudaPitchedPtr(d_NoiseTest, SizeNoiseTest*sizeof(float), SizeNoiseTest, SizeNoiseTest); copyParams.dstArray = d_cuArr; copyParams.extent = make_cudaExtent(SizeNoiseTest,SizeNoiseTest,SizeNoiseTest); copyParams.kind = cudaMemcpyDeviceToDevice; checkCudaErrors(cudaMemcpy3D(&copyParams)); //Array creation End cudaResourceDesc texRes; memset(&texRes, 0, sizeof(cudaResourceDesc)); texRes.resType = cudaResourceTypeArray; texRes.res.array.array = d_cuArr; cudaTextureDesc texDescr; memset(&texDescr, 0, sizeof(cudaTextureDesc)); texDescr.normalizedCoords = false; texDescr.filterMode = cudaFilterModeLinear; texDescr.addressMode[0] = cudaAddressModeClamp; // clamp texDescr.addressMode[1] = cudaAddressModeClamp; texDescr.addressMode[2] = cudaAddressModeClamp; texDescr.readMode = cudaReadModeElementType; checkCudaErrors(cudaCreateTextureObject(&texNoise[i], &texRes, &texDescr, NULL));} } int main(int argc, char **argv) { CreateTexture(); AccesTexture<<<1,dim3(2,2,2)>>>(texNoise[0]); AccesTexture<<<1,dim3(2,2,2)>>>(texNoise[1]); AccesTexture<<<1,dim3(2,2,2)>>>(texNoise[2]); checkCudaErrors(cudaPeekAtLastError()); checkCudaErrors(cudaDeviceSynchronize()); return 0; } 

编译:

 $ nvcc -arch=sm_30 -I/shared/apps/cuda/CUDA-v6.0.37/samples/common/inc -lcurand -o t507 t507.cu 

输出:

 $ cuda-memcheck ./t507 ========= CUDA-MEMCHECK thread: 0,0,0, value: 0.310691 thread: 1,0,0, value: 0.627906 thread: 0,1,0, value: 0.638900 thread: 1,1,0, value: 0.665186 thread: 0,0,1, value: 0.167465 thread: 1,0,1, value: 0.565227 thread: 0,1,1, value: 0.397606 thread: 1,1,1, value: 0.503013 thread: 0,0,0, value: 0.809163 thread: 1,0,0, value: 0.795669 thread: 0,1,0, value: 0.808565 thread: 1,1,0, value: 0.847564 thread: 0,0,1, value: 0.853998 thread: 1,0,1, value: 0.688446 thread: 0,1,1, value: 0.733255 thread: 1,1,1, value: 0.649379 thread: 0,0,0, value: 0.040824 thread: 1,0,0, value: 0.087417 thread: 0,1,0, value: 0.301392 thread: 1,1,0, value: 0.298669 thread: 0,0,1, value: 0.161962 thread: 1,0,1, value: 0.316443 thread: 0,1,1, value: 0.452077 thread: 1,1,1, value: 0.477722 ========= ERROR SUMMARY: 0 errors 

在这种情况下,我使用相同的内核,多次调用,从各个纹理对象中读取。 应该可以将多个对象传递到同一个内核,但是如果在代码中可以避免,则不建议从多个纹理中读取单个warp 。 实际问题存在于四级,我不想进入。 最好是你可以安排你的代码,以便在任何给定的循环中,warp从同一个纹理对象读取。

在cudaMalloc3DArray中,它应该是这样的make_cudaExtent(SizeNoiseTest,SizeNoiseTest,SizeNoiseTest)而不是make_cudaExtent(SizeNoiseTest * sizeof(float),SizeNoiseTest,SizeNoiseTest)