尝试写入使用cudaMalloc3D分配的2D数组时的“非法内存访问”

我正在尝试使用cudaMalloc3D将扁平2Darrays的内存分配并复制到设备上，以测试cudaMalloc3D的性能。但是当我尝试从内核写入数组时，它会抛出’遇到非法内存访问’exception。如果我只是从数组中读取，但是当我尝试写入它时，程序运行正常，则会出错。任何有关这方面的帮助将不胜感激。下面是我的代码和编译代码的语法。

编译使用

nvcc -O2 -arch sm_20 test.cu

代码：test.cu

 #include  #include  #include  #define PI 3.14159265 #define NX 8192 /* includes boundary points on both end */ #define NY 4096 /* includes boundary points on both end */ #define NZ 1 /* needed for cudaMalloc3D */ #define N_THREADS_X 16 #define N_THREADS_Y 16 #define N_BLOCKS_X NX/N_THREADS_X #define N_BLOCKS_Y NY/N_THREADS_Y #define LX 4.0 /* length of the domain in x-direction */ #define LY 2.0 /* length of the domain in x-direction */ #define dx (REAL) ( LX/( (REAL) (NX) ) ) #define cSqrd 5.0 #define dt (REAL) ( 0.4 * dx / sqrt(cSqrd) ) #define FACTOR ( cSqrd * (dt*dt)/(dx*dx) ) #define IC (i + j*NX) /* (i,j) */ #define IM1 (i + j*NX - 1) /* (i-1,j) */ #define IP1 (i + j*NX + 1) /* (i+1,j) */ #define JM1 (i + (j-1)*NX) /* (i,j-1) */ #define JP1 (i + (j+1)*NX) /* (i,j+1) */ // Macro for checking CUDA errors following a CUDA launch or API call #define cudaCheckError() {\ cudaError_t e = cudaGetLastError();\ if( e != cudaSuccess ) {\ printf("\nCuda failure %s:%d: '%s'\n",__FILE__,__LINE__,cudaGetErrorString(e));\ exit(EXIT_FAILURE);\ }\ } typedef double REAL; typedef int INT; void meshGrid ( REAL *x, REAL *y ) { INT i,j; REAL a; for (j=0; j<NY; j++) { a = dx * ( (REAL) j ); for (i=0; i<NX; i++) { x[IC] = dx * ( (REAL) i ); y[IC] = a; } } } void initWave ( REAL *u, REAL *uold, REAL *x, REAL *y ) { INT i,j; for (j=1; j<NY-1; j++) { for (i=1; i<NX-1; i++) { u[IC] = 0.1 * (4.0*x[IC]-x[IC]*x[IC]) * ( 2.0*y[IC] - y[IC]*y[IC] ); } } for (j=1; j<NY-1; j++) { for (i=1; i0 && i 0 && j < (NY-1) ) { char *unewPtr = (char *) unew.ptr; REAL *unew_row = (REAL *) (unewPtr + i * unew.pitch); REAL tmp = unew_row[j]; // no error on this line unew_row[j] = 1.2; // this is where I get the error } } INT main(INT argc, char *argv[]) { INT nTimeSteps = 10; // pointers for the host side REAL *unew, *u, *uold, *uFinal, *x, *y; // allocate memory on the host unew = (REAL *)calloc(NX*NY,sizeof(REAL)); u = (REAL *)calloc(NX*NY,sizeof(REAL)); uold = (REAL *)calloc(NX*NY,sizeof(REAL)); uFinal = (REAL *)calloc(NX*NY,sizeof(REAL)); x = (REAL *)calloc(NX*NY,sizeof(REAL)); y = (REAL *)calloc(NX*NY,sizeof(REAL)); // pointer for the device side size_t pitch = NX * sizeof(REAL); cudaPitchedPtr d_u, d_uold, d_unew, d_tmp; cudaExtent myExtent = make_cudaExtent(pitch, NY, NZ); // allocate 3D memory on the device cudaMalloc3D( &d_u, myExtent ); cudaCheckError(); cudaMalloc3D( &d_uold, myExtent ); cudaCheckError(); cudaMalloc3D( &d_unew, myExtent ); cudaCheckError(); // initialize grid and wave meshGrid( x, y ); initWave( u, uold, x, y ); // copy host memory to 3D device memory cudaMemcpy3DParms cpy3D = { 0 }; cpy3D.kind = cudaMemcpyHostToDevice; // copying u to d_u cpy3D.srcPtr = make_cudaPitchedPtr(u, pitch, NX, NY); cpy3D.dstPtr = d_u; cpy3D.extent = myExtent; cudaMemcpy3D( &cpy3D ); cudaCheckError(); // copying uold to d_uold cpy3D.srcPtr = make_cudaPitchedPtr(uold, pitch, NX, NY); cpy3D.dstPtr = d_uold; cpy3D.extent = myExtent; cudaMemcpy3D( &cpy3D ); cudaCheckError(); // set up the GPU grid/block model dim3 dimGrid ( N_BLOCKS_X , N_BLOCKS_Y ); dim3 dimBlock ( N_THREADS_X, N_THREADS_Y ); for ( INT n = 1; n < nTimeSteps + 1; n++ ) { solveWaveGPU <<>> ( d_uold, d_u, d_unew ); cudaThreadSynchronize(); cudaCheckError(); d_tmp = d_uold; d_uold = d_u; d_u = d_unew; d_unew = d_tmp; } // copy the memory back to host cpy3D.kind = cudaMemcpyDeviceToHost; // copying d_unew to uFinal cpy3D.srcPtr = d_unew; cpy3D.dstPtr = make_cudaPitchedPtr(uFinal, pitch, NX, NY); cpy3D.extent = myExtent; cudaMemcpy3D( &cpy3D ); cudaCheckError(); free(u); cudaFree(d_u.ptr); free(unew); cudaFree(d_unew.ptr); free(uold); cudaFree(d_uold.ptr); free(uFinal); free(x); free(y); return EXIT_SUCCESS; }

在此行上未发生错误的原因：

 REAL tmp = unew_row[j]; // no error on this line

是因为编译器正在优化该行。它没有做任何有用的事情，因此编译器完全消除了它。编译器警告：

 xxx.cu(87): warning: variable "tmp" was declared but never referenced

是暗示这种效果。

你的代码几乎是正确的。问题在这里：

 REAL *unew_row = (REAL *) (unewPtr + i * unew.pitch);

它应该是：

 REAL *unew_row = (REAL *) (unewPtr + j * unew.pitch);

内核中的i变量是宽度（即X）维度。 j变量是高度（即Y）尺寸。

高度是指您所在的行的高度，因此行间距应乘以高度参数，即j ，而不是i 。

同样，虽然它不是特定维度的特定失败的来源，但此代码可能不是您想要的：

 REAL tmp = unew_row[j]; // no error on this line unew_row[j] = 1.2; // this is where I get the error

例如，如果您打算计算行的偏移量然后索引到行（例如，可能要设置位置中的每个元素），那么我认为您希望使用i而不是j作为最终索引：

 REAL tmp = unew_row[i]; // no error on this line unew_row[i] = 1.2; // this is where I get the error

但是，对于此特定示例，这不是非法内存访问的实际来源。

尝试写入使用cudaMalloc3D分配的2D数组时的“非法内存访问”

编译程序中的多个C文件

等待信号，然后继续执行

可以扩展二次方的C程序

指针和该指针的地址导致相同的事情

使用ffmpeg安装Open-CV时出错

无法在二进制文件中找到.dtors和.ctors

在Windows上使用SQLite和C.

在没有break语句的情况下打破循环

奇怪的使用虚空

全局变量，共享库和-fPIC效果