尝试写入使用cudaMalloc3D分配的2D数组时的“非法内存访问”
我正在尝试使用cudaMalloc3D将扁平2Darrays的内存分配并复制到设备上,以测试cudaMalloc3D的性能。 但是当我尝试从内核写入数组时,它会抛出’遇到非法内存访问’exception。 如果我只是从数组中读取,但是当我尝试写入它时,程序运行正常,则会出错。 任何有关这方面的帮助将不胜感激。 下面是我的代码和编译代码的语法。
编译使用
nvcc -O2 -arch sm_20 test.cu
代码:test.cu
#include #include #include #define PI 3.14159265 #define NX 8192 /* includes boundary points on both end */ #define NY 4096 /* includes boundary points on both end */ #define NZ 1 /* needed for cudaMalloc3D */ #define N_THREADS_X 16 #define N_THREADS_Y 16 #define N_BLOCKS_X NX/N_THREADS_X #define N_BLOCKS_Y NY/N_THREADS_Y #define LX 4.0 /* length of the domain in x-direction */ #define LY 2.0 /* length of the domain in x-direction */ #define dx (REAL) ( LX/( (REAL) (NX) ) ) #define cSqrd 5.0 #define dt (REAL) ( 0.4 * dx / sqrt(cSqrd) ) #define FACTOR ( cSqrd * (dt*dt)/(dx*dx) ) #define IC (i + j*NX) /* (i,j) */ #define IM1 (i + j*NX - 1) /* (i-1,j) */ #define IP1 (i + j*NX + 1) /* (i+1,j) */ #define JM1 (i + (j-1)*NX) /* (i,j-1) */ #define JP1 (i + (j+1)*NX) /* (i,j+1) */ // Macro for checking CUDA errors following a CUDA launch or API call #define cudaCheckError() {\ cudaError_t e = cudaGetLastError();\ if( e != cudaSuccess ) {\ printf("\nCuda failure %s:%d: '%s'\n",__FILE__,__LINE__,cudaGetErrorString(e));\ exit(EXIT_FAILURE);\ }\ } typedef double REAL; typedef int INT; void meshGrid ( REAL *x, REAL *y ) { INT i,j; REAL a; for (j=0; j<NY; j++) { a = dx * ( (REAL) j ); for (i=0; i<NX; i++) { x[IC] = dx * ( (REAL) i ); y[IC] = a; } } } void initWave ( REAL *u, REAL *uold, REAL *x, REAL *y ) { INT i,j; for (j=1; j<NY-1; j++) { for (i=1; i<NX-1; i++) { u[IC] = 0.1 * (4.0*x[IC]-x[IC]*x[IC]) * ( 2.0*y[IC] - y[IC]*y[IC] ); } } for (j=1; j<NY-1; j++) { for (i=1; i0 && i 0 && j < (NY-1) ) { char *unewPtr = (char *) unew.ptr; REAL *unew_row = (REAL *) (unewPtr + i * unew.pitch); REAL tmp = unew_row[j]; // no error on this line unew_row[j] = 1.2; // this is where I get the error } } INT main(INT argc, char *argv[]) { INT nTimeSteps = 10; // pointers for the host side REAL *unew, *u, *uold, *uFinal, *x, *y; // allocate memory on the host unew = (REAL *)calloc(NX*NY,sizeof(REAL)); u = (REAL *)calloc(NX*NY,sizeof(REAL)); uold = (REAL *)calloc(NX*NY,sizeof(REAL)); uFinal = (REAL *)calloc(NX*NY,sizeof(REAL)); x = (REAL *)calloc(NX*NY,sizeof(REAL)); y = (REAL *)calloc(NX*NY,sizeof(REAL)); // pointer for the device side size_t pitch = NX * sizeof(REAL); cudaPitchedPtr d_u, d_uold, d_unew, d_tmp; cudaExtent myExtent = make_cudaExtent(pitch, NY, NZ); // allocate 3D memory on the device cudaMalloc3D( &d_u, myExtent ); cudaCheckError(); cudaMalloc3D( &d_uold, myExtent ); cudaCheckError(); cudaMalloc3D( &d_unew, myExtent ); cudaCheckError(); // initialize grid and wave meshGrid( x, y ); initWave( u, uold, x, y ); // copy host memory to 3D device memory cudaMemcpy3DParms cpy3D = { 0 }; cpy3D.kind = cudaMemcpyHostToDevice; // copying u to d_u cpy3D.srcPtr = make_cudaPitchedPtr(u, pitch, NX, NY); cpy3D.dstPtr = d_u; cpy3D.extent = myExtent; cudaMemcpy3D( &cpy3D ); cudaCheckError(); // copying uold to d_uold cpy3D.srcPtr = make_cudaPitchedPtr(uold, pitch, NX, NY); cpy3D.dstPtr = d_uold; cpy3D.extent = myExtent; cudaMemcpy3D( &cpy3D ); cudaCheckError(); // set up the GPU grid/block model dim3 dimGrid ( N_BLOCKS_X , N_BLOCKS_Y ); dim3 dimBlock ( N_THREADS_X, N_THREADS_Y ); for ( INT n = 1; n < nTimeSteps + 1; n++ ) { solveWaveGPU <<>> ( d_uold, d_u, d_unew ); cudaThreadSynchronize(); cudaCheckError(); d_tmp = d_uold; d_uold = d_u; d_u = d_unew; d_unew = d_tmp; } // copy the memory back to host cpy3D.kind = cudaMemcpyDeviceToHost; // copying d_unew to uFinal cpy3D.srcPtr = d_unew; cpy3D.dstPtr = make_cudaPitchedPtr(uFinal, pitch, NX, NY); cpy3D.extent = myExtent; cudaMemcpy3D( &cpy3D ); cudaCheckError(); free(u); cudaFree(d_u.ptr); free(unew); cudaFree(d_unew.ptr); free(uold); cudaFree(d_uold.ptr); free(uFinal); free(x); free(y); return EXIT_SUCCESS; }
在此行上未发生错误的原因:
REAL tmp = unew_row[j]; // no error on this line
是因为编译器正在优化该行。 它没有做任何有用的事情,因此编译器完全消除了它。 编译器警告:
xxx.cu(87): warning: variable "tmp" was declared but never referenced
是暗示这种效果。
你的代码几乎是正确的。 问题在这里:
REAL *unew_row = (REAL *) (unewPtr + i * unew.pitch);
它应该是:
REAL *unew_row = (REAL *) (unewPtr + j * unew.pitch);
内核中的i
变量是宽度 (即X)维度。 j
变量是高度 (即Y)尺寸。
高度是指您所在的行的高度,因此行间距应乘以高度参数,即j
,而不是i
。
同样,虽然它不是特定维度的特定失败的来源,但此代码可能不是您想要的:
REAL tmp = unew_row[j]; // no error on this line unew_row[j] = 1.2; // this is where I get the error
例如,如果您打算计算行的偏移量然后索引到行(例如,可能要设置位置中的每个元素),那么我认为您希望使用i
而不是j
作为最终索引:
REAL tmp = unew_row[i]; // no error on this line unew_row[i] = 1.2; // this is where I get the error
但是,对于此特定示例,这不是非法内存访问的实际来源。