PyCuda:在Cuda内核中通过指针取消引用数组元素

我正在使用PyCuda通过指针将数组对传递给cuda内核。 数组是不同内核的输出,因此数据已经在GPU上。

在内核中,我试图访问每个数组中的元素来进行向量减法。 我为数组中的元素获取的值不正确(h&p在下面的代码中是错误的)。

任何人都可以帮我看看我做错了什么?

我的代码:

import pycuda.driver as cuda import pycuda.autoinit from pycuda.compiler import SourceModule import numpy as np import time import cv2 from pycuda.tools import DeviceMemoryPool as DMP from scipy.spatial import distance import os import glob def get_cuda_hist_kernel(): #Make the kernel histogram_kernel = """ __global__ void kernel_getHist(unsigned int* array,unsigned int size, unsigned int* histo, float bucket_size, unsigned int num_bins, unsigned int* out_max) { unsigned int x = threadIdx.x + blockDim.x * blockIdx.x; if(x> 16) + (offset & 0xFFFF); offset = (offset >> 8) + (offset & 0xFF); offset = (offset >> 4) + (offset & 0xF); offset = (offset >> 2) + (offset & 0x3); offset = (offset >> 2) + (offset & 0x3); offset = (offset >> 2) + (offset & 0x3); if (offset > 2) offset = offset - 3; offset = offset * num_bins; bin += offset; atomicAdd(&histo[bin + offset],1); } } __global__ void kernel_chebyshev(unsigned int* histo, unsigned int* prev_histo, unsigned int number, int* output) { const unsigned int size = 12; //Get all of the differences __shared__ int temp_diffs[size]; unsigned int i = threadIdx.x + blockDim.x * blockIdx.x; if (i  p) { diff = h - p; } else { diff = p - h; } temp_diffs[i] = (int)diff; } __syncthreads(); output[number] = 0; atomicMax(&output[number], temp_diffs[i]); } """ mod = SourceModule(histogram_kernel) return mod def cuda_histogram(ims, block_size, kernel): start = time.time() max_val = 4 num_bins = np.uint32(4) num_channels = np.uint32(3) bin_size = np.float32(1 / np.uint32(max_val / num_bins)) #Memory Pool pool = DMP() print 'Pool Held Blocks: ', pool.held_blocks #Compute block & Grid dimensions bdim = (block_size, 1, 1) cols = ims[0].size rows = 1 channels = 1 dx, mx = divmod(cols, bdim[0]) dy, my = divmod(rows, bdim[1]) dz, mz = divmod(channels, bdim[2]) g_x = (dx + (mx>0)) * bdim[0] g_y = (dy + (my>0)) * bdim[1] g_z = (dz + (mz>0)) * bdim[2] gdim = (g_x, g_y, g_z) #get the function func = kernel.get_function('kernel_getHist') func2 = kernel.get_function('kernel_chebyshev') #build list of histograms #send the histogram to the gpu hists = [] device_hists = [] for im in range(len(ims)): hists.append(np.zeros([num_channels * num_bins]).astype(np.uint32)) end = time.time() dur = end - start print(' '.join(['Prep Time: ', str(dur)])) start = time.time() #Copy all of the image data to GPU device_images = [] for im in range(len(ims)): #print('Allocating data for image :', im) #convert the image to 1D array of uint32s a = ims[im].astype(np.uint32) a = a.flatten('C') a_size = np.uint32(a.size) #allocate & send im data to gpu device_images.append(pool.allocate(a.nbytes)) cuda.memcpy_htod(device_images[im], a) d_hist = pool.allocate(hists[im].nbytes) device_hists.append(d_hist) cuda.memcpy_htod(d_hist, hists[im]) differences = np.zeros(len(ims)).astype(np.uint32) device_diffs = pool.allocate(differences.nbytes) cuda.memcpy_htod(device_diffs, differences) for im in range(len(ims)): #run histogram function func(device_images[im], a_size, device_hists[im], bin_size, num_bins, block=(block_size,1,1), grid=gdim) cuda.Context.synchronize() device_hist_size = np.uint32(len(device_hists[im])) for im in range(1, len(ims)): number = np.uint32(im - 1) func2(device_hists[im], device_hists[im - 1], number, device_diffs, block=(32,1,1)) cuda.memcpy_dtoh(differences, device_diffs) print(differences) for im in range(len(ims)): #get histogram back cuda.memcpy_dtoh(hists[im], device_hists[im]) device_hists[im] = 0 end = time.time() dur = end - start print(' '.join(['Load, Compute, & Gather Time: ', str(dur)])) pool.free_held() return differences def get_all_files(directory): pattern = os.path.join(directory, '*.jpg') files = [f for f in glob.glob(pattern)] return files if __name__ == "__main__": RESOURCES_PATH = "../data/ims/" MAX_IMS = 1000 direc = os.path.join(RESOURCES_PATH, '21JumpStreet', 'source_video_frames') files = get_all_files(direc) a = cv2.imread('t.png') ims = [cv2.imread(f) for f in files] print 'Shape of my image: ', ims[0].shape print 'Number of images to histogram: ', len(ims) block_size = 128 kernel = get_cuda_hist_kernel() start = time.time() num_diffs = len(ims) // MAX_IMS + 1 cuda_diffs = [] for i in range(num_diffs): first = i * MAX_IMS last = (i + 1) * MAX_IMS print(first) small_set = ims[first:last] print 'Small set size: ', str(len(small_set)) cuda_diffs.extend(cuda_histogram(small_set, block_size, kernel)) end = time.time() dur = end - start print(' '.join(['CUDA version took:', str(dur)])) start = time.time() cv_hists = [] for i in range(len(ims)): im = ims[i % len(ims)] h = [] for j in range(3): hist = cv2.calcHist([im], [j], None, [4], [0, 100]) h.extend(hist) cv_hists.append(h) #run Chebyshev on CPU: color_hist_diffs = np.array([distance.chebyshev(cv_hists[i-1], cv_hists[i]) \ for i in range(len(cv_hists)) if i != 0]) print(color_hist_diffs) end = time.time() dur = end - start print(' '.join(['CPU & cv2 version took:', str(dur)])) 

这是一个糟糕的问题,因为错误在我的代码中的其他地方。 对困惑感到抱歉。