Cuda内核返回向量

我有一个单词列表,我的目标是在一个非常长的短语中匹配每个单词。 我在匹配每个单词方面没有问题,我唯一的问题是返回包含每个匹配信息的结构向量。

在代码中:

typedef struct { int A, B, C; } Match; __global__ void Find(veryLongPhrase * _phrase, Words * _word_list, vector * _matches) { int a, b, c; [...] //Parallel search for each word in the phrase if(match) //When an occurrence is found { _matches.push_back(new Match{ A = a, B = b, C = c }); //Here comes the unknown, what should I do here??? } } main() { [...] veryLongPhrase * myPhrase = "The quick brown fox jumps over the lazy dog etc etc etc..." Words * wordList = {"the", "lazy"}; vector * matches; //Obviously I can't pass a vector to a kernel Find<<>>(myPhrase, wordList, matches); [...] } 

我尝试过Thrust库但没有任何成功,你能给我一些解决方案吗?

非常感谢你。

这样的东西应该工作(在浏览器中编码,未经测试):

 // N is the maximum number of structs to insert #define N 10000 typedef struct { int A, B, C; } Match; __device__ Match dev_data[N]; __device__ int dev_count = 0; __device__ int my_push_back(Match * mt) { int insert_pt = atomicAdd(&dev_count, 1); if (insert_pt < N){ dev_data[insert_pt] = *mt; return insert_pt;} else return -1;} __global__ void Find(veryLongPhrase * _phrase, Words * _word_list, vector * _matches) { int a, b, c; [...] //Parallel search for each word in the phrase if(match) //When an occurrence is found { my_push_back(new Match{ A = a, B = b, C = c }); } } main() { [...] veryLongPhrase * myPhrase = "The quick brown fox jumps over the lazy dog etc etc etc..." Words * wordList = {"the", "lazy"}; Find<<< X, Y >>>(myPhrase, wordList); int dsize; cudaMemcpyFromSymbol(&dsize, dev_count, sizeof(int)); vector results(dsize); cudaMemcpyFromSymbol(&(results[0]), dev_data, dsize*sizeof(Match)); [...] } 

这将需要1.1或更高的计算能力用于primefaces操作。

 nvcc -arch=sm_11 ... 

这是一个有效的例子:

 $ cat t347.cu #include  #include  // N is the maximum number of structs to insert #define N 10000 typedef struct { int A, B, C; } Match; __device__ Match dev_data[N]; __device__ int dev_count = 0; __device__ int my_push_back(Match & mt) { int insert_pt = atomicAdd(&dev_count, 1); if (insert_pt < N){ dev_data[insert_pt] = mt; return insert_pt;} else return -1;} __global__ void Find() { if(threadIdx.x < 10) //Simulate a found occurrence { Match a = { .A = 1, .B = 2, .C = 3 }; my_push_back(a); } } main() { Find<<< 2, 256 >>>(); int dsize; cudaMemcpyFromSymbol(&dsize, dev_count, sizeof(int)); if (dsize >= N) {printf("overflow error\n"); return 1;} std::vector results(dsize); cudaMemcpyFromSymbol(&(results[0]), dev_data, dsize*sizeof(Match)); std::cout << "number of matches = " << dsize << std::endl; std::cout << "A = " << results[dsize-1].A << std:: endl; std::cout << "B = " << results[dsize-1].B << std:: endl; std::cout << "C = " << results[dsize-1].C << std:: endl; } $ nvcc -arch=sm_11 -o t347 t347.cu $ ./t347 number of matches = 20 A = 1 B = 2 C = 3 $ 

请注意,在这种情况下,我的Match结果结构创建是不同的,我通过引用传递,但概念是相同的。