OpenCL命令队列(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)无法正常工作(MacOS)

完成Fixstars的示例和源代码。 具体来说,我正在尝试第5章中的最后一点代码(两个移动平均线 – 又名Golden Cross):

http://www.fixstars.com/en/opencl/book/OpenCLProgrammingBook/opencl-programming-practice/

代码可在此处获得:

http://www.fixstars.com/en/opencl/book/sample/

我将在下面发布具体示例。 但缺点是通过设置命令队列如下:

command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &ret); 

导致无效的command_queue(clError)。 因此,返回数据未得到正确处理。 IE。,它全是零。

但是,如果我设置代码只计算一个移动平均值,而没有CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE ,那么事情就可以了。 相应地,本章之前的Fixstars代码似乎都可以正常工作。

我正在开发一款配备NVIDIA芯片的全新MacBook Pro(视网膜)。 所以我想知道它是否与NVIDIA实现或其他方面有关。

无论如何,它对我来说都是一个阻碍,因为我最终试图做的是触发多个(相同的)过程,每个过程都有不同的参数,这个例子通过同时计算两个股票价格移动平均线试图做同样的事情。

代码片段如下。 我已经在其源代码中添加了调试打印输出function。 所以我在程序执行时看到以下内容:

从clGetPlatformIDs返回代码:成功!
从clGetDeviceIDs返回代码:成功!
从clCreateContext返回代码:成功!
从clCreateCommandQueue返回代码:无效的值
从clBuildProgram返回代码:成功!
从clCreateKernel返回代码(13):成功!
从clCreateKernel返回代码(26):成功!
从clEnqueueTask(13)返回代码:无效的命令队列
从clEnqueueTask(26)返回代码:无效的命令队列
结果[25]:[0](0.000000,0.000000)[0](0.000000,0.000000)[0](0.000000,0.000000)[0](0.000000,0.000000)
结果[26]:[0](0.000000,0.000000)[0](0.000000,0.000000)[0](0.000000,0.000000)[0](0.000000,0.000000)
结果[27]:[0](0.000000,0.000000)[0](0.000000,0.000000)[0](0.000000,0.000000)[0](0.000000,0.000000)
结果[28]:[0](0.000000,0.000000)[0](0.000000,0.000000)[0](0.000000,0.000000)[0](0.000000,0.000000)

…其余数据也都是零。 我正在编译:

 gcc -O2 -c moving_average_vec4p.c
gcc moving_average_vec4p.o -o moving_average_vec4p -framework opencl

—-(主机代码)moving_average_vec4p.c —-

 #include  #ifdef __APPLE__ #include  #else #include  #endif #include  #define NAME_NUM (4) /* Number of stocks */ #define DATA_NUM (100) /* Number of data to process for each stock */ /* Read Stock data */ int stock_array_4[NAME_NUM*DATA_NUM]= { #include "stock_array_4.txt" }; /* Moving average width */ #define WINDOW_SIZE_13 (13) #define WINDOW_SIZE_26 (26) #define MAX_SOURCE_SIZE (0x100000) /* DT: added to aid in debugging */ void printCLError (int err) { switch (err) { case CL_SUCCESS: printf("Success!\n"); break; case CL_DEVICE_NOT_FOUND: printf("Device not found.\n"); break; case CL_DEVICE_NOT_AVAILABLE: printf("Device not available\n"); break; case CL_COMPILER_NOT_AVAILABLE: printf("Compiler not available\n"); break; case CL_MEM_OBJECT_ALLOCATION_FAILURE: printf("Memory object allocation failure\n"); break; case CL_OUT_OF_RESOURCES: printf("Out of resources\n"); break; case CL_OUT_OF_HOST_MEMORY: printf("Out of host memory\n"); break; case CL_PROFILING_INFO_NOT_AVAILABLE: printf("Profiling information not available\n"); break; case CL_MEM_COPY_OVERLAP: printf("Memory copy overlap\n"); break; case CL_IMAGE_FORMAT_MISMATCH: printf("Image format mismatch\n"); break; case CL_IMAGE_FORMAT_NOT_SUPPORTED: printf("Image format not supported\n"); break; case CL_BUILD_PROGRAM_FAILURE: printf("Program build failure\n"); break; case CL_MAP_FAILURE: printf("Map failure\n"); break; case CL_INVALID_VALUE: printf("Invalid value\n"); break; case CL_INVALID_DEVICE_TYPE: printf("Invalid device type\n"); break; case CL_INVALID_PLATFORM: printf("Invalid platform\n"); break; case CL_INVALID_DEVICE: printf("Invalid device\n"); break; case CL_INVALID_CONTEXT: printf("Invalid context\n"); break; case CL_INVALID_QUEUE_PROPERTIES: printf("Invalid queue properties\n"); break; case CL_INVALID_COMMAND_QUEUE: printf("Invalid command queue\n"); break; case CL_INVALID_HOST_PTR: printf("Invalid host pointer\n"); break; case CL_INVALID_MEM_OBJECT: printf("Invalid memory object\n"); break; case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: printf("Invalid image format descriptor\n"); break; case CL_INVALID_IMAGE_SIZE: printf("Invalid image size\n"); break; case CL_INVALID_SAMPLER: printf("Invalid sampler\n"); break; case CL_INVALID_BINARY: printf("Invalid binary\n"); break; case CL_INVALID_BUILD_OPTIONS: printf("Invalid build options\n"); break; case CL_INVALID_PROGRAM: printf("Invalid program\n"); break; case CL_INVALID_PROGRAM_EXECUTABLE: printf("Invalid program executable\n"); break; case CL_INVALID_KERNEL_NAME: printf("Invalid kernel name\n"); break; case CL_INVALID_KERNEL_DEFINITION: printf("Invalid kernel definition\n"); break; case CL_INVALID_KERNEL: printf("Invalid kernel\n"); break; case CL_INVALID_ARG_INDEX: printf("Invalid argument index\n"); break; case CL_INVALID_ARG_VALUE: printf("Invalid argument value\n"); break; case CL_INVALID_ARG_SIZE: printf("Invalid argument size\n"); break; case CL_INVALID_KERNEL_ARGS: printf("Invalid kernel arguments\n"); break; case CL_INVALID_WORK_DIMENSION: printf("Invalid work dimension\n"); break; case CL_INVALID_WORK_GROUP_SIZE: printf("Invalid work group size\n"); break; case CL_INVALID_WORK_ITEM_SIZE: printf("Invalid work item size\n"); break; case CL_INVALID_GLOBAL_OFFSET: printf("Invalid global offset\n"); break; case CL_INVALID_EVENT_WAIT_LIST: printf("Invalid event wait list\n"); break; case CL_INVALID_EVENT: printf("Invalid event\n"); break; case CL_INVALID_OPERATION: printf("Invalid operation\n"); break; case CL_INVALID_GL_OBJECT: printf("Invalid OpenGL object\n"); break; case CL_INVALID_BUFFER_SIZE: printf("Invalid buffer size\n"); break; case CL_INVALID_MIP_LEVEL: printf("Invalid mip-map level\n"); break; default: printf("Unknown\n"); } } int main(void) { cl_platform_id platform_id = NULL; cl_uint ret_num_platforms; cl_device_id device_id = NULL; cl_uint ret_num_devices; cl_context context = NULL; cl_command_queue command_queue = NULL; cl_mem memobj_in = NULL; cl_mem memobj_out13 = NULL; cl_mem memobj_out26 = NULL; cl_program program = NULL; cl_kernel kernel13 = NULL; cl_kernel kernel26 = NULL; cl_event event13, event26; size_t kernel_code_size; char *kernel_src_str; float *result13; float *result26; cl_int ret; FILE *fp; int window_num_13 = (int)WINDOW_SIZE_13; int window_num_26 = (int)WINDOW_SIZE_26; int point_num = (NAME_NUM * DATA_NUM); int data_num = (int)DATA_NUM; int name_num = (int)NAME_NUM; int i, j; /* Allocate space to read in kernel code */ kernel_src_str = (char *)malloc(MAX_SOURCE_SIZE); /* Allocate space for the result on the host side */ result13 = (float *)malloc(point_num*sizeof(float)); /* average over13 weeks */ result26 = (float *)malloc(point_num*sizeof(float)); /* average over26 weeks */ /* Get Platform */ ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); printf("Return code from clGetPlatformIDs: "); printCLError(ret); /* Get Device */ ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices); printf("Return code from clGetDeviceIDs: "); printCLError(ret); /* Create Context */ context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret); printf("Return code from clCreateContext: "); printCLError(ret); /* Create Command Queue */ // DT: this seems to break it (ie., output is all zeros) command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &ret); printf("Return code from clCreateCommandQueue: "); printCLError(ret); /* Read kernel source code */ fp = fopen("moving_average_vec4.cl", "r"); kernel_code_size = fread(kernel_src_str, 1, MAX_SOURCE_SIZE, fp); fclose(fp); /* Create Program Object */ program = clCreateProgramWithSource(context, 1, (const char **)&kernel_src_str, (const size_t *)&kernel_code_size, &ret); /* Compile kernel */ ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); printf("Return code from clBuildProgram: "); printCLError(ret); /* Create kernel */ kernel13 = clCreateKernel(program, "moving_average_vec4", &ret); /* 13 weeks */ printf("Return code from clCreateKernel(13): "); printCLError(ret); kernel26 = clCreateKernel(program, "moving_average_vec4", &ret); /* 26 weeks */ printf("Return code from clCreateKernel(26): "); printCLError(ret); /* Create buffer for the input data on the device */ memobj_in = clCreateBuffer(context, CL_MEM_READ_WRITE, point_num * sizeof(int), NULL, &ret); /* Create buffer for the result on the device */ memobj_out13 = clCreateBuffer(context, CL_MEM_READ_WRITE, point_num * sizeof(float), NULL, &ret); /* 13 weeks */ memobj_out26 = clCreateBuffer(context, CL_MEM_READ_WRITE, point_num * sizeof(float), NULL, &ret); /* 26 weeks */ /* Copy input data to the global memory on the device*/ ret = clEnqueueWriteBuffer(command_queue, memobj_in, CL_TRUE, 0, point_num * sizeof(int), stock_array_4, 0, NULL, NULL); /* Set Kernel Arguments (13 weeks) */ ret = clSetKernelArg(kernel13, 0, sizeof(cl_mem), (void *)&memobj_in); ret = clSetKernelArg(kernel13, 1, sizeof(cl_mem), (void *)&memobj_out13); ret = clSetKernelArg(kernel13, 2, sizeof(int), (void *)&data_num); ret = clSetKernelArg(kernel13, 3, sizeof(int), (void *)&window_num_13); /* Submit task to compute the moving average over 13 weeks */ ret = clEnqueueTask(command_queue, kernel13, 0, NULL, NULL); printf("Return code from clEnqueueTask(13): "); printCLError(ret); /* Set Kernel Arguments (26 weeks) */ ret = clSetKernelArg(kernel26, 0, sizeof(cl_mem), (void *)&memobj_in); ret = clSetKernelArg(kernel26, 1, sizeof(cl_mem), (void *)&memobj_out26); ret = clSetKernelArg(kernel26, 2, sizeof(int), (void *)&data_num); ret = clSetKernelArg(kernel26, 3, sizeof(int), (void *)&window_num_26); /* Submit task to compute the moving average over 26 weeks */ ret = clEnqueueTask(command_queue, kernel26, 0, NULL, &event26); printf("Return code from clEnqueueTask(26): "); printCLError(ret); // DT: doesn't seem to help ... ;-( ret = clFinish(command_queue); /* Copy result for the 13 weeks moving average from device to host */ ret = clEnqueueReadBuffer(command_queue, memobj_out13, CL_TRUE, 0, point_num * sizeof(float), result13, 1, &event13, NULL); /* Copy result for the 26 weeks moving average from device to host */ ret = clEnqueueReadBuffer(command_queue, memobj_out26, CL_TRUE, 0, point_num * sizeof(float), result26, 1, &event26, NULL); /* OpenCL Object Finalization */ ret = clReleaseKernel(kernel13); ret = clReleaseKernel(kernel26); ret = clReleaseProgram(program); ret = clReleaseMemObject(memobj_in); ret = clReleaseMemObject(memobj_out13); ret = clReleaseMemObject(memobj_out26); ret = clReleaseCommandQueue(command_queue); ret = clReleaseContext(context); /* Display results */ /* DT: also added printout to see if actual numeric values are passing through */ for (i=window_num_26-1; i < data_num; i++) { printf("result[%d]:", i ); for (j=0; j  result26[i*NAME_NUM+j]),result13[i*NAME_NUM+j],result26[i*NAME_NUM+j] ); } printf("\n"); } /* Deallocate memory on the host */ free(result13); free(result26); free(kernel_src_str); return 0; } 

—-(OPENCL KERNEL CODE)moving_average_vec4.cl —-

 __kernel void moving_average_vec4(__global int4 *values, __global float4 *average, int length, int width) { int i; int4 add_value; /* A vector to hold 4 components */ /* Compute sum for the first "width" elements for 4 stocks */ add_value = (int4)0; for (i=0; i < width; i++) { add_value += values[i]; } average[width-1] = convert_float4(add_value); /* Compute sum for the (width)th ~ (length-1)th elements for 4 stocks */ for (i=width; i < length; i++) { add_value = add_value - values[i-width] + values[i]; average[i] = convert_float4(add_value); } /* Insert zeros to 0th ~ (width-2)th element for 4 stocks*/ for (i=0; i < width-1; i++) { average[i] = (float4)(1.1f); } /* Compute average of (width-1) ~ (length-1) elements for 4 stocks */ for (i=width-1; i < length; i++) { average[i] /= (float4)width; } } 

很少有OpenCL实现支持无序命令队列。 对于重叠工作,请使用多个命令队列和(如无序队列)事件进行同步。

只需设置一台带有ATI HD6950的Ubuntu 12.04 PC。 上面的代码在那里按预期工作。 所以我假设现在这是MacOS的事情。

通过ssh移动到Aquamacs。