无法为CUDA C程序创建工作Makefile

我有一个由3个CUDA文件和2 个头文件组成的简单脚本: main.cukernel.cu func.cukernel.hfunc.h。 他们的目标是计算2个向量的总和。

// main.cu #include  #include  #include  #include  #include "kernel.h" int main(){ /* Error code to check return values for CUDA calls */ cudaError_t err = cudaSuccess; srand(time(NULL)); int count = 100; int A[count], B[count]; int *h_A, *h_B; h_A = A; h_B = B; int i; for(i=0;i<count;i++){ *(h_A+i) = rand() % count; /* Oppure: h_A[i] = rand() % count; */ *(h_B+i) = rand() % count; /* Oppure: h_B[i] = rand() % count; */ } /* Display dei vettori A e B. */ printf("\nPrimi cinque valori di A = "); for(i=0;i<4;i++){printf("%d ", A[i]);} printf("\nPrimi cinque valori di B = "); for(i=0;i<4;i++){printf("%d ", B[i]);} int *d_A, *d_B; err = cudaMalloc((void**)&d_A, count*sizeof(int)); if (err != cudaSuccess){fprintf(stderr, "Failed to allocate device vector A (error code %s)! \n", cudaGetErrorString(err));exit(EXIT_FAILURE);} err = cudaMalloc((void**)&d_B, count*sizeof(int)); if (err != cudaSuccess){fprintf(stderr, "Failed to allocate device vector A (error code %s)! \n", cudaGetErrorString(err));exit(EXIT_FAILURE);} err = cudaMemcpy(d_A, A, count*sizeof(int), cudaMemcpyHostToDevice); if (err != cudaSuccess){fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);} err = cudaMemcpy(d_B, B, count*sizeof(int), cudaMemcpyHostToDevice); if (err != cudaSuccess){fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);} int numThreads = 256; int numBlocks = count/numThreads + 1; AddInts<<>>(d_A,d_B); err = cudaGetLastError(); err = cudaMemcpy(A, d_A, count*sizeof(int), cudaMemcpyDeviceToHost); if (err != cudaSuccess){fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);} err = cudaFree(d_A); if (err != cudaSuccess){fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);} err = cudaFree(d_B); if (err != cudaSuccess){fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);} printf("\nPrimi cinque valori di A = "); for(i=0;i<4;i++){printf("%d ", A[i]);} printf("\n"); return 0;} 

这里有kernel.cu文件:

 // kernel.cu #include "func.h" #include "kernel.h" __global__ void AddInts(int *a, int *b){ int ID = get_global_index(); *(a+ID) += *(b+ID); } 

这是func.cu

 // func.cu #include "func.h" __device__ int get_global_index(){ return (blockIdx.x * blockDim.x) + threadIdx.x; } 

这是kernel.h

 // kernel.h __global__ void AddInts(int *a, int *b); 

这是func.h

 // func.h __device__ int get_global_index(); 

我100%确定main.cu脚本是正确的; 我也知道我可以直接在主脚本中添加内核,但这不是我测试的意图; 我也知道我可以摆脱__device__函数并将其直接放在__global__但这也不是我的意图。

现在问题出在这里:我写了一个非常简单的makefile,它应该能够编译程序,但不知何故它不起作用; 这是makefile:

 # Location of the CUDA Toolkit CUDA_PATH ?= /usr/local/cuda-6.5 OSUPPER = $(shell uname -s 2>/dev/null | tr "[:lower:]" "[:upper:]") OSLOWER = $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]") OS_SIZE = $(shell uname -m | sed -e "s/x86_64/64/" -e "s/armv7l/32/" -e "s/aarch64/64/") OS_ARCH = $(shell uname -m) ARCH_FLAGS = DARWIN = $(strip $(findstring DARWIN, $(OSUPPER))) ifneq ($(DARWIN),) XCODE_GE_5 = $(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5) endif # Take command line flags that override any of these settings ifeq ($(x86_64),1) OS_SIZE = 64 OS_ARCH = x86_64 endif ifeq ($(ARMv7),1) OS_SIZE = 32 OS_ARCH = armv7l ARCH_FLAGS = -target-cpu-arch ARM endif ifeq ($(aarch64),1) OS_SIZE = 64 OS_ARCH = aarch64 ARCH_FLAGS = -target-cpu-arch ARM endif # Common binaries ifneq ($(DARWIN),) ifeq ($(XCODE_GE_5),1) GCC ?= clang else GCC ?= g++ endif else ifeq ($(ARMv7),1) GCC ?= arm-linux-gnueabihf-g++ else GCC ?= g++ endif endif NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(GCC) # internal flags NVCCFLAGS := -m${OS_SIZE} ${ARCH_FLAGS} CCFLAGS := LDFLAGS := # Extra user flags EXTRA_NVCCFLAGS ?= EXTRA_LDFLAGS ?= EXTRA_CCFLAGS ?= # OS-specific build flags ifneq ($(DARWIN),) LDFLAGS += -rpath $(CUDA_PATH)/lib CCFLAGS += -arch $(OS_ARCH) else ifeq ($(OS_ARCH),armv7l) ifeq ($(abi),androideabi) NVCCFLAGS += -target-os-variant Android else ifeq ($(abi),gnueabi) CCFLAGS += -mfloat-abi=softfp else # default to gnueabihf override abi := gnueabihf LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3 CCFLAGS += -mfloat-abi=hard endif endif endif endif ifeq ($(ARMv7),1) ifneq ($(TARGET_FS),) GCCVERSIONLTEQ46 := $(shell expr `$(GCC) -dumpversion` \>> WARNING - no SM architectures have been specified - waiving sample <<<) SAMPLE_ENABLED := 0 endif ifeq ($(GENCODE_FLAGS),) # Generate SASS code for each SM architecture listed in $(SMS) $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility HIGHEST_SM := $(lastword $(sort $(SMS))) ifneq ($(HIGHEST_SM),) GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) endif endif LIBRARIES += -lcufft ifeq ($(SAMPLE_ENABLED),0) EXEC ?= @echo "[@]" endif ################################################################################ OBJS = main.o kernel.o func.o CFLAGS = -rdc=true # Target rules all: build build: eseguibile check.deps: ifeq ($(SAMPLE_ENABLED),0) @echo "Sample will be waived due to the above missing dependencies" else @echo "Sample is ready - all dependencies have been met" endif main.o:main.cu $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< kernel.o:kernel.cu kernel.h $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< func.o:func.cu func.h $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< eseguibile: $(OBJS) $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) $(EXEC) mkdir -p ../../bin/$(OS_ARCH)/$(OSLOWER)/$(TARGET)$(if $(abi),/$(abi)) $(EXEC) cp $@ ../../bin/$(OS_ARCH)/$(OSLOWER)/$(TARGET)$(if $(abi),/$(abi)) run: build $(EXEC) ./eseguibile clean: rm -f eseguibile $(OBJS) rm -rf ../../bin/$(OS_ARCH)/$(OSLOWER)/$(TARGET)$(if $(abi),/$(abi))/eseguibile clobber: clean 

common / inc文件夹是包含Nvidia给出的头文件的文件夹,以使Cuda正确执行; 对于标签的关注点,它们在我原始文件中100%正确,但我无法在stackoverflo中复制它们; 我得到的错误是这样的:

 ./kernel.cu(6): Error: External calls are not supported (found non-inlined call to _Z16get_global_indexv) make: *** [kernel.o] Error 2 

makefile基于Nvidia在样本中提供的文件; 我真的不知道错误在哪里; 它只是makefile还是我不应该像我刚才那样嵌套函数?

你在这里遇到的情况是需要可重定位的设备代码链接(也就是单独的编译/链接 ),但你的Makefile没有正确设置。

有许多情况可能需要单独编译和链接。 项目中存在的一个示例是,一个模块中的设备代码调用另一个模块中的__device__函数。 在这种情况下, AddInts中的AddInts内核正在调用get_global_index __device__函数,该函数在func.cu中定义。 这将需要单独编译和链接设备代码。

在这种情况下,解决方案非常简单。 我们只需要将相应的-c编译选项更改为-dc ,无论它在Makefile中使用。 这3行必须改变:

 main.o:main.cu $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -dc $< kernel.o:kernel.cu kernel.h $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -dc $< func.o:func.cu func.h $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -dc $< ^ | make changes here 

此外,您还需要将您选择的体系结构更改为以下目标:

 SMS ?= 11 20 30 35 37 50 

对此:

 SMS ?= 20 30 35 37 50 

因为sm_11不是单独编译和链接的有效体系结构。 (如果必须在pre-cc2.0设备上运行此代码,则必须重新构建代码以将所有设备function包含在同一文件中;您明确表示不希望在您的问题中执行此操作。)

请注意,这不是修改Makefile的唯一方法。 你有这样的定义:

 CFLAGS = -rdc=true 

但是你没有在任何地方使用它。 代替从-c-dc进行上述更改,我们可以将$(CFLAGS)添加到这3行中的每一行。 结果语法是等效的。 (即-dc相当于-rdc=true -c

一个不相关的评论是,您发布的代码与cufft库没有依赖关系。 因此,您可以更改此行:

 LIBRARIES += -lcufft 

对此:

 LIBRARIES += 

在你的Makefile中。 但是,根据您在此处描述的内容,此更改不是构建正确代码所必需的。 如果您的项目最终将使用cufft库,那么您应该按原样保留此行。