Compiling Code Containing Dynamic Parallelism Fails

Error compiling CUDA dynamic parallelism code with Driver API

Responding to a question posted under my other answer:

is there a way to avoid this "jit linking during runtime" process while still being with Driver API interface

Yes. (I'm providing a separate answer because I ran into the character limit on my previous answer).

In this case, we want to create a fatbin object rather than ptx, during the compilation of the kernel code itself. This fatbin needs to be compiled with -rdc=true as you would expect for the dynamic parallelism, and also needs to be device-linked, together with the CUDA device runtime library.

The host side mechanics in this case are simpler, since we don't need any of the linking steps. The CUDA sample code that seems to be relatively close to this flow is vectorAddDrv so I will start with that code/sample project in order to demonstrate this.

Here is the linux version:

$ cat vectorAdd_kernel.cu
#include <cstdio>
extern "C" __global__ void k(int N)
{
printf("kernel level %d\n", N);
if ((N > 1) && (threadIdx.x == 0)) k<<<1,1>>>(N-1);
}
$ cat vectorAddDrv.cpp
// Includes
#include <stdio.h>
#include <string.h>
#include <iostream>
#include <cstring>
#include <cuda.h>

// includes, project
#include <helper_cuda_drvapi.h>
#include <helper_functions.h>

// includes, CUDA
#include <builtin_types.h>

using namespace std;

// Variables
CUdevice cuDevice;
CUcontext cuContext;
CUmodule cuModule;
CUfunction vecAdd_kernel;

// Functions
bool findModulePath(const char *, string &, char **, string &);

//define input fatbin file
#ifndef FATBIN_FILE
#define FATBIN_FILE "vectorAdd_kernel64.fatbin"
#endif

// Host code
int main(int argc, char **argv)
{
printf("Linked CDP demo (Driver API)\n");
int N = 4, devID = 0;

// Initialize
checkCudaErrors(cuInit(0));

cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
// Create context
checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));

// first search for the module path before we load the results
string module_path;

std::ostringstream fatbin;

if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin))
{
exit(EXIT_FAILURE);
}
else
{
printf("> initCUDA loading module: <%s>\n", module_path.c_str());
}

if (!fatbin.str().size())
{
printf("fatbin file empty. exiting..\n");
exit(EXIT_FAILURE);
}

// Create module from binary file (FATBIN)
checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));

// Get function handle from module
checkCudaErrors(cuModuleGetFunction(&vecAdd_kernel, cuModule, "k"));

// Grid/Block configuration
int threadsPerBlock = 1;
int blocksPerGrid = 1;

void *args[] = { &N };

// Launch the CUDA kernel
checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1,
threadsPerBlock, 1, 1,
0,
NULL, args, NULL));

checkCudaErrors(cuCtxSynchronize());

exit(EXIT_SUCCESS);
}
$ cat Makefile
# Location of the CUDA Toolkit
CUDA_PATH ?= /usr/local/cuda

##############################
# start deprecated interface #
##############################
ifeq ($(x86_64),1)
$(info WARNING - x86_64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=x86_64 instead)
TARGET_ARCH ?= x86_64
endif
ifeq ($(ARMv7),1)
$(info WARNING - ARMv7 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=armv7l instead)
TARGET_ARCH ?= armv7l
endif
ifeq ($(aarch64),1)
$(info WARNING - aarch64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=aarch64 instead)
TARGET_ARCH ?= aarch64
endif
ifeq ($(ppc64le),1)
$(info WARNING - ppc64le variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=ppc64le instead)
TARGET_ARCH ?= ppc64le
endif
ifneq ($(GCC),)
$(info WARNING - GCC variable has been deprecated)
$(info WARNING - please use HOST_COMPILER=$(GCC) instead)
HOST_COMPILER ?= $(GCC)
endif
ifneq ($(abi),)
$(error ERROR - abi variable has been removed)
endif
############################
# end deprecated interface #
############################

# architecture
HOST_ARCH := $(shell uname -m)
TARGET_ARCH ?= $(HOST_ARCH)
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
TARGET_SIZE := 64
else ifneq (,$(filter $(TARGET_ARCH),armv7l))
TARGET_SIZE := 32
endif
else
TARGET_SIZE := $(shell getconf LONG_BIT)
endif
else
$(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
endif

# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
ifeq ($(HOST_ARCH),aarch64)
ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
HOST_ARCH := sbsa
TARGET_ARCH := sbsa
endif
endif

ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
$(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
endif
endif

# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
TARGET_ARCH = armv7l
endif

# operating system
HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
TARGET_OS ?= $(HOST_OS)
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
$(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
endif

# host compiler
ifeq ($(TARGET_OS),darwin)
ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
HOST_COMPILER ?= clang++
endif
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
ifeq ($(TARGET_OS),linux)
HOST_COMPILER ?= arm-linux-gnueabihf-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
else ifeq ($(TARGET_OS),android)
HOST_COMPILER ?= arm-linux-androideabi-g++
endif
else ifeq ($(TARGET_ARCH),aarch64)
ifeq ($(TARGET_OS), linux)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
else ifeq ($(TARGET_OS), android)
HOST_COMPILER ?= aarch64-linux-android-clang++
endif
else ifeq ($(TARGET_ARCH),sbsa)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_ARCH),ppc64le)
HOST_COMPILER ?= powerpc64le-linux-gnu-g++
endif
endif
HOST_COMPILER ?= g++
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)

# internal flags
NVCCFLAGS := -m${TARGET_SIZE}
CCFLAGS :=
LDFLAGS :=

# build flags
ifeq ($(TARGET_OS),darwin)
LDFLAGS += -rpath $(CUDA_PATH)/lib
CCFLAGS += -arch $(HOST_ARCH)
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
CCFLAGS += -mfloat-abi=hard
else ifeq ($(TARGET_OS),android)
LDFLAGS += -pie
CCFLAGS += -fpie -fpic -fexceptions
endif

ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
NVCCFLAGS += -D_QNX_SOURCE
NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
LDFLAGS += -lsocket
LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
ifdef TARGET_OVERRIDE
LDFLAGS += -lslog2
endif

ifneq ($(TARGET_FS),)
LDFLAGS += -L$(TARGET_FS)/usr/lib
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
CCFLAGS += -I$(TARGET_FS)/../include
endif
endif
endif

ifdef TARGET_OVERRIDE # cuda toolkit targets override
NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
endif

# Install directory of different arch
CUDA_INSTALL_TARGET_DIR :=
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
else ifeq ($(TARGET_ARCH),ppc64le)
CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
endif

# Debug build flags
ifeq ($(dbg),1)
NVCCFLAGS += -g -G
BUILD_TYPE := debug
else
BUILD_TYPE := release
endif

ALL_CCFLAGS :=
ALL_CCFLAGS += $(NVCCFLAGS)
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))

UBUNTU = $(shell lsb_release -i -s 2>/dev/null | grep -i ubuntu)

SAMPLE_ENABLED := 1

ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))

# Common includes and paths for CUDA
INCLUDES := -I$(CUDA_PATH)/samples/common/inc
LIBRARIES :=

################################################################################

FATBIN_FILE := vectorAdd_kernel${TARGET_SIZE}.fatbin

#Detect if installed version of GCC supports required C++11
ifeq ($(TARGET_OS),linux)
empty :=
space := $(empty) $(empty)
GCCVERSIONSTRING := $(shell expr `$(HOST_COMPILER) -dumpversion`)
#Create version number without "."
GCCVERSION := $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f1 -d.)
GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f2 -d.)
GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f3 -d.)
# Make sure the version number has at least 3 decimals
GCCVERSION += 00
# Remove spaces from the version number
GCCVERSION := $(subst $(space),$(empty),$(GCCVERSION))
#$(warning $(GCCVERSION))

IS_MIN_VERSION := $(shell expr `echo $(GCCVERSION)` \>= 47000)

ifeq ($(IS_MIN_VERSION), 1)
$(info >>> GCC Version is greater or equal to 4.7.0 <<<)
else
$(info >>> Waiving build. Minimum GCC version required is 4.7.0<<<)
SAMPLE_ENABLED := 0
endif
endif

# Gencode arguments
SMS ?= 52 60 61 70 75 80 86

ifeq ($(GENCODE_FLAGS),)
# Generate SASS code for each SM architecture listed in $(SMS)
$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))

ifeq ($(SMS),)
# Generate PTX code from SM 35
GENCODE_FLAGS += -gencode arch=compute_35,code=compute_35
endif

# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
HIGHEST_SM := $(lastword $(sort $(SMS)))
ifneq ($(HIGHEST_SM),)
GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
endif
endif

ifeq ($(TARGET_OS),darwin)
ALL_LDFLAGS += -Xcompiler -F/Library/Frameworks -Xlinker -framework -Xlinker CUDA
else
ifeq ($(TARGET_ARCH),x86_64)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/lib64/stubs
CUDA_SEARCH_PATH += $(CUDA_PATH)/targets/x86_64-linux/lib/stubs
endif

ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-gnueabihf/lib/stubs
endif

ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux/lib/stubs
endif

ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/sbsa-linux/lib/stubs
endif

ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-androideabi/lib/stubs
endif

ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux-androideabi/lib/stubs
endif

ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ARMv7-linux-QNX/lib/stubs
endif

ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-qnx/lib/stubs
ifdef TARGET_OVERRIDE
CUDA_SEARCH_PATH := $(CUDA_PATH)/targets/$(TARGET_OVERRIDE)/lib/stubs
endif
endif

ifeq ($(TARGET_ARCH),ppc64le)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ppc64le-linux/lib/stubs
endif

ifeq ($(HOST_ARCH),ppc64le)
CUDA_SEARCH_PATH += $(CUDA_PATH)/lib64/stubs
endif

CUDALIB ?= $(shell find -L $(CUDA_SEARCH_PATH) -maxdepth 1 -name libcuda.so 2> /dev/null)
ifeq ("$(CUDALIB)","")
$(info >>> WARNING - libcuda.so not found, CUDA Driver is not installed. Please re-install the driver. <<<)
SAMPLE_ENABLED := 0
else
CUDALIB := $(shell echo $(CUDALIB) | sed "s/ .*//" | sed "s/\/libcuda.so//" )
LIBRARIES += -L$(CUDALIB) -lcuda
endif
endif

ALL_CCFLAGS += --threads 0 --std=c++11

ifeq ($(SAMPLE_ENABLED),0)
EXEC ?= @echo "[@]"
endif

################################################################################

# Target rules
all: build

build: vectorAddDrv $(FATBIN_FILE)

check.deps:
ifeq ($(SAMPLE_ENABLED),0)
@echo "Sample will be waived due to the above missing dependencies"
else
@echo "Sample is ready - all dependencies have been met"
endif

$(FATBIN_FILE): vectorAdd_kernel.cu
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -rdc=true -lcudadevrt -dlink -o $@ -fatbin $<

vectorAddDrv.o:vectorAddDrv.cpp
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<

vectorAddDrv: vectorAddDrv.o
$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)

run: build
$(EXEC) ./vectorAddDrv

clean:
rm -f vectorAddDrv vectorAddDrv.o $(FATBIN_FILE)

clobber: clean
$ make clean
>>> GCC Version is greater or equal to 4.7.0 <<<
rm -f vectorAddDrv vectorAddDrv.o vectorAdd_kernel64.fatbin
$ make
>>> GCC Version is greater or equal to 4.7.0 <<<
/usr/local/cuda/bin/nvcc -ccbin g++ -I/usr/local/cuda/samples/common/inc -m64 --threads 0 --std=c++11 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_86,code=compute_86 -o vectorAddDrv.o -c vectorAddDrv.cpp
/usr/local/cuda/bin/nvcc -ccbin g++ -m64 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_86,code=compute_86 -o vectorAddDrv vectorAddDrv.o -L/usr/local/cuda/lib64/stubs -lcuda
/usr/local/cuda/bin/nvcc -ccbin g++ -I/usr/local/cuda/samples/common/inc -m64 --threads 0 --std=c++11 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_86,code=compute_86 -rdc=true -lcudadevrt -dlink -o vectorAdd_kernel64.fatbin -fatbin vectorAdd_kernel.cu
$ cuda-memcheck ./vectorAddDrv
========= CUDA-MEMCHECK
Linked CDP demo (Driver API)
> Using CUDA Device [0]: Tesla V100-PCIE-32GB
> findModulePath found file at <./vectorAdd_kernel64.fatbin>
> initCUDA loading module: <./vectorAdd_kernel64.fatbin>
kernel level 4
kernel level 3
kernel level 2
kernel level 1
========= ERROR SUMMARY: 0 errors
$

On Windows/VS 2019/CUDA 11.1, I followed these steps:

  1. Open the vectorAddDrv project/solution, on my machine it was in: C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\0_Simple\vectorAddDrv
  2. Replace the code in the vectorAddDrv.cpp file with the code from the same file above in the linux example.
  3. Replace the code in the vectorAdd_kernel.cu file with the code from the same file above in the linux example.
  4. In the solution explorer pane on the left, right-click on the vectorAdd_kernel.cu file, and open the properties. Then in Configuration Properties...CUDA C/C++...Common change "Generate Relocatable Device Code" from No to Yes. Then in Configuration Properties...CUDA C/C++...Command Line add -dlink. Also make sure that Configuration Properties...CUDA C/C++...Device...Code Generation matches the device(s) you want to run on. Click "OK".
  5. In the same solution explorer pane on the left, right click on the vectorAddDrv project, select Properties, then in Configuration Properties...CUDA Linker...General change "Perform Device Link" from Yes to No. Click "OK".
  6. Select Build...Rebuild Solution.

When I do that I see console build output like this:

1>------ Rebuild All started: Project: vectorAddDrv, Configuration: Debug x64 ------
1>Compiling CUDA source file vectorAdd_kernel.cu...
1>
1>C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\0_Simple\vectorAddDrv>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin\nvcc.exe" -gencode=arch=compute_61,code=sm_61 --use-local-env -ccbin "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.26.28801\bin\HostX86\x64" -x cu -rdc=true -I./ -I../../common/inc -I./ -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\/include" -I../../common/inc -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\include" -G --keep-dir x64\Debug -maxrregcount=0 --machine 64 -fatbin -cudart static -dlink -Xcompiler "/wd 4819" -o data/vectorAdd_kernel64.fatbin "C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\0_Simple\vectorAddDrv\vectorAdd_kernel.cu"
1>vectorAdd_kernel.cu
1>vectorAddDrv.cpp
1>LINK : ..\..\bin\win64\Debug\\vectorAddDrv.exe not found or not built by the last incremental link; performing full link
1>vectorAddDrv_vs2019.vcxproj -> C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\0_Simple\vectorAddDrv\../../bin/win64/Debug/vectorAddDrv.exe
========== Rebuild All: 1 succeeded, 0 failed, 0 skipped ==========

If we then open a command prompt and navigate to the indicated location for the executable, and run it, I see:

C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\bin\win64\Debug>vectorAddDrv
Linked CDP demo (Driver API)
> Using CUDA Device [0]: Quadro P4000
sdkFindFilePath <vectorAdd_kernel64.fatbin> in ./
...
sdkFindFilePath <vectorAdd_kernel64.fatbin> in ../../../0_Simple/vectorAddDrv/data/
> findModulePath found file at <../../../0_Simple/vectorAddDrv/data/vectorAdd_kernel64.fatbin>
> initCUDA loading module: <../../../0_Simple/vectorAddDrv/data/vectorAdd_kernel64.fatbin>
kernel level 4
kernel level 3
kernel level 2
kernel level 1

C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\bin\win64\Debug>

One of the notes in the other answer applies here as well: The extended sequence of sdkFindFilePath messages printed at runtime can be shortened by copying the fatbin file from its location to the location of the exe file. The final sdkFindFilePath output will tell you where it found the fatbin file.

Trouble compiling/running CUDA code involving dynamic parallelism

I fixed the problem by fully reinstalling CUDA.

I'm now able to compile both the CUDA samples and my own code.

How to compile a .cu with dynamic parallelism?

Try:

g++  -c functions.cpp -std=c++0x 
g++ -c setup.cpp -std=c++0x
nvcc timestep.cu copy.cu continuity.cu discharge.cu flood.cu -arch=sm_35 -lcudadevrt -rdc=true -c
nvcc timestep.o copy.o continuity.o discharge.o flood.o -arch=sm_35 -lcudadevrt -dlink -o dlink.o
g++ functions.o steup.o dlink.o -o a.out -std=c++0x -L/<path>/cuda/lib<64,32> -lcudart -lcudadevrt

Dynamic Parallelism Invalid File Format

If you run your compile command with the --dryrun option:

$ nvcc --dryrun -o cdp -rdc=true -dc -dlink -arch=sm_35 cdp.cu -lcudadevrt
#$ _SPACE_=
#$ _CUDART_=cudart
#$ _HERE_=/opt/cuda-7.5/bin
#$ _THERE_=/opt/cuda-7.5/bin
#$ _TARGET_SIZE_=
#$ _TARGET_DIR_=
#$ _TARGET_SIZE_=64
#$ TOP=/opt/cuda-7.5/bin/..
#$ NVVMIR_LIBRARY_DIR=/opt/cuda-7.5/bin/../nvvm/libdevice
#$ LD_LIBRARY_PATH=/opt/cuda-7.5/bin/../lib:/opt/cuda-7.5/lib64
#$ PATH=/opt/cuda-7.5/bin/../open64/bin:/opt/cuda-7.5/bin/../nvvm/bin:/opt/cuda-7.5/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/opt/cuda-7.5/bin
#$ INCLUDES="-I/opt/cuda-7.5/bin/..//include"
#$ LIBRARIES= "-L/opt/cuda-7.5/bin/..//lib64/stubs" "-L/opt/cuda-7.5/bin/..//lib64"
#$ CUDAFE_FLAGS=
#$ PTXAS_FLAGS=
#$ gcc -D__CUDA_ARCH__=350 -E -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -D__CUDACC__ -D__NVCC__ -D__CUDACC_RDC__ "-I/opt/cuda-7.5/bin/..//include" -D"__CUDACC_VER__=70517" -D"__CUDACC_VER_BUILD__=17" -D"__CUDACC_VER_MINOR__=5" -D"__CUDACC_VER_MAJOR__=7" -include "cuda_runtime.h" -m64 "cdp.cu" > "/tmp/tmpxft_000022ba_00000000-7_cdp.cpp1.ii"
#$ cudafe --allow_managed --m64 --gnu_version=40603 -tused --no_remove_unneeded_entities --device-c --gen_c_file_name "/tmp/tmpxft_000022ba_00000000-4_cdp.cudafe1.c" --stub_file_name "/tmp/tmpxft_000022ba_00000000-4_cdp.cudafe1.stub.c" --gen_device_file_name "/tmp/tmpxft_000022ba_00000000-4_cdp.cudafe1.gpu" --nv_arch "compute_35" --gen_module_id_file --module_id_file_name "/tmp/tmpxft_000022ba_00000000-3_cdp.module_id" --include_file_name "tmpxft_000022ba_00000000-2_cdp.fatbin.c" "/tmp/tmpxft_000022ba_00000000-7_cdp.cpp1.ii"
#$ gcc -D__CUDA_ARCH__=350 -E -x c -DCUDA_DOUBLE_MATH_FUNCTIONS -D__CUDACC__ -D__NVCC__ -D__CUDACC_RDC__ -D__CUDANVVM__ -D__CUDA_PREC_DIV -D__CUDA_PREC_SQRT "-I/opt/cuda-7.5/bin/..//include" -m64 "/tmp/tmpxft_000022ba_00000000-4_cdp.cudafe1.gpu" > "/tmp/tmpxft_000022ba_00000000-8_cdp.cpp2.i"
#$ cudafe -w --allow_managed --m64 --gnu_version=40603 --c --device-c --gen_c_file_name "/tmp/tmpxft_000022ba_00000000-9_cdp.cudafe2.c" --stub_file_name "/tmp/tmpxft_000022ba_00000000-9_cdp.cudafe2.stub.c" --gen_device_file_name "/tmp/tmpxft_000022ba_00000000-9_cdp.cudafe2.gpu" --nv_arch "compute_35" --module_id_file_name "/tmp/tmpxft_000022ba_00000000-3_cdp.module_id" --include_file_name "tmpxft_000022ba_00000000-2_cdp.fatbin.c" "/tmp/tmpxft_000022ba_00000000-8_cdp.cpp2.i"
#$ gcc -D__CUDA_ARCH__=350 -E -x c -DCUDA_DOUBLE_MATH_FUNCTIONS -D__CUDABE__ -D__CUDANVVM__ -D__CUDA_PREC_DIV -D__CUDA_PREC_SQRT "-I/opt/cuda-7.5/bin/..//include" -m64 "/tmp/tmpxft_000022ba_00000000-9_cdp.cudafe2.gpu" > "/tmp/tmpxft_000022ba_00000000-10_cdp.cpp3.i"
#$ filehash -s "--compile-only " "/tmp/tmpxft_000022ba_00000000-10_cdp.cpp3.i" > "/tmp/tmpxft_000022ba_00000000-11_cdp.hash"
#$ gcc -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_RDC__ "-I/opt/cuda-7.5/bin/..//include" -D"__CUDACC_VER__=70517" -D"__CUDACC_VER_BUILD__=17" -D"__CUDACC_VER_MINOR__=5" -D"__CUDACC_VER_MAJOR__=7" -include "cuda_runtime.h" -m64 "cdp.cu" > "/tmp/tmpxft_000022ba_00000000-5_cdp.cpp4.ii"
#$ cudafe++ --allow_managed --m64 --gnu_version=40603 --parse_templates --device-c --gen_c_file_name "/tmp/tmpxft_000022ba_00000000-4_cdp.cudafe1.cpp" --stub_file_name "tmpxft_000022ba_00000000-4_cdp.cudafe1.stub.c" --module_id_file_name "/tmp/tmpxft_000022ba_00000000-3_cdp.module_id" "/tmp/tmpxft_000022ba_00000000-5_cdp.cpp4.ii"
#$ cicc -arch compute_35 -m64 -ftz=0 -prec_div=1 -prec_sqrt=1 -fmad=1 -nvvmir-library "/opt/cuda-7.5/bin/../nvvm/libdevice/libdevice.compute_35.10.bc" --device-c --orig_src_file_name "cdp.cu" "/tmp/tmpxft_000022ba_00000000-10_cdp.cpp3.i" -o "/tmp/tmpxft_000022ba_00000000-6_cdp.ptx"
#$ ptxas -arch=sm_35 -m64 --compile-only "/tmp/tmpxft_000022ba_00000000-6_cdp.ptx" -o "/tmp/tmpxft_000022ba_00000000-13_cdp.sm_35.cubin"
#$ fatbinary --create="/tmp/tmpxft_000022ba_00000000-2_cdp.fatbin" -64 --key="xxxxxxxxxx" --cmdline="--compile-only " "--image=profile=sm_35,file=/tmp/tmpxft_000022ba_00000000-13_cdp.sm_35.cubin" "--image=profile=compute_35,file=/tmp/tmpxft_000022ba_00000000-6_cdp.ptx" --embedded-fatbin="/tmp/tmpxft_000022ba_00000000-2_cdp.fatbin.c" --cuda --device-c
#$ rm /tmp/tmpxft_000022ba_00000000-2_cdp.fatbin
#$ gcc -D__CUDA_ARCH__=350 -E -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -D__CUDA_PREC_DIV -D__CUDA_PREC_SQRT "-I/opt/cuda-7.5/bin/..//include" -m64 "/tmp/tmpxft_000022ba_00000000-4_cdp.cudafe1.cpp" > "/tmp/tmpxft_000022ba_00000000-14_cdp.ii"
#$ gcc -c -x c++ "-I/opt/cuda-7.5/bin/..//include" -fpreprocessed -m64 -o "cdp" "/tmp/tmpxft_000022ba_00000000-14_cdp.ii"

it becomes obvious that this has only emitted a host object file with an embedded cubin payload. There is no host code compilation or linking to an executable, which is confirmed by the output of objdump posted in an edit to your question.

The complicating factor here is that you must perform device independent compilation to use dynamic parallelism and then link the device code, but you only have a single source file, so the conventional build approach (device compile, device link, host compile) would fail with duplicate symbols.

The solution seems to be this:

$ nvcc --dryrun -o cdp -rdc=true  -arch=sm_35 cdp.cu 
#$ _SPACE_=
#$ _CUDART_=cudart
#$ _HERE_=/opt/cuda-7.5/bin
#$ _THERE_=/opt/cuda-7.5/bin
#$ _TARGET_SIZE_=
#$ _TARGET_DIR_=
#$ _TARGET_SIZE_=64
#$ TOP=/opt/cuda-7.5/bin/..
#$ NVVMIR_LIBRARY_DIR=/opt/cuda-7.5/bin/../nvvm/libdevice
#$ LD_LIBRARY_PATH=/opt/cuda-7.5/bin/../lib:/opt/cuda-7.5/lib64
#$ PATH=/opt/cuda-7.5/bin/../open64/bin:/opt/cuda-7.5/bin/../nvvm/bin:/opt/cuda-7.5/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/opt/cuda-7.5/bin
#$ INCLUDES="-I/opt/cuda-7.5/bin/..//include"
#$ LIBRARIES= "-L/opt/cuda-7.5/bin/..//lib64/stubs" "-L/opt/cuda-7.5/bin/..//lib64"
#$ CUDAFE_FLAGS=
#$ PTXAS_FLAGS=
#$ gcc -D__CUDA_ARCH__=350 -E -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -D__CUDACC__ -D__NVCC__ -D__CUDACC_RDC__ "-I/opt/cuda-7.5/bin/..//include" -D"__CUDACC_VER__=70517" -D"__CUDACC_VER_BUILD__=17" -D"__CUDACC_VER_MINOR__=5" -D"__CUDACC_VER_MAJOR__=7" -include "cuda_runtime.h" -m64 "cdp.cu" > "/tmp/tmpxft_00002454_00000000-9_cdp.cpp1.ii"
#$ cudafe --allow_managed --m64 --gnu_version=40603 -tused --no_remove_unneeded_entities --device-c --gen_c_file_name "/tmp/tmpxft_00002454_00000000-4_cdp.cudafe1.c" --stub_file_name "/tmp/tmpxft_00002454_00000000-4_cdp.cudafe1.stub.c" --gen_device_file_name "/tmp/tmpxft_00002454_00000000-4_cdp.cudafe1.gpu" --nv_arch "compute_35" --gen_module_id_file --module_id_file_name "/tmp/tmpxft_00002454_00000000-3_cdp.module_id" --include_file_name "tmpxft_00002454_00000000-2_cdp.fatbin.c" "/tmp/tmpxft_00002454_00000000-9_cdp.cpp1.ii"


Related Topics



Leave a reply



Submit