Compiling Code Containing Dynamic Parallelism Fails

Error compiling CUDA dynamic parallelism code with Driver API

Responding to a question posted under my other answer:

is there a way to avoid this "jit linking during runtime" process while still being with Driver API interface

Yes. (I'm providing a separate answer because I ran into the character limit on my previous answer).

In this case, we want to create a fatbin object rather than ptx, during the compilation of the kernel code itself. This fatbin needs to be compiled with -rdc=true as you would expect for the dynamic parallelism, and also needs to be device-linked, together with the CUDA device runtime library.

The host side mechanics in this case are simpler, since we don't need any of the linking steps. The CUDA sample code that seems to be relatively close to this flow is vectorAddDrv so I will start with that code/sample project in order to demonstrate this.

Here is the linux version:

$ cat vectorAdd_kernel.cu
#include <cstdio>
extern "C" __global__ void k(int N)
{
    printf("kernel level %d\n", N);
    if ((N > 1) && (threadIdx.x == 0))  k<<<1,1>>>(N-1);
}
$ cat vectorAddDrv.cpp
// Includes
#include <stdio.h>
#include <string.h>
#include <iostream>
#include <cstring>
#include <cuda.h>

// includes, project
#include <helper_cuda_drvapi.h>
#include <helper_functions.h>

// includes, CUDA
#include <builtin_types.h>

using namespace std;

// Variables
CUdevice cuDevice;
CUcontext cuContext;
CUmodule cuModule;
CUfunction vecAdd_kernel;

// Functions
bool findModulePath(const char *, string &, char **, string &);

//define input fatbin file
#ifndef FATBIN_FILE
#define FATBIN_FILE "vectorAdd_kernel64.fatbin"
#endif

// Host code
int main(int argc, char **argv)
{
    printf("Linked CDP demo (Driver API)\n");
    int N = 4, devID = 0;

    // Initialize
    checkCudaErrors(cuInit(0));

    cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
    // Create context
    checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));

    // first search for the module path before we load the results
    string module_path;

    std::ostringstream fatbin;

    if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin))
    {
        exit(EXIT_FAILURE);
    }
    else
    {
        printf("> initCUDA loading module: <%s>\n", module_path.c_str());
    }

    if (!fatbin.str().size())
    {
        printf("fatbin file empty. exiting..\n");
        exit(EXIT_FAILURE);
    }

    // Create module from binary file (FATBIN)
    checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));

    // Get function handle from module
    checkCudaErrors(cuModuleGetFunction(&vecAdd_kernel, cuModule, "k"));

        // Grid/Block configuration
        int threadsPerBlock = 1;
        int blocksPerGrid   = 1;

        void *args[] = { &N };

        // Launch the CUDA kernel
        checkCudaErrors(cuLaunchKernel(vecAdd_kernel,  blocksPerGrid, 1, 1,
                               threadsPerBlock, 1, 1,
                               0,
                               NULL, args, NULL));

    checkCudaErrors(cuCtxSynchronize());

    exit(EXIT_SUCCESS);
}
$ cat Makefile
# Location of the CUDA Toolkit
CUDA_PATH ?= /usr/local/cuda

##############################
# start deprecated interface #
##############################
ifeq ($(x86_64),1)
    $(info WARNING - x86_64 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
    TARGET_ARCH ?= x86_64
endif
ifeq ($(ARMv7),1)
    $(info WARNING - ARMv7 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=armv7l instead)
    TARGET_ARCH ?= armv7l
endif
ifeq ($(aarch64),1)
    $(info WARNING - aarch64 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
    TARGET_ARCH ?= aarch64
endif
ifeq ($(ppc64le),1)
    $(info WARNING - ppc64le variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
    TARGET_ARCH ?= ppc64le
endif
ifneq ($(GCC),)
    $(info WARNING - GCC variable has been deprecated)
    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
    HOST_COMPILER ?= $(GCC)
endif
ifneq ($(abi),)
    $(error ERROR - abi variable has been removed)
endif
############################
# end deprecated interface #
############################

# architecture
HOST_ARCH   := $(shell uname -m)
TARGET_ARCH ?= $(HOST_ARCH)
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
            TARGET_SIZE := 64
        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
            TARGET_SIZE := 32
        endif
    else
        TARGET_SIZE := $(shell getconf LONG_BIT)
    endif
else
    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
endif

# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
ifeq ($(HOST_ARCH),aarch64)
    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
        HOST_ARCH := sbsa
        TARGET_ARCH := sbsa
    endif
endif

ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
    endif
endif

# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
    TARGET_ARCH = armv7l
endif

# operating system
HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
TARGET_OS ?= $(HOST_OS)
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
endif

# host compiler
ifeq ($(TARGET_OS),darwin)
    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
        HOST_COMPILER ?= clang++
    endif
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
        ifeq ($(TARGET_OS),linux)
            HOST_COMPILER ?= arm-linux-gnueabihf-g++
        else ifeq ($(TARGET_OS),qnx)
            ifeq ($(QNX_HOST),)
                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
            endif
            ifeq ($(QNX_TARGET),)
                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
            endif
            export QNX_HOST
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
        else ifeq ($(TARGET_OS),android)
            HOST_COMPILER ?= arm-linux-androideabi-g++
        endif
    else ifeq ($(TARGET_ARCH),aarch64)
        ifeq ($(TARGET_OS), linux)
            HOST_COMPILER ?= aarch64-linux-gnu-g++
        else ifeq ($(TARGET_OS),qnx)
            ifeq ($(QNX_HOST),)
                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
            endif
            ifeq ($(QNX_TARGET),)
                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
            endif
            export QNX_HOST
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
        else ifeq ($(TARGET_OS), android)
            HOST_COMPILER ?= aarch64-linux-android-clang++
        endif
    else ifeq ($(TARGET_ARCH),sbsa)
        HOST_COMPILER ?= aarch64-linux-gnu-g++
    else ifeq ($(TARGET_ARCH),ppc64le)
        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
    endif
endif
HOST_COMPILER ?= g++
NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)

# internal flags
NVCCFLAGS   := -m${TARGET_SIZE}
CCFLAGS     :=
LDFLAGS     :=

# build flags
ifeq ($(TARGET_OS),darwin)
    LDFLAGS += -rpath $(CUDA_PATH)/lib
    CCFLAGS += -arch $(HOST_ARCH)
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
    CCFLAGS += -mfloat-abi=hard
else ifeq ($(TARGET_OS),android)
    LDFLAGS += -pie
    CCFLAGS += -fpie -fpic -fexceptions
endif

ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
        ifneq ($(TARGET_FS),)
            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
            ifeq ($(GCCVERSIONLTEQ46),1)
                CCFLAGS += --sysroot=$(TARGET_FS)
            endif
            LDFLAGS += --sysroot=$(TARGET_FS)
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
        endif
    endif
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
        ifneq ($(TARGET_FS),)
            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
            ifeq ($(GCCVERSIONLTEQ46),1)
                CCFLAGS += --sysroot=$(TARGET_FS)
            endif
            LDFLAGS += --sysroot=$(TARGET_FS)
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
            CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
        endif
    endif
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
        NVCCFLAGS += -D_QNX_SOURCE
        NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
        LDFLAGS += -lsocket
        LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
        CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
        ifdef TARGET_OVERRIDE
            LDFLAGS += -lslog2
        endif

        ifneq ($(TARGET_FS),)
            LDFLAGS += -L$(TARGET_FS)/usr/lib
            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
            LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
            CCFLAGS += -I$(TARGET_FS)/../include
        endif
    endif
endif

ifdef TARGET_OVERRIDE # cuda toolkit targets override
    NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
endif

# Install directory of different arch
CUDA_INSTALL_TARGET_DIR :=
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
else ifeq ($(TARGET_ARCH),ppc64le)
    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
endif

# Debug build flags
ifeq ($(dbg),1)
      NVCCFLAGS += -g -G
      BUILD_TYPE := debug
else
      BUILD_TYPE := release
endif

ALL_CCFLAGS :=
ALL_CCFLAGS += $(NVCCFLAGS)
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))

UBUNTU = $(shell lsb_release -i -s 2>/dev/null | grep -i ubuntu)

SAMPLE_ENABLED := 1

ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))

# Common includes and paths for CUDA
INCLUDES  := -I$(CUDA_PATH)/samples/common/inc
LIBRARIES :=

################################################################################

FATBIN_FILE := vectorAdd_kernel${TARGET_SIZE}.fatbin

#Detect if installed version of GCC supports required C++11
ifeq ($(TARGET_OS),linux)
    empty :=
    space := $(empty) $(empty)
    GCCVERSIONSTRING := $(shell expr `$(HOST_COMPILER) -dumpversion`)
#Create version number without "."
    GCCVERSION := $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f1 -d.)
    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f2 -d.)
    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f3 -d.)
# Make sure the version number has at least 3 decimals
    GCCVERSION += 00
# Remove spaces from the version number
    GCCVERSION := $(subst $(space),$(empty),$(GCCVERSION))
#$(warning $(GCCVERSION))

    IS_MIN_VERSION := $(shell expr `echo $(GCCVERSION)` \>= 47000)

    ifeq ($(IS_MIN_VERSION), 1)
        $(info >>> GCC Version is greater or equal to 4.7.0 <<<)
    else
        $(info >>> Waiving build. Minimum GCC version required is 4.7.0<<<)
        SAMPLE_ENABLED := 0
    endif
endif

# Gencode arguments
SMS ?= 52 60 61 70 75 80 86

ifeq ($(GENCODE_FLAGS),)
# Generate SASS code for each SM architecture listed in $(SMS)
$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))

ifeq ($(SMS),)
# Generate PTX code from SM 35
GENCODE_FLAGS += -gencode arch=compute_35,code=compute_35
endif

# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
HIGHEST_SM := $(lastword $(sort $(SMS)))
ifneq ($(HIGHEST_SM),)
GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
endif
endif

ifeq ($(TARGET_OS),darwin)
  ALL_LDFLAGS += -Xcompiler -F/Library/Frameworks -Xlinker -framework -Xlinker CUDA
else
  ifeq ($(TARGET_ARCH),x86_64)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/lib64/stubs
    CUDA_SEARCH_PATH += $(CUDA_PATH)/targets/x86_64-linux/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-gnueabihf/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/sbsa-linux/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-androideabi/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux-androideabi/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ARMv7-linux-QNX/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-qnx/lib/stubs
    ifdef TARGET_OVERRIDE
        CUDA_SEARCH_PATH := $(CUDA_PATH)/targets/$(TARGET_OVERRIDE)/lib/stubs
    endif
  endif

  ifeq ($(TARGET_ARCH),ppc64le)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ppc64le-linux/lib/stubs
  endif

  ifeq ($(HOST_ARCH),ppc64le)
    CUDA_SEARCH_PATH += $(CUDA_PATH)/lib64/stubs
  endif

  CUDALIB ?= $(shell find -L $(CUDA_SEARCH_PATH) -maxdepth 1 -name libcuda.so 2> /dev/null)
  ifeq ("$(CUDALIB)","")
    $(info >>> WARNING - libcuda.so not found, CUDA Driver is not installed.  Please re-install the driver. <<<)
    SAMPLE_ENABLED := 0
  else
    CUDALIB := $(shell echo $(CUDALIB) | sed "s/ .*//" | sed "s/\/libcuda.so//" )
    LIBRARIES += -L$(CUDALIB) -lcuda
  endif
endif

ALL_CCFLAGS += --threads 0 --std=c++11

ifeq ($(SAMPLE_ENABLED),0)
EXEC ?= @echo "[@]"
endif

################################################################################

# Target rules
all: build

build: vectorAddDrv $(FATBIN_FILE)

check.deps:
ifeq ($(SAMPLE_ENABLED),0)
        @echo "Sample will be waived due to the above missing dependencies"
else
        @echo "Sample is ready - all dependencies have been met"
endif

$(FATBIN_FILE): vectorAdd_kernel.cu
        $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -rdc=true -lcudadevrt -dlink  -o $@ -fatbin $<

vectorAddDrv.o:vectorAddDrv.cpp
        $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<

vectorAddDrv: vectorAddDrv.o
        $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)

run: build
        $(EXEC) ./vectorAddDrv

clean:
        rm -f vectorAddDrv vectorAddDrv.o  $(FATBIN_FILE)

clobber: clean
$ make clean
>>> GCC Version is greater or equal to 4.7.0 <<<
rm -f vectorAddDrv vectorAddDrv.o  vectorAdd_kernel64.fatbin
$ make
>>> GCC Version is greater or equal to 4.7.0 <<<
/usr/local/cuda/bin/nvcc -ccbin g++ -I/usr/local/cuda/samples/common/inc  -m64    --threads 0 --std=c++11 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_86,code=compute_86 -o vectorAddDrv.o -c vectorAddDrv.cpp
/usr/local/cuda/bin/nvcc -ccbin g++   -m64      -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_86,code=compute_86 -o vectorAddDrv vectorAddDrv.o  -L/usr/local/cuda/lib64/stubs -lcuda
/usr/local/cuda/bin/nvcc -ccbin g++ -I/usr/local/cuda/samples/common/inc  -m64    --threads 0 --std=c++11 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_86,code=compute_86 -rdc=true -lcudadevrt -dlink  -o vectorAdd_kernel64.fatbin -fatbin vectorAdd_kernel.cu
$ cuda-memcheck ./vectorAddDrv
========= CUDA-MEMCHECK
Linked CDP demo (Driver API)
> Using CUDA Device [0]: Tesla V100-PCIE-32GB
> findModulePath found file at <./vectorAdd_kernel64.fatbin>
> initCUDA loading module: <./vectorAdd_kernel64.fatbin>
kernel level 4
kernel level 3
kernel level 2
kernel level 1
========= ERROR SUMMARY: 0 errors
$

On Windows/VS 2019/CUDA 11.1, I followed these steps:

Open the vectorAddDrv project/solution, on my machine it was in: C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\0_Simple\vectorAddDrv
Replace the code in the vectorAddDrv.cpp file with the code from the same file above in the linux example.
Replace the code in the vectorAdd_kernel.cu file with the code from the same file above in the linux example.
In the solution explorer pane on the left, right-click on the vectorAdd_kernel.cu file, and open the properties. Then in Configuration Properties...CUDA C/C++...Common change "Generate Relocatable Device Code" from No to Yes. Then in Configuration Properties...CUDA C/C++...Command Line add -dlink. Also make sure that Configuration Properties...CUDA C/C++...Device...Code Generation matches the device(s) you want to run on. Click "OK".
In the same solution explorer pane on the left, right click on the vectorAddDrv project, select Properties, then in Configuration Properties...CUDA Linker...General change "Perform Device Link" from Yes to No. Click "OK".
Select Build...Rebuild Solution.

When I do that I see console build output like this:

1>------ Rebuild All started: Project: vectorAddDrv, Configuration: Debug x64 ------
1>Compiling CUDA source file vectorAdd_kernel.cu...
1>
1>C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\0_Simple\vectorAddDrv>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin\nvcc.exe" -gencode=arch=compute_61,code=sm_61 --use-local-env -ccbin "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.26.28801\bin\HostX86\x64" -x cu -rdc=true  -I./ -I../../common/inc -I./ -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\/include" -I../../common/inc -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\include"  -G   --keep-dir x64\Debug -maxrregcount=0  --machine 64 -fatbin -cudart static -dlink -Xcompiler "/wd 4819" -o data/vectorAdd_kernel64.fatbin "C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\0_Simple\vectorAddDrv\vectorAdd_kernel.cu"
1>vectorAdd_kernel.cu
1>vectorAddDrv.cpp
1>LINK : ..\..\bin\win64\Debug\\vectorAddDrv.exe not found or not built by the last incremental link; performing full link
1>vectorAddDrv_vs2019.vcxproj -> C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\0_Simple\vectorAddDrv\../../bin/win64/Debug/vectorAddDrv.exe
========== Rebuild All: 1 succeeded, 0 failed, 0 skipped ==========

If we then open a command prompt and navigate to the indicated location for the executable, and run it, I see:

C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\bin\win64\Debug>vectorAddDrv
Linked CDP demo (Driver API)
> Using CUDA Device [0]: Quadro P4000
sdkFindFilePath <vectorAdd_kernel64.fatbin> in ./
...
sdkFindFilePath <vectorAdd_kernel64.fatbin> in ../../../0_Simple/vectorAddDrv/data/
> findModulePath found file at <../../../0_Simple/vectorAddDrv/data/vectorAdd_kernel64.fatbin>
> initCUDA loading module: <../../../0_Simple/vectorAddDrv/data/vectorAdd_kernel64.fatbin>
kernel level 4
kernel level 3
kernel level 2
kernel level 1

C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\bin\win64\Debug>

One of the notes in the other answer applies here as well: The extended sequence of sdkFindFilePath messages printed at runtime can be shortened by copying the fatbin file from its location to the location of the exe file. The final sdkFindFilePath output will tell you where it found the fatbin file.

Trouble compiling/running CUDA code involving dynamic parallelism

I fixed the problem by fully reinstalling CUDA.

I'm now able to compile both the CUDA samples and my own code.

How to compile a .cu with dynamic parallelism?

Try:

g++  -c functions.cpp -std=c++0x 
g++  -c setup.cpp -std=c++0x  
nvcc timestep.cu copy.cu continuity.cu discharge.cu flood.cu -arch=sm_35 -lcudadevrt -rdc=true -c
nvcc timestep.o copy.o continuity.o discharge.o flood.o -arch=sm_35 -lcudadevrt -dlink -o dlink.o 
g++ functions.o steup.o dlink.o -o a.out -std=c++0x -L/<path>/cuda/lib<64,32> -lcudart -lcudadevrt

Dynamic Parallelism Invalid File Format

If you run your compile command with the --dryrun option:

$ nvcc --dryrun -o cdp -rdc=true -dc -dlink -arch=sm_35 cdp.cu -lcudadevrt
#$ _SPACE_= 
#$ _CUDART_=cudart
#$ _HERE_=/opt/cuda-7.5/bin
#$ _THERE_=/opt/cuda-7.5/bin
#$ _TARGET_SIZE_=
#$ _TARGET_DIR_=
#$ _TARGET_SIZE_=64
#$ TOP=/opt/cuda-7.5/bin/..
#$ NVVMIR_LIBRARY_DIR=/opt/cuda-7.5/bin/../nvvm/libdevice
#$ LD_LIBRARY_PATH=/opt/cuda-7.5/bin/../lib:/opt/cuda-7.5/lib64
#$ PATH=/opt/cuda-7.5/bin/../open64/bin:/opt/cuda-7.5/bin/../nvvm/bin:/opt/cuda-7.5/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/opt/cuda-7.5/bin
#$ INCLUDES="-I/opt/cuda-7.5/bin/..//include"  
#$ LIBRARIES=  "-L/opt/cuda-7.5/bin/..//lib64/stubs" "-L/opt/cuda-7.5/bin/..//lib64"
#$ CUDAFE_FLAGS=
#$ PTXAS_FLAGS=
#$ gcc -D__CUDA_ARCH__=350 -E -x c++     -DCUDA_DOUBLE_MATH_FUNCTIONS  -D__CUDACC__ -D__NVCC__ -D__CUDACC_RDC__  "-I/opt/cuda-7.5/bin/..//include"   -D"__CUDACC_VER__=70517" -D"__CUDACC_VER_BUILD__=17" -D"__CUDACC_VER_MINOR__=5" -D"__CUDACC_VER_MAJOR__=7" -include "cuda_runtime.h" -m64 "cdp.cu" > "/tmp/tmpxft_000022ba_00000000-7_cdp.cpp1.ii" 
#$ cudafe --allow_managed --m64 --gnu_version=40603 -tused --no_remove_unneeded_entities  --device-c --gen_c_file_name "/tmp/tmpxft_000022ba_00000000-4_cdp.cudafe1.c" --stub_file_name "/tmp/tmpxft_000022ba_00000000-4_cdp.cudafe1.stub.c" --gen_device_file_name "/tmp/tmpxft_000022ba_00000000-4_cdp.cudafe1.gpu" --nv_arch "compute_35" --gen_module_id_file --module_id_file_name "/tmp/tmpxft_000022ba_00000000-3_cdp.module_id" --include_file_name "tmpxft_000022ba_00000000-2_cdp.fatbin.c" "/tmp/tmpxft_000022ba_00000000-7_cdp.cpp1.ii" 
#$ gcc -D__CUDA_ARCH__=350 -E -x c     -DCUDA_DOUBLE_MATH_FUNCTIONS  -D__CUDACC__ -D__NVCC__ -D__CUDACC_RDC__ -D__CUDANVVM__  -D__CUDA_PREC_DIV -D__CUDA_PREC_SQRT "-I/opt/cuda-7.5/bin/..//include"   -m64 "/tmp/tmpxft_000022ba_00000000-4_cdp.cudafe1.gpu" > "/tmp/tmpxft_000022ba_00000000-8_cdp.cpp2.i" 
#$ cudafe -w --allow_managed --m64 --gnu_version=40603 --c  --device-c --gen_c_file_name "/tmp/tmpxft_000022ba_00000000-9_cdp.cudafe2.c" --stub_file_name "/tmp/tmpxft_000022ba_00000000-9_cdp.cudafe2.stub.c" --gen_device_file_name "/tmp/tmpxft_000022ba_00000000-9_cdp.cudafe2.gpu" --nv_arch "compute_35" --module_id_file_name "/tmp/tmpxft_000022ba_00000000-3_cdp.module_id" --include_file_name "tmpxft_000022ba_00000000-2_cdp.fatbin.c" "/tmp/tmpxft_000022ba_00000000-8_cdp.cpp2.i" 
#$ gcc -D__CUDA_ARCH__=350 -E -x c     -DCUDA_DOUBLE_MATH_FUNCTIONS  -D__CUDABE__ -D__CUDANVVM__  -D__CUDA_PREC_DIV -D__CUDA_PREC_SQRT "-I/opt/cuda-7.5/bin/..//include"   -m64 "/tmp/tmpxft_000022ba_00000000-9_cdp.cudafe2.gpu" > "/tmp/tmpxft_000022ba_00000000-10_cdp.cpp3.i" 
#$ filehash -s "--compile-only  " "/tmp/tmpxft_000022ba_00000000-10_cdp.cpp3.i" > "/tmp/tmpxft_000022ba_00000000-11_cdp.hash"
#$ gcc -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_RDC__  "-I/opt/cuda-7.5/bin/..//include"   -D"__CUDACC_VER__=70517" -D"__CUDACC_VER_BUILD__=17" -D"__CUDACC_VER_MINOR__=5" -D"__CUDACC_VER_MAJOR__=7" -include "cuda_runtime.h" -m64 "cdp.cu" > "/tmp/tmpxft_000022ba_00000000-5_cdp.cpp4.ii" 
#$ cudafe++ --allow_managed --m64 --gnu_version=40603 --parse_templates  --device-c --gen_c_file_name "/tmp/tmpxft_000022ba_00000000-4_cdp.cudafe1.cpp" --stub_file_name "tmpxft_000022ba_00000000-4_cdp.cudafe1.stub.c" --module_id_file_name "/tmp/tmpxft_000022ba_00000000-3_cdp.module_id" "/tmp/tmpxft_000022ba_00000000-5_cdp.cpp4.ii" 
#$ cicc  -arch compute_35 -m64 -ftz=0 -prec_div=1 -prec_sqrt=1 -fmad=1 -nvvmir-library "/opt/cuda-7.5/bin/../nvvm/libdevice/libdevice.compute_35.10.bc"  --device-c --orig_src_file_name "cdp.cu"  "/tmp/tmpxft_000022ba_00000000-10_cdp.cpp3.i" -o "/tmp/tmpxft_000022ba_00000000-6_cdp.ptx"
#$ ptxas  -arch=sm_35 -m64 --compile-only  "/tmp/tmpxft_000022ba_00000000-6_cdp.ptx"  -o "/tmp/tmpxft_000022ba_00000000-13_cdp.sm_35.cubin" 
#$ fatbinary --create="/tmp/tmpxft_000022ba_00000000-2_cdp.fatbin" -64 --key="xxxxxxxxxx" --cmdline="--compile-only  " "--image=profile=sm_35,file=/tmp/tmpxft_000022ba_00000000-13_cdp.sm_35.cubin" "--image=profile=compute_35,file=/tmp/tmpxft_000022ba_00000000-6_cdp.ptx" --embedded-fatbin="/tmp/tmpxft_000022ba_00000000-2_cdp.fatbin.c" --cuda --device-c
#$ rm /tmp/tmpxft_000022ba_00000000-2_cdp.fatbin
#$ gcc -D__CUDA_ARCH__=350 -E -x c++     -DCUDA_DOUBLE_MATH_FUNCTIONS   -D__CUDA_PREC_DIV -D__CUDA_PREC_SQRT "-I/opt/cuda-7.5/bin/..//include"   -m64 "/tmp/tmpxft_000022ba_00000000-4_cdp.cudafe1.cpp" > "/tmp/tmpxft_000022ba_00000000-14_cdp.ii" 
#$ gcc -c -x c++ "-I/opt/cuda-7.5/bin/..//include"   -fpreprocessed -m64 -o "cdp" "/tmp/tmpxft_000022ba_00000000-14_cdp.ii"

it becomes obvious that this has only emitted a host object file with an embedded cubin payload. There is no host code compilation or linking to an executable, which is confirmed by the output of objdump posted in an edit to your question.

The complicating factor here is that you must perform device independent compilation to use dynamic parallelism and then link the device code, but you only have a single source file, so the conventional build approach (device compile, device link, host compile) would fail with duplicate symbols.

The solution seems to be this:

$ nvcc --dryrun -o cdp -rdc=true  -arch=sm_35 cdp.cu 
#$ _SPACE_= 
#$ _CUDART_=cudart
#$ _HERE_=/opt/cuda-7.5/bin
#$ _THERE_=/opt/cuda-7.5/bin
#$ _TARGET_SIZE_=
#$ _TARGET_DIR_=
#$ _TARGET_SIZE_=64
#$ TOP=/opt/cuda-7.5/bin/..
#$ NVVMIR_LIBRARY_DIR=/opt/cuda-7.5/bin/../nvvm/libdevice
#$ LD_LIBRARY_PATH=/opt/cuda-7.5/bin/../lib:/opt/cuda-7.5/lib64
#$ PATH=/opt/cuda-7.5/bin/../open64/bin:/opt/cuda-7.5/bin/../nvvm/bin:/opt/cuda-7.5/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/opt/cuda-7.5/bin
#$ INCLUDES="-I/opt/cuda-7.5/bin/..//include"  
#$ LIBRARIES=  "-L/opt/cuda-7.5/bin/..//lib64/stubs" "-L/opt/cuda-7.5/bin/..//lib64"
#$ CUDAFE_FLAGS=
#$ PTXAS_FLAGS=
#$ gcc -D__CUDA_ARCH__=350 -E -x c++     -DCUDA_DOUBLE_MATH_FUNCTIONS  -D__CUDACC__ -D__NVCC__ -D__CUDACC_RDC__  "-I/opt/cuda-7.5/bin/..//include"   -D"__CUDACC_VER__=70517" -D"__CUDACC_VER_BUILD__=17" -D"__CUDACC_VER_MINOR__=5" -D"__CUDACC_VER_MAJOR__=7" -include "cuda_runtime.h" -m64 "cdp.cu" > "/tmp/tmpxft_00002454_00000000-9_cdp.cpp1.ii" 
#$ cudafe --allow_managed --m64 --gnu_version=40603 -tused --no_remove_unneeded_entities  --device-c --gen_c_file_name "/tmp/tmpxft_00002454_00000000-4_cdp.cudafe1.c" --stub_file_name "/tmp/tmpxft_00002454_00000000-4_cdp.cudafe1.stub.c" --gen_device_file_name "/tmp/tmpxft_00002454_00000000-4_cdp.cudafe1.gpu" --nv_arch "compute_35" --gen_module_id_file --module_id_file_name "/tmp/tmpxft_00002454_00000000-3_cdp.module_id" --include_file_name "tmpxft_00002454_00000000-2_cdp.fatbin.c" "/tmp/tmpxft_00002454_00000000-9_cdp.cpp1.ii"

Compiling Code Containing Dynamic Parallelism Fails

Error compiling CUDA dynamic parallelism code with Driver API

Trouble compiling/running CUDA code involving dynamic parallelism

How to compile a .cu with dynamic parallelism?

Dynamic Parallelism Invalid File Format

Related Topics

Leave a reply