How to asynchronously copy memory from the host to the device using thrust and CUDA streams
As indicated in the comments, I don't think this will be possible directly with thrust::copy
. However we can use cudaMemcpyAsync
in a thrust application to achieve the goal of asynchronous copies and overlap of copy with compute.
Here is a worked example:
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/system/cuda/experimental/pinned_allocator.h>
#include <thrust/system/cuda/execution_policy.h>
#include <thrust/fill.h>
#include <thrust/sequence.h>
#include <thrust/for_each.h>
#include <iostream>
// DSIZE determines duration of H2D and D2H transfers
#define DSIZE (1048576*8)
// SSIZE,LSIZE determine duration of kernel launched by thrust
#define SSIZE (1024*512)
#define LSIZE 1
// KSIZE determines size of thrust kernels (number of threads per block)
#define KSIZE 64
#define TV1 1
#define TV2 2
typedef int mytype;
typedef thrust::host_vector<mytype, thrust::cuda::experimental::pinned_allocator<mytype> > pinnedVector;
struct sum_functor
{
mytype *dptr;
sum_functor(mytype* _dptr) : dptr(_dptr) {};
__host__ __device__ void operator()(mytype &data) const
{
mytype result = data;
for (int j = 0; j < LSIZE; j++)
for (int i = 0; i < SSIZE; i++)
result += dptr[i];
data = result;
}
};
int main(){
pinnedVector hi1(DSIZE);
pinnedVector hi2(DSIZE);
pinnedVector ho1(DSIZE);
pinnedVector ho2(DSIZE);
thrust::device_vector<mytype> di1(DSIZE);
thrust::device_vector<mytype> di2(DSIZE);
thrust::device_vector<mytype> do1(DSIZE);
thrust::device_vector<mytype> do2(DSIZE);
thrust::device_vector<mytype> dc1(KSIZE);
thrust::device_vector<mytype> dc2(KSIZE);
thrust::fill(hi1.begin(), hi1.end(), TV1);
thrust::fill(hi2.begin(), hi2.end(), TV2);
thrust::sequence(do1.begin(), do1.end());
thrust::sequence(do2.begin(), do2.end());
cudaStream_t s1, s2;
cudaStreamCreate(&s1); cudaStreamCreate(&s2);
cudaMemcpyAsync(thrust::raw_pointer_cast(di1.data()), thrust::raw_pointer_cast(hi1.data()), di1.size()*sizeof(mytype), cudaMemcpyHostToDevice, s1);
cudaMemcpyAsync(thrust::raw_pointer_cast(di2.data()), thrust::raw_pointer_cast(hi2.data()), di2.size()*sizeof(mytype), cudaMemcpyHostToDevice, s2);
thrust::for_each(thrust::cuda::par.on(s1), do1.begin(), do1.begin()+KSIZE, sum_functor(thrust::raw_pointer_cast(di1.data())));
thrust::for_each(thrust::cuda::par.on(s2), do2.begin(), do2.begin()+KSIZE, sum_functor(thrust::raw_pointer_cast(di2.data())));
cudaMemcpyAsync(thrust::raw_pointer_cast(ho1.data()), thrust::raw_pointer_cast(do1.data()), do1.size()*sizeof(mytype), cudaMemcpyDeviceToHost, s1);
cudaMemcpyAsync(thrust::raw_pointer_cast(ho2.data()), thrust::raw_pointer_cast(do2.data()), do2.size()*sizeof(mytype), cudaMemcpyDeviceToHost, s2);
cudaDeviceSynchronize();
for (int i=0; i < KSIZE; i++){
if (ho1[i] != ((LSIZE*SSIZE*TV1) + i)) { std::cout << "mismatch on stream 1 at " << i << " was: " << ho1[i] << " should be: " << ((DSIZE*TV1)+i) << std::endl; return 1;}
if (ho2[i] != ((LSIZE*SSIZE*TV2) + i)) { std::cout << "mismatch on stream 2 at " << i << " was: " << ho2[i] << " should be: " << ((DSIZE*TV2)+i) << std::endl; return 1;}
}
std::cout << "Success!" << std::endl;
return 0;
}
For my test case, I used RHEL5.5, Quadro5000, and cuda 6.5RC. This example is designed to have thrust create very small kernels (only a single threadblock, as long as KSIZE
is small, say 32 or 64), so that the kernels that thrust creates from thrust::for_each
are able to run concurrently.
When I profile this code, I see:
This indicates that we are achieving proper overlap both between thrust kernels, and between copy operations and thrust kernels, as well as asynchronous data copying at the completion of the kernels. Note that the cudaDeviceSynchronize()
operation "fills" the timeline, indicating that all the async operations (data copying, thrust functions) were issued asynchronously and control returned to the host thread before any of the operations were underway. All of this is expected, proper behavior for full concurrency between host, GPU, and data copying operations.
How to asynchronously copy a disjoint subset of an array from device to host with CUDA/Thrust?
If the strides between elements of p_arr
are constant, then this is possible in a single operation with cudaMemcpy2DAsync
.
For varying strides its not possible in a single operation. Furthermore, the single operation method (with constant stride) is not necessarily the fastest way (the cudaMemcpy2DAsync
method does not necessarily get close to expected bus transfer speeds). For the fastest method, plus ability to handle varying strides between elements to be copied, the usual recommendation is to to break this into two steps.
- Do something like
thrust::gather
(orthrust::copy
with a permutation iterator) to collect all elements to be copied into a contiguous, temporary device buffer - then use
cudaMemcpyAsync
to copy that buffer to host.
About thrust::execution_policy when copying data from device to host
The Thrust doc on the thrust::device
states the following:
Raw pointers allocated by host APIs should not be mixed with a thrust::device algorithm invocation when the device backend is CUDA
To my understanding, this means that host-device copy with thrust::device
execution policy is invalid, in the first place, unless the host memory is pinned.
We imply that your host allocation is not pinned, BUT: One possibility is that on POWER9 with NVLINK you may be lucky that any host-allocated memory is addressable from within the GPU. Thanks to that, host-device copy with thrust::device
works, though it should not.
On a regular system, host memory is addressable from within a GPU only if this host memory is allocated with cudaMallocHost
(pinned). So, the question is whether your POWER system has automagically upgraded all allocations to be pinned. Is the observed performance bonus due to the implicitly-pinned memory, or would you get an additional speedup, if allocations are also done with cudaMallocHost
explicitly?
Another Thrust design-based evidence is that thrust::device
policy has par.on(stream)
support, while thrust::host
does not. This is pretty much aligned with the fact that asynchronous host-device copies are only possible with the pinned memory.
CUDA thrust: copy from device to device
The canonical way to do this is just to use thrust::copy
. The thrust::device_ptr
has standard pointer semantics and the API will seamlessly understand whether the source and destination pointers are on the host or device, viz:
#include <thrust/device_malloc.h>
#include <thrust/device_ptr.h>
#include <thrust/copy.h>
#include <iostream>
int main()
{
// Initial host data
int ivals[4] = { 1, 3, 6, 10 };
// Allocate and copy to first device allocation
thrust::device_ptr<int> dp1 = thrust::device_malloc<int>(4);
thrust::copy(&ivals[0], &ivals[0]+4, dp1);
// Allocate and copy to second device allocation
thrust::device_ptr<int> dp2 = thrust::device_malloc<int>(4);
thrust::copy(dp1, dp1+4, dp2);
// Copy back to host
int ovals[4] = {-1, -1, -1, -1};
thrust::copy(dp2, dp2+4, &ovals[0]);
for(int i=0; i<4; i++)
std::cout << ovals[i] << std::endl;
return 0;
}
which does this:
talonmies@box:~$ nvcc -arch=sm_30 thrust_dtod.cu
talonmies@box:~$ ./a.out
1
3
6
10
Device -> host vs host -> device copy performance in cuda
First off, rather than using CPU timers, keep in mind that it is better to use Cuda Event API for timing measurements. Also you may want to consider a warmup call before the timing (see here for more info). I think @Robert Crovella has already answered your question in his comment by mentioning that the vector instantiation is likely the cause of the time difference. But just to proof it, I did a simple test where I measured device to host (D2H) and host to device (H2D) transfer times for two cases with and without a vector allocation. Consider this code which is basically equal to your code:
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <iostream>
int main(){
int dimension = 1000000;
// Some dummy vector to wake up device
thrust::device_vector<int> dummy_vec (dimension, 1);
// Create a Cuda event
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
float elapsed = 0; // time in ms
thrust::host_vector <int> host_Table (dimension);
// H2D:
cudaEventRecord(start);
thrust::device_vector<int> device_Table = host_Table;
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed, start, stop);
std::cout<<"H2D elapsed time: " << elapsed << " ms"<< std::endl;
// D2H:
cudaEventRecord(start);
thrust::host_vector<int> host_TableCopiedFromDevice = device_Table;
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed, start, stop);
std::cout<<"D2H elapsed time: " << elapsed << " ms"<< std::endl;
}
Running this on a Titan Black (Ubuntu, CUDA 10.1) gives the following time values:
H2D elapsed time: 1.76941 ms
D2H elapsed time: 3.80643 ms
You are right here. The D2H time is almost 2 times larger than the H2D. Now, the same code with vectors allocated prior to the transfers:
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <iostream>
int main(){
int dimension = 1000000;
// Some dummy vector to wake up device
thrust::device_vector<int> dummy_vec (dimension, 1);
// Create a Cuda event
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
float elapsed = 0; // time in ms
// initialized vectors
thrust::host_vector <int> h_vec (dimension, 1);
thrust::device_vector <int> d_vec (dimension);
thrust::host_vector <int> h_vec_2 (dimension);
// H2D:
cudaEventRecord(start);
d_vec = h_vec;
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed, start, stop);
std::cout<<"H2D elapsed time: " << elapsed << " ms"<< std::endl;
// D2H:
cudaEventRecord(start);
h_vec_2 = d_vec;
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed, start, stop);
std::cout<<"D2H elapsed time: " << elapsed << " ms"<< std::endl;
}
which gives:
H2D elapsed time: 1.7777 ms
D2H elapsed time: 1.54707 ms
Which confirms that the H2D and D2H memory transfers are actually about the same if we exclude other factors. Another investigation that could have given you some hints was to change the dimension
to a smaller/larger value and see how that changes the time difference.
Getting CUDA Thrust to use a CUDA stream of your choice
I want to update the answer provided by talonmies following the release of Thrust 1.8 which introduces the possibility of indicating the CUDA execution stream as
thrust::cuda::par.on(stream)
see also
Thrust Release 1.8.0.
In the following, I'm recasting the example in
False dependency issue for the Fermi architecture
in terms of CUDA Thrust APIs.
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <thrust\device_vector.h>
#include <thrust\execution_policy.h>
#include "Utilities.cuh"
using namespace std;
#define NUM_THREADS 32
#define NUM_BLOCKS 16
#define NUM_STREAMS 3
struct BinaryOp{ __host__ __device__ int operator()(const int& o1,const int& o2) { return o1 * o2; } };
int main()
{
const int N = 6000000;
// --- Host side input data allocation and initialization. Registering host memory as page-locked (required for asynch cudaMemcpyAsync).
int *h_in = new int[N]; for(int i = 0; i < N; i++) h_in[i] = 5;
gpuErrchk(cudaHostRegister(h_in, N * sizeof(int), cudaHostRegisterPortable));
// --- Host side input data allocation and initialization. Registering host memory as page-locked (required for asynch cudaMemcpyAsync).
int *h_out = new int[N]; for(int i = 0; i < N; i++) h_out[i] = 0;
gpuErrchk(cudaHostRegister(h_out, N * sizeof(int), cudaHostRegisterPortable));
// --- Host side check results vector allocation and initialization
int *h_checkResults = new int[N]; for(int i = 0; i < N; i++) h_checkResults[i] = h_in[i] * h_in[i];
// --- Device side input data allocation.
int *d_in = 0; gpuErrchk(cudaMalloc((void **)&d_in, N * sizeof(int)));
// --- Device side output data allocation.
int *d_out = 0; gpuErrchk( cudaMalloc((void **)&d_out, N * sizeof(int)));
int streamSize = N / NUM_STREAMS;
size_t streamMemSize = N * sizeof(int) / NUM_STREAMS;
// --- Set kernel launch configuration
dim3 nThreads = dim3(NUM_THREADS,1,1);
dim3 nBlocks = dim3(NUM_BLOCKS, 1,1);
dim3 subKernelBlock = dim3((int)ceil((float)nBlocks.x / 2));
// --- Create CUDA streams
cudaStream_t streams[NUM_STREAMS];
for(int i = 0; i < NUM_STREAMS; i++)
gpuErrchk(cudaStreamCreate(&streams[i]));
/**************************/
/* BREADTH-FIRST APPROACH */
/**************************/
for(int i = 0; i < NUM_STREAMS; i++) {
int offset = i * streamSize;
cudaMemcpyAsync(&d_in[offset], &h_in[offset], streamMemSize, cudaMemcpyHostToDevice, streams[i]);
}
for(int i = 0; i < NUM_STREAMS; i++)
{
int offset = i * streamSize;
thrust::transform(thrust::cuda::par.on(streams[i]), thrust::device_pointer_cast(&d_in[offset]), thrust::device_pointer_cast(&d_in[offset]) + streamSize/2,
thrust::device_pointer_cast(&d_in[offset]), thrust::device_pointer_cast(&d_out[offset]), BinaryOp());
thrust::transform(thrust::cuda::par.on(streams[i]), thrust::device_pointer_cast(&d_in[offset + streamSize/2]), thrust::device_pointer_cast(&d_in[offset + streamSize/2]) + streamSize/2,
thrust::device_pointer_cast(&d_in[offset + streamSize/2]), thrust::device_pointer_cast(&d_out[offset + streamSize/2]), BinaryOp());
}
for(int i = 0; i < NUM_STREAMS; i++) {
int offset = i * streamSize;
cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost, streams[i]);
}
for(int i = 0; i < NUM_STREAMS; i++)
gpuErrchk(cudaStreamSynchronize(streams[i]));
gpuErrchk(cudaDeviceSynchronize());
// --- Release resources
gpuErrchk(cudaHostUnregister(h_in));
gpuErrchk(cudaHostUnregister(h_out));
gpuErrchk(cudaFree(d_in));
gpuErrchk(cudaFree(d_out));
for(int i = 0; i < NUM_STREAMS; i++)
gpuErrchk(cudaStreamDestroy(streams[i]));
cudaDeviceReset();
// --- GPU output check
int sum = 0;
for(int i = 0; i < N; i++) {
//printf("%i %i\n", h_out[i], h_checkResults[i]);
sum += h_checkResults[i] - h_out[i];
}
cout << "Error between CPU and GPU: " << sum << endl;
delete[] h_in;
delete[] h_out;
delete[] h_checkResults;
return 0;
}
The Utilities.cu and Utilities.cuh files needed to run such an example are maintained at this github page.
The Visual Profiler timeline shows the concurrency of CUDA Thrust operations and memory transfers
How to call a Thrust function in a stream from a kernel?
The following code compiles cleanly for me on CUDA 10.1.243:
$ cat t1518.cu
#include <thrust/scatter.h>
#include <thrust/execution_policy.h>
__global__ void async_scatter_kernel(float* first,
float* last,
int* map,
float* output)
{
cudaStream_t stream;
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
thrust::scatter(thrust::cuda::par.on(stream), first, last, map, output);
cudaDeviceSynchronize();
cudaStreamDestroy(stream);
}
int main(){
float *first = NULL;
float *last = NULL;
float *output = NULL;
int *map = NULL;
async_scatter_kernel<<<1,1>>>(first, last, map, output);
cudaDeviceSynchronize();
}
$ nvcc -arch=sm_35 -rdc=true t1518.cu -o t1518
$ nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243
$
The -arch=sm_35
(or similar) and -rdc=true
are necessary (but not in all cases sufficient) compile switches for any code that uses CUDA Dynamic Parallelism. If you omit, for example, the -rdc=true
switch, you get an error similar to what you describe:
$ nvcc -arch=sm_35 t1518.cu -o t1518
t1518.cu(11): error: calling a __host__ function("thrust::cuda_cub::par_t::on const") from a __global__ function("async_scatter_kernel") is not allowed
t1518.cu(11): error: identifier "thrust::cuda_cub::par_t::on const" is undefined in device code
2 errors detected in the compilation of "/tmp/tmpxft_00003a80_00000000-8_t1518.cpp1.ii".
$
So, for the example you have shown here, your compilation error can be eliminated either by updating to the latest CUDA version or by specifying the proper command line, or both.
How to effectively copy data from 2D host array (with padding) to 1D device array and remove the original padding in CUDA?
If the use case is only copying a subset of a larger source into a smaller destination which isn't strided (so contiguous), then a conditional copy with a predicate is probably the simplest approach (I guess gather would also work). Something like this:
#include <vector>
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <thrust/iterator/counting_iterator.h>
struct indexer
{
int lda0;
int lda1;
indexer() = default;
__device__ __host__
indexer(int l0, int l1) : lda0(l0), lda1(l1) {};
__device__ __host__
bool operator()(int x) {
int r = x % lda0;
return (r < lda1);
};
};
int main()
{
const int M0 = 5, N=3;
const int M1 = 3;
const int len1 = M1*N;
{
std::vector<int> data{ 1, 2, 3, -1, -1, 4, 5, 6, -1, -1, 7, 8, 9, -1, -1 };
thrust::device_vector<int> ddata = data;
thrust::device_vector<int> doutput(len1);
indexer pred(M0, M1);
thrust::counting_iterator<int> idx(0);
thrust::copy_if(ddata.begin(), ddata.end(), idx, doutput.begin(), pred);
for(int i=0; i<len1; i++) {
int val = doutput[i];
std::cout << i << " " << val << std::endl;
}
}
return 0;
}
Here the predicate will only select a subset of each column and copy them into a continuous output range:
$ nvcc -arch=sm_52 -std=c++11 -o subset subset.cu
$ ./subset
0 1
1 2
2 3
3 4
4 5
5 6
6 7
7 8
8 9
If you want something more general (so strided input and output) then you could probably use the same idea with scatter_if
. As noted in comments, this is trivially done with cudaMemcpy2D
or a copy kernel.
is there something equivalent to cudaMemcpy from device global memory to host global memory that I can call within a device function?
Is there something equivalent to
cudaMemcpy
from device global memory to host global memory that I can call within a device function?
No.
As pointed out in comments, your only choice here would be to use mapped or managed host memory which can be directly addressed in device code.
Related Topics
Do Stl Maps Initialize Primitive Types on Insert
Which Sorting Algorithm Is Used by Stl's List::Sort()
Is Extern "C" Only Required on the Function Declaration
Std::Unordered_Map::Find Using a Type Different Than the Key Type
Understanding the Different Clocks of Clock_Gettime()
Switch-Case Statement Without Break
Amortized Analysis of Std::Vector Insertion
Why Is Cmake Designed So That It Removes Runtime Path When Installing
Automatically Generate C++ File from Header
Is It a Good Practice to Use Unions in C++
Ensuring C++ Doubles Are 64 Bits
Is Returning a 2-Tuple Less Efficient Than Std::Pair
Why am I Not Provided with a Default Copy Constructor from a Volatile
Understand Op Registration and Kernel Linking in Tensorflow