Cuda Kernel Returning Vectors

Cuda kernel returning vectors

something like this should work (coded in browser, not tested):

// N is the maximum number of structs to insert
#define N 10000

typedef struct {
    int A, B, C; } Match;

__device__ Match dev_data[N];
__device__ int dev_count = 0;

__device__ int my_push_back(Match * mt) {
  int insert_pt = atomicAdd(&dev_count, 1);
  if (insert_pt < N){
    dev_data[insert_pt] = *mt;
    return insert_pt;}
  else return -1;}

__global__ void Find(veryLongPhrase * _phrase, Words * _word_list, vector<Match> * _matches)
{
    int a, b, c;

    [...] //Parallel search for each word in the phrase

    if(match) //When an occurrence is found
    {
        my_push_back(new Match{ A = a, B = b, C = c });    }
}

main()
{
    [...]

    veryLongPhrase * myPhrase = "The quick brown fox jumps over the lazy dog etc etc etc..."

    Words * wordList = {"the", "lazy"};

    Find<<< X, Y >>>(myPhrase, wordList);

    int dsize;
    cudaMemcpyFromSymbol(&dsize, dev_count, sizeof(int));
    vector<Match> results(dsize);
    cudaMemcpyFromSymbol(&(results[0]), dev_data, dsize*sizeof(Match));

    [...]

}

This will require compute capability 1.1 or better for the atomic operation.

nvcc -arch=sm_11 ...

Here's a worked example:

$ cat t347.cu
#include <iostream>
#include <vector>

// N is the maximum number of structs to insert
#define N 10000

typedef struct {
    int A, B, C; } Match;

__device__ Match dev_data[N];
__device__ int dev_count = 0;

__device__ int my_push_back(Match & mt) {
  int insert_pt = atomicAdd(&dev_count, 1);
  if (insert_pt < N){
    dev_data[insert_pt] = mt;
    return insert_pt;}
  else return -1;}

__global__ void Find()
{

    if(threadIdx.x < 10) //Simulate a found occurrence
    {
        Match a = { .A = 1, .B = 2, .C = 3 };
        my_push_back(a);    }
}

main()
{

    Find<<< 2, 256 >>>();

    int dsize;
    cudaMemcpyFromSymbol(&dsize, dev_count, sizeof(int));
    if (dsize >= N) {printf("overflow error\n"); return 1;}
    std::vector<Match> results(dsize);
    cudaMemcpyFromSymbol(&(results[0]), dev_data, dsize*sizeof(Match));
    std::cout << "number of matches = " << dsize << std::endl;
    std::cout << "A  =  " << results[dsize-1].A << std:: endl;
    std::cout << "B  =  " << results[dsize-1].B << std:: endl;
    std::cout << "C  =  " << results[dsize-1].C << std:: endl;

}
$ nvcc -arch=sm_11 -o t347 t347.cu
$ ./t347
number of matches = 20
A  =  1
B  =  2
C  =  3
$

Note that in this case my Match result struct creation is different, and I am passing by reference, but the concept is the same.

Creating a vector in cuda kernel

It is correct that thrust::device_vector is not usable in CUDA device code.

I'm not aware of any in-kernel container-like API that is part of the CUDA distribution itself. If you search around, however, you will probably find dozens of possibly useful/interesting implementations. A lower level library like trove could possibly give improved performance for this kind of use-case.

In your example, it appears that each thread will maintain its own "stack" or "vector" to keep track of tree traversal. (The method I will offer here depends on not having threads concurrently accessing the same stack. If you need concurrent access from several threads, the method here may be of interest as a starting point.)

If you know what the maximum probable size for such a stack would be, I would suggest allocating for it ahead of time, either a static (local) variable definition per-thread in-kernel, or a dynamic allocation e.g. via cudaMalloc. (I would not suggest in-kernel malloc for this, and I definitely would not suggest allocating/deallocating on-the-fly, for performance reasons.) The choice of which allocation method will give the most performance may depend on your actual test case. The coalescing rules (i.e. underlying storage method) are somewhat different for access to a global pointer vs. access to a local pointer. If your threads will tend to push or pop uniformly across a warp and as your code progresses, then either allocation method may give good performance. You can experiment with either approach.

Here's a fairly simple partially worked example of the "stack" methods you have outlined in your example, assuming the maximum stack size per thread is known a priori. It's by no means fully tested; my purpose is to give you some ideas or a starting point. However if you find errors, please feel free to point them out and I will try to address them.

$ cat t1082.cu

const size_t max_items = 256;

template <typename T>
class cu_st{  // simple implementation of "stack" function

  T *my_ptr;
  size_t n_items;
  size_t my_width;
  public:
    __host__ __device__
    cu_st(T *base, size_t id, size_t width=0){
      if (width == 0){ // "local" stack allocated
        my_ptr = base;
        my_width = 1;}
      else{            // "global" stack allocated
        my_ptr = base + id;
        my_width = width;}
      n_items = 0;}

    __host__ __device__
    int push_back(T &item){
      if (n_items < max_items){
        *my_ptr = item;
        my_ptr += my_width;
        n_items++;
        return 0;}
      return -1;}

    __host__ __device__
    T pop_back(){
      if (n_items > 0){
        n_items--;
        my_ptr -= my_width;}
      return *my_ptr;}

    __host__ __device__
    T back(){
      if (n_items > 0){
        return *(my_ptr-my_width);}
      return *my_ptr;}

    __host__ __device__
    bool empty(){
      return (n_items == 0);}

    __host__ __device__
    size_t size(){
      return n_items;}

    __host__ __device__
    size_t max_size(){
      return max_items;}

};

const size_t nTPB = 256;
const size_t nBLK = 256;

typedef int Node;

__global__
void kernel(Node **g_stack, size_t n)
{
    int idx = blockIdx.x * gridDim.x + threadIdx.x;
    if(idx >= n) return;
    Node *root = NULL;

    //method 1 - global stack
    cu_st<Node*> to_visit(g_stack, idx, gridDim.x*blockDim.x);
    to_visit.push_back(root);

    while(!to_visit.empty())
    {
        Node* n = to_visit.back();
        to_visit.pop_back();

    }

    //method 2 - local stack

    Node *l_stack[max_items];
    cu_st<Node*> l_to_visit(l_stack, idx);
    l_to_visit.push_back(root);

    while(!l_to_visit.empty())
    {
        Node* n = l_to_visit.back();
        l_to_visit.pop_back();

    }

}

int main(){

  Node **d_stack;
  cudaMalloc(&d_stack, nTPB*nBLK*max_items*sizeof(Node *));
  kernel<<<nBLK, nTPB>>>(d_stack, nTPB*nBLK);
  cudaDeviceSynchronize();
}
$ nvcc -o t1082 t1082.cu
$ cuda-memcheck ./t1082
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$

Notes:

Other than what you see here, this code is not tested. I'd suggest doing more verification before using it as-is.
As you can see in the code, there is essentially no error checking.
This sort of random access will generally tend to be slow, probably regardless of which allocation method you choose. If possible, minimize your use of such a "stack". If you know that the stack size per thread is very small, you could also try experimenting with using this construct with a __shared__ memory allocation.
Another allocation approach which I have not demonstrated here would be to give each thread a global allocation but have the thread push and pop contiguously rather than in the strided fashion I have shown here (algorithmically a combination of the two methods I have outlined here). Such a method will definitely degrade performance in the "uniform" case, but may give better performance in some "random" access patterns.

Dynamically expanding array in cuda kernel

I think your impression 1 could be implemented approximately with what is described here (per-thread stack, pre-allocated). It has the issues you mention, related to over-allocation. In newer GPUs, memory of several gigabytes (or more) is common, so the over-allocation concern might be not very serious, if total memory is not an issue.
I think your impression 2 could be implemented approximately with what is described here (device-wide thread-safe vector push_back). It has the issues you mention, related to lack of ordering of the results in the result vector. These could possibly be resolved with a sorting operation after the collection operation is complete.

(4. It sounds like you probably already have an idea of how to do your "worst-case" impression 4.)

That leaves impression 3. We could use an idea that is a combination of impression 1 and impression 2, i.e. create a per-thread vector push_back, but use an on-demand allocation via in-kernel malloc or new. In-kernel memory allocation like this is pretty slow, and not without its own issues (e.g. you may have to reserve additional heap space, in-kernel allocated heap memory cannot participate in a transfer to the host, small allocations may be inefficient in memory usage), but there really is no way to tell which of these approaches might be best, without more information about the dimensions of your problem. If keeping track of the parent nodes is a relatively infrequent operation as you are traversing the graph, the dynamic allocation approach may not be a problem.

Here's an example of how a simple vector (per-thread) could be created:

$ cat t376.cu
#include <iostream>
#include <cstdio>

#include <assert.h>
template <typename T>
class cu_vec{  // simple implementation of per-thread "vector"
  const size_t alloc_block_size = 4096; // tuning parameter
  T *my_ptr;
  size_t n_items;
  size_t alloc_blocks;
  public:
    __host__ __device__
    cu_vec(){
      assert(sizeof(T) <= alloc_block_size);
      n_items = 0;
      my_ptr = (T *)new char[alloc_block_size];
      assert(my_ptr != NULL);
      alloc_blocks = 1;}

    __host__ __device__
    cu_vec(size_t sz){
      assert(sizeof(T) <= alloc_block_size);
      n_items = sz;
      alloc_blocks = (n_items*sizeof(T)+alloc_block_size-1)/alloc_block_size;
      my_ptr = (T *)new char[alloc_blocks*alloc_block_size];
      assert(my_ptr != NULL);
      memset(my_ptr, 0, alloc_blocks*alloc_block_size);}

    __host__ __device__
    ~cu_vec(){
      if (my_ptr != NULL) delete[] my_ptr;
      }

    __host__ __device__
    void push_back(T const &item){ // first test if we can just store new item
      if ((n_items+1)*sizeof(T) > alloc_blocks*alloc_block_size){
        T *temp = (T *)new char[(alloc_blocks+1)*alloc_block_size];
        assert(temp != NULL);
        memcpy(temp, my_ptr, alloc_blocks*alloc_block_size);
        delete[] my_ptr;
        my_ptr = temp;
        alloc_blocks++;}
      my_ptr[n_items] = item;
      n_items++;}

    __host__ __device__
    size_t size(){
      return n_items;}

    __host__ __device__
    void clear(){
      n_items = 0;}

    __host__ __device__
    T& operator[](size_t idx){
      assert(idx < n_items);
      return my_ptr[idx];}

    __host__ __device__
    T& pop_back(){
      if (n_items > 0){
        n_items--;}
      return my_ptr[n_items];}

    __host__ __device__
    T* data(){
      return my_ptr;}

    __host__ __device__
    size_t storage_ratio(){
      return alloc_block_size/sizeof(T);}
};

struct ss
{
   unsigned x;
   float y;
};

__global__ void test(){

  cu_vec<ss> my_vec;
  ss temp = {threadIdx.x, 2.0f};
  my_vec.push_back(temp);
  assert(my_vec.size() == 1);
  assert(my_vec.storage_ratio() >= 1);
  ss temp2 = my_vec[0];
  printf("threadIdx.x: %u, ss.x: %u, ss.y: %f\n", threadIdx.x, temp2.x, temp2.y);
  temp.y = 3.0f;
  my_vec[0].x = temp.x;
  my_vec[0].y = temp.y;
  ss temp3 = my_vec.pop_back();
  printf("threadIdx.x: %u, ss.x: %u, ss.y: %f\n", threadIdx.x, temp3.x, temp3.y);
  my_vec.clear();
  temp.x = 0;
  for (int i = 0; i < 10000; i++){
    my_vec.push_back(temp);
    temp.x++;}
  temp.x--;
  for (int i = 0; i < 10000; i++) {
    assert(my_vec.pop_back().x == temp.x);
    temp.x--;}
  cu_vec<ss> my_vec2(2);
  assert(my_vec2[1].x == 0);
  assert(my_vec2[1].y == 0.0f);
}

int main(){

  //default heap space is 8MB, if needed reserve more with:
  cudaDeviceSetLimit(cudaLimitMallocHeapSize, (1048576*32));
  test<<<1, 4>>>();
  cudaDeviceSynchronize();
}
$ nvcc -std=c++11 -o t376 t376.cu
$ cuda-memcheck ./t376
========= CUDA-MEMCHECK
threadIdx.x: 0, ss.x: 0, ss.y: 2.000000
threadIdx.x: 1, ss.x: 1, ss.y: 2.000000
threadIdx.x: 2, ss.x: 2, ss.y: 2.000000
threadIdx.x: 3, ss.x: 3, ss.y: 2.000000
threadIdx.x: 0, ss.x: 0, ss.y: 3.000000
threadIdx.x: 1, ss.x: 1, ss.y: 3.000000
threadIdx.x: 2, ss.x: 2, ss.y: 3.000000
threadIdx.x: 3, ss.x: 3, ss.y: 3.000000
========= ERROR SUMMARY: 0 errors
$

The code hasn't been tested any more than what you see here.

Add vector with CUDA C++ : don't get the expected result

As said in the comment, the problem was on the cudeMemcpy(d_a, &a, size, cudaMemcpyHostToDevice)
The ampersand was not supposed to be here.

Returning references to CUDA-specific vector types

This solution was discussed in the comments and there I also stated that I find this a bit dirty, however if CUDA specification guarantees the alignment of the float4 and float1 values then this could be a valid option;

__device__ float1& f1(float4& v) {
    return *reinterpret_cast<float1*>(&v);
}

__device__ float& f2(float4& v) {
    return v.x;
}

In this solution, you reinterpret the address of v as a pointer to float1. You can then dereference the result to have v as float1&.

Be careful with reinterpret_cast and different struct's when it comes to alignment and offsets.

In the Compiler Explorer Example you can see both functions produce the exact same output.

CUDA Device Vector

Is it required that I specifically allocate memory for this vector in host with cudaMalloc and cudaMemcpy?

No. Copy assignment works just fine between thrust containers (host or device) and std::vector.

For example:

$ module load cuda/10.1

$ cat notreallyno.cu 

#include <thrust/device_vector.h>
#include <vector>
#include <cstdio>

__global__ void kernel (float* pd_vec, int n)
{
    if (threadIdx.x < n)
        printf("%d %f \n", threadIdx.x, pd_vec[threadIdx.x]);
}

int main()
{
    {
    std::vector<float> h_vec = { 1.1f, 2.2f, 3.3f, 4.4f, 5.5f, 6.6f, 7.7f, 8.8f, 9.9f, 10.01f };
    thrust::device_vector<float> d_vec = h_vec;
    float* pd_vec = thrust::raw_pointer_cast(d_vec.data());

    int n = h_vec.size();
    kernel<<<1, 32>>>(pd_vec, n);
    cudaDeviceSynchronize();
    }
    cudaDeviceReset();

    return 0;
}

$ nvcc -std=c++11 -arch=sm_52 -o notreallyno notreallyno.cu 

$ ./notreallyno 
0 1.100000 
1 2.200000 
2 3.300000 
3 4.400000 
4 5.500000 
5 6.600000 
6 7.700000 
7 8.800000 
8 9.900000 
9 10.010000

Cuda Kernel Returning Vectors