A basic demo of how to use CUDA in Python / Numpy using Pybind11.

Download a Git repo with the code here https://github.com/torstem/demo-cuda-pybind11/

Class-less version

test.py

import sys
sys.path.append('./build')

import gpu_library
import numpy

vec = numpy.linspace(0,1,10)

print("before: ", vec)
gpu_library.multiply_with_scalar(vec, 10)
print("after: ", vec)

CMakeLists.txt

cmake_minimum_required(VERSION 2.8)
find_package(CUDA)
find_package(PythonLibs 2.7 REQUIRED)

include_directories(${PYTHON_INCLUDE_DIRS})

cuda_add_library(gpu_library SHARED  gpu_library.cpp gpu_library.cu)
target_link_libraries(gpu_library ${PYTHON_LIBRARIES} ${CUDA_LIBRARIES})

set_target_properties(gpu_library PROPERTIES PREFIX "")

gpu_library.cpp

#include <iostream>
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <pybind11/stl.h>
#include <cuda_runtime.h>

void run_kernel
(double *vec, double scalar, int num_elements);

void multiply_with_scalar(pybind11::array_t<double> vec, double scalar)
{
  int size = 10;
  double *gpu_ptr;
  cudaError_t error = cudaMalloc(&gpu_ptr, size * sizeof(double));

  if (error != cudaSuccess) {
    throw std::runtime_error(cudaGetErrorString(error));
  }
  auto ha = vec.request();

  if (ha.ndim != 1) {
    std::stringstream strstr;
    strstr << "ha.ndim != 1" << std::endl;
    strstr << "ha.ndim: " << ha.ndim << std::endl;
    throw std::runtime_error(strstr.str());
  }

  double* ptr = reinterpret_cast<double*>(ha.ptr);
  error = cudaMemcpy(gpu_ptr, ptr, size * sizeof(double), cudaMemcpyHostToDevice);
  if (error != cudaSuccess) {
    throw std::runtime_error(cudaGetErrorString(error));
  }

  run_kernel(gpu_ptr, scalar, size);

  error = cudaMemcpy(ptr, gpu_ptr, size * sizeof(double), cudaMemcpyDeviceToHost);
  if (error != cudaSuccess) {
    throw std::runtime_error(cudaGetErrorString(error));
  }
}

PYBIND11_PLUGIN(gpu_library)
{
  pybind11::module m("gpu_library", "GPU Library");
  m.def("multiply_with_scalar", multiply_with_scalar);
  return m.ptr();
}

gpu_library.cu

#include <sstream>
#include <iostream>
#include <cuda_runtime.h>

__global__ void kernel
(double *vec, double scalar, int num_elements)
{
  unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < num_elements) {
    vec[idx] = vec[idx] * scalar;
  }
}


void run_kernel
(double *vec, double scalar, int num_elements)
{
  dim3 dimBlock(256, 1, 1);
  dim3 dimGrid(ceil((double)num_elements / dimBlock.x));

  kernel<<<dimGrid, dimBlock>>>
    (vec, scalar, num_elements);

  cudaError_t error = cudaGetLastError();
  if (error != cudaSuccess) {
    std::stringstream strstr;
    strstr << "run_kernel launch failed" << std::endl;
    strstr << "dimBlock: " << dimBlock.x << ", " << dimBlock.y << std::endl;
    strstr << "dimGrid: " << dimGrid.x << ", " << dimGrid.y << std::endl;
    strstr << cudaGetErrorString(error);
    throw strstr.str();
  }
}

Class-based version

Here the gpu array can be persistent over multiple calls. This can be very useful when the array is large and the function is called many times.

test.py

import sys
sys.path.append('./build')

import gpu_library
import numpy

vec = numpy.linspace(0,1,10)

print("before: ", vec)
gl = gpu_library.GPULibrary(10)
gl.set_data(vec)
gl.multiply_with_scalar(vec, 10)
gl.get_data(vec)
print("after: ", vec)

CMakeLists.txt

cmake_minimum_required(VERSION 2.8)
find_package(CUDA)
find_package(PythonLibs 2.7 REQUIRED)

include_directories(${PYTHON_INCLUDE_DIRS})

cuda_add_library(gpu_library SHARED  gpu_library.cpp gpu_library.cu)
target_link_libraries(gpu_library ${PYTHON_LIBRARIES} ${CUDA_LIBRARIES})

set_target_properties(gpu_library PROPERTIES PREFIX "")

gpu_library.cpp

#include <iostream>
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <pybind11/stl.h>
#include <cuda_runtime.h>

void run_kernel
(double *vec, double scalar, int num_elements);

struct gpu_library
{
  unsigned int array_size;
  double *gpu_ptr;

  gpu_library(unsigned int array_size_) : array_size(array_size_) {
    cudaError_t error = cudaMalloc(&gpu_ptr, array_size * sizeof(double));
    if (error != cudaSuccess) {
      throw std::runtime_error(cudaGetErrorString(error));
    }
  }

  ~gpu_library() {
    cudaFree(gpu_ptr);
  }

  void set_data(pybind11::array_t<double> vec) {
    auto ha = vec.request();

    if (ha.ndim != 1) {
      std::stringstream strstr;
      strstr << "ha.ndim != 1" << std::endl;
      strstr << "ha.ndim: " << ha.ndim << std::endl;
      throw std::runtime_error(strstr.str());
    }

    if (ha.shape[0] != array_size) {
    std::stringstream strstr;
    strstr << "ha.shape[0] != array_size" << std::endl;
    strstr << "ha.shape[0] " << vec.shape[0] << std::endl;
    strstr << "array_size: " << array_size << std::endl;
    throw std::runtime_error(strstr.str());
    }

    double* ptr = reinterpret_cast<double*>(ha.ptr);
    cudaError_t error = cudaMemcpy(gpu_ptr, ptr, array_size * sizeof(double), cudaMemcpyHostToDevice);
    if (error != cudaSuccess) {
      throw std::runtime_error(cudaGetErrorString(error));
    }    
  }

  void get_data(pybind11::array_t<double> vec) {
    auto ha = vec.request();

    if (ha.ndim != 1) {
      std::stringstream strstr;
      strstr << "ha.ndim != 1" << std::endl;
      strstr << "ha.ndim: " << ha.ndim << std::endl;
      throw std::runtime_error(strstr.str());
    }

    if (ha.shape[0] != array_size) {
    std::stringstream strstr;
    strstr << "vec.shape[0] != array_size" << std::endl;
    strstr << "vec.shape[0] " << vec.shape[0] << std::endl;
    strstr << "array_size: " << array_size << std::endl;
    throw std::runtime_error(strstr.str());
    }

    double* ptr = reinterpret_cast<double*>(ha.ptr);    
    cudaError_t error = cudaMemcpy(ptr, gpu_ptr, array_size * sizeof(double), cudaMemcpyDeviceToHost);
    if (error != cudaSuccess) {
      throw std::runtime_error(cudaGetErrorString(error));
    }
  }

  void multiply_with_scalar(double scalar)
  {
    run_kernel(gpu_ptr, scalar, array_size);
  }  
};


PYBIND11_PLUGIN(gpu_library)
{
  pybind11::module m("gpu_library", "GPU Library");

  pybind11::class_<gpu_library> gl(m, "GPULibrary");
  gl.def(pybind11::init<unsigned int>(),
     pybind11::arg("array_size"));
  gl.def("set_data", &gpu_library::set_data);
  gl.def("get_data", &gpu_library::get_data);  
  gl.def("multiply_with_scalar", &gpu_library::multiply_with_scalar);
  return m.ptr();
}

gpu_library.cu

#include <sstream>
#include <iostream>
#include <cuda_runtime.h>

__global__ void kernel
(double *vec, double scalar, int num_elements)
{
  unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < num_elements) {
    vec[idx] = vec[idx] * scalar;
  }
}


void run_kernel
(double *vec, double scalar, int num_elements)
{
  dim3 dimBlock(256, 1, 1);
  dim3 dimGrid(ceil((double)num_elements / dimBlock.x));

  kernel<<<dimGrid, dimBlock>>>
    (vec, scalar, num_elements);

  cudaError_t error = cudaGetLastError();
  if (error != cudaSuccess) {
    std::stringstream strstr;
    strstr << "run_kernel launch failed" << std::endl;
    strstr << "dimBlock: " << dimBlock.x << ", " << dimBlock.y << std::endl;
    strstr << "dimGrid: " << dimGrid.x << ", " << dimGrid.y << std::endl;
    strstr << cudaGetErrorString(error);
    throw strstr.str();
  }
}