Writing your own CUDA kernel (Part 2)

Posted on mar. 02 octobre 2018 in Tutorial by Laura Domine

If you followed part 1 of the tutorial, you now understand how to write the CUDA kernel itself. However it would be even better if you could use it directly in your Tensorflow or PyTorch program. In this part of the tutorial we show how to do it in Tensorflow.

In order to integrate this new CUDA kernel to our network we have to wrap it into a new TensorFlow operation. The official documentation on this point is pretty poor and full of mistakes. We will end up with 3 new files:

  • crop_op.h header file
  • crop_op.cc Registers the new TF operation and CPU/GPU kernels. Defines CPU implementation and OpKernel.
  • crop_op.cu.cc Defines GPU kernels

crop_op.h

This is the headers file, it contains 2 definitions:

#ifndef CROP_OP_H_
#define CROP_OP_H_

#include "tensorflow/core/framework/op_kernel.h"
#include 

using namespace tensorflow;

template <typename Device, typename T>
struct CropFunctor {
  void operator()(
    const Device& d,
    const T* image_ptr,
    const int* crop_centers_ptr,
    int crop_size,
    int image_size,
    int channels,
    int num_crops,
    T* crops_ptr
  );
};

We define here a general version of CropFunctor which will run on CPU.

#if GOOGLE_CUDA
// Partially specialize functor for GpuDevice.
template <typename T>
struct CropFunctor<Eigen::GpuDevice, T> {
  void operator()(
    const Eigen::GpuDevice& d,
    const T* image_ptr,
    const int* crop_centers_ptr,
    int crop_size,
    int image_size,
    int channels,
    int num_crops,
    T* crops_ptr
  );
};
#endif

#endif // CROP_OP_H_

This partially specialized version of CropFunctor will run on GPU.

crop_op.cc

Moving on to crop_op.cc, we have the registration of the new TF operation called Crop and of all its arguments:

#include "crop_op.h"

using namespace tensorflow;

// Register TF operation
REGISTER_OP("Crop")
    .Attr("T: {float, int32} = DT_FLOAT")
    .Input("image: float32")
    .Input("crop_centers: int32")
    .Input("crop_size: int32")
    .Output("crops: float32");

using CPUDevice = Eigen::ThreadPoolDevice;
using GPUDevice = Eigen::GpuDevice;

We then define an (empty) CPU implementation of CropFunctor.

// CPU specialization of actual computation.
template <typename T>
struct CropFunctor<CPUDevice, T> {
  void operator()(
    const CPUDevice& d,
    const T* image_ptr,
    const int* crop_centers_ptr,
    int crop_size,
    int image_size,
    int channels,
    int num_crops,
    T* crops_ptr
  ) {

  }
};

We then define the OpKernel which will interface between TF and the CropFunctor functions.

// OpKernel definition.
// template parameter  is the datatype of the tensors.
template <typename Device, typename T>
class CropOp : public OpKernel {
 public:
  explicit CropOp(OpKernelConstruction* context) : OpKernel(context) {}

  void Compute(OpKernelContext* context) override {

We can grab the input data, crop centers and crop size through context->input.

// Grab the input tensors
    const Tensor& image = context->input(0);
    const Tensor& crop_centers = context->input(1);
    const Tensor& crop_size_tensor = context->input(2);
    // FIXME
    OP_REQUIRES(context, TensorShapeUtils::IsScalar(crop_size_tensor.shape()), errors::InvalidArgument("crop_size must be scalar, has shape ", crop_size_tensor.shape().DebugString()));
    //const int crop_size = crop_size_tensor.scalar()();
    const int crop_size = 64;

We define the output array and its shape:

// Get shapes of input tensors
    const TensorShape& image_shape = image.shape();
    const TensorShape& crop_centers_shape = crop_centers.shape();
    int image_size = image_shape.dim_size(1);
    int channels = image_shape.dim_size(3);
    int num_crops = crop_centers_shape.dim_size(0);
    int dim = crop_centers_shape.dim_size(1);

    // Create an output tensor
    Tensor* crops = NULL;
    // create output shape
    TensorShape crops_shape;
    crops_shape.AddDim(num_crops);
    crops_shape.AddDim(crop_size);
    crops_shape.AddDim(crop_size);
    crops_shape.AddDim(crop_size);
    crops_shape.AddDim(channels);
    OP_REQUIRES_OK(context, context->allocate_output(0, crops_shape,
                                                     &crops));

Finally CropFunctor is called.

// Do the computation.
    CropFunctor<Device, T>()(
        context->eigen_device<Device>(),
        image.flat<T>().data(),
        crop_centers.flat<int>().data(),
        crop_size,
        image_size,
        channels,
        num_crops,
        crops->flat<T>().data()
      );

  }
};

Last thing in crop_op.cc, we register our CPU and GPU kernels with TensorFlow.

// Register the CPU kernels.
#define REGISTER_CPU(T) \
  REGISTER_KERNEL_BUILDER( \
      Name("Crop") \
      .Device(DEVICE_CPU) \
      .TypeConstraint("T"), \
    CropOp);
REGISTER_CPU(float);
REGISTER_CPU(int32);

// Register the GPU kernels.
#ifdef GOOGLE_CUDA
#define REGISTER_GPU(T) \
  extern template struct CropFunctor; \
  REGISTER_KERNEL_BUILDER( \
      Name("Crop")      \
      .Device(DEVICE_GPU)   \
      .TypeConstraint("T"),  \
    CropOp);
REGISTER_GPU(float);
REGISTER_GPU(int32);
#endif // GOOGLE_CUDA

crop_op.cu.cc

This is the CUDA kernel definition itself.

#ifdef GOOGLE_CUDA
#define EIGEN_USE_GPU

#include "crop_op.h"
#include "tensorflow/core/util/cuda_kernel_helper.h"

using namespace tensorflow;

using GPUDevice = Eigen::GpuDevice;

// Define the CUDA kernel.
// template 
// __global__ void CropCudaKernel
// ...

Our custom CUDA kernel implementation should come around here.

// Define the GPU implementation that launches the CUDA kernel.
template <typename T>
void CropFunctor<GPUDevice, T>::operator()(
    const GPUDevice& d,
    const T* image_ptr,
    const int* crop_centers_ptr,
    int crop_size,
    int image_size,
    int channels,
    int num_crops,
    T* crops_ptr
  ) {
  // Launch the cuda kernel.
  int block_count = num_crops;
  int thread_per_block = 1024;
  CropCudaKernel<T>
      <<<block_count, thread_per_block, 0, d.stream()>>>(
        image_ptr,
        crop_centers_ptr,
        image_size,
        channels,
        crop_size,
        num_crops,
        crops_ptr
      );
    cudaDeviceSynchronize();
}

// Explicitly instantiate functors for the types of OpKernels registered.
template struct CropFunctor<GPUDevice, float>;
template struct CropFunctor<GPUDevice, int32>;

#endif // GOOGLE_CUDA

This is the function that will set the number of blocks and threads per block before calling the kernel. The syntax to call the kernel is slightly different from the syntax for a normal function.

Compiling

You have to first compile the CUDA kernel using nvcc compiler, then compile the C++ wrappers with g++ compiler. There are many subtleties that I do not master in this compilation chain and linking, but in short, here is a Makefile that worked for me (and runs both on CPU and GPU).

#Get location of Tensorflow headers and library files
TF_INC=$(shell python -c 'import tensorflow as tf; print(tf.sysconfig.get_include())')
TF_LIB=$(shell python -c 'import tensorflow as tf; print(tf.sysconfig.get_lib())')

CC        = gcc -O2 -pthread
CXX       = g++
GPUCC     = nvcc
CFLAGS    = -std=c++11 -I$(TF_INC) -D_GLIBCXX_USE_CXX11_ABI=0
GPUCFLAGS = -c
LFLAGS    =  -shared -fPIC -ltensorflow_framework -I$(TF_INC) -I$(TF_INC)/external/nsync/public -L$(TF_LIB) -L/usr/local/cuda/lib64 -I/usr/local/cuda/include
GPULFLAGS = -x cu -shared -Xcompiler -fPIC -ltensorflow_framework -I$(TF_INC) -I$(TF_INC)/external/nsync/public -L$(TF_LIB)
DEBUG = -g -G
GPUDEF    = -D GOOGLE_CUDA=1
CGPUFLAGS = -lcudart


SRC       = crop_op.cc
GPUSRC    = crop_op_gpu.cu.cc
PROD      = crop_op.so
GPUPROD = crop_op_cu.so

default: gpu

cpu:
    $(CXX) $(CFLAGS) $(SRC) $(LFLAGS) -o $(PROD)

gpu:
    $(GPUCC) $(CFLAGS) $(GPUCFLAGS) $(GPUSRC) $(GPULFLAGS) -o $(GPUPROD) $(GPUDEF) -I/usr/local/ --expt-relaxed-constexpr -D_MWAITXINTRIN_H_INCLUDED
    $(CXX) $(CFLAGS)  $(SRC) $(GPUPROD) $(LFLAGS) $(CGPUFLAGS) -o $(PROD) $(GPUDEF)

clean:
    rm -f $(PROD) $(GPUPROD)

Testing and using our new Tensorflow operation

We can use the tensorflow TestCase class to test our new operation. Let us first define a CPU, Numpy implementation of the cropping operation:

In [ ]:
import tensorflow as tf
import numpy as np
import time

def crop(patch_centers, N, data):
    """
    Slice patches of size N centered at patch_centers in data.
    Assumes data has shape (1, M, M, M, channels)
    or (1, M, M, channels)
    """
    coords0 = np.floor(patch_centers - N/2.0)  # bottom left corner
    coords1 = np.floor(patch_centers + N/2.0)  # top right corner
    dim = patch_centers.shape[1]
    image_size = data.shape[1]
    coords0 = np.clip(coords0, 0, image_size).astype(int)
    coords1 = np.clip(coords1, 0, image_size).astype(int)
    crops = np.zeros((coords0.shape[0],) + (N,) * dim + (data.shape[-1],))
    crops_labels = np.zeros_like(crops)
    for j in range(len(coords0)):
        padding = []
        for d in range(dim):
            pad = np.maximum(N - (coords1[j, d] - coords0[j, d]), 0)
            if coords0[j, d] == 0.0:
                padding.append((pad, 0))
            else:
                padding.append((0, pad))
        padding.append((0, 0))
        if dim == 2:
            crops[j] = np.pad(data[0,
                                   coords0[j, 0]:coords1[j, 0],
                                   coords0[j, 1]:coords1[j, 1],
                                   :],
                              padding, 'constant')
        else:  # dim == 3
            crops[j] = np.pad(data[0,
                                   coords0[j, 0]:coords1[j, 0],
                                   coords0[j, 1]:coords1[j, 1],
                                   coords0[j, 2]:coords1[j, 2],
                                   :],
                              padding, 'constant')
    return crops

Now we can define our test and compare the timing between Numpy and CUDA implementations:

In [ ]:
class CropTest(tf.test.TestCase):
    def testCrop(self):
        crop_module = tf.load_op_library('./faster_particles/crop_op/crop_op.so')

        np.random.seed(123)
        tf.set_random_seed(123)
        
        N = 192
        # The more steps here, the more accurate the timings will be
        # Remember that TF first few calls to sess.run are always slower
        MAX_STEPS = 200
        CROP_SIZE = 64
        
        # Define dummy crop centers
        image_np = (np.random.rand(N, N, N, 1) * N).astype(np.float32)
        crop_centers_np = np.random.randint(50, high=100, size=(100, 3))
        
        # Define TF equivalents
        image = tf.constant(image_np, dtype=tf.float32)
        crop_centers = tf.constant(crop_centers_np, dtype=tf.int32)
        
        # >>> Call our CUDA kernel! <<<
        crops = crop_module.crop(image, crop_centers, CROP_SIZE)
        
        with self.test_session():
            duration = 0
            for i in range(MAX_STEPS):
                start = time.time()
                tf_result = crops.eval()
                end = time.time()
                duration += end - start
            print("TF duration = %f s" % (duration / MAX_STEPS))
            duration = 0
            for i in range(MAX_STEPS):
                start = time.time()
                np_result, _ = crop_numpy(crop_centers_np, CROP_SIZE, image_np[np.newaxis, ...])
                end = time.time()
                duration += end - start
            print("NP duration = %f s" % (duration / MAX_STEPS))
            self.assertAllClose(tf_result, np_result)

To run the test just execute the following line:

In [ ]:
tf.test.main()

The timing outcomes are in my case:

TF duration = 0.011914 s
NP duration = 0.138170 s

So we get a speedup of 10x for this small toy example. It is not that spectacular, because the number of blocks is relatively low (100 * 64 = 6400 blocks). By tuning the number of blocks, threads per block etc, the speedup could probably be better. How much will you be able to achieve with your own use case? Let us know!