Back to previous page
CUDA Conjugate Gradient Solver

CUDA is messy and ugly—but it is also fast! The code below implements the solver for Equation 2 in the paper. The whole operatinon can be probably expressed in about 20 lines in MATLAB. It is based on listing B2 in Shewchuk's tech report [1994]. This code has been tested with the NVIDIA CUDA Toolkit v5.0 (64 bit).
#include <cuda_runtime.h>
#include <cuda_gl_interop.h>
#include <helper_cuda.h>
#include "device_launch_parameters.h"

// define the following to insert "fences" in the rendering code for accurately measuring the timings of the asynchronous operations

#define MEASURE_TIMINGS

const float dataWeight = 0.1;
const int cgIterations = 10;

void initCuda()
{
    cudaGLSetGLDevice(gpuGetMaxGflopsDeviceId());
}

void integrateCuda()
{
    // init arrays for data, gx, gy, and result images
    cudaGraphicsResource * resData, * resGx, * resGy, * resResult;
    cudaArray * texData = 0, * texGx = 0, * texGy = 0, * texResult = 0;

    checkCudaErrors( cudaGraphicsGLRegisterImage(&resData, outTex[0], GL_TEXTURE_2D, cudaGraphicsRegisterFlagsReadOnly) );
    checkCudaErrors( cudaGraphicsGLRegisterImage(&resGx, outTex[1], GL_TEXTURE_2D, cudaGraphicsRegisterFlagsReadOnly) );
    checkCudaErrors( cudaGraphicsGLRegisterImage(&resGy, outTex[2], GL_TEXTURE_2D, cudaGraphicsRegisterFlagsReadOnly) );
    checkCudaErrors( cudaGraphicsGLRegisterImage(&resResult, outTex[3], GL_TEXTURE_2D, cudaGraphicsRegisterFlagsReadOnly) );

    float * dData, * dGx, * dGy, * dX;

    checkCudaErrors( cudaMalloc(&dData, outWidth*outHeight*4*sizeof(float)) );
    checkCudaErrors( cudaMalloc(&dGx, outWidth*outHeight*4*sizeof(float)) );
    checkCudaErrors( cudaMalloc(&dGy, outWidth*outHeight*4*sizeof(float)) );
    checkCudaErrors( cudaMalloc(&dX, outWidth*outHeight*4*sizeof(float)) );

    // render data term into a frame buffer object [0]
    double dataStartTime = getTime();
    glBindFramebuffer(GL_FRAMEBUFFER, outFbo[0]);
    drawDataTerm(renderData);
#ifdef MEASURE_TIMINGS
    glFinish();
#endif
    glBindFramebuffer(GL_FRAMEBUFFER, 0);

    checkCudaErrors( cudaGraphicsMapResources(1, &resData) );
    checkCudaErrors( cudaGraphicsSubResourceGetMappedArray(&texData, resData, 0, 0) );
    checkCudaErrors( cudaMemcpy2DFromArray(dData, outWidth*4*sizeof(float), texData, 0, 0, outWidth*4*sizeof(float), outHeight, cudaMemcpyDeviceToDevice) );
    checkCudaErrors( cudaGraphicsUnmapResources(1, &resData) );

    // intialize the solution "x" equal to the data term
    checkCudaErrors( cudaMemcpy(dX, dData, outWidth*outHeight*4*sizeof(float), cudaMemcpyDeviceToDevice) );
    double dataEndTime = getTime();

    // render the gx term into a frame buffer object [1]
    glBindFramebuffer(GL_FRAMEBUFFER, outFbo[1]);
    double gxStartTime = getTime();
    drawGradientTerm(renderData, true);
#ifdef MEASURE_TIMINGS
    glFinish();
#endif
    double gxEndTime = getTime();
    glBindFramebuffer(GL_FRAMEBUFFER, 0);

    checkCudaErrors( cudaGraphicsMapResources(1, &resGx) );
    checkCudaErrors( cudaGraphicsSubResourceGetMappedArray(&texGx, resGx, 0, 0) );
    checkCudaErrors( cudaMemcpy2DFromArray(dGx, outWidth*4*sizeof(float), texGx, 0, 0, outWidth*4*sizeof(float), outHeight, cudaMemcpyDeviceToDevice) );
    checkCudaErrors( cudaGraphicsUnmapResources(1, &resGx) );

    // render the gy term into a frame buffer object [2]
    glBindFramebuffer(GL_FRAMEBUFFER, outFbo[2]);
    double gyStartTime = getTime();
    drawGradientTerm(renderData, false);
#ifdef MEASURE_TIMINGS
    glFinish();
#endif
    double gyEndTime = getTime();
    glBindFramebuffer(GL_FRAMEBUFFER, 0);

    checkCudaErrors( cudaGraphicsMapResources(1, &resGy) );
    checkCudaErrors( cudaGraphicsSubResourceGetMappedArray(&texGy, resGy, 0, 0) );
    checkCudaErrors( cudaMemcpy2DFromArray(dGy, outWidth*4*sizeof(float), texGy, 0, 0, outWidth*4*sizeof(float), outHeight, cudaMemcpyDeviceToDevice) );
    checkCudaErrors( cudaGraphicsUnmapResources(1, &resGy) );

#ifdef MEASURE_TIMINGS
    cudaThreadSynchronize();
#endif

    double cgStartTime = getTime();
    cudaConjugateGradients(dataWeight, outWidth, outHeight, dData, dGx, dGy, dX, cgIterations);
#ifdef MEASURE_TIMINGS
    cudaThreadSynchronize();
#endif
    double cgEndTime = getTime();

    // copy solution to an OpenGL texture
    checkCudaErrors( cudaGraphicsMapResources(1, &resResult) );
    checkCudaErrors( cudaGraphicsSubResourceGetMappedArray(&texResult, resResult, 0, 0) );
    checkCudaErrors( cudaMemcpy2DToArray(texResult, 0, 0, dX, outWidth*4*sizeof(float), outWidth*4*sizeof(float), outHeight, cudaMemcpyDeviceToDevice) );
    checkCudaErrors( cudaGraphicsUnmapResources(1, &resResult) );

    // free the temporary arrays
    checkCudaErrors( cudaFree(dData) );
    checkCudaErrors( cudaFree(dGx) );
    checkCudaErrors( cudaFree(dGy) );
    checkCudaErrors( cudaFree(dX) );

    checkCudaErrors( cudaGraphicsUnregisterResource(resData) );
    checkCudaErrors( cudaGraphicsUnregisterResource(resGx) );
    checkCudaErrors( cudaGraphicsUnregisterResource(resGy) );
    checkCudaErrors( cudaGraphicsUnregisterResource(resResult) );

    double totalEndTime = getTime();
}

__global__ void initGradientKernel(const int stride, const int w, const int h,
                                   const float wD, 
                                   const float *const x, const float *const data,
                                   const float *const gX, const float *const gY,
                                   float *const residual, float *const rMagn2Array)
{
    extern __shared__ float sdata[];

    int j = blockIdx.x * blockDim.x + threadIdx.x;
    int i = blockIdx.y * blockDim.y + threadIdx.y;

    if (j >= w || i >= h)
    {
        return;
    }

    int ch = blockIdx.z * blockDim.z + threadIdx.z;
 
    int idx = i*stride + j*bTotal + ch;

    float c = x[idx];

    float gradD = wD * (c - data[idx]);

    float gradGX;
    if (j == 0)
    {
        gradGX = c - x[idx+bTotal] + gX[idx];
    }
    else if (j < w-1)
    {
        gradGX = 2*c - x[idx-bTotal] - x[idx+bTotal] + gX[idx] - gX[idx-bTotal];
    }
    else
    {
        gradGX = c - x[idx-bTotal] - gX[idx-bTotal];
    }

    float gradGY;
    if (i == 0)
    {
        gradGY = c - x[idx+stride] - gY[idx+stride];
    }
    else if (i < h-1)
    {
        gradGY = 2*c - x[idx+stride] - x[idx-stride] + gY[idx] - gY[idx+stride];
    }
    else
    {
        gradGY = c - x[idx-stride] + gY[idx];
    }

    float g = gradD + gradGX + gradGY;

    residual[idx] = -g;

    // rMagn2 reduction
    int tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.x*blockDim.y;
    sdata[tid] = g*g;

    __syncthreads();

    int elsPerBand = blockDim.x * blockDim.y;
    int size = 2*elsPerBand;

    if (tid < elsPerBand)
    {
        sdata[tid] += sdata[tid+size];
    }

    __syncthreads();

    for (size = size/2; size > 0; size >>= 1)
    {
        if (tid < size)
        {
            sdata[tid] += sdata[tid+size];
        }
        __syncthreads();
    }

    if (tid == 0)
    {
        rMagn2Array[blockIdx.x + blockIdx.y*gridDim.x] = sdata[0];
    }
}

void initGradient(const dim3 gridDim, const dim3 blockDim, const int blockEls,
                  const int stride, const int w, const int h,
                  const float wD,
                  const float *const dX, const float *const dData,
                  const float *const dGx, const float *const dGy,
                  float *const dResidual, float *const dDirection, 
                  float *const dRMagn2Array, float *const pRMagn2Array,
                  float & rMagn2)
{
    initGradientKernel<<>>(
        stride, w, h, wD,
        dX, dData, dGx, dGy,
        dResidual, dRMagn2Array);

    cudaMemcpy(pRMagn2Array, dRMagn2Array, gridDim.x*gridDim.y*sizeof(float), cudaMemcpyDeviceToHost);

    rMagn2 = pRMagn2Array[0];
    for (unsigned int i = 1; i < gridDim.x*gridDim.y; i++)
    {
        rMagn2 += pRMagn2Array[i];
    }

    cudaMemcpy(dDirection, dResidual, w*h*bTotal*sizeof(float), cudaMemcpyDeviceToDevice);
}

__global__ void conjugateKernel(const int stride, const int w, const int h,
                                const float beta, 
                                const float *const residual, float *const direction)
{
    extern __shared__ float sdata[];

    int j = blockIdx.x * blockDim.x + threadIdx.x;
    int i = blockIdx.y * blockDim.y + threadIdx.y;

    if (j >= w || i >= h)
    {
        return;
    }

    int ch = blockIdx.z * blockDim.z + threadIdx.z;

    int idx = i*stride + j*bTotal + ch;

    direction[idx] = residual[idx] + beta * direction[idx];
}

void conjugateGpu(const dim3 gridDim, const dim3 blockDim,
                  const int stride, const int w, const int h,
                  const float beta,
                  const float *const dResidual, float *const dDirection)
{
    conjugateKernel<<>>(stride, w, h, beta, dResidual, dDirection);
}

__global__ void computeGradientKernel(const int stride, const int w, const int h,
                                      const float wD, 
                                      const float *const direction, const float *const residual,
                                      float *const q,
                                      float *const alphaNumArray, float *const alphaDenArray)
{
    extern __shared__ float sdata[];

    int j = blockIdx.x * blockDim.x + threadIdx.x;
    int i = blockIdx.y * blockDim.y + threadIdx.y;

    if (j >= w || i >= h)
    {
        return;
    }

    int ch = blockIdx.z * blockDim.z + threadIdx.z;

    int idx = i*stride + j*bTotal + ch;

    float c = direction[idx];

    float gradD = wD * c;

    float gradGX;

    if (j == 0)
    {
        gradGX = c - direction[idx+bTotal];
    }
    else if (j < w-1)
    {
        gradGX = 2*c - direction[idx-bTotal] - direction[idx+bTotal];
    }
    else
    {
        gradGX = c - direction[idx-bTotal];
    }

    float gradGY;
    if (i == 0)
    {
        gradGY = c - direction[idx+stride];
    }
    else if (i < h-1)
    {
        gradGY = 2*c - direction[idx+stride] - direction[idx-stride];
    }
    else
    {
        gradGY = c - direction[idx-stride];
    }

    float g = gradD + gradGX + gradGY;

    q[idx] = g;

    // alphaNum / alphaDen reduction
    int elsPerBand = blockDim.x * blockDim.y;
    int size = 2*elsPerBand;

    int tidNum = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.x*blockDim.y;
    int tidDen = tidNum + 3*elsPerBand;
    sdata[tidNum] = c * residual[idx];
    sdata[tidDen] = g * c;

    __syncthreads();

    if (tidNum < elsPerBand)
    {
        sdata[tidNum] += sdata[tidNum+size];
        sdata[tidDen] += sdata[tidDen+size];
    }

    __syncthreads();

    for (size = size/2; size > 0; size >>= 1)
    {
        if (tidNum < size)
        {
            sdata[tidNum] += sdata[tidNum+size];
            sdata[tidDen] += sdata[tidDen+size];
        }
        __syncthreads();
    }

    if (tidNum == 0)
    {
        alphaNumArray[blockIdx.x + blockIdx.y*gridDim.x] = sdata[0];
        alphaDenArray[blockIdx.x + blockIdx.y*gridDim.x] = sdata[3*elsPerBand];
    }
}

void computeGradientGpu(const dim3 gridDim, const dim3 blockDim, const int blockEls,
                        const int stride, const int w, const int h,
                        const float wD,
                        const float *const dDirection, const float *const dResidual,
                        float *const dQ,
                        float *const dAlphaNumArray, float *const pAlphaNumArray,
                        float *const dAlphaDenArray, float *const pAlphaDenArray,
                        float & alpha)
{
    computeGradientKernel<<>>(
        stride, w, h, wD,
        dDirection, dResidual, dQ, dAlphaNumArray, dAlphaDenArray);

    cudaMemcpy(pAlphaNumArray, dAlphaNumArray, gridDim.x*gridDim.y*sizeof(float), cudaMemcpyDeviceToHost);
    cudaMemcpy(pAlphaDenArray, dAlphaDenArray, gridDim.x*gridDim.y*sizeof(float), cudaMemcpyDeviceToHost);

    float alphaNum = pAlphaNumArray[0];
    float alphaDen = pAlphaDenArray[0];
    for (unsigned int i = 1; i < gridDim.x*gridDim.y; i++)
    {
        alphaNum += pAlphaNumArray[i];
        alphaDen += pAlphaDenArray[i];
    }

    alpha = alphaNum / alphaDen;
}

__global__ void downhillStepKernel(const int stride, const int w, const int h,
                                   const float alpha, 
                                   const float *const direction, const float *const q,
                                   float *const x, float *const residual,
                                   float *const rMagn2Array)
{
    extern __shared__ float sdata[];

    int j = blockIdx.x * blockDim.x + threadIdx.x;
    int i = blockIdx.y * blockDim.y + threadIdx.y;

    if (j >= w || i >= h)
    {
        return;
    }

    int ch = blockIdx.z * blockDim.z + threadIdx.z;

    int idx = i*stride + j*bTotal + ch;

    x[idx] += alpha * direction[idx];

    float res = residual[idx] - alpha * q[idx];
    residual[idx] = res;

    // rMagn2 reduction
    int tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.x*blockDim.y;
    sdata[tid] = res*res;

    __syncthreads();

    int elsPerBand = blockDim.x * blockDim.y;
    int size = 2*elsPerBand;

    if (tid < elsPerBand)
    {
        sdata[tid] += sdata[tid+size];
    }

    __syncthreads();

    for (size = size/2; size > 0; size >>= 1)
    {
        if (tid < size)
        {
            sdata[tid] += sdata[tid+size];
        }
        __syncthreads();
    }
 
    if (tid == 1)
    {
        rMagn2Array[blockIdx.x + blockIdx.y*gridDim.x] = sdata[0];
    }
}
 
__global__ void downhillStepKernelLinear(const float alpha, 
                                         const float *const direction, const float *const q,
                                         float *const x, float *const residual,
                                         float *const rMagn2Array)
{
    extern __shared__ float sdata[];
 
    int tid = threadIdx.x;
    int idx = blockIdx.x * blockDim.x + tid;
 
    x[idx] += alpha * direction[idx];
    
    float res = residual[idx] - alpha * q[idx];
    residual[idx] = res;
 
    // rMagn2 reduction
    sdata[tid] = res*res;
 
    __syncthreads();
 
    for (unsigned int s = blockDim.x/2; s > 0; s >>= 1)
    {
        if (tid < s)
        {
            sdata[tid] += sdata[tid + s];
        }
 
        __syncthreads();
    }
 
    // write result for this block to global memory
    if (tid == 0)
    {
        rMagn2Array[blockIdx.x] = sdata[0];
    }
}
 
void downhillStepGpu(const dim3 gridDim, const dim3 blockDim, const int blockEls,
                     const int stride, const int w, const int h,
                     const float alpha,
                     const float *const dDirection, const float *const dQ,
                     float *const dX, float *const dResidual,
                     float *const dRMagn2Array, float *const pRMagn2Array,
                     float & rMagn2)
{
    downhillStepKernel<<>>(
        stride, w, h, alpha, dDirection, dQ, dX, dResidual, dRMagn2Array);
 
    cudaMemcpy(pRMagn2Array, dRMagn2Array, gridDim.x*gridDim.y*sizeof(float), cudaMemcpyDeviceToHost);
 
    rMagn2 = pRMagn2Array[0];
    for (unsigned int i = 1; i < gridDim.x*gridDim.y; i++)
    {
        rMagn2 += pRMagn2Array[i];
    }
}
 
void cudaConjugateGradients(const float wD, const int w, const int h,
                            const float *const dData,
                            const float *const dGx,
                            const float *const dGy,
                            float *const dX,
                            const int nIter)
{
    const int numBytes = w*h*bTotal*sizeof(float);
    const int blockSize = 8;
 
    float rMagn2, rMagn2Old, alpha;
 
    dim3 blockDim;
    blockDim.x = blockDim.y = blockSize;
    blockDim.z = 3;
 
    dim3 gridDim;
    gridDim.x = (w + blockSize - 1) / blockSize;
    gridDim.y = (h + blockSize - 1) / blockSize;
 
    const int blockEls = blockDim.x * blockDim.y * blockDim.z;
 
    // init GPU arrays
    float * dResidual, * dDirection, * dQ;
    checkCudaErrors( cudaMalloc((void**)&dResidual, numBytes) );
    checkCudaErrors( cudaMalloc((void**)&dDirection, numBytes) );
    checkCudaErrors( cudaMalloc((void**)&dQ, numBytes) );
 
    float * dSum1Array, * dSum2Array, * pSum1Array, * pSum2Array;
    pSum1Array = new float[gridDim.x*gridDim.y];
    pSum2Array = new float[gridDim.x*gridDim.y];
    checkCudaErrors( cudaMalloc((void**)&dSum1Array, gridDim.x*gridDim.y*sizeof(float)) );
    checkCudaErrors( cudaMalloc((void**)&dSum2Array, gridDim.x*gridDim.y*sizeof(float)) );
    
    const int stride = w*bTotal;
 
    // initialize gradient
    initGradient(gridDim, blockDim, blockEls, stride, w, h, wD,
                 dX, dData, dGx, dGy, dResidual, dDirection, dSum1Array,
                 pSum1Array, rMagn2);
    rMagn2Old = rMagn2;
 
    for (int iter = 0; iter < nIter; iter++)
    {
        if (iter > 0)
        {
            float beta = rMagn2 / rMagn2Old;
            conjugateGpu(gridDim, blockDim, stride, w, h, beta, dResidual, dDirection);
        }
 
        // compute the step size
        computeGradientGpu(gridDim, blockDim, blockEls, stride, w, h, wD,
                           dDirection, dResidual, dQ,
                           dSum1Array, pSum1Array, dSum2Array, pSum2Array, alpha);
 
        // take a downhill step and update residual
        rMagn2Old = rMagn2;
        downhillStepGpu(gridDim, blockDim, blockEls, stride, w, h, alpha,
                        dDirection, dQ, dX, dResidual, dSum1Array, pSum1Array, rMagn2);
    }
 
    // clean up
    delete[] pSum1Array;
    delete[] pSum2Array;
 
    cudaFree(dResidual);
    cudaFree(dDirection);
    cudaFree(dQ);
    cudaFree(dSum1Array);
    cudaFree(dSum2Array);
}
Back to previous page