#include <cuda_runtime.h>
#include <cuda_gl_interop.h>
#include <helper_cuda.h>
#include "device_launch_parameters.h"
// define the following to insert "fences" in the rendering code for accurately measuring the timings of the asynchronous operations
#define MEASURE_TIMINGS
const float dataWeight = 0.1;
const int cgIterations = 10;
void initCuda()
{
cudaGLSetGLDevice(gpuGetMaxGflopsDeviceId());
}
void integrateCuda()
{
// init arrays for data, gx, gy, and result images
cudaGraphicsResource * resData, * resGx, * resGy, * resResult;
cudaArray * texData = 0, * texGx = 0, * texGy = 0, * texResult = 0;
checkCudaErrors( cudaGraphicsGLRegisterImage(&resData, outTex[0], GL_TEXTURE_2D, cudaGraphicsRegisterFlagsReadOnly) );
checkCudaErrors( cudaGraphicsGLRegisterImage(&resGx, outTex[1], GL_TEXTURE_2D, cudaGraphicsRegisterFlagsReadOnly) );
checkCudaErrors( cudaGraphicsGLRegisterImage(&resGy, outTex[2], GL_TEXTURE_2D, cudaGraphicsRegisterFlagsReadOnly) );
checkCudaErrors( cudaGraphicsGLRegisterImage(&resResult, outTex[3], GL_TEXTURE_2D, cudaGraphicsRegisterFlagsReadOnly) );
float * dData, * dGx, * dGy, * dX;
checkCudaErrors( cudaMalloc(&dData, outWidth*outHeight*4*sizeof(float)) );
checkCudaErrors( cudaMalloc(&dGx, outWidth*outHeight*4*sizeof(float)) );
checkCudaErrors( cudaMalloc(&dGy, outWidth*outHeight*4*sizeof(float)) );
checkCudaErrors( cudaMalloc(&dX, outWidth*outHeight*4*sizeof(float)) );
// render data term into a frame buffer object [0]
double dataStartTime = getTime();
glBindFramebuffer(GL_FRAMEBUFFER, outFbo[0]);
drawDataTerm(renderData);
#ifdef MEASURE_TIMINGS
glFinish();
#endif
glBindFramebuffer(GL_FRAMEBUFFER, 0);
checkCudaErrors( cudaGraphicsMapResources(1, &resData) );
checkCudaErrors( cudaGraphicsSubResourceGetMappedArray(&texData, resData, 0, 0) );
checkCudaErrors( cudaMemcpy2DFromArray(dData, outWidth*4*sizeof(float), texData, 0, 0, outWidth*4*sizeof(float), outHeight, cudaMemcpyDeviceToDevice) );
checkCudaErrors( cudaGraphicsUnmapResources(1, &resData) );
// intialize the solution "x" equal to the data term
checkCudaErrors( cudaMemcpy(dX, dData, outWidth*outHeight*4*sizeof(float), cudaMemcpyDeviceToDevice) );
double dataEndTime = getTime();
// render the gx term into a frame buffer object [1]
glBindFramebuffer(GL_FRAMEBUFFER, outFbo[1]);
double gxStartTime = getTime();
drawGradientTerm(renderData, true);
#ifdef MEASURE_TIMINGS
glFinish();
#endif
double gxEndTime = getTime();
glBindFramebuffer(GL_FRAMEBUFFER, 0);
checkCudaErrors( cudaGraphicsMapResources(1, &resGx) );
checkCudaErrors( cudaGraphicsSubResourceGetMappedArray(&texGx, resGx, 0, 0) );
checkCudaErrors( cudaMemcpy2DFromArray(dGx, outWidth*4*sizeof(float), texGx, 0, 0, outWidth*4*sizeof(float), outHeight, cudaMemcpyDeviceToDevice) );
checkCudaErrors( cudaGraphicsUnmapResources(1, &resGx) );
// render the gy term into a frame buffer object [2]
glBindFramebuffer(GL_FRAMEBUFFER, outFbo[2]);
double gyStartTime = getTime();
drawGradientTerm(renderData, false);
#ifdef MEASURE_TIMINGS
glFinish();
#endif
double gyEndTime = getTime();
glBindFramebuffer(GL_FRAMEBUFFER, 0);
checkCudaErrors( cudaGraphicsMapResources(1, &resGy) );
checkCudaErrors( cudaGraphicsSubResourceGetMappedArray(&texGy, resGy, 0, 0) );
checkCudaErrors( cudaMemcpy2DFromArray(dGy, outWidth*4*sizeof(float), texGy, 0, 0, outWidth*4*sizeof(float), outHeight, cudaMemcpyDeviceToDevice) );
checkCudaErrors( cudaGraphicsUnmapResources(1, &resGy) );
#ifdef MEASURE_TIMINGS
cudaThreadSynchronize();
#endif
double cgStartTime = getTime();
cudaConjugateGradients(dataWeight, outWidth, outHeight, dData, dGx, dGy, dX, cgIterations);
#ifdef MEASURE_TIMINGS
cudaThreadSynchronize();
#endif
double cgEndTime = getTime();
// copy solution to an OpenGL texture
checkCudaErrors( cudaGraphicsMapResources(1, &resResult) );
checkCudaErrors( cudaGraphicsSubResourceGetMappedArray(&texResult, resResult, 0, 0) );
checkCudaErrors( cudaMemcpy2DToArray(texResult, 0, 0, dX, outWidth*4*sizeof(float), outWidth*4*sizeof(float), outHeight, cudaMemcpyDeviceToDevice) );
checkCudaErrors( cudaGraphicsUnmapResources(1, &resResult) );
// free the temporary arrays
checkCudaErrors( cudaFree(dData) );
checkCudaErrors( cudaFree(dGx) );
checkCudaErrors( cudaFree(dGy) );
checkCudaErrors( cudaFree(dX) );
checkCudaErrors( cudaGraphicsUnregisterResource(resData) );
checkCudaErrors( cudaGraphicsUnregisterResource(resGx) );
checkCudaErrors( cudaGraphicsUnregisterResource(resGy) );
checkCudaErrors( cudaGraphicsUnregisterResource(resResult) );
double totalEndTime = getTime();
}
__global__ void initGradientKernel(const int stride, const int w, const int h,
const float wD,
const float *const x, const float *const data,
const float *const gX, const float *const gY,
float *const residual, float *const rMagn2Array)
{
extern __shared__ float sdata[];
int j = blockIdx.x * blockDim.x + threadIdx.x;
int i = blockIdx.y * blockDim.y + threadIdx.y;
if (j >= w || i >= h)
{
return;
}
int ch = blockIdx.z * blockDim.z + threadIdx.z;
int idx = i*stride + j*bTotal + ch;
float c = x[idx];
float gradD = wD * (c - data[idx]);
float gradGX;
if (j == 0)
{
gradGX = c - x[idx+bTotal] + gX[idx];
}
else if (j < w-1)
{
gradGX = 2*c - x[idx-bTotal] - x[idx+bTotal] + gX[idx] - gX[idx-bTotal];
}
else
{
gradGX = c - x[idx-bTotal] - gX[idx-bTotal];
}
float gradGY;
if (i == 0)
{
gradGY = c - x[idx+stride] - gY[idx+stride];
}
else if (i < h-1)
{
gradGY = 2*c - x[idx+stride] - x[idx-stride] + gY[idx] - gY[idx+stride];
}
else
{
gradGY = c - x[idx-stride] + gY[idx];
}
float g = gradD + gradGX + gradGY;
residual[idx] = -g;
// rMagn2 reduction
int tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.x*blockDim.y;
sdata[tid] = g*g;
__syncthreads();
int elsPerBand = blockDim.x * blockDim.y;
int size = 2*elsPerBand;
if (tid < elsPerBand)
{
sdata[tid] += sdata[tid+size];
}
__syncthreads();
for (size = size/2; size > 0; size >>= 1)
{
if (tid < size)
{
sdata[tid] += sdata[tid+size];
}
__syncthreads();
}
if (tid == 0)
{
rMagn2Array[blockIdx.x + blockIdx.y*gridDim.x] = sdata[0];
}
}
void initGradient(const dim3 gridDim, const dim3 blockDim, const int blockEls,
const int stride, const int w, const int h,
const float wD,
const float *const dX, const float *const dData,
const float *const dGx, const float *const dGy,
float *const dResidual, float *const dDirection,
float *const dRMagn2Array, float *const pRMagn2Array,
float & rMagn2)
{
initGradientKernel<<>>(
stride, w, h, wD,
dX, dData, dGx, dGy,
dResidual, dRMagn2Array);
cudaMemcpy(pRMagn2Array, dRMagn2Array, gridDim.x*gridDim.y*sizeof(float), cudaMemcpyDeviceToHost);
rMagn2 = pRMagn2Array[0];
for (unsigned int i = 1; i < gridDim.x*gridDim.y; i++)
{
rMagn2 += pRMagn2Array[i];
}
cudaMemcpy(dDirection, dResidual, w*h*bTotal*sizeof(float), cudaMemcpyDeviceToDevice);
}
__global__ void conjugateKernel(const int stride, const int w, const int h,
const float beta,
const float *const residual, float *const direction)
{
extern __shared__ float sdata[];
int j = blockIdx.x * blockDim.x + threadIdx.x;
int i = blockIdx.y * blockDim.y + threadIdx.y;
if (j >= w || i >= h)
{
return;
}
int ch = blockIdx.z * blockDim.z + threadIdx.z;
int idx = i*stride + j*bTotal + ch;
direction[idx] = residual[idx] + beta * direction[idx];
}
void conjugateGpu(const dim3 gridDim, const dim3 blockDim,
const int stride, const int w, const int h,
const float beta,
const float *const dResidual, float *const dDirection)
{
conjugateKernel<<>>(stride, w, h, beta, dResidual, dDirection);
}
__global__ void computeGradientKernel(const int stride, const int w, const int h,
const float wD,
const float *const direction, const float *const residual,
float *const q,
float *const alphaNumArray, float *const alphaDenArray)
{
extern __shared__ float sdata[];
int j = blockIdx.x * blockDim.x + threadIdx.x;
int i = blockIdx.y * blockDim.y + threadIdx.y;
if (j >= w || i >= h)
{
return;
}
int ch = blockIdx.z * blockDim.z + threadIdx.z;
int idx = i*stride + j*bTotal + ch;
float c = direction[idx];
float gradD = wD * c;
float gradGX;
if (j == 0)
{
gradGX = c - direction[idx+bTotal];
}
else if (j < w-1)
{
gradGX = 2*c - direction[idx-bTotal] - direction[idx+bTotal];
}
else
{
gradGX = c - direction[idx-bTotal];
}
float gradGY;
if (i == 0)
{
gradGY = c - direction[idx+stride];
}
else if (i < h-1)
{
gradGY = 2*c - direction[idx+stride] - direction[idx-stride];
}
else
{
gradGY = c - direction[idx-stride];
}
float g = gradD + gradGX + gradGY;
q[idx] = g;
// alphaNum / alphaDen reduction
int elsPerBand = blockDim.x * blockDim.y;
int size = 2*elsPerBand;
int tidNum = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.x*blockDim.y;
int tidDen = tidNum + 3*elsPerBand;
sdata[tidNum] = c * residual[idx];
sdata[tidDen] = g * c;
__syncthreads();
if (tidNum < elsPerBand)
{
sdata[tidNum] += sdata[tidNum+size];
sdata[tidDen] += sdata[tidDen+size];
}
__syncthreads();
for (size = size/2; size > 0; size >>= 1)
{
if (tidNum < size)
{
sdata[tidNum] += sdata[tidNum+size];
sdata[tidDen] += sdata[tidDen+size];
}
__syncthreads();
}
if (tidNum == 0)
{
alphaNumArray[blockIdx.x + blockIdx.y*gridDim.x] = sdata[0];
alphaDenArray[blockIdx.x + blockIdx.y*gridDim.x] = sdata[3*elsPerBand];
}
}
void computeGradientGpu(const dim3 gridDim, const dim3 blockDim, const int blockEls,
const int stride, const int w, const int h,
const float wD,
const float *const dDirection, const float *const dResidual,
float *const dQ,
float *const dAlphaNumArray, float *const pAlphaNumArray,
float *const dAlphaDenArray, float *const pAlphaDenArray,
float & alpha)
{
computeGradientKernel<<>>(
stride, w, h, wD,
dDirection, dResidual, dQ, dAlphaNumArray, dAlphaDenArray);
cudaMemcpy(pAlphaNumArray, dAlphaNumArray, gridDim.x*gridDim.y*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(pAlphaDenArray, dAlphaDenArray, gridDim.x*gridDim.y*sizeof(float), cudaMemcpyDeviceToHost);
float alphaNum = pAlphaNumArray[0];
float alphaDen = pAlphaDenArray[0];
for (unsigned int i = 1; i < gridDim.x*gridDim.y; i++)
{
alphaNum += pAlphaNumArray[i];
alphaDen += pAlphaDenArray[i];
}
alpha = alphaNum / alphaDen;
}
__global__ void downhillStepKernel(const int stride, const int w, const int h,
const float alpha,
const float *const direction, const float *const q,
float *const x, float *const residual,
float *const rMagn2Array)
{
extern __shared__ float sdata[];
int j = blockIdx.x * blockDim.x + threadIdx.x;
int i = blockIdx.y * blockDim.y + threadIdx.y;
if (j >= w || i >= h)
{
return;
}
int ch = blockIdx.z * blockDim.z + threadIdx.z;
int idx = i*stride + j*bTotal + ch;
x[idx] += alpha * direction[idx];
float res = residual[idx] - alpha * q[idx];
residual[idx] = res;
// rMagn2 reduction
int tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.x*blockDim.y;
sdata[tid] = res*res;
__syncthreads();
int elsPerBand = blockDim.x * blockDim.y;
int size = 2*elsPerBand;
if (tid < elsPerBand)
{
sdata[tid] += sdata[tid+size];
}
__syncthreads();
for (size = size/2; size > 0; size >>= 1)
{
if (tid < size)
{
sdata[tid] += sdata[tid+size];
}
__syncthreads();
}
if (tid == 1)
{
rMagn2Array[blockIdx.x + blockIdx.y*gridDim.x] = sdata[0];
}
}
__global__ void downhillStepKernelLinear(const float alpha,
const float *const direction, const float *const q,
float *const x, float *const residual,
float *const rMagn2Array)
{
extern __shared__ float sdata[];
int tid = threadIdx.x;
int idx = blockIdx.x * blockDim.x + tid;
x[idx] += alpha * direction[idx];
float res = residual[idx] - alpha * q[idx];
residual[idx] = res;
// rMagn2 reduction
sdata[tid] = res*res;
__syncthreads();
for (unsigned int s = blockDim.x/2; s > 0; s >>= 1)
{
if (tid < s)
{
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// write result for this block to global memory
if (tid == 0)
{
rMagn2Array[blockIdx.x] = sdata[0];
}
}
void downhillStepGpu(const dim3 gridDim, const dim3 blockDim, const int blockEls,
const int stride, const int w, const int h,
const float alpha,
const float *const dDirection, const float *const dQ,
float *const dX, float *const dResidual,
float *const dRMagn2Array, float *const pRMagn2Array,
float & rMagn2)
{
downhillStepKernel<<>>(
stride, w, h, alpha, dDirection, dQ, dX, dResidual, dRMagn2Array);
cudaMemcpy(pRMagn2Array, dRMagn2Array, gridDim.x*gridDim.y*sizeof(float), cudaMemcpyDeviceToHost);
rMagn2 = pRMagn2Array[0];
for (unsigned int i = 1; i < gridDim.x*gridDim.y; i++)
{
rMagn2 += pRMagn2Array[i];
}
}
void cudaConjugateGradients(const float wD, const int w, const int h,
const float *const dData,
const float *const dGx,
const float *const dGy,
float *const dX,
const int nIter)
{
const int numBytes = w*h*bTotal*sizeof(float);
const int blockSize = 8;
float rMagn2, rMagn2Old, alpha;
dim3 blockDim;
blockDim.x = blockDim.y = blockSize;
blockDim.z = 3;
dim3 gridDim;
gridDim.x = (w + blockSize - 1) / blockSize;
gridDim.y = (h + blockSize - 1) / blockSize;
const int blockEls = blockDim.x * blockDim.y * blockDim.z;
// init GPU arrays
float * dResidual, * dDirection, * dQ;
checkCudaErrors( cudaMalloc((void**)&dResidual, numBytes) );
checkCudaErrors( cudaMalloc((void**)&dDirection, numBytes) );
checkCudaErrors( cudaMalloc((void**)&dQ, numBytes) );
float * dSum1Array, * dSum2Array, * pSum1Array, * pSum2Array;
pSum1Array = new float[gridDim.x*gridDim.y];
pSum2Array = new float[gridDim.x*gridDim.y];
checkCudaErrors( cudaMalloc((void**)&dSum1Array, gridDim.x*gridDim.y*sizeof(float)) );
checkCudaErrors( cudaMalloc((void**)&dSum2Array, gridDim.x*gridDim.y*sizeof(float)) );
const int stride = w*bTotal;
// initialize gradient
initGradient(gridDim, blockDim, blockEls, stride, w, h, wD,
dX, dData, dGx, dGy, dResidual, dDirection, dSum1Array,
pSum1Array, rMagn2);
rMagn2Old = rMagn2;
for (int iter = 0; iter < nIter; iter++)
{
if (iter > 0)
{
float beta = rMagn2 / rMagn2Old;
conjugateGpu(gridDim, blockDim, stride, w, h, beta, dResidual, dDirection);
}
// compute the step size
computeGradientGpu(gridDim, blockDim, blockEls, stride, w, h, wD,
dDirection, dResidual, dQ,
dSum1Array, pSum1Array, dSum2Array, pSum2Array, alpha);
// take a downhill step and update residual
rMagn2Old = rMagn2;
downhillStepGpu(gridDim, blockDim, blockEls, stride, w, h, alpha,
dDirection, dQ, dX, dResidual, dSum1Array, pSum1Array, rMagn2);
}
// clean up
delete[] pSum1Array;
delete[] pSum2Array;
cudaFree(dResidual);
cudaFree(dDirection);
cudaFree(dQ);
cudaFree(dSum1Array);
cudaFree(dSum2Array);
}