c# with cudafy: image stitching...

John Hauck - Manager of software at LECO, a scientific laboratory instrument company. - Loves the Lord, his wife, 3 kids, and sailing on Lake Michigan

-GOAL: Encourage C# developers to explore CUDA

[email protected]

S4375 C# with CUDAfy: Image Stitching Concepts

GPGPU Gpu = CudafyHost.GetDevice(eGPUType.Cuda); or eGPUType.OpenCL CudafyTranslator.Language = Gpu is OpenCLDevice ? eLanguage.OpenCL : eLanguage.Cuda; var module = CudafyTranslator.Cudafy(eArchitecture.sm_11); Gpu.LoadModule(module);

Initialize

The “Manufactured” Problem Restore an image that has been divided into 9 tiles, where the tiles have been randomly rearranged. Here is a fragment of a scroll (containing Isaiah 6:10) that has been scrambled.

w8isms.blogspot.com

Scaling the Image Scale the image to a manageable size of 192x192 pixels and remove the color component.

// load image from disk

var cpuImage = new MyImageReader(sourceFileName);

// allocate image memory on the GPU

var gpuImage = Gpu.Allocate<uint>(cpuImage.Pixels.Length);

// copy the image pixels to the GPU

Gpu.CopyToDevice(cpuImage.Pixels, gpuImage);

// allocate scaled image memory on the GPU

var gpuScaledImage = Gpu.Allocate<byte>(64 * 3, 64 * 3);

// rescale the image using the GPU

Gpu.Launch(new dim3(12, 12), new dim3(16, 16),

ScaleImageKernel, gpuImage, cpuImage.Width, cpuImage.Height, gpuScaledImage);

[Cudafy] public static void ScaleImageKernel(GThread gThread, uint[] sourceImage, int sourceWidth, int sourceHeight, byte[,] scaledImage) { var scaledX = gThread.blockIdx.x * gThread.blockDim.x + gThread.threadIdx.x; var scaledY = gThread.blockIdx.y * gThread.blockDim.y + gThread.threadIdx.y; var sourceX = scaledX * sourceWidth / scaledImage.GetLength(0); var sourceY = scaledY * sourceHeight / scaledImage.GetLength(1); var sourcePixel = sourceImage[sourceX + sourceY * sourceWidth]; var blue = sourcePixel & 0xFF; var green = (sourcePixel >> 8) & 0xFF; var red = (sourcePixel >> 16) & 0xFF; scaledImage[scaledX, scaledY] = (byte)(red * 0.3f + green * 0.6f + blue * 0.1f); }

12x12 Blocks 16x16 Threads

extern "C" __global__ void ScaleImageKernel( unsigned int* sourceImage, int sourceImageLen0, int sourceWidth, int sourceHeight, unsigned char* scaledImage, int scaledImageLen0, int scaledImageLen1) { int num = blockIdx.x * blockDim.x + threadIdx.x; int num2 = blockIdx.y * blockDim.y + threadIdx.y; int num3 = num * sourceWidth / scaledImageLen0; int num4 = num2 * sourceHeight / scaledImageLen1; unsigned int num5 = sourceImage[(num3 + num4 * sourceWidth)]; unsigned int num6 = num5 & 255u; unsigned int num7 = num5 >> 8 & 255u; unsigned int num8 = num5 >> 16 & 255u; scaledImage[(num) * scaledImageLen1 + ( num2)] = (unsigned char)(num8 * 0.3f + num7 * 0.6f + num6 * 0.1f); }

Auto-generated CUDAFYSOURCETEMP.CU

[Cudafy]

public static void EnhancedScaleImageKernel(GThread gThread, uint[] sourceImage, int sourceWidth, int sourceHeight, byte[,] scaledImage)

{

var scaledX = gThread.blockIdx.x * gThread.blockDim.x + gThread.threadIdx.x;

var scaledY = gThread.blockIdx.y * gThread.blockDim.y + gThread.threadIdx.y;

EnhancedScaleImagePixel(sourceImage, sourceWidth, sourceHeight, scaledImage, scaledX, scaledY);

}

Scaling the Image The EnhancedScaleImageKernel averages the source pixels, and makes use of a helper function.

[Cudafy]

private static float EnhancedScaleImagePixel(uint[] sourceImage, int sourceWidth, int sourceHeight, byte[,] scaledImage, int scaledX, int scaledY)

{

var startX = scaledX * sourceWidth / scaledImage.GetLength(0);

var startY = scaledY * sourceHeight / scaledImage.GetLength(1);

var endX = (scaledX + 1) * sourceWidth / scaledImage.GetLength(0);

var endY = (scaledY + 1) * sourceHeight / scaledImage.GetLength(1);

var sum = 0f;

var count = 0;

for (var sourceX = startX; sourceX < endX; sourceX++)

{

for (var sourceY = startY; sourceY < endY; sourceY++)

{

var sourcePixel = sourceImage[sourceX + sourceY * sourceWidth];

var blue = sourcePixel & 0xFF;

var green = (sourcePixel >> 8) & 0xFF;

var red = (sourcePixel >> 16) & 0xFF;

sum += red * 0.3f + green * 0.6f + blue * 0.1f;

count++;

}

}

scaledImage[scaledX, scaledY] = (byte)(sum / count);

return 0;

}

Parallel.For(0, 192 * 192, p => {

var x = p % 192;

var y = p / 192;

EnhancedScaleImagePixel(sourceImage, sourceWidth, sourceHeight, scaledImage, x, y);});

GPU vs. CPU The GPU performs this task over 100 times faster than the CPU for my configuration… … a Dell Precision T3600, 16GB RAM, Intel Xeon E5-2665 0 @ 2.40GHz, NVidia GTX Titan.

Extracting the Edges We have 9 tiles, each with 4 edges, and each edge has 64 pixels. The C# code that runs on the CPU, reads as follows:

// allocate edges memory on the GPU

var gpuEdges = Gpu.Allocate<byte>(9, 4, 64);

// extract edge information using the GPU

Gpu.Launch(new dim3(9, 4), 64, ExtractEdgeKernel, gpuScaledImage, gpuEdges);

public static void ExtractEdgeKernel(GThread gThread, byte[,] image, byte[,,] edges) { var tileIndex = gThread.blockIdx.x; var tileX = tileIndex % 3; var tileY = tileIndex / 3; var edgeIndex = gThread.blockIdx.y; var pixelIndex = gThread.threadIdx.x; var sourceX = tileX * 64; var sourceY = tileY * 64; switch (edgeIndex) { case 0: sourceY += pixelIndex; break; // left case 1: sourceX += pixelIndex; break; // top case 2: sourceY += pixelIndex; sourceX += 63; break; // right case 3: sourceX += pixelIndex; sourceY += 63; break; // bottom } edges[tileIndex, edgeIndex, pixelIndex] = image[sourceX, sourceY]; }

9x4 Blocks

64 Threads

Constant Memory Move the edge data into constant memory to speed up the next step.

[Cudafy]

public static byte[,,] Edges = new byte[9, 4, 64];

// copy edges to GPU constant memory

Gpu.CopyFromDevice(gpuEdges, Edges);

Gpu.CopyToConstantMemory(Edges, Edges);

Computing Fits Compute the fitness between the edges of each tile. We want to know how well the left edge of tile 2 fits the right edge of tile 4, etc.

Computing Fits Compute the fitness between the edges of each tile. We want to know how well the left edge of tile 2 fits the right edge of tile 4, etc.

[Cudafy]

public static float[,] LeftRightFit = new float[9, 9];

// allocate fit memory on the GPU

var gpuFit = Gpu.Allocate<float>(9, 9);

// compare edge fitting using the GPU

Gpu.Launch(new dim3(9, 9), 64, ComputeFitsKernel, 2, 0, gpuFit);

// copy edges to GPU constant memory

Gpu.CopyFromDevice(gpuFit, LeftRightFit);

Gpu.CopyToConstantMemory(LeftRightFit, LeftRightFit);

public static void ComputeFitsKernel(GThread gThread, int edgeIndexA, int edgeIndexB, float[,] fit) { var sum = gThread.AllocateShared<float>("sum", 64); var tileIndexA = gThread.blockIdx.x; var tileIndexB = gThread.blockIdx.y; var pixelIndex = gThread.threadIdx.x; var diff = Edges[tileIndexA, edgeIndexA, pixelIndex] - Edges[tileIndexB, edgeIndexB, pixelIndex]; sum[pixelIndex] = diff * diff; gThread.SyncThreads(); for (var i = 64 / 2; i > 0; i /= 2) { if (pixelIndex < i) sum[pixelIndex] += sum[pixelIndex + i]; gThread.SyncThreads(); } if (pixelIndex == 0) fit[tileIndexA, tileIndexB] = sum[0]; }

extern "C" __global__ void ComputeFitsKernel( int edgeIndexA, int edgeIndexB, float* fit, int fitLen0, int fitLen1) { __shared__ float array[64]; int arrayLen0 = 64; int x = blockIdx.x; int y = blockIdx.y; int x2 = threadIdx.x; int num = (int)(Edges[(x) * EdgesLen1 * EdgesLen2 + ( edgeIndexA) * EdgesLen2 + ( x2)] - Edges[(y) * EdgesLen1 * EdgesLen2 + ( edgeIndexB) * EdgesLen2 + ( x2)]); array[(x2)] = (float)(num * num); __syncthreads(); for (int i = 32; i > 0; i /= 2) { if (x2 < i) { array[(x2)] += array[(x2 + i)]; } __syncthreads(); } if (x2 == 0) { fit[(x) * fitLen1 + ( y)] = array[(0)]; } }

Auto-generated CUDAFYSOURCETEMP.CU

V0 V1 V2

V3 V4 V5

V6 V7 V8

V[9] 0,1,2,3,4,5,6,7,8 0,1,2,3,4,5,6,8,7 0,1,2,3,4,5,7,6,8 0,1,2,3,4,5,7,8,6 0,1,2,3,4,5,8,6,7 0,1,2,3,4,5,8,7,6 … 8,7,6,5,4,3,2,1,0

9! = 362,880 permutations

Visiting All Permutations We will pick (arbitrarily) that each block contains 256 threads. We will then create enough blocks (1418) so that each permutation has its own thread. We will then write the kernel so as each block returns the best permutation of the 256 that each of its threads evaluates. We will then transfer the 1418 best candidates from the GPU to the CPU. The CPU will then find the best permutation in each of the 1418 evaluations. Here is the CPU code

// evaluate all permutations

const int threads = 256;

const int blocks = (Permutations + threads - 1) / threads;

var cpuEvaluations = new Evaluation[blocks];

var gpuEvaluations = Gpu.Allocate(cpuEvaluations);

Gpu.Launch(blocks, threads, ExplorePermutationsKernel, gpuEvaluations);

Gpu.CopyFromDevice(gpuEvaluations, cpuEvaluations);

// get the best permutation

var bestEvaluation = cpuEvaluations[0];

foreach (var evaluation in cpuEvaluations)

{

if (evaluation.Metric < bestEvaluation.Metric)

{

bestEvaluation = evaluation;

}

}

public static void ExplorePermutationsKernel(GThread gThread, Evaluation[] evaluations) { var blockEvaluations = gThread.AllocateShared<Evaluation>("be", 256); var v = gThread.AllocateShared<byte>("v", 256, 9); var t = gThread.threadIdx.x; var permutation = gThread.blockIdx.x * gThread.blockDim.x + gThread.threadIdx.x; // 0 1 2 // 3 4 5 // 6 7 8 TileOrderFromPermutation(Permutations, permutation, 9, v, t); var metric = 0f; metric += LeftRightFit[v[t, 0], v[t, 1]] + LeftRightFit[v[t, 1], v[t, 2]]; metric += LeftRightFit[v[t, 3], v[t, 4]] + LeftRightFit[v[t, 4], v[t, 5]]; metric += LeftRightFit[v[t, 6], v[t, 7]] + LeftRightFit[v[t, 7], v[t, 8]]; metric += TopBottomFit[v[t, 0], v[t, 3]] + TopBottomFit[v[t, 3], v[t, 6]]; metric += TopBottomFit[v[t, 1], v[t, 4]] + TopBottomFit[v[t, 4], v[t, 7]]; metric += TopBottomFit[v[t, 2], v[t, 5]] + TopBottomFit[v[t, 5], v[t, 8]]; blockEvaluations[t].Permutation = permutation; blockEvaluations[t].Metric = metric;

… blockEvaluations[t].Permutation = permutation; blockEvaluations[t].Metric = metric; gThread.SyncThreads(); for (var i = 256 / 2; i > 0; i /= 2) { if (t < i) { if (blockEvaluations[t].Metric > blockEvaluations[t + i].Metric) { blockEvaluations[t] = blockEvaluations[t + i]; } } gThread.SyncThreads(); } if (gThread.threadIdx.x == 0) { evaluations[gThread.blockIdx.x] = blockEvaluations[0]; } }

http://w8isms.blogspot.com/2013/04/gpgpu-papyrus-demo.html

Bonus Material w8isms.blogspot.com

www.assembla.com/code/telecontrol/subversion/nodes

Phase Correlation

w8isms.blogspot.com [email protected]

c# with cudafy: image stitching...

Documents