__gloabl__ proc(float *arr,float *brr){ float v; __shared__ float shared[l]; shared[threadidx.x] =...

1
__gloabl__ proc(float *arr,float *brr){ float v; __shared__ float shared[L]; shared[threadIdx.x] = brr[threadIdx.x]; __syncthreads(); if(threadIdx.x!=0){ v=arr[theadIdx.x]; v+=shared[threadIdx.x]; … }else{ } … } Modularity for HPC - WootinJ- GPGPU in HPC Many SuperComputers support GPGPU TSUBAME2, Dawning Nebulae, … Many non-functional concerns Optimization Hardware-aware Fail-safe Masayuki Ioki, Shumpei Hozumi, Shigeru Chiba Tokyo Institute of Technology WootinJ Runtime convertor from Java to CUDA Generating CUDA code with runtime context Delete some overheads in OOP Devirtualization Flattening the structure of an object to remove field access chains Motivating example GPU has several types of memories. Global Memory Large but slow Shared Memory fast but small __gloabl__ proc(float *arr, float *brr){ float v; if(threadIdx.x!=0) { v=arr[theadIdx.x]; v+= brr[theadIdx.x]; }else{ } … } Global Memory SM SM SP Shared Memory SP Streamin g Processo rs GPU Streaming Multiproces sors Non SharedMemory Ver. __gloabl__ proc(float *arr,float *brr){ float v; __shared__ float shared[L]; shared[threadIdx.x] = arr[threadIdx.x]; __syncthreads(); if(threadIdx.x!=0){ v=shared[theadIdx.x]; v+=brr[threadIdx.x]; … }else{ } … } __gloabl__ proc(float *arr,float *brr){ float v; __shared__ float shared_a[L]; __shared__ float shared_b[L]; shared_a[threadIdx.x] = arr[threadIdx.x]; shared_b[threadIdx.x] = brr[threadIdx.x]; __syncthreads(); if(threadIdx.x!=0){ v=shared_a [theadIdx.x]; v+=shared_b[threadIdx.x]; }else{ } ...} brr -> SharedMemory arr -> SharedMemory both -> SharedMemory HPC Programmer hates OOP. OOP has rich modularities However OOP has many overheads. Dynamic method dispatch Field access chain class Calc{ Memory memA, memB …; @gloabl void proc(float[] arr, float[] brr) { … } } Calc calc = new Calc(); float[256] arr=..., brr= …; Dim3s dim3s = new Dim3s(); dim3s.threadDim=new Dim3(256); CUDAKicker .run(dim3s,calc,"proc", arr,brr); Java bytecode Java AST CUDA code Run on GPUs WootinJ Sample Code Memory memA = new SimpleSharedMem(256); Memory memB = new Memory(); memA.set(arr,theadIdx.x); memB.set(brr,theadIdx.x); void set_memA(float[] arr,int i){ /* SimpleSharedMem method */ } void set_memB(float[] arr,int i){ /* Memory method body */ } set_memA(arr,theadIdx.x); set_memB(brr,theadIdx.x); Devirtualization Dynamic method dispatch to Static find all actual types from given objects Micro Benchmark matrix product WootinJ’s overheads is about 2 sec. JVM start up CUDA code generate and compile TSUBAME2 Super Computer CPUs : Intel Xeon 2.93 GHz * 2 GPUs : NVIDIA Tesla M2050 * 3 Memory : 54GB GPUs G F L O P S ile-time check for Devirtualization a = b both types must be Strict- final in @global method assignment exp. obj.m(); Return type must be Strict-final return type of the method. Strict-final 1. primitive types are strict-final. 2. The class that is final class and all fields are strict-final, is strict-final. 3. An array that its element type is strictfinal, @global is an annotation for CUDA function. My name is Wootin!

Upload: jayson-dorsey

Post on 02-Jan-2016

257 views

Category:

Documents


0 download

TRANSCRIPT

Page 1: __gloabl__ proc(float *arr,float *brr){ float v; __shared__ float shared[L]; shared[threadIdx.x] = brr[threadIdx.x]; __syncthreads(); if(threadIdx.x!=0){

__gloabl__ proc(float *arr,float *brr){ float v; __shared__ float shared[L]; shared[threadIdx.x] = brr[threadIdx.x]; __syncthreads();

if(threadIdx.x!=0){ v=arr[theadIdx.x]; v+=shared[threadIdx.x]; … }else{ … } … }

Modularity for HPC -WootinJ-

GPGPU in HPCMany SuperComputers support GPGPUTSUBAME2, Dawning Nebulae, …

Many non-functional concerns OptimizationHardware-awareFail-safe

Masayuki Ioki, Shumpei Hozumi,

Shigeru Chiba

Tokyo Institute of Technology

WootinJRuntime convertor from Java to CUDAGenerating CUDA code with runtime context

Delete some overheads in OOP DevirtualizationFlattening the structure of an objectto remove field access chains

Motivating exampleGPU has several types of memories.Global MemoryLarge but slow

Shared Memoryfast but small

__gloabl__ proc(float *arr, float *brr){ float v; if(threadIdx.x!=0){ v=arr[theadIdx.x]; v+= brr[theadIdx.x]; … }else{ … } … }

Global Memory

SMSM

SP

Shared Memory

SPStreaming Processors…

GPU

Streaming Multiprocessors

Non SharedMemory Ver.

__gloabl__ proc(float *arr,float *brr){ float v; __shared__ float shared[L]; shared[threadIdx.x] = arr[threadIdx.x]; __syncthreads();

if(threadIdx.x!=0){ v=shared[theadIdx.x]; v+=brr[threadIdx.x]; … }else{ … } … }

__gloabl__ proc(float *arr,float *brr){ float v; __shared__ float shared_a[L]; __shared__ float shared_b[L]; shared_a[threadIdx.x] = arr[threadIdx.x]; shared_b[threadIdx.x] = brr[threadIdx.x]; __syncthreads();

if(threadIdx.x!=0){ v=shared_a [theadIdx.x]; v+=shared_b[threadIdx.x]; }else{ … } ...}

brr -> SharedMemoryarr -> SharedMemory both -> SharedMemory

HPC Programmer hates OOP.OOP has rich modularitiesHowever OOP has many overheads.Dynamic method dispatchField access chain

class Calc{ Memory memA, memB …; @gloabl void proc(float[] arr, float[] brr) { … }}

Calc calc = new Calc();float[256] arr=..., brr= …; Dim3s dim3s = new Dim3s();dim3s.threadDim=new Dim3(256);

CUDAKicker    .run(dim3s,calc,"proc",arr,brr);

Java bytecode

Java AST

CUDA code

Run on GPUs

WootinJ Sample Code

Memory memA = new SimpleSharedMem(256);Memory memB = new Memory();

memA.set(arr,theadIdx.x);memB.set(brr,theadIdx.x);

void set_memA(float[] arr,int i){ /* SimpleSharedMem method */ }void set_memB(float[] arr,int i){ /* Memory method body */ }…set_memA(arr,theadIdx.x);set_memB(brr,theadIdx.x);

DevirtualizationDynamic method dispatch to Staticfind all actual types from given objects Micro Benchmark

matrix product

WootinJ’s overheads is about 2 sec. JVM start up CUDA code generate and compile

TSUBAME2 Super Computer CPUs : Intel Xeon 2.93 GHz * 2 GPUs : NVIDIA Tesla M2050 * 3 Memory : 54GB

GPUs

GFLO

PS

Compile-time check for Devirtualization

a = b

both types must be Strict-final

in @global method

assignment exp.

obj.m();

Return type must be Strict-final

return type of the method.Strict-final 1. primitive types are strict-final. 2. The class that is final class and all fields are strict-final, is strict-final. 3. An array that its element type is strictfinal, is strict-final.

@global is an annotation for

CUDA function.

My name is Wootin!