Accelerate your Application with CUDA C
|
|
|
- Della Sims
- 10 years ago
- Views:
Transcription
1 Accelerate your Application with CUDA C
2 2D Minimum Algorithm Consider applying a 2D window to a 2D array of elements Each output element is the minimum of input elements within the window Window Input Matrix Output Matrix 2D operations like this are found in many fundamental algorithms Interpolation, Convolution, Filtering Applications in seismic processing, weather simulation, image processing, etc
3 2D Minimum: C Version #define WIN_SIZE 16 #define N int main() int size=n*n*sizeof(int); int i, j, x, y, temp; //allocate resources int *cell=(int *)malloc(size); //input int *node=(int *)malloc(size); //output initializearray(cell,n); initializearray(node,n); Allocate Resources Initialize data for(i=0;i<n;i++) for(j=0;j<n;j++) //find minimum in window temp = node[i][j]; for(x=0;x<win_size;x++) for(y=0;y<win_size;y++) if (temp> cell[i+x][j+y]) temp = cell[i+x][j+y]; node[i][j] = temp; Loop over dataset Loop over window Find min //free resources free(cell); free(node); Cleanup
4 2D Minimum: C Version #define WIN_SIZE 16 #define N for(i=0;i<n;i++) for(j=0;j<n;j++) int main() int size=n*n*sizeof(int); //find minimum in window int i, j, x, y, temp; temp = node[i][j]; //allocate resources for(x=0;x<win_size;x++) Algorithm Device Compute Speedup int *cell=(int *)malloc(size); //input for(y=0;y<win_size;y++) int *node=(int *)malloc(size); //output if (temp> cell[i+x][j+y]) C (serial) Xeon X sec -- temp = cell[i+x][j+y]; initializearray(cell,n); node[i][j] = temp; initializearray(node,n); //free resources free(cell); free(node);
5 CPU + GPU = Big Speedup Application Code GPU Compute-Intensive Functions Use CUDA to Parallelize Rest of Sequential CPU Code CPU +
6 Explicit Data Movement CPU (host) GPU (device) CPU Memory in out PCI Bus GPU Memory d_in d_out
7 2D Window: CUDA Main Program C int main() int main() int size=n*n*sizeof(int); int size=n*n*sizeof(int); //allocate resources //allocate resources int *cell=(int*)malloc(size); //input int *cell=(float*)malloc(size); int *node=(int*)malloc(size); //output int *node=(float*)malloc(size); int *d_cell; cudamalloc(&d_cell,size); initializearray(cell,n); int *d_node; cudamalloc(&d_node,size); initializearray(node,n); initializearray(cell,n); initializearray(node,n); // nested for loops for cudamemcpy(d_cell,cell,size,cudamemcpyhosttodevice); for cudamemcpy(d_node,node,size,cudamemcpyhosttodevice); for compute_win2d<<<nblocks, nthreads>>>(d_node,d_cell); for. //free resources //free resources free(cell); free(node); free(in); free(out); cudafree(d_cell); cudafree(d_node); CUDA C
8 2D Window: CUDA Main Program int main() int size=n*n*sizeof(int); //allocate resources int *cell=(int*)malloc(size); //input int *node=(int*)malloc(size); //output initializearray(cell,n); initializearray(node,n); // nested for loops for for for for. //free resources free(in); free(out); C int main() int size=n*n*sizeof(int); //allocate resources int *cell=(float*)malloc(size); int *node=(float*)malloc(size); int *d_cell; cudamalloc(&d_cell,size); int *d_node; cudamalloc(&d_node,size); initializearray(cell,n); initializearray(node,n); cudamemcpy(d_cell,cell,size,cudamemcpyhosttodevice); cudamemcpy(d_node,node,size,cudamemcpyhosttodevice); compute_win2d<<<nblocks, nthreads>>>(d_node,d_cell); //free resources free(cell); free(node); cudafree(d_cell); cudafree(d_node); CUDA C Allocate Device Memory
9 2D Window: CUDA Main Program int main() int size=n*n*sizeof(int); //allocate resources int *cell=(int*)malloc(size); //input int *node=(int*)malloc(size); //output initializearray(cell,n); initializearray(node,n); // nested for loops for for for for. //free resources free(in); free(out); C int main() int size=n*n*sizeof(int); //allocate resources int *cell=(float*)malloc(size); int *node=(float*)malloc(size); int *d_cell; cudamalloc(&d_cell,size); int *d_node; cudamalloc(&d_node,size); initializearray(cell,n); initializearray(node,n); cudamemcpy(d_cell,cell,size,cudamemcpyhosttodevice); cudamemcpy(d_node,node,size,cudamemcpyhosttodevice); compute_win2d<<<nblocks, nthreads>>>(d_node,d_cell); //free resources free(cell); free(node); cudafree(d_cell); cudafree(d_node); CUDA C Copy Data to the Device
10 2D Window: CUDA Main Program int main() int size=n*n*sizeof(int); //allocate resources int *cell=(int*)malloc(size); //input int *node=(int*)malloc(size); //output initializearray(cell,n); initializearray(node,n); // nested for loops for for for for. //free resources free(in); free(out); C Call Cuda Function int main() int size=n*n*sizeof(int); //allocate resources int *cell=(float*)malloc(size); int *node=(float*)malloc(size); int *d_cell; cudamalloc(&d_cell,size); int *d_node; cudamalloc(&d_node,size); initializearray(cell,n); initializearray(node,n); cudamemcpy(d_cell,cell,size,cudamemcpyhosttodevice); cudamemcpy(d_node,node,size,cudamemcpyhosttodevice); compute_win2d<<<nblocks, nthreads>>>(d_node,d_cell); //free resources free(cell); free(node); Launch cudafree(d_cell); cudafree(d_node); Parameters CUDA C Device Pointers
11 2D Window: CUDA Main Program C int main() int size=n*n*sizeof(int); //allocate resources int *cell=(int*)malloc(size); //input int *node=(int*)malloc(size); //output int main() int size=n*n*sizeof(int); //allocate resources int *cell=(float*)malloc(size); int *node=(float*)malloc(size); int *d_cell; cudamalloc(&d_cell,size); initializearray(cell,n); initializearray(node,n); int *d_node; cudamalloc(&d_node,size); initializearray(cell,n); initializearray(node,n); // nested for loops for for for for. cudamemcpy(d_cell,cell,size,cudamemcpyhosttodevice); cudamemcpy(d_node,node,size,cudamemcpyhosttodevice); compute_win2d<<<nblocks, nthreads>>>(d_node,d_cell); //free resources //free resources free(in); free(out); free(cell); free(node); cudafree(d_cell); cudafree(d_node); Cleanup CUDA C
12 Parallel Execution Model A CUDA C function is executed by many parallel threads Threads are organized as a grid of independent thread blocks Grid Block 0 Block 1 Block N_B-2 Block N_B-1
13 2D Window: CUDA Kernel Function for(i=0;i<n;i++) for(j=0;j<n;j++) //find minimum in window temp = node[i][j]; C for(x=0;x<win_size;x++) for(y=0;y<win_size;y++) if (temp> cell[i+x][j+y]) temp = cell[i+x][j+y]; node[i][j] = temp; global void compute_win2d(int knode[][n], int kcell[][n]) int idx=blockidx.x*blockdim.x+threadidx.x; int idy=blockidx.y*blockdim.y+threadidx.y; int temp, x, y; if((idx<n)&&(idy<n)) //find minimum in window temp = knode[idx][idy]; for(x=0;x<win_size;x++) for(y=0;y<win_size;y++) if (temp> kcell[idx+x][idy+y]) temp = kcell[idx+x][idy+y]; knode[i][j] = temp; CUDA C
14 2D Window: CUDA Kernel Function for(i=0;i<n;i++) for(j=0;j<n;j++) //find minimum in window temp = node[i][j]; C for(x=0;x<win_size;x++) for(y=0;y<win_size;y++) if (temp> cell[i+x][j+y]) temp = cell[i+x][j+y]; node[i][j] = temp; Add global Keyword global void compute_win2d(int knode[][n], int kcell[][n]) int idx=blockidx.x*blockdim.x+threadidx.x; int idy=blockidx.y*blockdim.y+threadidx.y; int temp, x, y; if((idx<n)&&(idy<n)) //find minimum in window temp = knode[idx][idy]; for(x=0;x<win_size;x++) for(y=0;y<win_size;y++) if (temp> kcell[idx+x][idy+y]) temp = kcell[idx+x][idy+y]; knode[i][j] = temp; CUDA C
15 2D Window: CUDA Kernel Function for(i=0;i<n;i++) for(j=0;j<n;j++) //find minimum in window blockidx.x temp = node[i][j]; C for(x=0;x<win_size;x++) for(y=0;y<win_size;y++) if (temp> cell[i+x][j+y]) temp = cell[i+x][j+y]; node[i][j] = temp; Replace Outer Loops With an Index Calculation 0 N_B threadidx.x threadidx.x global void compute_win2d(int knode[][n], int kcell[][n]) int idx=blockidx.x*blockdim.x+threadidx.x; int idy=blockidx.y*blockdim.y+threadidx.y; int temp, x, y; if((idx<n)&&(idy<n)) //find minimum in window temp = knode[idx][idy]; for(x=0;x<win_size;x++) for(y=0;y<win_size;y++) if (temp> kcell[idx+x][idy+y]) temp = kcell[idx+x][idy+y]; knode[i][j] = temp; CUDA C
16 2D Window: CUDA Kernel Function C global void compute_win2d(int knode[][n], int kcell[][n]) for(i=0;i<n;i++) int idx=blockidx.x*blockdim.x+threadidx.x; for(j=0;j<n;j++) int idy=blockidx.y*blockdim.y+threadidx.y; int temp, x, y; //find minimum Algorithm in window Device if((idx<n)&&(idy<n)) Compute Speedup temp = node[i][j]; //find minimum in window for(x=0;x<win_size;x++) C (serial) X5650 temp 96.1sec = knode[idx][idy]; -- for(y=0;y<win_size;y++) for(x=0;x<win_size;x++) CUDA M sec 11.5x if (temp> cell[i+x][j+y]) for(y=0;y<win_size;y++) temp = cell[i+x][j+y]; if (temp> kcell[idx+x][idy+y]) node[i][j] = temp; temp = kcell[idx+x][idy+y]; knode[i][j] = temp; CUDA C
17 Observation: Data Reuse Thread(i) Thread(i+1) Global Memory Neighboring threads read the same elements.
18 Optimization: Use Shared Memory Global Memory SMEM COPY BLK_SIZE WSIZE Shared Memory
19 CUDA C: Optimized Kernel global void compute_win2d(int knode[][n], int kcell[][n]) shared int smem[blk_size+wsize][blk_size+wsize]; int idx=blockidx.x*blockdim.x+threadidx.x; int idy=blockidx.y*blockdim.y+threadidx.y; int temp, x, y; if((idx<n)&&(idy<n)) smem[threadidx.x][threadidx.y]=kcell[idx][idy]; if (threadidx.y > (N-WSIZE)) smem[threadidx.x][threadidx.y + WSIZE]=kcell[idx][idy+WSIZE]; if (threadidx.x >(N-WSIZE)) smem[threadidx.x + WSIZE][threadIdx.y]=kcell[idx+WSIZE][idy]; if ((threadidx.x >(N-WSIZE)) && (threadidx.y > (N-WSIZE))) smem[threadidx.x+wsize][threadidx.y+wsize]= kcell[idx+wsize][idy+wsize]; //wait for all threads to finish read syncthreads(); //find minimum in window temp = knode[idx][idy]; for(x=0;x<wsize;x++) for(y=0;y<wsize;y++) if (temp> smem[threadidx.x+x][threadidx.y+y]) temp = smem[threadidx.x+x][threadidx.y+y]; knode[i][j] = temp;
20 CUDA C: Optimized Kernel global void compute_win2d(int knode[][n], int kcell[][n]) shared int smem[blk_size+wsize][blk_size+wsize]; int idx=blockidx.x*blockdim.x+threadidx.x; int idy=blockidx.y*blockdim.y+threadidx.y; int temp, x, y; if((idx<n)&&(idy<n)) Allocate Shared Memory for Each Block smem[threadidx.x][threadidx.y]=kcell[idx][idy]; if (threadidx.y > (N-WSIZE)) smem[threadidx.x][threadidx.y + WSIZE]=kcell[idx][idy+WSIZE]; if (threadidx.x >(N-WSIZE)) smem[threadidx.x + WSIZE][threadIdx.y]=kcell[idx+WSIZE][idy]; if ((threadidx.x >(N-WSIZE)) && (threadidx.y > (N-WSIZE))) smem[threadidx.x+wsize][threadidx.y+wsize]= kcell[idx+wsize][idy+wsize]; //wait for all threads to finish read syncthreads(); Shared Memory BLK_SIZE WSIZE
21 CUDA C: Optimized Kernel global void compute_win2d(int knode[][n], int kcell[][n]) shared int smem[blk_size+wsize][blk_size+wsize]; int idx=blockidx.x*blockdim.x+threadidx.x; int idy=blockidx.y*blockdim.y+threadidx.y; int temp, x, y; if((idx<n)&&(idy<n)) Calculate Global Memory Indices smem[threadidx.x][threadidx.y]=kcell[idx][idy]; if (threadidx.y > (N-WSIZE)) smem[threadidx.x][threadidx.y + WSIZE]=kcell[idx][idy+WSIZE]; if (threadidx.x >(N-WSIZE)) smem[threadidx.x + WSIZE][threadIdx.y]=kcell[idx+WSIZE][idy]; if ((threadidx.x >(N-WSIZE)) && (threadidx.y > (N-WSIZE))) smem[threadidx.x+wsize][threadidx.y+wsize]= kcell[idx+wsize][idy+wsize]; //wait for all threads to finish read syncthreads(); Shared Memory BLK_SIZE WSIZE
22 CUDA C: Optimized Kernel global void compute_win2d(int knode[][n], int kcell[][n]) shared int smem[blk_size+wsize][blk_size+wsize]; int idx=blockidx.x*blockdim.x+threadidx.x; int idy=blockidx.y*blockdim.y+threadidx.y; int temp, x, y; if((idx<n)&&(idy<n)) smem[threadidx.x][threadidx.y]=kcell[idx][idy]; if (threadidx.y > (N-WSIZE)) smem[threadidx.x][threadidx.y + WSIZE]=kcell[idx][idy+WSIZE]; if (threadidx.x >(N-WSIZE)) smem[threadidx.x + WSIZE][threadIdx.y]=kcell[idx+WSIZE][idy]; if ((threadidx.x >(N-WSIZE)) && (threadidx.y > (N-WSIZE))) smem[threadidx.x+wsize][threadidx.y+wsize]= kcell[idx+wsize][idy+wsize]; //wait for all threads to finish read syncthreads(); Load Interior Region 1 to Shared Memory 1 Shared Memory BLK_SIZE WSIZE
23 CUDA C: Optimized Kernel global void compute_win2d(int knode[][n], int kcell[][n]) shared int smem[blk_size+wsize][blk_size+wsize]; int idx=blockidx.x*blockdim.x+threadidx.x; int idy=blockidx.y*blockdim.y+threadidx.y; int temp, x, y; if((idx<n)&&(idy<n)) Load Halo Region 2 to Shared Memory smem[threadidx.x][threadidx.y]=kcell[idx][idy]; if (threadidx.y > (N-WSIZE)) smem[threadidx.x][threadidx.y + WSIZE]=kcell[idx][idy+WSIZE]; if (threadidx.x >(N-WSIZE)) smem[threadidx.x + WSIZE][threadIdx.y]=kcell[idx+WSIZE][idy]; if ((threadidx.x >(N-WSIZE)) && (threadidx.y > (N-WSIZE))) smem[threadidx.x+wsize][threadidx.y+wsize]= kcell[idx+wsize][idy+wsize]; //wait for all threads to finish read syncthreads(); Shared Memory 2 BLK_SIZE WSIZE
24 CUDA C: Optimized Kernel global void compute_win2d(int knode[][n], int kcell[][n]) shared int smem[blk_size+wsize][blk_size+wsize]; int idx=blockidx.x*blockdim.x+threadidx.x; int idy=blockidx.y*blockdim.y+threadidx.y; int temp, x, y; if((idx<n)&&(idy<n)) Load Halo Region 3 to Shared Memory smem[threadidx.x][threadidx.y]=kcell[idx][idy]; if (threadidx.y > (N-WSIZE)) smem[threadidx.x][threadidx.y + WSIZE]=kcell[idx][idy+WSIZE]; if (threadidx.x >(N-WSIZE)) smem[threadidx.x + WSIZE][threadIdx.y]=kcell[idx+WSIZE][idy]; if ((threadidx.x >(N-WSIZE)) && (threadidx.y > (N-WSIZE))) smem[threadidx.x+wsize][threadidx.y+wsize]= kcell[idx+wsize][idy+wsize]; //wait for all threads to finish read syncthreads(); 3 Shared Memory BLK_SIZE WSIZE
25 CUDA C: Optimized Kernel global void compute_win2d(int knode[][n], int kcell[][n]) shared int smem[blk_size+wsize][blk_size+wsize]; int idx=blockidx.x*blockdim.x+threadidx.x; int idy=blockidx.y*blockdim.y+threadidx.y; int temp, x, y; if((idx<n)&&(idy<n)) Load Halo Region 4 to Shared Memory smem[threadidx.x][threadidx.y]=kcell[idx][idy]; if (threadidx.y > (N-WSIZE)) smem[threadidx.x][threadidx.y + WSIZE]=kcell[idx][idy+WSIZE]; if (threadidx.x >(N-WSIZE)) smem[threadidx.x + WSIZE][threadIdx.y]=kcell[idx+WSIZE][idy]; if ((threadidx.x >(N-WSIZE)) && (threadidx.y > (N-WSIZE))) smem[threadidx.x+wsize][threadidx.y+wsize]= kcell[idx+wsize][idy+wsize]; //wait for all threads to finish read syncthreads(); Shared Memory 4 BLK_SIZE WSIZE
26 CUDA C: Optimized Kernel global void compute_win2d(int knode[][n], int kcell[][n]) shared int smem[blk_size+wsize][blk_size+wsize]; int idx=blockidx.x*blockdim.x+threadidx.x; int idy=blockidx.y*blockdim.y+threadidx.y; int temp, x, y; if((idx<n)&&(idy<n)) smem[threadidx.x][threadidx.y]=kcell[idx][idy]; if (threadidx.y > (N-WSIZE)) smem[threadidx.x][threadidx.y + WSIZE]=kcell[idx][idy+WSIZE]; if (threadidx.x >(N-WSIZE)) smem[threadidx.x + WSIZE][threadIdx.y]=kcell[idx+WSIZE][idy]; if ((threadidx.x >(N-WSIZE)) && (threadidx.y > (N-WSIZE))) smem[threadidx.x+wsize][threadidx.y+wsize]= kcell[idx+wsize][idy+wsize]; //wait for all threads to finish read syncthreads(); Wait for All Threads to Finish Writing to Shared Memory Shared Memory BLK_SIZE WSIZE
27 CUDA C: Optimized Kernel global void compute_win2d(int knode[][n], int kcell[][n]) shared int smem[blk_size+wsize][blk_size+wsize]; int idx=blockidx.x*blockdim.x+threadidx.x; int idy=blockidx.y*blockdim.y+threadidx.y; int temp, x, y; if((idx<n)&&(idy<n)) smem[threadidx.x][threadidx.y]=kcell[idx][idy]; if (threadidx.y > (N-WSIZE)) smem[threadidx.x][threadidx.y + WSIZE]=kcell[idx][idy+WSIZE]; if (threadidx.x >(N-WSIZE)) smem[threadidx.x + WSIZE][threadIdx.y]=kcell[idx+WSIZE][idy]; if ((threadidx.x >(N-WSIZE)) && (threadidx.y > (N-WSIZE))) smem[threadidx.x+wsize][threadidx.y+wsize]= kcell[idx+wsize][idy+wsize]; //wait for all threads to finish read syncthreads(); //find minimum in window temp = knode[idx][idy]; for(x=0;x<wsize;x++) for(y=0;y<wsize;y++) if (temp> smem[threadidx.x+x][threadidx.y+y]) temp = smem[threadidx.x+x][threadidx.y+y]; knode[i][j] = temp; Find Minimum: Read Input from Shared Memory, Accumulate into a Register
28 CUDA C: Optimized Kernel global void compute_win2d(int knode[][n], int kcell[][n]) shared int smem[blk_size+wsize][blk_size+wsize]; int idx=blockidx.x*blockdim.x+threadidx.x; int idy=blockidx.y*blockdim.y+threadidx.y; int temp, x, y; if((idx<n)&&(idy<n)) smem[threadidx.x][threadidx.y]=kcell[idx][idy]; if (threadidx.y > (N-WSIZE)) smem[threadidx.x][threadidx.y + WSIZE]=kcell[idx][idy+WSIZE]; if (threadidx.x >(N-WSIZE)) smem[threadidx.x + WSIZE][threadIdx.y]=kcell[idx+WSIZE][idy]; if ((threadidx.x >(N-WSIZE)) && (threadidx.y > (N-WSIZE))) smem[threadidx.x+wsize][threadidx.y+wsize]= kcell[idx+wsize][idy+wsize]; //wait for all threads to finish read syncthreads(); //find minimum in window temp = knode[idx][idy]; for(x=0;x<wsize;x++) for(y=0;y<wsize;y++) if (temp> smem[threadidx.x+x][threadidx.y+y]) temp = smem[threadidx.x+x][threadidx.y+y]; knode[i][j] = temp; Write Minimum to Global Memory
29 Questions?
GPU Computing with CUDA Lecture 3 - Efficient Shared Memory Use. Christopher Cooper Boston University August, 2011 UTFSM, Valparaíso, Chile
GPU Computing with CUDA Lecture 3 - Efficient Shared Memory Use Christopher Cooper Boston University August, 2011 UTFSM, Valparaíso, Chile 1 Outline of lecture Recap of Lecture 2 Shared memory in detail
CUDA Programming. Week 4. Shared memory and register
CUDA Programming Week 4. Shared memory and register Outline Shared memory and bank confliction Memory padding Register allocation Example of matrix-matrix multiplication Homework SHARED MEMORY AND BANK
Hands-on CUDA exercises
Hands-on CUDA exercises CUDA Exercises We have provided skeletons and solutions for 6 hands-on CUDA exercises In each exercise (except for #5), you have to implement the missing portions of the code Finished
Learn CUDA in an Afternoon: Hands-on Practical Exercises
Learn CUDA in an Afternoon: Hands-on Practical Exercises Alan Gray and James Perry, EPCC, The University of Edinburgh Introduction This document forms the hands-on practical component of the Learn CUDA
Introduction to CUDA C
Introduction to CUDA C What is CUDA? CUDA Architecture Expose general-purpose GPU computing as first-class capability Retain traditional DirectX/OpenGL graphics performance CUDA C Based on industry-standard
HPC with Multicore and GPUs
HPC with Multicore and GPUs Stan Tomov Electrical Engineering and Computer Science Department University of Tennessee, Knoxville CS 594 Lecture Notes March 4, 2015 1/18 Outline! Introduction - Hardware
GPU Computing with CUDA Lecture 4 - Optimizations. Christopher Cooper Boston University August, 2011 UTFSM, Valparaíso, Chile
GPU Computing with CUDA Lecture 4 - Optimizations Christopher Cooper Boston University August, 2011 UTFSM, Valparaíso, Chile 1 Outline of lecture Recap of Lecture 3 Control flow Coalescing Latency hiding
1. If we need to use each thread to calculate one output element of a vector addition, what would
Quiz questions Lecture 2: 1. If we need to use each thread to calculate one output element of a vector addition, what would be the expression for mapping the thread/block indices to data index: (A) i=threadidx.x
Optimizing Parallel Reduction in CUDA. Mark Harris NVIDIA Developer Technology
Optimizing Parallel Reduction in CUDA Mark Harris NVIDIA Developer Technology Parallel Reduction Common and important data parallel primitive Easy to implement in CUDA Harder to get it right Serves as
GPU Hardware and Programming Models. Jeremy Appleyard, September 2015
GPU Hardware and Programming Models Jeremy Appleyard, September 2015 A brief history of GPUs In this talk Hardware Overview Programming Models Ask questions at any point! 2 A Brief History of GPUs 3 Once
MONTE-CARLO SIMULATION OF AMERICAN OPTIONS WITH GPUS. Julien Demouth, NVIDIA
MONTE-CARLO SIMULATION OF AMERICAN OPTIONS WITH GPUS Julien Demouth, NVIDIA STAC-A2 BENCHMARK STAC-A2 Benchmark Developed by banks Macro and micro, performance and accuracy Pricing and Greeks for American
Parallel Prefix Sum (Scan) with CUDA. Mark Harris [email protected]
Parallel Prefix Sum (Scan) with CUDA Mark Harris [email protected] April 2007 Document Change History Version Date Responsible Reason for Change February 14, 2007 Mark Harris Initial release April 2007
Accelerating Intensity Layer Based Pencil Filter Algorithm using CUDA
Accelerating Intensity Layer Based Pencil Filter Algorithm using CUDA Dissertation submitted in partial fulfillment of the requirements for the degree of Master of Technology, Computer Engineering by Amol
15-418 Final Project Report. Trading Platform Server
15-418 Final Project Report Yinghao Wang [email protected] May 8, 214 Trading Platform Server Executive Summary The final project will implement a trading platform server that provides back-end support
OpenACC 2.0 and the PGI Accelerator Compilers
OpenACC 2.0 and the PGI Accelerator Compilers Michael Wolfe The Portland Group [email protected] This presentation discusses the additions made to the OpenACC API in Version 2.0. I will also present
CUDA Basics. Murphy Stein New York University
CUDA Basics Murphy Stein New York University Overview Device Architecture CUDA Programming Model Matrix Transpose in CUDA Further Reading What is CUDA? CUDA stands for: Compute Unified Device Architecture
Rootbeer: Seamlessly using GPUs from Java
Rootbeer: Seamlessly using GPUs from Java Phil Pratt-Szeliga. Dr. Jim Fawcett. Dr. Roy Welch. Syracuse University. Rootbeer Overview and Motivation Rootbeer allows a developer to program a GPU in Java
12 Tips for Maximum Performance with PGI Directives in C
12 Tips for Maximum Performance with PGI Directives in C List of Tricks 1. Eliminate Pointer Arithmetic 2. Privatize Arrays 3. Make While Loops Parallelizable 4. Rectangles are Better Than Triangles 5.
Image Processing & Video Algorithms with CUDA
Image Processing & Video Algorithms with CUDA Eric Young & Frank Jargstorff 8 NVIDIA Corporation. introduction Image processing is a natural fit for data parallel processing Pixels can be mapped directly
Introduction to GPU hardware and to CUDA
Introduction to GPU hardware and to CUDA Philip Blakely Laboratory for Scientific Computing, University of Cambridge Philip Blakely (LSC) GPU introduction 1 / 37 Course outline Introduction to GPU hardware
U N C L A S S I F I E D
CUDA and Java GPU Computing in a Cross Platform Application Scot Halverson [email protected] LA-UR-13-20719 Slide 1 What s the goal? Run GPU code alongside Java code Take advantage of high parallelization Utilize
GPU Parallel Computing Architecture and CUDA Programming Model
GPU Parallel Computing Architecture and CUDA Programming Model John Nickolls Outline Why GPU Computing? GPU Computing Architecture Multithreading and Arrays Data Parallel Problem Decomposition Parallel
Intro to GPU computing. Spring 2015 Mark Silberstein, 048661, Technion 1
Intro to GPU computing Spring 2015 Mark Silberstein, 048661, Technion 1 Serial vs. parallel program One instruction at a time Multiple instructions in parallel Spring 2015 Mark Silberstein, 048661, Technion
Debugging CUDA Applications Przetwarzanie Równoległe CUDA/CELL
Debugging CUDA Applications Przetwarzanie Równoległe CUDA/CELL Michał Wójcik, Tomasz Boiński Katedra Architektury Systemów Komputerowych Wydział Elektroniki, Telekomunikacji i Informatyki Politechnika
Optimizing Application Performance with CUDA Profiling Tools
Optimizing Application Performance with CUDA Profiling Tools Why Profile? Application Code GPU Compute-Intensive Functions Rest of Sequential CPU Code CPU 100 s of cores 10,000 s of threads Great memory
APPLICATIONS OF LINUX-BASED QT-CUDA PARALLEL ARCHITECTURE
APPLICATIONS OF LINUX-BASED QT-CUDA PARALLEL ARCHITECTURE Tuyou Peng 1, Jun Peng 2 1 Electronics and information Technology Department Jiangmen Polytechnic, Jiangmen, Guangdong, China, [email protected]
Multi-GPU Load Balancing for Simulation and Rendering
Multi- Load Balancing for Simulation and Rendering Yong Cao Computer Science Department, Virginia Tech, USA In-situ ualization and ual Analytics Instant visualization and interaction of computing tasks
CudaDMA: Optimizing GPU Memory Bandwidth via Warp Specialization
CudaDMA: Optimizing GPU Memory Bandwidth via Warp Specialization Michael Bauer Stanford University [email protected] Henry Cook UC Berkeley [email protected] Brucek Khailany NVIDIA Research [email protected]
CUDA SKILLS. Yu-Hang Tang. June 23-26, 2015 CSRC, Beijing
CUDA SKILLS Yu-Hang Tang June 23-26, 2015 CSRC, Beijing day1.pdf at /home/ytang/slides Referece solutions coming soon Online CUDA API documentation http://docs.nvidia.com/cuda/index.html Yu-Hang Tang @
CUDA Debugging. GPGPU Workshop, August 2012. Sandra Wienke Center for Computing and Communication, RWTH Aachen University
CUDA Debugging GPGPU Workshop, August 2012 Sandra Wienke Center for Computing and Communication, RWTH Aachen University Nikolay Piskun, Chris Gottbrath Rogue Wave Software Rechen- und Kommunikationszentrum
OpenACC Basics Directive-based GPGPU Programming
OpenACC Basics Directive-based GPGPU Programming Sandra Wienke, M.Sc. [email protected] Center for Computing and Communication RWTH Aachen University Rechen- und Kommunikationszentrum (RZ) PPCES,
GPU Accelerated Monte Carlo Simulations and Time Series Analysis
GPU Accelerated Monte Carlo Simulations and Time Series Analysis Institute of Physics, Johannes Gutenberg-University of Mainz Center for Polymer Studies, Department of Physics, Boston University Artemis
CUDA Optimization with NVIDIA Tools. Julien Demouth, NVIDIA
CUDA Optimization with NVIDIA Tools Julien Demouth, NVIDIA What Will You Learn? An iterative method to optimize your GPU code A way to conduct that method with Nvidia Tools 2 What Does the Application
PARALLEL JAVASCRIPT. Norm Rubin (NVIDIA) Jin Wang (Georgia School of Technology)
PARALLEL JAVASCRIPT Norm Rubin (NVIDIA) Jin Wang (Georgia School of Technology) JAVASCRIPT Not connected with Java Scheme and self (dressed in c clothing) Lots of design errors (like automatic semicolon
GPU File System Encryption Kartik Kulkarni and Eugene Linkov
GPU File System Encryption Kartik Kulkarni and Eugene Linkov 5/10/2012 SUMMARY. We implemented a file system that encrypts and decrypts files. The implementation uses the AES algorithm computed through
GPU Tools Sandra Wienke
Sandra Wienke Center for Computing and Communication, RWTH Aachen University MATSE HPC Battle 2012/13 Rechen- und Kommunikationszentrum (RZ) Agenda IDE Eclipse Debugging (CUDA) TotalView Profiling (CUDA
Applications to Computational Financial and GPU Computing. May 16th. Dr. Daniel Egloff +41 44 520 01 17 +41 79 430 03 61
F# Applications to Computational Financial and GPU Computing May 16th Dr. Daniel Egloff +41 44 520 01 17 +41 79 430 03 61 Today! Why care about F#? Just another fashion?! Three success stories! How Alea.cuBase
Overview. Lecture 1: an introduction to CUDA. Hardware view. Hardware view. hardware view software view CUDA programming
Overview Lecture 1: an introduction to CUDA Mike Giles [email protected] hardware view software view Oxford University Mathematical Institute Oxford e-research Centre Lecture 1 p. 1 Lecture 1 p.
Advanced CUDA Webinar. Memory Optimizations
Advanced CUDA Webinar Memory Optimizations Outline Overview Hardware Memory Optimizations Data transfers between host and device Device memory optimizations Summary Measuring performance effective bandwidth
ACCELERATING SELECT WHERE AND SELECT JOIN QUERIES ON A GPU
Computer Science 14 (2) 2013 http://dx.doi.org/10.7494/csci.2013.14.2.243 Marcin Pietroń Pawe l Russek Kazimierz Wiatr ACCELERATING SELECT WHERE AND SELECT JOIN QUERIES ON A GPU Abstract This paper presents
A Pattern-Based Comparison of OpenACC & OpenMP for Accelerators
A Pattern-Based Comparison of OpenACC & OpenMP for Accelerators Sandra Wienke 1,2, Christian Terboven 1,2, James C. Beyer 3, Matthias S. Müller 1,2 1 IT Center, RWTH Aachen University 2 JARA-HPC, Aachen
High Performance Cloud: a MapReduce and GPGPU Based Hybrid Approach
High Performance Cloud: a MapReduce and GPGPU Based Hybrid Approach Beniamino Di Martino, Antonio Esposito and Andrea Barbato Department of Industrial and Information Engineering Second University of Naples
Overview on Modern Accelerators and Programming Paradigms Ivan Giro7o [email protected]
Overview on Modern Accelerators and Programming Paradigms Ivan Giro7o [email protected] Informa(on & Communica(on Technology Sec(on (ICTS) Interna(onal Centre for Theore(cal Physics (ICTP) Mul(ple Socket
Introduction to Numerical General Purpose GPU Computing with NVIDIA CUDA. Part 1: Hardware design and programming model
Introduction to Numerical General Purpose GPU Computing with NVIDIA CUDA Part 1: Hardware design and programming model Amin Safi Faculty of Mathematics, TU dortmund January 22, 2016 Table of Contents Set
Sélection adaptative de codes polyédriques pour GPU/CPU
Sélection adaptative de codes polyédriques pour GPU/CPU Jean-François DOLLINGER, Vincent LOECHNER, Philippe CLAUSS INRIA - Équipe CAMUS Université de Strasbourg Saint-Hippolyte - Le 6 décembre 2011 1 Sommaire
Case Study on Productivity and Performance of GPGPUs
Case Study on Productivity and Performance of GPGPUs Sandra Wienke [email protected] ZKI Arbeitskreis Supercomputing April 2012 Rechen- und Kommunikationszentrum (RZ) RWTH GPU-Cluster 56 Nvidia
Porting the Plasma Simulation PIConGPU to Heterogeneous Architectures with Alpaka
Porting the Plasma Simulation PIConGPU to Heterogeneous Architectures with Alpaka René Widera1, Erik Zenker1,2, Guido Juckeland1, Benjamin Worpitz1,2, Axel Huebl1,2, Andreas Knüpfer2, Wolfgang E. Nagel2,
HP ProLiant SL270s Gen8 Server. Evaluation Report
HP ProLiant SL270s Gen8 Server Evaluation Report Thomas Schoenemeyer, Hussein Harake and Daniel Peter Swiss National Supercomputing Centre (CSCS), Lugano Institute of Geophysics, ETH Zürich [email protected]
Design and Optimization of a Portable Lattice Boltzmann Code for Heterogeneous Architectures
Design and Optimization of a Portable Lattice Boltzmann Code for Heterogeneous Architectures E Calore, S F Schifano, R Tripiccione Enrico Calore INFN Ferrara, Italy Perspectives of GPU Computing in Physics
Parallel & Distributed Optimization. Based on Mark Schmidt s slides
Parallel & Distributed Optimization Based on Mark Schmidt s slides Motivation behind using parallel & Distributed optimization Performance Computational throughput have increased exponentially in linear
OpenACC Programming on GPUs
OpenACC Programming on GPUs Directive-based GPGPU Programming Sandra Wienke, M.Sc. [email protected] Center for Computing and Communication RWTH Aachen University Rechen- und Kommunikationszentrum
Performance Characteristics of Large SMP Machines
Performance Characteristics of Large SMP Machines Dirk Schmidl, Dieter an Mey, Matthias S. Müller [email protected] Rechen- und Kommunikationszentrum (RZ) Agenda Investigated Hardware Kernel Benchmark
GPU Performance Analysis and Optimisation
GPU Performance Analysis and Optimisation Thomas Bradley, NVIDIA Corporation Outline What limits performance? Analysing performance: GPU profiling Exposing sufficient parallelism Optimising for Kepler
Embedded Systems: map to FPGA, GPU, CPU?
Embedded Systems: map to FPGA, GPU, CPU? Jos van Eijndhoven [email protected] Bits&Chips Embedded systems Nov 7, 2013 # of transistors Moore s law versus Amdahl s law Computational Capacity Hardware
CUDA programming on NVIDIA GPUs
p. 1/21 on NVIDIA GPUs Mike Giles [email protected] Oxford University Mathematical Institute Oxford-Man Institute for Quantitative Finance Oxford eresearch Centre p. 2/21 Overview hardware view
HPC Wales Skills Academy Course Catalogue 2015
HPC Wales Skills Academy Course Catalogue 2015 Overview The HPC Wales Skills Academy provides a variety of courses and workshops aimed at building skills in High Performance Computing (HPC). Our courses
OpenACC Parallelization and Optimization of NAS Parallel Benchmarks
OpenACC Parallelization and Optimization of NAS Parallel Benchmarks Presented by Rengan Xu GTC 2014, S4340 03/26/2014 Rengan Xu, Xiaonan Tian, Sunita Chandrasekaran, Yonghong Yan, Barbara Chapman HPC Tools
Scalable and High Performance Computing for Big Data Analytics in Understanding the Human Dynamics in the Mobile Age
Scalable and High Performance Computing for Big Data Analytics in Understanding the Human Dynamics in the Mobile Age Xuan Shi GRA: Bowei Xue University of Arkansas Spatiotemporal Modeling of Human Dynamics
Benchmark Hadoop and Mars: MapReduce on cluster versus on GPU
Benchmark Hadoop and Mars: MapReduce on cluster versus on GPU Heshan Li, Shaopeng Wang The Johns Hopkins University 3400 N. Charles Street Baltimore, Maryland 21218 {heshanli, shaopeng}@cs.jhu.edu 1 Overview
Lecture 1: an introduction to CUDA
Lecture 1: an introduction to CUDA Mike Giles [email protected] Oxford University Mathematical Institute Oxford e-research Centre Lecture 1 p. 1 Overview hardware view software view CUDA programming
Programming GPUs with CUDA
Programming GPUs with CUDA Max Grossman Department of Computer Science Rice University [email protected] COMP 422 Lecture 23 12 April 2016 Why GPUs? Two major trends GPU performance is pulling away from
OpenACC Programming and Best Practices Guide
OpenACC Programming and Best Practices Guide June 2015 2015 openacc-standard.org. All Rights Reserved. Contents 1 Introduction 3 Writing Portable Code........................................... 3 What
E6895 Advanced Big Data Analytics Lecture 14:! NVIDIA GPU Examples and GPU on ios devices
E6895 Advanced Big Data Analytics Lecture 14: NVIDIA GPU Examples and GPU on ios devices Ching-Yung Lin, Ph.D. Adjunct Professor, Dept. of Electrical Engineering and Computer Science IBM Chief Scientist,
Performance Evaluations of Graph Database using CUDA and OpenMP Compatible Libraries
Performance Evaluations of Graph Database using CUDA and OpenMP Compatible Libraries Shin Morishima 1 and Hiroki Matsutani 1,2,3 1Keio University, 3 14 1 Hiyoshi, Kohoku ku, Yokohama, Japan 2National Institute
Application Note 120 Communicating Through the 1-Wire Master
www.dalsemi.com Application Note 120 Communicating Through the 1-Wire Master INTRODUCTION The DS1WM 1-Wire Master was created to facilitate host CPU communication with devices over a 1-Wire bus without
Monte-Carlo Option Pricing. Victor Podlozhnyuk [email protected]
Monte-Carlo Option Pricing Victor Podlozhnyuk [email protected] Document Change History Version Date Responsible Reason for Change 1. 3//7 vpodlozhnyuk Initial release Abstract The pricing of options
Texture Cache Approximation on GPUs
Texture Cache Approximation on GPUs Mark Sutherland Joshua San Miguel Natalie Enright Jerger {suther68,enright}@ece.utoronto.ca, [email protected] 1 Our Contribution GPU Core Cache Cache
LBM BASED FLOW SIMULATION USING GPU COMPUTING PROCESSOR
LBM BASED FLOW SIMULATION USING GPU COMPUTING PROCESSOR Frédéric Kuznik, frederic.kuznik@insa lyon.fr 1 Framework Introduction Hardware architecture CUDA overview Implementation details A simple case:
ultra fast SOM using CUDA
ultra fast SOM using CUDA SOM (Self-Organizing Map) is one of the most popular artificial neural network algorithms in the unsupervised learning category. Sijo Mathew Preetha Joy Sibi Rajendra Manoj A
Black-Scholes option pricing. Victor Podlozhnyuk [email protected]
Black-Scholes option pricing Victor Podlozhnyuk [email protected] June 007 Document Change History Version Date Responsible Reason for Change 0.9 007/03/19 vpodlozhnyuk Initial release 1.0 007/04/06
Automatic CUDA Code Synthesis Framework for Multicore CPU and GPU architectures
Automatic CUDA Code Synthesis Framework for Multicore CPU and GPU architectures 1 Hanwoong Jung, and 2 Youngmin Yi, 1 Soonhoi Ha 1 School of EECS, Seoul National University, Seoul, Korea {jhw7884, sha}@iris.snu.ac.kr
IMAGE PROCESSING WITH CUDA
IMAGE PROCESSING WITH CUDA by Jia Tse Bachelor of Science, University of Nevada, Las Vegas 2006 A thesis submitted in partial fulfillment of the requirements for the Master of Science Degree in Computer
The Uintah Framework: A Unified Heterogeneous Task Scheduling and Runtime System
The Uintah Framework: A Unified Heterogeneous Task Scheduling and Runtime System Qingyu Meng, Alan Humphrey, Martin Berzins Thanks to: John Schmidt and J. Davison de St. Germain, SCI Institute Justin Luitjens
Unleashing the Performance Potential of GPUs for Atmospheric Dynamic Solvers
Unleashing the Performance Potential of GPUs for Atmospheric Dynamic Solvers Haohuan Fu [email protected] High Performance Geo-Computing (HPGC) Group Center for Earth System Science Tsinghua University
ANALYSIS OF RSA ALGORITHM USING GPU PROGRAMMING
ANALYSIS OF RSA ALGORITHM USING GPU PROGRAMMING Sonam Mahajan 1 and Maninder Singh 2 1 Department of Computer Science Engineering, Thapar University, Patiala, India 2 Department of Computer Science Engineering,
NVIDIA CUDA Software and GPU Parallel Computing Architecture. David B. Kirk, Chief Scientist
NVIDIA CUDA Software and GPU Parallel Computing Architecture David B. Kirk, Chief Scientist Outline Applications of GPU Computing CUDA Programming Model Overview Programming in CUDA The Basics How to Get
INTEL PARALLEL STUDIO EVALUATION GUIDE. Intel Cilk Plus: A Simple Path to Parallelism
Intel Cilk Plus: A Simple Path to Parallelism Compiler extensions to simplify task and data parallelism Intel Cilk Plus adds simple language extensions to express data and task parallelism to the C and
Introduction to Parallel Programming (w/ JAVA)
Introduction to Parallel Programming (w/ JAVA) Dec. 21st, 2015 Christian Terboven IT Center, RWTH Aachen University [email protected] 1 Moore s Law Cramming More Components onto Integrated Circuits
Packet-based Network Traffic Monitoring and Analysis with GPUs
Packet-based Network Traffic Monitoring and Analysis with GPUs Wenji Wu, Phil DeMar [email protected], [email protected] GPU Technology Conference 2014 March 24-27, 2014 SAN JOSE, CALIFORNIA Background Main
NVIDIA CUDA GETTING STARTED GUIDE FOR MAC OS X
NVIDIA CUDA GETTING STARTED GUIDE FOR MAC OS X DU-05348-001_v6.5 August 2014 Installation and Verification on Mac OS X TABLE OF CONTENTS Chapter 1. Introduction...1 1.1. System Requirements... 1 1.2. About
Compute Spearman Correlation Coefficient with Matlab/CUDA
Compute Spearman Correlation Coefficient with Matlab/CUDA Seongho Kim Bioinformatics & Biostatistics Department University of Louisville Louisville, Kentucky 40292 USA [email protected] Ming Ouyang
<Insert Picture Here> An Experimental Model to Analyze OpenMP Applications for System Utilization
An Experimental Model to Analyze OpenMP Applications for System Utilization Mark Woodyard Principal Software Engineer 1 The following is an overview of a research project. It is intended
Experiences on using GPU accelerators for data analysis in ROOT/RooFit
Experiences on using GPU accelerators for data analysis in ROOT/RooFit Sverre Jarp, Alfio Lazzaro, Julien Leduc, Yngve Sneen Lindal, Andrzej Nowak European Organization for Nuclear Research (CERN), Geneva,
Part I Courses Syllabus
Part I Courses Syllabus This document provides detailed information about the basic courses of the MHPC first part activities. The list of courses is the following 1.1 Scientific Programming Environment
Hardware-Aware Analysis and. Presentation Date: Sep 15 th 2009 Chrissie C. Cui
Hardware-Aware Analysis and Optimization of Stable Fluids Presentation Date: Sep 15 th 2009 Chrissie C. Cui Outline Introduction Highlights Flop and Bandwidth Analysis Mehrstellen Schemes Advection Caching
Optimizing a 3D-FWT code in a cluster of CPUs+GPUs
Optimizing a 3D-FWT code in a cluster of CPUs+GPUs Gregorio Bernabé Javier Cuenca Domingo Giménez Universidad de Murcia Scientific Computing and Parallel Programming Group XXIX Simposium Nacional de la
Evaluation of CUDA Fortran for the CFD code Strukti
Evaluation of CUDA Fortran for the CFD code Strukti Practical term report from Stephan Soller High performance computing center Stuttgart 1 Stuttgart Media University 2 High performance computing center
CUDA Tools for Debugging and Profiling. Jiri Kraus (NVIDIA)
Mitglied der Helmholtz-Gemeinschaft CUDA Tools for Debugging and Profiling Jiri Kraus (NVIDIA) GPU Programming@Jülich Supercomputing Centre Jülich 7-9 April 2014 What you will learn How to use cuda-memcheck
Pricing of cross-currency interest rate derivatives on Graphics Processing Units
Pricing of cross-currency interest rate derivatives on Graphics Processing Units Duy Minh Dang Department of Computer Science University of Toronto Toronto, Canada [email protected] Joint work with
