# Accelerate your Application with CUDA C

1 Accelerate your Application with CUDA C

2 2D Minimum Algorithm Consider applying a 2D window to a 2D array of elements Each output element is the minimum of input elements within the window Window Input Matrix Output Matrix 2D operations like this are found in many fundamental algorithms Interpolation, Convolution, Filtering Applications in seismic processing, weather simulation, image processing, etc

3 2D Minimum: C Version #define WIN_SIZE 16 #define N int main() int size=n*n*sizeof(int); int i, j, x, y, temp; //allocate resources int *cell=(int *)malloc(size); //input int *node=(int *)malloc(size); //output initializearray(cell,n); initializearray(node,n); Allocate Resources Initialize data for(i=0;i<n;i++) for(j=0;j<n;j++) //find minimum in window temp = node[i][j]; for(x=0;x<win_size;x++) for(y=0;y<win_size;y++) if (temp> cell[i+x][j+y]) temp = cell[i+x][j+y]; node[i][j] = temp; Loop over dataset Loop over window Find min //free resources free(cell); free(node); Cleanup

4 2D Minimum: C Version #define WIN_SIZE 16 #define N for(i=0;i<n;i++) for(j=0;j<n;j++) int main() int size=n*n*sizeof(int); //find minimum in window int i, j, x, y, temp; temp = node[i][j]; //allocate resources for(x=0;x<win_size;x++) Algorithm Device Compute Speedup int *cell=(int *)malloc(size); //input for(y=0;y<win_size;y++) int *node=(int *)malloc(size); //output if (temp> cell[i+x][j+y]) C (serial) Xeon X sec -- temp = cell[i+x][j+y]; initializearray(cell,n); node[i][j] = temp; initializearray(node,n); //free resources free(cell); free(node);

5 CPU + GPU = Big Speedup Application Code GPU Compute-Intensive Functions Use CUDA to Parallelize Rest of Sequential CPU Code CPU +

6 Explicit Data Movement CPU (host) GPU (device) CPU Memory in out PCI Bus GPU Memory d_in d_out

7 2D Window: CUDA Main Program C int main() int main() int size=n*n*sizeof(int); int size=n*n*sizeof(int); //allocate resources //allocate resources int *cell=(int*)malloc(size); //input int *cell=(float*)malloc(size); int *node=(int*)malloc(size); //output int *node=(float*)malloc(size); int *d_cell; cudamalloc(&d_cell,size); initializearray(cell,n); int *d_node; cudamalloc(&d_node,size); initializearray(node,n); initializearray(cell,n); initializearray(node,n); // nested for loops for cudamemcpy(d_cell,cell,size,cudamemcpyhosttodevice); for cudamemcpy(d_node,node,size,cudamemcpyhosttodevice); for compute_win2d<<<nblocks, nthreads>>>(d_node,d_cell); for. //free resources //free resources free(cell); free(node); free(in); free(out); cudafree(d_cell); cudafree(d_node); CUDA C

8 2D Window: CUDA Main Program int main() int size=n*n*sizeof(int); //allocate resources int *cell=(int*)malloc(size); //input int *node=(int*)malloc(size); //output initializearray(cell,n); initializearray(node,n); // nested for loops for for for for. //free resources free(in); free(out); C int main() int size=n*n*sizeof(int); //allocate resources int *cell=(float*)malloc(size); int *node=(float*)malloc(size); int *d_cell; cudamalloc(&d_cell,size); int *d_node; cudamalloc(&d_node,size); initializearray(cell,n); initializearray(node,n); cudamemcpy(d_cell,cell,size,cudamemcpyhosttodevice); cudamemcpy(d_node,node,size,cudamemcpyhosttodevice); compute_win2d<<<nblocks, nthreads>>>(d_node,d_cell); //free resources free(cell); free(node); cudafree(d_cell); cudafree(d_node); CUDA C Allocate Device Memory

9 2D Window: CUDA Main Program int main() int size=n*n*sizeof(int); //allocate resources int *cell=(int*)malloc(size); //input int *node=(int*)malloc(size); //output initializearray(cell,n); initializearray(node,n); // nested for loops for for for for. //free resources free(in); free(out); C int main() int size=n*n*sizeof(int); //allocate resources int *cell=(float*)malloc(size); int *node=(float*)malloc(size); int *d_cell; cudamalloc(&d_cell,size); int *d_node; cudamalloc(&d_node,size); initializearray(cell,n); initializearray(node,n); cudamemcpy(d_cell,cell,size,cudamemcpyhosttodevice); cudamemcpy(d_node,node,size,cudamemcpyhosttodevice); compute_win2d<<<nblocks, nthreads>>>(d_node,d_cell); //free resources free(cell); free(node); cudafree(d_cell); cudafree(d_node); CUDA C Copy Data to the Device

10 2D Window: CUDA Main Program int main() int size=n*n*sizeof(int); //allocate resources int *cell=(int*)malloc(size); //input int *node=(int*)malloc(size); //output initializearray(cell,n); initializearray(node,n); // nested for loops for for for for. //free resources free(in); free(out); C Call Cuda Function int main() int size=n*n*sizeof(int); //allocate resources int *cell=(float*)malloc(size); int *node=(float*)malloc(size); int *d_cell; cudamalloc(&d_cell,size); int *d_node; cudamalloc(&d_node,size); initializearray(cell,n); initializearray(node,n); cudamemcpy(d_cell,cell,size,cudamemcpyhosttodevice); cudamemcpy(d_node,node,size,cudamemcpyhosttodevice); compute_win2d<<<nblocks, nthreads>>>(d_node,d_cell); //free resources free(cell); free(node); Launch cudafree(d_cell); cudafree(d_node); Parameters CUDA C Device Pointers

11 2D Window: CUDA Main Program C int main() int size=n*n*sizeof(int); //allocate resources int *cell=(int*)malloc(size); //input int *node=(int*)malloc(size); //output int main() int size=n*n*sizeof(int); //allocate resources int *cell=(float*)malloc(size); int *node=(float*)malloc(size); int *d_cell; cudamalloc(&d_cell,size); initializearray(cell,n); initializearray(node,n); int *d_node; cudamalloc(&d_node,size); initializearray(cell,n); initializearray(node,n); // nested for loops for for for for. cudamemcpy(d_cell,cell,size,cudamemcpyhosttodevice); cudamemcpy(d_node,node,size,cudamemcpyhosttodevice); compute_win2d<<<nblocks, nthreads>>>(d_node,d_cell); //free resources //free resources free(in); free(out); free(cell); free(node); cudafree(d_cell); cudafree(d_node); Cleanup CUDA C

12 Parallel Execution Model A CUDA C function is executed by many parallel threads Threads are organized as a grid of independent thread blocks Grid Block 0 Block 1 Block N_B-2 Block N_B-1

13 2D Window: CUDA Kernel Function for(i=0;i<n;i++) for(j=0;j<n;j++) //find minimum in window temp = node[i][j]; C for(x=0;x<win_size;x++) for(y=0;y<win_size;y++) if (temp> cell[i+x][j+y]) temp = cell[i+x][j+y]; node[i][j] = temp; global void compute_win2d(int knode[][n], int kcell[][n]) int idx=blockidx.x*blockdim.x+threadidx.x; int idy=blockidx.y*blockdim.y+threadidx.y; int temp, x, y; if((idx<n)&&(idy<n)) //find minimum in window temp = knode[idx][idy]; for(x=0;x<win_size;x++) for(y=0;y<win_size;y++) if (temp> kcell[idx+x][idy+y]) temp = kcell[idx+x][idy+y]; knode[i][j] = temp; CUDA C

14 2D Window: CUDA Kernel Function for(i=0;i<n;i++) for(j=0;j<n;j++) //find minimum in window temp = node[i][j]; C for(x=0;x<win_size;x++) for(y=0;y<win_size;y++) if (temp> cell[i+x][j+y]) temp = cell[i+x][j+y]; node[i][j] = temp; Add global Keyword global void compute_win2d(int knode[][n], int kcell[][n]) int idx=blockidx.x*blockdim.x+threadidx.x; int idy=blockidx.y*blockdim.y+threadidx.y; int temp, x, y; if((idx<n)&&(idy<n)) //find minimum in window temp = knode[idx][idy]; for(x=0;x<win_size;x++) for(y=0;y<win_size;y++) if (temp> kcell[idx+x][idy+y]) temp = kcell[idx+x][idy+y]; knode[i][j] = temp; CUDA C

15 2D Window: CUDA Kernel Function for(i=0;i<n;i++) for(j=0;j<n;j++) //find minimum in window blockidx.x temp = node[i][j]; C for(x=0;x<win_size;x++) for(y=0;y<win_size;y++) if (temp> cell[i+x][j+y]) temp = cell[i+x][j+y]; node[i][j] = temp; Replace Outer Loops With an Index Calculation 0 N_B threadidx.x threadidx.x global void compute_win2d(int knode[][n], int kcell[][n]) int idx=blockidx.x*blockdim.x+threadidx.x; int idy=blockidx.y*blockdim.y+threadidx.y; int temp, x, y; if((idx<n)&&(idy<n)) //find minimum in window temp = knode[idx][idy]; for(x=0;x<win_size;x++) for(y=0;y<win_size;y++) if (temp> kcell[idx+x][idy+y]) temp = kcell[idx+x][idy+y]; knode[i][j] = temp; CUDA C

16 2D Window: CUDA Kernel Function C global void compute_win2d(int knode[][n], int kcell[][n]) for(i=0;i<n;i++) int idx=blockidx.x*blockdim.x+threadidx.x; for(j=0;j<n;j++) int idy=blockidx.y*blockdim.y+threadidx.y; int temp, x, y; //find minimum Algorithm in window Device if((idx<n)&&(idy<n)) Compute Speedup temp = node[i][j]; //find minimum in window for(x=0;x<win_size;x++) C (serial) X5650 temp 96.1sec = knode[idx][idy]; -- for(y=0;y<win_size;y++) for(x=0;x<win_size;x++) CUDA M sec 11.5x if (temp> cell[i+x][j+y]) for(y=0;y<win_size;y++) temp = cell[i+x][j+y]; if (temp> kcell[idx+x][idy+y]) node[i][j] = temp; temp = kcell[idx+x][idy+y]; knode[i][j] = temp; CUDA C

17 Observation: Data Reuse Thread(i) Thread(i+1) Global Memory Neighboring threads read the same elements.

18 Optimization: Use Shared Memory Global Memory SMEM COPY BLK_SIZE WSIZE Shared Memory

29 Questions?

