ACCELERATING HPC APPLICATIONS ON NVIDIA GPUS WITH OPENACC Doug Miles, PGI Compilers & Tools, NVIDIA High Performance Computing Advisory Council February 21, 2018 PGI — THE NVIDIA HPC SDK

Fortran, C & C++ Compilers Optimizing, SIMD Vectorizing, OpenMP Accelerated Computing Features CUDA Fortran, OpenACC Directives Multi-Platform Solution X86-64 and OpenPOWER Multicore CPUs NVIDIA Tesla GPUs Supported on , macOS, Windows MPI/OpenMP/OpenACC Tools Debugger Performance Profiler Interoperable with DDT, TotalView

2 Programming GPU-Accelerated Systems Separate CPU System and GPU Memories

GPU Developer View

PCIe

System GPU Memory Memory

3 Programming GPU-Accelerated Systems Separate CPU System and GPU Memories

GPU Developer View

NVLink

System GPU Memory Memory

4 CUDA FORTRAN attributes(global) subroutine mm_kernel ( A, B, C, N, M, L ) real :: A(N,M), B(M,L), C(N,L), Cij integer, value :: N, M, L real, device, allocatable, dimension(:,:) :: integer :: i, j, kb, k, tx, ty Adev,Bdev,Cdev real, shared :: Asub(16,16),Bsub(16,16) tx = threadidx%x . . . ty = threadidx%y i = blockidx%x * 16 + tx allocate (Adev(N,M), Bdev(M,L), Cdev(N,L)) j = blockidx%y * 16 + ty Adev = A(1:N,1:M) Cij = 0.0 Bdev = B(1:M,1:L) do kb = 1, M, 16 Asub(tx,ty) = A(i,kb+tx-1) call mm_kernel <<>> Bsub(tx,ty) = B(kb+ty-1,j) ( Adev, Bdev, Cdev, N, M, L ) call syncthreads() do k = 1,16 C(1:N,1:L) = Cdev Cij = Cij + Asub(tx,k) * Bsub(k,ty) deallocate ( Adev, Bdev, Cdev ) enddo call syncthreads() . . . enddo C(i,j) = Cij end subroutine mmul_kernel CPU Code Tesla Code 5 module madd_device_module use cudafor implicit none contains attributes(global) subroutine madd_kernel(a,b,c,blocksum,n1,n2) real, dimension(:,:) :: a,b,c real, dimension(:) :: blocksum integer, value :: n1,n2 integer :: i,j,tindex,tneighbor,bindex real :: mysum real, shared :: bsum(256) CUDA FORTRAN ! Do this thread's work mysum = 0.0 do j = threadidx%y + (blockidx%y-1)*blockdim%y, n2, blockdim%y*griddim%y do i = threadidx%x + (blockidx%x-1)*blockdim%x, n1, blockdim%x*griddim%x !$CUF KERNEL Directives a(i,j) = b(i,j) + c(i,j) mysum = mysum + a(i,j) ! accumulates partial sum per thread enddo enddo ! Now add up all partial sums for the whole thread block ! Compute this thread's linear index in the thread block ! We assume 256 threads in the thread block tindex = threadidx%x + (threadidx%y-1)*blockdim%x module madd_device_module ! Store this thread's partial sum in the shared memory block bsum(tindex) = mysum call syncthreads() use cudafor ! Accumulate all the partial sums for this thread block to a single value tneighbor = 128 do while( tneighbor >= 1 ) contains if( tindex <= tneighbor ) & bsum(tindex) = bsum(tindex) + bsum(tindex+tneighbor) tneighbor = tneighbor / 2 subroutine madd_dev(a,b,c,sum,n1,n2) call syncthreads() enddo ! Store the partial sum for the thread block real,dimension(:,:),device :: a,b,c bindex = blockidx%x + (blockidx%y-1)*griddim%x if( tindex == 1 ) blocksum(bindex) = bsum(1) real :: sum end subroutine ! Add up partial sums for all thread blocks to a single cumulative sum attributes(global) subroutine madd_sum_kernel(blocksum,dsum,nb) integer :: n1,n2 real, dimension(:) :: blocksum real :: dsum integer, value :: nb type(dim3) :: grid, block real, shared :: bsum(256) integer :: tindex,tneighbor,i ! Again, we assume 256 threads in the thread block !$cuf kernel do (2) <<<(*,*),(32,4)>>> ! accumulate a partial sum for each thread tindex = threadidx%x bsum(tindex) = 0.0 do j = 1,n2 do i = tindex, nb, blockdim%x bsum(tindex) = bsum(tindex) + blocksum(i) enddo do i = 1,n1 call syncthreads() ! This code is copied from the previous kernel ! Accumulate all the partial sums for this thread block to a single value a(i,j) = b(i,j) + c(i,j) ! Since there is only one thread block, this single value is the final result tneighbor = 128 do while( tneighbor >= 1 ) sum = sum + a(i,j) if( tindex <= tneighbor ) & bsum(tindex) = bsum(tindex) + bsum(tindex+tneighbor) tneighbor = tneighbor / 2 enddo call syncthreads() enddo if( tindex == 1 ) dsum = bsum(1) enddo end subroutine

subroutine madd_dev(a,b,c,dsum,n1,n2) end subroutine real, dimension(:,:), device :: a,b,c Equivalent real, device :: dsum real, dimension(:), allocatable, device :: blocksum end module integer :: n1,n2,nb type(dim3) :: grid, block hand-written integer :: r ! Compute grid/block size; block size must be 256 threads grid = dim3((n1+31)/32, (n2+7)/8, 1) block = dim3(32,8,1) CUDA kernels nb = grid%x * grid%y allocate(blocksum(1:nb)) call madd_kernel<<< grid, block >>>(a,b,c,blocksum,n1,n2) call madd_sum_kernel<<< 1, 256 >>>(blocksum,dsum,nb) r = cudaThreadSynchronize() ! don't deallocate too early 6 deallocate(blocksum) end subroutine end module OpenACC Directives

Manage #pragma acc data copyin(a,b) copyout(c) Incremental Data { Movement ... Single #pragma acc parallel { Initiate #pragma acc loop gang vector Interoperable Parallel for (i = 0; i < n; ++i) { Execution c[i] = a[i] + b[i]; Performance portable ... } CPU, GPU, Manycore Optimize } Loop ... Mappings }

7 OpenACC for GPUs in a Nutshell ... #pragma acc data copy(b[0:n][0:m]) \ create(a[0:n][0:m]) { p21 for (iter = 1; iter <= p; ++iter){ A #pragma acc parallel loop SS (B(B)) for (i = 1; i < n-1; ++i){ for (j = 1; j < m-1; ++j){ a[i][j]=w0*b[i][j]+ 1p w1*(b[i-1][j]+b[i+1][j]+ B S (B)(B) b[i][j-1]+b[i][j+1])+ w2*(b[i-1][j-1]+b[i-1][j+1]+ b[i+1][j-1]+b[i+1][j+1]); } } #pragma acc parallel loop for( i = 1; i < n-1; ++i ) for( j = 1; j < m-1; ++j ) System GPU b[i][j] = a[i][j]; Memory Memory } } ... 8 OpenACC is for Multicore, Manycore & GPUs

98 !$acc parallel 99 !$acc loop independent 100 do k=y_min-depth,y_max+depth 101 !$acc loop independent 102 do j=1,depth 103 density0(x_min-j,k)=left_density0(left_xmax+1-j,k) 104 enddo 105 enddo 106 !$acc end parallel

Multicore CPU Tesla GPU

% pgfortran -ta=multicore –fast –Minfo=acc -c \ % pgfortran -ta=tesla –fast -Minfo=acc –c \ update_tile_halo_kernel.f90 update_tile_halo_kernel.f90 ...... 100, Loop is parallelizable 100, Loop is parallelizable Generating Multicore code 102, Loop is parallelizable 100, !$acc loop gang Accelerator kernel generated 102, Loop is parallelizable Generating Tesla code 100, !$acc loop gang, vector(4) ! blockidx%y threadidx%y 102, !$acc loop gang, vector(32) ! blockidx%x threadidx%x

9 SPEC ACCEL 1.2 BENCHMARKS

OpenACC OpenMP 4.5 200 200 PGI 18.1 Intel 2018 PGI 18.1

150 150

100 100 4.4x

Speed-up

GEOMEAN Seconds GEOMEAN GEOMEAN Seconds GEOMEAN 50 50

0 0 2-socket 1x Volta 2-socket Skylake 2-socket EPYC 2-socket Broadwell Broadwell V100 40 cores / 80 threads 48 cores / 48 threads 40 cores / 80 threads

Performance measured February, 2018. Skylake: Two 20 core Intel Xeon Gold 6148 CPUs @ 2.4GHz w/ 376GB memory, hyperthreading enabled. EPYC: Two 24 core AMD EPYC 7451 CPUs @ 2.3GHz w/ 256GB memory. Broadwell: Two 20 core Intel Xeon E5-2698 v4 CPUs @ 3.6GHz w/ 256GB memory, hyperthreading enabled. Volta: NVIDIA DGX1 system with two 20 core Intel Xeon E5-2698 v4 CPUs @ 2.20GHz, 256GB memory, one NVIDIA Tesla V100-SXM2-16GB GPU @ 1.53GHz. SPEC® is a registered trademark of the Standard Performance Evaluation Corporation (www.spec.org). OPENACC APPLICATIONS

11 Parallelization Strategy Within Gaussi an 16, GPUs are used for a sm all fraction of code that consumes a large fraction of the execution time. T e implementation of GPU parallelism conforms to Gaussi an’s general parallelization strategy. Its main tenets are to avoid changing the underlying source code and to avoid modif cations which negatively af ect CPU per formance. For these reasons, OpenACC was used for GPU parallelization. PGI Accelerator Compilers with OpenACC PGI compilers fully support the current OpenACC standard as wel l as important extensions to it. PGI is an important contributor to the ongoing development of OpenACC. OpenACC enables developer s to implement GPU parallelism by adding compiler directives to their source code, of en eliminating the need for rewriting or restructuring. For example, the following Fortran compiler directive identif es a loop which the compiler should parallelize: !$acc paral l el l oop Other directives allocate GPU memory, copy data to/from GPUs, specify data to remain on the GPU, combine or split loops and other code sections, and gener ally provide hints for optimal work T e Gaussian approach to parallelization relies on environment-speci f c parallelization distribution management, and more. frameworks and tools: OpenMP for shared-memory, Linda for cluster and net work parallelization across discret e nodes, and OpenACC for GPUs. T e OpenACC project is ver y active, and the speci f cations and tools are changing fairly rapidly. T is has been true throughout the T e process of implementing GPU support involved many dif er ent aspects: lifet ime of this project. Indeed, one of its major Identifying places wher e GPUs could be benef ci al. T ese are a subset of areas which challenges has been using OpenACC in the midst of its development. T e talented people at PGI are parallelized for other execution contexts because using GPUs requires f ne grained wer e instrumental in addressing issues that arose parallelism. in one of the ver y f rst uses of OpenACC for a Understanding and optimizing data movem ent/storage at a high level to maximize large commer cial sof ware package. GPU ef ciency. PGI’s sophisticated prof ling and per formance evaluation tools wer e vital to the success of the ef ort. Specifying GPUs to Gaussian 16 T e GPU implementation in Gaussi an 16 is sophisticated and complex but using it is simple and straightforward. GPUs are specif ed with 1 additional Link 0 command (or equivalent Def ault.Route f le entry/command line option). For example, the following commands tell Gaussi an to run the calculation using 24 compute cores plus 8 GPUs+8 controlling cores (32 cores total): %CPU= 0 - 3 1 Request 32 CPUs for the calculation: 24 cores for computation, and 8 cores to control GPUs (see below). %GPUCPU=0- 7=0- 7 Use GPUs 0-7 with CPUs 0-7 as thei r controller s. Det ailed information is available on our website. Project Contributors GAUSSIAN 16

Mike Frisch, Ph.D. President and CEO Gaussian, Inc.

Using OpenACC allowed us to continue Roberto Gomperts Michael Fdevelopmentrisch of our Bfundamentalrent Leback Giovanni Scalmani NVIDIA Gaussian NVIDIA/PGI Gaussian algorithms and software capabilities Gaussi an, Inc. simultaneously with the GPU-related 340 Quinnipiac St. Bldg. 40 work.Gaussia nIn is athe regist erend,ed trad emweark couldof Gaussi ausen, Inc. theAll ot her trademarks and register ed trademarks are Wallingford, CT 06492 USA the proper ties of thei r respective holder s. Speci f cations subject to change without notice. custser v@gaussi an.com sameCopyri ghcodet © 2017, base Gauss iaforn, In c.SMP, All righ tscluster/ reser ved. network and GPU parallelism. PGI's compilers were essential to the success of our efforts.

12 ANSYS FLUENT

Sunil Sathe Lead Software Developer ANSYS Fluent

We’ve effectively used OpenACC for heterogeneous computing in ANSYS Fluent with impressive performance. We’re now applying this work to more of our models and new platforms.

Image courtesy: ANSYS

13 VASP

Prof. Georg Kresse Computational Materials Physics University of Vienna

For VASP, OpenACC is the way forward for GPU acceleration. Performance is similar and in some cases better than CUDA C, and OpenACC dramatically decreases GPU development and maintenance efforts. We’re excited to collaborate with NVIDIA and PGI as an early adopter of CUDA Unified Memory.

14 NUMECA FINE/Open

David Gutzwiller Lead Software Developer NUMECA

Porting our unstructured C++ CFD solver FINE/Open to GPUs using OpenACC would have been impossible two or three years ago, but OpenACC has developed enough that we’re now getting some really good results.

15 MPAS-A

Richard Loft Director, Technology Development NCAR

Our team has been evaluating OpenACC as a pathway to performance portability for the Model for Prediction (MPAS) atmospheric model. Using this approach on the MPAS dynamical core, we have achieved performance on a single P100 GPU equivalent to 2.7 dual socketed Intel Xeon nodes on our new Cheyenne supercomputer. Image courtesy: NCAR

16 COSMO

Dr. Oliver Fuhrer Senior Scientist Meteoswiss

OpenACC made it practical to develop for GPU-based hardware while retaining a single source for almost all the COSMO physics code.

17 GAMERA FOR GPU

Takuma Yamaguchi, Kohei Fujita, Tsuyoshi Ichimura, Muneo Hori, Lalith Wijerathne The University of Tokyo

With OpenACC and a compute node based on NVIDIA's Tesla P100 GPU, we achieved more than a 14X speed up over a K Computer node running our earthquake disaster simulation code

Map courtesy University of Tokyo 18 QUANTUM ESPRESSO

Filippo Spiga Head of Research Software Engineering University of Cambridge

CUDA Fortran gives us the full performance potential of the CUDA programming model and NVIDIA GPUs. !$CUF KERNELS directives give us productivity and source code maintainability. It’s the best of both worlds.

19 OPENACC AND CUDA UNIFIED MEMORY

20 Programming GPU-Accelerated Systems CUDA Unified Memory for Dynamically Allocated Data

GPU Developer View GPU Developer View With CUDA Unified Memory

PCIe

System GPU Memory Unified Memory Memory

21 How CUDA Unified Memory Works on TESLA GPUs Servicing CPU and GPU Page Faults for Allocatable Data

__global__ cudaMallocManaged(&array, size); void setValue(char *ptr, int index, char val) memset(array, size); { setValue<<<...>>>(array, size/2, 5); ptr[index] = val; ... }

CPU Memory Mapping GPU Memory Mapping array array Page Fault Page Fault

PCIe or NVLink

22 PGI OpenACC and CUDA Unified Memory Compiling with the –ta=tesla:managed option

#pragma acc data copyin(a,b) copyout(c) GPU Developer View With { CUDA Unified Memory ... #pragma acc parallel { #pragma acc loop gang vector for (i = 0; i < n; ++i) { c[i] = a[i] + b[i]; ... } } ... } Unified Memory

C malloc, C++ new, Fortran allocate all mapped to CUDA Unified Memory 23 PGI OpenACC and CUDA Unified Memory Compiling with the –ta=tesla:managed option

GPU Developer View With CUDA Unified Memory ... #pragma acc parallel { #pragma acc loop gang vector for (i = 0; i < n; ++i) { c[i] = a[i] + b[i]; ... } } ... Unified Memory

C malloc, C++ new, Fortran allocate all mapped to CUDA Unified Memory 24 Center for Accelerated Application Readiness (CAAR) Oak Ridge Leadership Computing Facility

IBM POWER9 CPUs

NVIDIA Volta V100 GPUs 25 GTC: An OpenACC Production Application Being ported for runs on the ORNL Summit supercomputer

The gyrokinetic toroidal code (GTC) is a massively parallel, particle-in-cell production code for turbulence simulation in support of the burning experiment ITER, the crucial next step in the quest for fusion energy.

http://phoenix.ps.uci.edu/gtc_group

26 GTC Performance using OpenACC OpenPOWER | NVLink | Unified Memory | P100 | V100

16x 16.5X 14x

12x 12.1X 12X 10x

8x

6x 6.1X 5.9X 4x

2x

20-core P8 P8+2xP100 P8+2xP100 P8+4xP100 P8+4xP100 x64+4xV100 UM Data Directives UM Data Directives Data Directives P8 : IBM POWER8NVL, 2 sockets, 20 cores, NVLINK UM : No Data Directives in sources, compiled with –ta=tesla:managed 27 DEEP COPY

28 Managing Aggregate Data Structures with OpenACC An Example from the OpenACC port of VASP Derived Type 1 Members: 3 dynamic 1 derived type 2

Derived Type 2 • Real-world applications often have complex, Members: 21 dynamic aggregate data structures 1 derived type 3 1 derived type 4 • CUDA Unified Memory can automatically manage Deep Copy, but … Derived Type 3 Derived Type 4 Members: Members: only static 8 dynamic 4 derived type 5 2 derived type 6

Derived Type 5 Members: 3 dynamic

Derived Type 6 Members: 8 dynamic

29 Managing Aggregate Data Structures with OpenACC An Example from the OpenACC port of VASP Derived Type 1 Members: 3 dynamic 1 derived type 2

Derived Type 2 • Real-world applications often have complex, Members: 21 dynamic aggregate data structures 1 derived type 3 1 derived type 4 • CUDA Unified Memory can automatically manage Deep Copy, but … Derived Type 3 Derived Type 4 Members: Members: only static 8 dynamic 4 derived type 5 • CUDA Unified Memory is only for allocatable 2 derived type 6 data today Derived Type 5 Members: 3 dynamic

Derived Type 6 Members: 8 dynamic

30 FORTRAN AUTOMATIC FULL DEEP COPY Fortran Derived Types

31 OPENACC 2.6 MANUAL DEEP COPY Supported Today in PGI Compilers

typedef struct points { float* x; float* y; float* z; int n; float coef, direction;

} points;

void sub ( int n, float* y ) { points p; #pragma acc data create (p) { p.n = n; p.x = ( float*) malloc ( sizeof ( float )*n ); p.y = ( float*) malloc ( sizeof ( float )*n ); p.z = ( float*) malloc ( sizeof ( float )*n ); #pragma acc update device (p.n) #pragma acc data copyin (p.x[0:n], p.y[0: n]) { #pragma acc parallel loop for ( i =0; i

typedef struct points { float* x; float* y; float* z; int n; float coef, direction; #pragma acc policy inout(x[0:n],y[0:n]) } points;

void sub ( int n, float* y ) { points p;

p.n = n; p.x = ( float*) malloc ( sizeof ( float )*n ); p.y = ( float*) malloc ( sizeof ( float )*n ); p.z = ( float*) malloc ( sizeof ( float )*n );

#pragma acc data copy (p) { #pragma acc parallel loop for ( i =0; i

34 CLOVERLEAF AWE Hydrodynamics mini-App, bm32 data set http://uk-mac.github.io/CloverLeaf

160 142x

140 Core 120 109x

100 PGI 18.1 OpenACC Haswell Intel 2018 OpenMP 80 67x 60 40x 40

Speedup vs vs Single Speedup 20 14.8x 15x 7.6x 7.9x 10x 10x 11x 0 Multicore Haswell Multicore Broadwell Multicore Skylake Kepler Pascal 1x 2x 4x Volta V100

Systems: Haswell: 2x16 core Haswell server, four K80s, CentOS 7.2 (perf-hsw10), Broadwell: 2x20 core Broadwell server, eight P100s (dgx1-prd-01), Broadwell server, eight V100s (dgx07), Skylake 2x20 core Xeon Gold server (sky-4). Compilers: Intel 2018.0.128, PGI 18.1 Benchmark: CloverLeaf v1.3 downloaded from http://uk-mac.github.io/CloverLeaf the week of November 7 2016; CloverlLeaf_Serial; CloverLeaf_ref (MPI+OpenMP); CloverLeaf_OpenACC (MPI+OpenACC) Data compiled by PGI February 2018. 35 OPENACC DIRECTIVES FOR GPUS

75 !$ACC KERNELS 76 !$ACC LOOP INDEPENDENT 77 DO k=y_min,y_max 78 !$ACC LOOP INDEPENDENT PRIVATE(right_flux,left_flux,top_flux,bottom_flux,total_flux, min_cell_volume,energy_change,recip_volume) 79 DO j=x_min,x_max % pgfortran –fast –ta=tesla –Minfo -c PdV_kernel.f90 80 pdv_kernel: 81 left_flux= (xarea(j ,k )*(xvel0(j ,k )+xvel0(j ,k+1) & ... 82 +xvel0(j ,k )+xvel0(j ,k+1)))*0.25_8*dt*0.5 83 right_flux= (xarea(j+1,k )*(xvel0(j+1,k )+xvel0(j+1,k+1) & 77, Loop is parallelizable 84 +xvel0(j+1,k )+xvel0(j+1,k+1)))*0.25_8*dt*0.5 79, Loop is parallelizable 85 bottom_flux=(yarea(j ,k )*(yvel0(j ,k )+yvel0(j+1,k ) & Accelerator kernel generated 86 +yvel0(j ,k )+yvel0(j+1,k )))*0.25_8*dt*0.5 87 top_flux= (yarea(j ,k+1)*(yvel0(j ,k+1)+yvel0(j+1,k+1) & Generating Tesla code 88 +yvel0(j ,k+1)+yvel0(j+1,k+1)))*0.25_8*dt*0.5 77, !$acc loop gang, vector(4) ! blockidx%y 89 total_flux=right_flux-left_flux+top_flux-bottom_flux ! threadidx%y 90 91 volume_change(j,k)=volume(j,k)/(volume(j,k)+total_flux) 79, !$acc loop gang, vector(32)! blockidx%x 92 ! threadidx%x 93 min_cell_volume=MIN(volume(j,k)+right_flux-left_flux+top_flux-bottom_flux & ... 94 ,volume(j,k)+right_flux-left_flux & 95 ,volume(j,k)+top_flux-bottom_flux) 97 recip_volume=1.0/volume(j,k) 99 energy_change=(pressure(j,k)/density0(j,k)+viscosity(j,k)/density0(j,k))*... 101 energy1(j,k)=energy0(j,k)-energy_change 103 density1(j,k)=density0(j,k)*volume_change(j,k) 105 ENDDO 106 ENDDO 107 !$ACC END KERNELS

http://uk-mac.github.io/CloverLeaf 36 OPENACC DIRECTIVES FOR MULTICORE CPUS

75 !$ACC KERNELS 76 !$ACC LOOP INDEPENDENT 77 DO k=y_min,y_max 78 !$ACC LOOP INDEPENDENT PRIVATE(right_flux,left_flux,top_flux,bottom_flux,total_flux, min_cell_volume,energy_change,recip_volume) 79 DO j=x_min,x_max % pgfortran –fast –ta=multicore ... PdV_kernel.f90 80 pdv_kernel: 81 left_flux= (xarea(j ,k )*(xvel0(j ,k )+xvel0(j ,k+1) & ... 82 +xvel0(j ,k )+xvel0(j ,k+1)))*0.25_8*dt*0.5 83 right_flux= (xarea(j+1,k )*(xvel0(j+1,k )+xvel0(j+1,k+1) & 77, Loop is parallelizable 84 +xvel0(j+1,k )+xvel0(j+1,k+1)))*0.25_8*dt*0.5 Generating Multicore code 85 bottom_flux=(yarea(j ,k )*(yvel0(j ,k )+yvel0(j+1,k ) & 77, !$acc loop gang 86 +yvel0(j ,k )+yvel0(j+1,k )))*0.25_8*dt*0.5 87 top_flux= (yarea(j ,k+1)*(yvel0(j ,k+1)+yvel0(j+1,k+1) & 79, Loop is parallelizable 88 +yvel0(j ,k+1)+yvel0(j+1,k+1)))*0.25_8*dt*0.5 3 loop-carried redundant expressions removed 89 total_flux=right_flux-left_flux+top_flux-bottom_flux with 9 operations and 9 arrays 90 91 volume_change(j,k)=volume(j,k)/(volume(j,k)+total_flux) Innermost loop distributed: 2 new loops 92 Generated vector SIMD code for the loop 93 min_cell_volume=MIN(volume(j,k)+right_flux-left_flux+top_flux-bottom_flux & Generated 2 prefetch instructions for the loop 94 ,volume(j,k)+right_flux-left_flux & 95 ,volume(j,k)+top_flux-bottom_flux) Generated 12 prefetch instructions for the loop 97 recip_volume=1.0/volume(j,k) ... 99 energy_change=(pressure(j,k)/density0(j,k)+viscosity(j,k)/density0(j,k))*... 101 energy1(j,k)=energy0(j,k)-energy_change 103 density1(j,k)=density0(j,k)*volume_change(j,k) 105 ENDDO 106 ENDDO 107 !$ACC END KERNELS

http://uk-mac.github.io/CloverLeaf 37 FORTRAN 2018 DO CONCURRENT

75 76 77 DO CONCURRENT (k=y_min:y_max, j=x_min:x_max) & 78 LOCAL (right_flux,left_flux,top_flux,bottom_flux,total_flux, & min_cell_volume,energy_change,recip_volume) 79 80 81 left_flux= (xarea(j ,k )*(xvel0(j ,k )+xvel0(j ,k+1) & Fortran 2018 DO CONCURRENT 82 +xvel0(j ,k )+xvel0(j ,k+1)))*0.25_8*dt*0.5 83 right_flux= (xarea(j+1,k )*(xvel0(j+1,k )+xvel0(j+1,k+1) & 84 +xvel0(j+1,k )+xvel0(j+1,k+1)))*0.25_8*dt*0.5 + True Parallel Loops 85 bottom_flux=(yarea(j ,k )*(yvel0(j ,k )+yvel0(j+1,k ) & 86 +yvel0(j ,k )+yvel0(j+1,k )))*0.25_8*dt*0.5 + Loop-scope shared/private data 87 top_flux= (yarea(j ,k+1)*(yvel0(j ,k+1)+yvel0(j+1,k+1) & 88 +yvel0(j ,k+1)+yvel0(j+1,k+1)))*0.25_8*dt*0.5 − No support for reductions 89 total_flux=right_flux-left_flux+top_flux-bottom_flux 90 91 volume_change(j,k)=volume(j,k)/(volume(j,k)+total_flux) − No support for atomics 92 93 min_cell_volume=MIN(volume(j,k)+right_flux-left_flux+top_flux-bottom_flux & − No support for data management 94 ,volume(j,k)+right_flux-left_flux & 95 ,volume(j,k)+top_flux-bottom_flux) 97 recip_volume=1.0/volume(j,k) 99 energy_change=(pressure(j,k)/density0(j,k)+viscosity(j,k)/density0(j,k))*... 101 energy1(j,k)=energy0(j,k)-energy_change 103 density1(j,k)=density0(j,k)*volume_change(j,k) 105 106 ENDDO 107

38 OPENACC FOR EVERYONE The PGI Community Edition, pgicompilers.com/community

FREE

PROGRAMMING MODELS OpenACC, CUDA Fortran, OpenMP, C/C++/Fortran Compilers and Tools PLATFORMS X86, OpenPOWER, NVIDIA GPU

UPDATES 1-2 times a year 6-9 times a year 6-9 times a year

PGI Premier SUPPORT User Forums PGI Support Services LICENSE Annual Perpetual Volume/Site

39