Multiresolution Parallel Programming with Chapel

Vassily Litvinov and the Chapel team, Cray Inc. HPC Advisory Council Conference 13 September 2012 Motivation: Programming Models Multiresolution Programming Empirical Evaluation About the Project 2 1 GF – 1988: Cray Y-MP; 8 Processors • Static finite element analysis 1 TF – 1998: Cray T3E; 1,024 Processors • Modeling of metallic magnet atoms 1 PF – 2008: Cray XT5; 150,000 Processors • Superconductive materials 1 EF – ~2018: Cray ____; ~10,000,000 Processors • TBD 3 1 GF – 1988: Cray Y-MP; 8 Processors • Static finite element analysis • Fortran77 + Cray autotasking + vectorization 1 TF – 1998: Cray T3E; 1,024 Processors • Modeling of metallic magnet atoms • Fortran + MPI (Message Passing Interface) 1 PF – 2008: Cray XT5; 150,000 Processors • Superconductive materials • C++/Fortran + MPI + vectorization 1 EF – ~2018: Cray ____; ~10,000,000 Processors • TBD Or Perhaps Something • TBD: C/C++/Fortran + MPI + CUDA/OpenCL/OpenMP/OpenACC Completely Different? 4 Given: m-element vectors A, B, C Compute: i 1..m, Ai = Bi + αCi In pictures: A = B + C · α 5 Given: m-element vectors A, B, C Compute: i 1..m, Ai = Bi + αCi In pictures, in parallel: A = = = = B + + + + C · · · · α 6 Given: m-element vectors A, B, C Compute: i 1..m, Ai = Bi + αCi In pictures, in parallel, distributed memory: A = = = = B + + + + C · · · · α 7 Given: m-element vectors A, B, C Compute: i 1..m, Ai = Bi + αCi In pictures, in parallel, distributed memory, multicore: A = = = = = = = = B + + + + + + + + C · · · · · · · · α 8 #include <hpcc.h> MPI if (!a || !b || !c) { if (c) HPCC_free(c); if (b) HPCC_free(b); if (a) HPCC_free(a); static int VectorSize; if (doIO) { static double *a, *b, *c; fprintf( outFile, "Failed to allocate memory (%d).\n", VectorSize ); int HPCC_StarStream(HPCC_Params *params) { fclose( outFile ); int myRank, commSize; } int rv, errCount; return 1; MPI_Comm comm = MPI_COMM_WORLD; } MPI_Comm_size( comm, &commSize ); MPI_Comm_rank( comm, &myRank ); rv = HPCC_Stream( params, 0 == myRank); for (j=0; j<VectorSize; j++) { MPI_Reduce( &rv, &errCount, 1, MPI_INT, MPI_SUM, b[j] = 2.0; 0, comm ); c[j] = 3.0; } return errCount; } scalar = 3.0; int HPCC_Stream(HPCC_Params *params, int doIO) { register int j; double scalar; for (j=0; j<VectorSize; j++) VectorSize = HPCC_LocalVectorSize( params, 3, a[j] = b[j]+scalar*c[j]; sizeof(double), 0 ); HPCC_free(c); a = HPCC_XMALLOC( double, VectorSize ); HPCC_free(b); b = HPCC_XMALLOC( double, VectorSize ); HPCC_free(a); c = HPCC_XMALLOC( double, VectorSize ); return 0; } 9 #include <hpcc.h> MPI + OpenMP #ifdef _OPENMP if (!a || !b || !c) { #include <omp.h> if (c) HPCC_free(c); #endif if (b) HPCC_free(b); if (a) HPCC_free(a); static int VectorSize; if (doIO) { static double *a, *b, *c; fprintf( outFile, "Failed to allocate memory (%d).\n", VectorSize ); int HPCC_StarStream(HPCC_Params *params) { fclose( outFile ); int myRank, commSize; } int rv, errCount; return 1; MPI_Comm comm = MPI_COMM_WORLD; } MPI_Comm_size( comm, &commSize ); #ifdef _OPENMP MPI_Comm_rank( comm, &myRank ); #pragma omp parallel for #endif rv = HPCC_Stream( params, 0 == myRank); for (j=0; j<VectorSize; j++) { MPI_Reduce( &rv, &errCount, 1, MPI_INT, MPI_SUM, b[j] = 2.0; 0, comm ); c[j] = 3.0; } return errCount; } scalar = 3.0; int HPCC_Stream(HPCC_Params *params, int doIO) { #ifdef _OPENMP register int j; #pragma omp parallel for double scalar; #endif for (j=0; j<VectorSize; j++) VectorSize = HPCC_LocalVectorSize( params, 3, a[j] = b[j]+scalar*c[j]; sizeof(double), 0 ); HPCC_free(c); a = HPCC_XMALLOC( double, VectorSize ); HPCC_free(b); b = HPCC_XMALLOC( double, VectorSize ); HPCC_free(a); c = HPCC_XMALLOC( double, VectorSize ); return 0; } 10 MPI + OpenMP CUDA #include <hpcc.h> #ifdef _OPENMP #define N 2000000 #include <omp.h> #endif static int VectorSize; int main() { static double *a, *b, *c; float *d_a, *d_b, *d_c; int HPCC_StarStream(HPCC_Params *params) { float scalar; int myRank, commSize; int rv, errCount; MPI_Comm comm = MPI_COMM_WORLD; cudaMalloc((void**)&d_a, sizeof(float)*N); MPI_Comm_size( comm, &commSize ); cudaMalloc((void**)&d_b, sizeof(float)*N); MPI_Comm_rank( comm, &myRank ); cudaMalloc((void**)&d_c, sizeof(float)*N); rv = HPCC_Stream( params, 0 == myRank); MPI_Reduce( &rv, &errCount, 1, MPI_INT, MPI_SUM, 0, comm ); return errCount; dim3 dimBlock(128); } dim3 dimGrid(N/dimBlock.x ); int HPCC_Stream(HPCC_Params *params, int doIO) { if( N % dimBlock.x != 0 ) dimGrid.x+=1; register int j; double scalar; VectorSize = HPCC_LocalVectorSize( params, 3, sizeof(double), 0 ); set_array<<<dimGrid,dimBlock>>>(d_b, 2.0f, N); set_array<<<dimGrid,dimBlock>>>(d_c, 3.0f, N); a = HPCC_XMALLOC( double, VectorSize );Where is programmer productivity ?? b = HPCC_XMALLOC( double, VectorSize ); c = HPCC_XMALLOC( double, VectorSize ); scalar=3.0f; if (!a || !b || !c) { STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); if (c) HPCC_free(c); if (b) HPCC_free(b); cudaThreadSynchronize(); if (a) HPCC_free(a); if (doIO) { fprintf( outFile, "Failed to allocate memory (%d).\n", VectorSize ); cudaFree(d_a); fclose( outFile ); } cudaFree(d_b); return 1; cudaFree(d_c); } #ifdef _OPENMP } #pragma omp parallel for #endif __global__ void set_array(float *a, float value, int len) { for (j=0; j<VectorSize; j++) { b[j] = 2.0; int idx = threadIdx.x + blockIdx.x * blockDim.x; c[j] = 3.0; } if (idx < len) a[idx] = value; scalar = 3.0; } #ifdef _OPENMP #pragma omp parallel for __global__ void STREAM_Triad( float *a, float *b, float *c, #endif for (j=0; j<VectorSize; j++) float scalar, int len) { a[j] = b[j]+scalar*c[j]; int idx = threadIdx.x + blockIdx.x * blockDim.x; HPCC_free(c); HPCC_free(b); if (idx < len) c[idx] = a[idx]+scalar*b[idx]; HPCC_free(a); } return 0; } 11 Examples: Type of HW Parallelism Programming Model Unit of Parallelism Inter -node MPI process Intra -node/multicore OpenMP/pthreads iteration/task Instruction -level vectors/threads pragmas iteration GPU/accelerator CUDA/OpenCL/OpenAcc SIMD function/task HPC has traditionally given users… …low-level, control-centric programming models …ones that are closely tied to the underlying hardware …ones that support only a single type of parallelism benefits: lots of control; decent generality; easy to implement downsides: lots of user-managed detail; brittle to changes 12 MPI + OpenMP CUDA #include <hpcc.h> #ifdef _OPENMP #defineChapel N 2000000 #include <omp.h> #endif static int VectorSize; int main() { static double *a, *b, *c; config const m = 1000, float *d_a, *d_b, *d_c; int HPCC_StarStream(HPCC_Params *params) { float scalar; int myRank, commSize; alpha = 3.0; int rv, errCount; MPI_Comm comm = MPI_COMM_WORLD; cudaMalloc((void**)&d_a, sizeof(float)*N); MPI_Comm_size( comm, &commSize ); cudaMalloc((void**)&d_b, sizeof(float)*N); MPI_Comm_rank( comm, &myRank ); const ProblemSpace = [1..m] dmapped …; the special cudaMalloc((void**)&d_c, sizeof(float)*N); rv = HPCC_Stream( params, 0 == myRank); MPI_Reduce( &rv, &errCount, 1, MPI_INT, MPI_SUM, 0, comm ); sauce return errCount; dim3 dimBlock(128); A, B, C: [ProblemSpace] } var real; dim3 dimGrid(N/dimBlock.x ); int HPCC_Stream(HPCC_Params *params, int doIO) { if( N % dimBlock.x != 0 ) dimGrid.x+=1; register int j; double scalar; B = 2.0; VectorSize = HPCC_LocalVectorSize( params, 3, sizeof(double), 0 ); set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); C = 3.0; set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); a = HPCC_XMALLOC( double, VectorSize ); b = HPCC_XMALLOC( double, VectorSize ); c = HPCC_XMALLOC( double, VectorSize ); scalar=3.0f; if (!a || !b || !c) { A = B + alpha * C; STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); if (c) HPCC_free(c); if (b) HPCC_free(b); cudaThreadSynchronize(); if (a) HPCC_free(a); if (doIO) { fprintf( outFile, "Failed to allocate memory (%d).\n", VectorSize ); cudaFree(d_a); fclose( outFile ); } cudaFree(d_b); return 1; cudaFree(d_c); } #ifdef _OPENMP } #pragma omp parallel for #endif __global__ void set_array(float *a, float value, int len) { for (j=0; j<VectorSize; j++) { b[j] = 2.0; int idx = threadIdx.x + blockIdx.x * blockDim.x; c[j] = 3.0; } if (idx < len) a[idx] = value; scalar = 3.0; } #ifdef _OPENMP #pragma omp parallelPhilosophy: for Good language design__global__ can void tease STREAM_Triad( details floatof locality *a, float and *b, float *c, #endif for (j=0; j<VectorSize; j++) float scalar, int len) { a[j] = b[j]+scalar*c[j];parallelism away from an algorithm, permitting the compiler, runtime, int idx = threadIdx.x + blockIdx.x * blockDim.x; HPCC_free(c); HPCC_free(b); if (idx < len) c[idx] = a[idx]+scalar*b[idx]; HPCC_free(a);applied scientist, and HPC expert} each to focus on their strengths. return 0; } 13 Chapel: a parallel language that has emerged from DARPA HPCS general parallelism: data-, task-, and nested parallelism highly dynamic multithreading or static SPMD-style multiresolution philosophy: high-level features built on low-level to provide “manual overrides” to support a separation of concerns (application vs. parallel experts) locality control: explicit or data-driven placement of data and tasks locality expressed distinctly from parallelism features for productivity: type inference, iterators, rich array types portable: designed and implemented to support diverse systems open source: developed and distributed under the BSD license plausibly adoptable: forward-thinking HPC users want a mature version 14 Motivation Multiresolution programming Empirical Evaluation About the Project 15 1) General Parallel Programming 2) Global-View Abstractions 3) Multiresolution

Multiresolution Parallel Programming with Chapel

Details

Download

Copyright

We respect the copyrights and intellectual property rights of all users. All uploaded documents are either original works of the uploader or authorized works of the rightful owners.

Support