Multiresolution Parallel Programming with Chapel
Total Page:16
File Type:pdf, Size:1020Kb
Vassily Litvinov and the Chapel team, Cray Inc. HPC Advisory Council Conference 13 September 2012 Motivation: Programming Models Multiresolution Programming Empirical Evaluation About the Project 2 1 GF – 1988: Cray Y-MP; 8 Processors • Static finite element analysis 1 TF – 1998: Cray T3E; 1,024 Processors • Modeling of metallic magnet atoms 1 PF – 2008: Cray XT5; 150,000 Processors • Superconductive materials 1 EF – ~2018: Cray ____; ~10,000,000 Processors • TBD 3 1 GF – 1988: Cray Y-MP; 8 Processors • Static finite element analysis • Fortran77 + Cray autotasking + vectorization 1 TF – 1998: Cray T3E; 1,024 Processors • Modeling of metallic magnet atoms • Fortran + MPI (Message Passing Interface) 1 PF – 2008: Cray XT5; 150,000 Processors • Superconductive materials • C++/Fortran + MPI + vectorization 1 EF – ~2018: Cray ____; ~10,000,000 Processors • TBD Or Perhaps Something • TBD: C/C++/Fortran + MPI + CUDA/OpenCL/OpenMP/OpenACC Completely Different? 4 Given: m-element vectors A, B, C Compute: i 1..m, Ai = Bi + αCi In pictures: A = B + C · α 5 Given: m-element vectors A, B, C Compute: i 1..m, Ai = Bi + αCi In pictures, in parallel: A = = = = B + + + + C · · · · α 6 Given: m-element vectors A, B, C Compute: i 1..m, Ai = Bi + αCi In pictures, in parallel, distributed memory: A = = = = B + + + + C · · · · α 7 Given: m-element vectors A, B, C Compute: i 1..m, Ai = Bi + αCi In pictures, in parallel, distributed memory, multicore: A = = = = = = = = B + + + + + + + + C · · · · · · · · α 8 #include <hpcc.h> MPI if (!a || !b || !c) { if (c) HPCC_free(c); if (b) HPCC_free(b); if (a) HPCC_free(a); static int VectorSize; if (doIO) { static double *a, *b, *c; fprintf( outFile, "Failed to allocate memory (%d).\n", VectorSize ); int HPCC_StarStream(HPCC_Params *params) { fclose( outFile ); int myRank, commSize; } int rv, errCount; return 1; MPI_Comm comm = MPI_COMM_WORLD; } MPI_Comm_size( comm, &commSize ); MPI_Comm_rank( comm, &myRank ); rv = HPCC_Stream( params, 0 == myRank); for (j=0; j<VectorSize; j++) { MPI_Reduce( &rv, &errCount, 1, MPI_INT, MPI_SUM, b[j] = 2.0; 0, comm ); c[j] = 3.0; } return errCount; } scalar = 3.0; int HPCC_Stream(HPCC_Params *params, int doIO) { register int j; double scalar; for (j=0; j<VectorSize; j++) VectorSize = HPCC_LocalVectorSize( params, 3, a[j] = b[j]+scalar*c[j]; sizeof(double), 0 ); HPCC_free(c); a = HPCC_XMALLOC( double, VectorSize ); HPCC_free(b); b = HPCC_XMALLOC( double, VectorSize ); HPCC_free(a); c = HPCC_XMALLOC( double, VectorSize ); return 0; } 9 #include <hpcc.h> MPI + OpenMP #ifdef _OPENMP if (!a || !b || !c) { #include <omp.h> if (c) HPCC_free(c); #endif if (b) HPCC_free(b); if (a) HPCC_free(a); static int VectorSize; if (doIO) { static double *a, *b, *c; fprintf( outFile, "Failed to allocate memory (%d).\n", VectorSize ); int HPCC_StarStream(HPCC_Params *params) { fclose( outFile ); int myRank, commSize; } int rv, errCount; return 1; MPI_Comm comm = MPI_COMM_WORLD; } MPI_Comm_size( comm, &commSize ); #ifdef _OPENMP MPI_Comm_rank( comm, &myRank ); #pragma omp parallel for #endif rv = HPCC_Stream( params, 0 == myRank); for (j=0; j<VectorSize; j++) { MPI_Reduce( &rv, &errCount, 1, MPI_INT, MPI_SUM, b[j] = 2.0; 0, comm ); c[j] = 3.0; } return errCount; } scalar = 3.0; int HPCC_Stream(HPCC_Params *params, int doIO) { #ifdef _OPENMP register int j; #pragma omp parallel for double scalar; #endif for (j=0; j<VectorSize; j++) VectorSize = HPCC_LocalVectorSize( params, 3, a[j] = b[j]+scalar*c[j]; sizeof(double), 0 ); HPCC_free(c); a = HPCC_XMALLOC( double, VectorSize ); HPCC_free(b); b = HPCC_XMALLOC( double, VectorSize ); HPCC_free(a); c = HPCC_XMALLOC( double, VectorSize ); return 0; } 10 MPI + OpenMP CUDA #include <hpcc.h> #ifdef _OPENMP #define N 2000000 #include <omp.h> #endif static int VectorSize; int main() { static double *a, *b, *c; float *d_a, *d_b, *d_c; int HPCC_StarStream(HPCC_Params *params) { float scalar; int myRank, commSize; int rv, errCount; MPI_Comm comm = MPI_COMM_WORLD; cudaMalloc((void**)&d_a, sizeof(float)*N); MPI_Comm_size( comm, &commSize ); cudaMalloc((void**)&d_b, sizeof(float)*N); MPI_Comm_rank( comm, &myRank ); cudaMalloc((void**)&d_c, sizeof(float)*N); rv = HPCC_Stream( params, 0 == myRank); MPI_Reduce( &rv, &errCount, 1, MPI_INT, MPI_SUM, 0, comm ); return errCount; dim3 dimBlock(128); } dim3 dimGrid(N/dimBlock.x ); int HPCC_Stream(HPCC_Params *params, int doIO) { if( N % dimBlock.x != 0 ) dimGrid.x+=1; register int j; double scalar; VectorSize = HPCC_LocalVectorSize( params, 3, sizeof(double), 0 ); set_array<<<dimGrid,dimBlock>>>(d_b, 2.0f, N); set_array<<<dimGrid,dimBlock>>>(d_c, 3.0f, N); a = HPCC_XMALLOC( double, VectorSize );Where is programmer productivity ?? b = HPCC_XMALLOC( double, VectorSize ); c = HPCC_XMALLOC( double, VectorSize ); scalar=3.0f; if (!a || !b || !c) { STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); if (c) HPCC_free(c); if (b) HPCC_free(b); cudaThreadSynchronize(); if (a) HPCC_free(a); if (doIO) { fprintf( outFile, "Failed to allocate memory (%d).\n", VectorSize ); cudaFree(d_a); fclose( outFile ); } cudaFree(d_b); return 1; cudaFree(d_c); } #ifdef _OPENMP } #pragma omp parallel for #endif __global__ void set_array(float *a, float value, int len) { for (j=0; j<VectorSize; j++) { b[j] = 2.0; int idx = threadIdx.x + blockIdx.x * blockDim.x; c[j] = 3.0; } if (idx < len) a[idx] = value; scalar = 3.0; } #ifdef _OPENMP #pragma omp parallel for __global__ void STREAM_Triad( float *a, float *b, float *c, #endif for (j=0; j<VectorSize; j++) float scalar, int len) { a[j] = b[j]+scalar*c[j]; int idx = threadIdx.x + blockIdx.x * blockDim.x; HPCC_free(c); HPCC_free(b); if (idx < len) c[idx] = a[idx]+scalar*b[idx]; HPCC_free(a); } return 0; } 11 Examples: Type of HW Parallelism Programming Model Unit of Parallelism Inter -node MPI process Intra -node/multicore OpenMP/pthreads iteration/task Instruction -level vectors/threads pragmas iteration GPU/accelerator CUDA/OpenCL/OpenAcc SIMD function/task HPC has traditionally given users… …low-level, control-centric programming models …ones that are closely tied to the underlying hardware …ones that support only a single type of parallelism benefits: lots of control; decent generality; easy to implement downsides: lots of user-managed detail; brittle to changes 12 MPI + OpenMP CUDA #include <hpcc.h> #ifdef _OPENMP #defineChapel N 2000000 #include <omp.h> #endif static int VectorSize; int main() { static double *a, *b, *c; config const m = 1000, float *d_a, *d_b, *d_c; int HPCC_StarStream(HPCC_Params *params) { float scalar; int myRank, commSize; alpha = 3.0; int rv, errCount; MPI_Comm comm = MPI_COMM_WORLD; cudaMalloc((void**)&d_a, sizeof(float)*N); MPI_Comm_size( comm, &commSize ); cudaMalloc((void**)&d_b, sizeof(float)*N); MPI_Comm_rank( comm, &myRank ); const ProblemSpace = [1..m] dmapped …; the special cudaMalloc((void**)&d_c, sizeof(float)*N); rv = HPCC_Stream( params, 0 == myRank); MPI_Reduce( &rv, &errCount, 1, MPI_INT, MPI_SUM, 0, comm ); sauce return errCount; dim3 dimBlock(128); A, B, C: [ProblemSpace] } var real; dim3 dimGrid(N/dimBlock.x ); int HPCC_Stream(HPCC_Params *params, int doIO) { if( N % dimBlock.x != 0 ) dimGrid.x+=1; register int j; double scalar; B = 2.0; VectorSize = HPCC_LocalVectorSize( params, 3, sizeof(double), 0 ); set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); C = 3.0; set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); a = HPCC_XMALLOC( double, VectorSize ); b = HPCC_XMALLOC( double, VectorSize ); c = HPCC_XMALLOC( double, VectorSize ); scalar=3.0f; if (!a || !b || !c) { A = B + alpha * C; STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); if (c) HPCC_free(c); if (b) HPCC_free(b); cudaThreadSynchronize(); if (a) HPCC_free(a); if (doIO) { fprintf( outFile, "Failed to allocate memory (%d).\n", VectorSize ); cudaFree(d_a); fclose( outFile ); } cudaFree(d_b); return 1; cudaFree(d_c); } #ifdef _OPENMP } #pragma omp parallel for #endif __global__ void set_array(float *a, float value, int len) { for (j=0; j<VectorSize; j++) { b[j] = 2.0; int idx = threadIdx.x + blockIdx.x * blockDim.x; c[j] = 3.0; } if (idx < len) a[idx] = value; scalar = 3.0; } #ifdef _OPENMP #pragma omp parallelPhilosophy: for Good language design__global__ can void tease STREAM_Triad( details floatof locality *a, float and *b, float *c, #endif for (j=0; j<VectorSize; j++) float scalar, int len) { a[j] = b[j]+scalar*c[j];parallelism away from an algorithm, permitting the compiler, runtime, int idx = threadIdx.x + blockIdx.x * blockDim.x; HPCC_free(c); HPCC_free(b); if (idx < len) c[idx] = a[idx]+scalar*b[idx]; HPCC_free(a);applied scientist, and HPC expert} each to focus on their strengths. return 0; } 13 Chapel: a parallel language that has emerged from DARPA HPCS general parallelism: data-, task-, and nested parallelism highly dynamic multithreading or static SPMD-style multiresolution philosophy: high-level features built on low-level to provide “manual overrides” to support a separation of concerns (application vs. parallel experts) locality control: explicit or data-driven placement of data and tasks locality expressed distinctly from parallelism features for productivity: type inference, iterators, rich array types portable: designed and implemented to support diverse systems open source: developed and distributed under the BSD license plausibly adoptable: forward-thinking HPC users want a mature version 14 Motivation Multiresolution programming Empirical Evaluation About the Project 15 1) General Parallel Programming 2) Global-View Abstractions 3) Multiresolution