<<

2020 IEEE Intl Conf on Parallel & Distributed Processing with Applications, Big Data & Cloud , Sustainable Computing & Communications, Social Computing & Networking (ISPA/BDCloud/SocialCom/SustainCom)

        

             #=*=..B*+8;*=8;B8/869>=.;;,12=.,=>;.7<=2=>=.8/869>=270$.,178580B127.<.,*-.6B8/#,2.7,.< .23270 #,18858/869>=.;*7-87=;857027..;270%72?.;<2=B8/127.<.,*-.6B8/#,2.7,.<.23270  6*25,*81>*@.22,=*,,7     9 (2' 2'$ #$4$*-.+$,2 -% "-+.32$0 2$"',-*-&7 2'$ ;*91 52<=@*<27=;8->,.-=8;*74,869>=.;9.;/8;6*7,. # 2  &$,$0 2$# (, -30 # (*7 *(%$ (1 &0-5(,& 0 .(#*7  2  =8@*;-< -*=*27=.7<2?. *9952,*=287< *7- # 2< 87. 8/ =1. (,2$,1(4$ *&-0(2'+1 2 )$ +-0$ ,# +-0$ (+.-02 ,2 . 02 (, 4.B4.;7.5<8/;*91 +.7,16*;4( ) '(&'.$0%-0+ ,"$"-+.32(,&'$0$ #2'(012$ 0"' ;2027*55B !% @.;. ,;.*=.- =8 <85?. =1. 9;8+5.6 8/ (1  27.(" * # 2 (,2$,1(4$ *&-0(2'+ ,# "' 0 "2$0(8$# 5(2' 0;*912,< ;.7-.;270 >. =8 2=< 6*<<2?. 9*;*55.52<6 1201 (,2$,1(4$ (00$&3* 0 +$+-07 ""$11 *-5 "-+.32 2(-, (,2$,1(27 +*7-@2-=1 *7- 58@ .7.;0B ,87<>69=287 !% 1*< +.,86. ,# 120-,& # 2  #$.$,#$,"7 *2'-3&' 0 .'("1 0-"$11(,& *7 *==;*,=2?. 95*=/8;6 /8; 1201 9.;/8;6*7,. ,869>=270 ,(2-%%$01+ 11(4$. 0 **$*(1+(1,-2 869*;.- =8 !% !% 1*< -2//.;.7= 6.68;B 12.;*;,1B %0($,#*7 #3$ 2- !-4$ "' 0 "2$0(12("1 - 32(*(8$ 2'$ .-5$0 -% .A.,>=28768-.*7-68;.,869>=*=287>72=<  %-0  $%%("($,2 1"'$#3*(,& -% + 11(4$ 2'0$ #1 ,# 8@.?.; ->. =8 =1. 2;;.0>5*; ,1*;*,=.;2<=2,< 8/ 0;*91 !$22$0 32(*(8 2(-, -%  +$+-07 '($0 0"'7 0$ 0$/3(0$# , 2'(1. .$05$%-"31-,'(&'*7$%%("($,2(+.*$+$,2 2(-,-% =;*?.;<. 2=D< -2//2,>5= =8 *,12.?. 1201 9.;/8;6*7,. 27 -,   .* 2%-0+ $ .0-.-1$ 2'0$$ -.2(+(8 2(-, =;*-2=287*5 6>5=2,8;. 95*=/8;6< .<9.,2*55B 27 !% $1. 2$"',(/3$1(,"*3#(,&%(,$&0 (,$#. 0 **$*(1+-0($,2$# 9;8+5.6 2< />;=1.; *00;*?*=.- /8; <,*5./;.. 0;*91< *7  1203"230$ ,# 4$02$6 /3(")1$ 0"' 2- -4$0"-+$ .<<.7=2*5,5*<<8/;.*5@8;5-0;*91<@12,1/8558@<98@.;5*@ .$0%-0+ ,"$!-22*$,$")1(,(,$&0 (,$#. 0 **$*(1+" , -2<=;2+>=287 () $1. =898580B 8/ <,*5./;.. 0;*91< ;.<=;2,=< (+.0-4$ 2'$ 5-0)*- # ! * ,"$ %-0 . 0 **$* (+.*$+$,2 2(-, (, .//2,2.7=#2695.6.7=*=28727!%*7-,*7,*><.<.?.;. 2-.#-5,12 &$-0($,2$#1203"230$$+.*-71  @8;458*-26+*5*7,..68;B-2?.;0.7,.,;.*=.<*--2=287*5 %0($,#*7 # 2  * 7-32 5'("' " , (+.0-4$ 2'$ $%%("($,"7 -% ,1*55.70.< 27 # 9;8,.<<270 >. =8 =1. 27,87<.,>=2?. +$+-07 ""$11 $02$6 /3(")1$ 0"' (1 .0-.-1$# 2- 0$#3"$ 6.68;B*,,.<<@2=127*@*;92=,*7,*><.*5*;0.*68>7=8/ 0$#3,# ,2 &0 .' "-+.32 2(-,1 -,  (, **7 5$ "-,#3"2 58*-*7-<=8;.=;*7<*,=287<->;270=1.=;*?.;<. $62$,1(4$ $6.$0(+$,21 -, ! 1$# .* 2%-0+ 2- 4$0(%7 2'$ $8 =*,45. <>,1 ,1*55.70.< *7- .//2,2.7=5B >=252C. =1. $%%$"2(4$,$11 -% 2'$1$ 2$"',(/3$1 $ "'($4$    6*<<2?. 9*;*55.52<6 27 !% 6*7B 89=262C*=287 6.=18-< %-0 2'$ 0-,$")$0 &0 .' 5(2'  4$02("$1 ,# $#&$1, 1*?. +..7 9>= /8;@*;- 27 ;.,.7= B.*;< () *;2<1 *7- 2$0+1-%$,$0&7$%%("($,"7-30(+.*$+$,2 2(-,0 ,)112.* "$ *;*B*7*79;898<.-#2695.6.7=*=28727!%+*<.-87 -,2'$-4$+!$0 0$$,0 .' *(12 ?.;=.A,.7=;2, 9;8,.<<270 =1*= 2-.7=2/2.< *,=2?. ?.;=2,.< +B <,*77270?.;=.A<=*=><()870  9>=/8;@*;-?2;=>*5            @*;9=8269;8?.@8;458*-+*5*7,.()$1.7.201+8;52<=8/   .*,1*,=2?.?.;=.A@8>5-+.9;8,.<<.-+B*0;8>98/=1;.*-< 27<=.*- 8/ 87. =1;.*- .;;255   9;898<.- * 527.*;  %+)&,+ &% 9*;*55.52C*=287 8/ # *508;2=16 =1*= 6*99.- =1. @8;458*- '2=1=1.-.?.5896.7=8/27/8;6*=287<8,2.=B=1.-*=*2< 8/*<2705.?.;=.A=8*<2705.=1;.*-@*;98;+58,4-.9.7-270 ,87=27>8><5B 0.7.;*=.- 27 8>; -*25B 52/. ;*91 *7*5B=2,< * 87 2=< 8>=-.0;.. *7- *,12.?.- 1201 9.;/8;6*7,. 27 !% @*?.8/+20-*=**7*5B<2<1*<.6.;0.-*<*7.@6.=18-=8 ( ).*6.;  9;898<.-*-2;.,=28789=262C270<,1.6. .A958;. *7- ><. =1.<. -*=* =8 /*,252=*=. 9.895.D< 52?.< () =1*= ,86+27.- =;*-2=287*5 =89-8@7 *99;8*,1 @2=1 * 78?.5 *7B 9;8+5.6< 27 ;.*52=B ,*7 +. *+<=;*,=.- *7- -.<,;2+.- +8==86>9 *99;8*,1 @12,1 ,*7 -;*6*=2,*55B ;.->,. =1. @2=1 0;*91 < 87. 8/ =1. 68<= 2698;=*7= -*=* <=;>,=>;.< 7>6+.; 8/ ;.->7-*7= .-0.< =;*?.;<. () 2>   0;*91 2< @2-.5B ><.- 27 ?*;28>< /2.5-< 27,5>-270 9;8=.27 2695.6.7=.-*7.//2,2.7=1B+;2-#*508;2=16@2=1-.0;.. 27=.;*,=287*7*5B<2<0;8>7-=;*7<98;=*=287<8,2*5<,2.7,.*7- +*<.- ,5*<<2/2,*=287 /8; ?.;=2,.< =8 -.*5 @2=1 @8;458*- 6*,127.5.*;7270(   ) 26+*5*7,.9;8+5.6( )#*+.=  ,87<=;>,=.-*?2;=>*55B $1. ;.*-=12;<= #.*;,1 # 2< * =B92,*5 0;*91 =;*7,=>;.@12,15262=<=1. *508;2=16*7-=1.,8;.,86987.7=8/6*7B12015.?.50;*91 @8;458*- 8/ .*,1 ?.;=.A ( ) 5=18>01 *+8?. =.,172:>.< *7*5B<2< <>,1 *< ,877.,=.- ,86987.7=< ,.7=;*52=B *7- 1*?. 269;8?.- =1. .//2,2.7,B 8/ # 87 !% 1201 <2705.<8>;,. <18;=.<= 9*=1< ( ) # 2< ,1*;*,=.;2C.- @2=1 9.;/8;6*7,.0*27,8>5-+./>;=1.;*,12.?.-+*<.-87!% 27=.7<2?. 2;;.0>5*; 6.68;B *,,.<< 58@ ,869>=*=287 <9.,2/2,89=262C*=287< 27=.7<2=B *7- <=;870 -*=* -.9.7-.7,B @12,1 *;. :>2=. 7 =12< 9*9.; @./8,>< 87 269;8?270 # 9.;/8;6*7,. -2//.;.7= /;86 ,869>=.27=.7<2?. @8;458*- 7  87 ?2-2* !% 95*=/8;6 *7- ><. $.<5* ! 27 8>;

978-0-7381-3199-3/20/$31.00 ©2020 IEEE 544 DOI 10.1109/ISPA-BDCloud-SocialCom-SustainCom51426.2020.00094 .A9.;26.7=< $1. -.=*25.- 89=262C*=287< *;. 9;.<.7=.- 27 first neighbor in adjacency list for each vertex. The 8;-.; =8 -.*5 @2=1 @8;458*- 26+*5*7,. 6.68;B *,,.<< difference between adjacent value in row list is the degree of -2?.;0.7,.*7-;.->7-*7=,*5,>5*=287<87!% each vertex.

#9.,2/2,*55B@.6*4.=1./8558@270,87=;2+>=287<  1. Due to graph topology and SIMT execution, there  exists severe workload imbalance on scale-free graphs. We develop a fine-grained parallelism method to improve the   workload balance.  2. Original CSR data structure is not GPU-friendly, which causes memory divergence problem. We develop a  GPU-oriented CSR layout to improve the efficiency of memory access.            3. By leveraging bitmap structure, we further propose a vertex quick-search method to find all unvisited vertices. It can highly reduce the amount of redundant computations in                  status check procedure. 4. We conduct extensive experiments on P100 Figure 1: Illustration of CSR format platform to verify the effectiveness of the proposed techniques. Our implementation achieves 237.94 GTEPS for  Top-down BFS the Kronecker graph with 226 vertices and 230 edges. It ranks 1st on November 2019 Green Graph500 list. Algorithm 1: Top-down BFS Input: undirected graph G=(V,E), level array LA, current frontier  BACKGROUND CF, next frontier NF, adjacency list A, source vertex s. Output: level array LA, parent map PM. BFS is a widely used graph algorithm and important building block of many graph analysis algorithms. To 1: LA[v] ← inf, for    facilitate BFS performance, there has been a lot of work on 2: lvl ← 0 parallel implementations of BFS algorithm. In this section, 3: LA[s] ← level we will present some preliminary concepts concerning GPU 4: PM[s] ← s and some state-of-art optimizations for BFS. 5: CF ← {s}  GPU Concepts 6: NF ←∅ Normally, one GPU contains dozens of Streaming 7: while CF is not empty do Multiprocessors (SMs). For example, P100 consists of 56 8: lvl++ SMs. Each SM contains 64 single-precision CUDA cores 9: ∈ and 32 double-precision cores. With numerous processing for u CF in parallel do units, GPU can offer outstanding power. 10: for w   do The execution model of GPU is quite different from CPU. 11: if LA[w]==infthen GPU schedules threads in the form of warp (32 adjacent 12: PM[w] ← u threads) and executes in Single-Instruction Multiple-Threads 13: LA[w] ← lvl (SIMT) fashion. The SIMT execution model is very efficient 14: ←  for regular computations [20]. NF NF {w} The memory hierarchy of GPU is also different from 15: swap CF with NF CPU. P100 offers 16 GB global memory and 4096 KB L2 16: NF ←∅ cache. Each SM contains 256 KB register file and 64 KB dedicated shared memory. The shared memory is a Algorithm 1: Top-down BFS algorithm configurable cache in SM. All the threads in the same Cooperative Thread Array (CTA) can communicate through Traditional BFS is presented in top-down manner. Given shared memory and execute in the same SM. a graph G = (V, E) with vertex set V and edge set E, BFS is going to traverse all reachable vertices starting at a source  CSR Format vertex. The result of the algorithm is the BFS searching tree In order to reduce the memory footprint of graph data, according to the source vertex. The pseudocode of BFS graph is usually stored in Compressed Sparse Row (CSR) search is shown in Algorithm 1. format, which allows streaming access of neighboring edges At the beginning, all data structures are initialized and a for each vertex. CSR format is the compressed row storage random source vertex s is generated. Then, the source vertex of adjacency matrix. The basic idea of CSR is shown Figure s is put into current frontier. If current frontier is not empty, 1. The CSR structure includes two arrays: row list and all the neighbors of vertices in current frontier will be adjacency list. The adjacency list stores all the neighboring traversed. The traversal procedure is to mark the status of the vertices for each vertex and its size is bounded by the unvisited neighbors as visited, map their parent vertices, and number of edges for the graph. Row list stores the offset of put these vertices into next frontier. After all the vertices in

545 current frontier are processed, current frontier is cleared and Algorithm 2: Bottom-up BFS swapped with next frontier. Then a new iteration starts. As Input: undirected graph G=(V,E), level array LA, current frontier described above, traditional BFS is performed in a top-down CF, next frontier NF, adjacency list A, source vertex s. manner and generates the BFS searching tree at the end of Output: level array LA, parent map PM. this algorithm. 1: LA[v] ← inf, for     Direction-optimizing Approach 2: lvl ← 0 Top-down BFS approach performs well when the 3: LA[s] ← lvl number of vertices in current frontier is small. After several 4: PM[s] ← s iterations, the number of vertices in current frontier 5: CF ← {s} increases rapidly and many vertices in adjacency list have 6: NF ←∅ already been visited. However, top-down approach still 7: while CF is not empty do traverses all the neighbors of vertices in current frontier, 8: lvl++ leading to massive redundant edges traverse. On GPU platform, due to massive launching threads and limited 9: for LA[u]==infin parallel do global memory, plenty of atomic operations are needed to 10: for w   do obtain accurate next frontier, causing expensive 11: if LA[w]==lvl-1then computational overhead. 12: PM[u] ← w To overcome the limitations of top-down approach, an 13: LA[u] ← lvl effective bottom-up approach was proposed by Beamer et al 14: NF ← NF  {w} [9]. Bottom-up approach works in the opposite way 15: break compared to top-down approach. In each level, top-down approach traverses neighbors of vertices in current frontier, 16: swap CF with NF 17: NF ←∅ which are expanded in previous level, while bottom-up approach traverses neighbors of unvisited vertices. The visiting status of vertices can be identified by status array, Algorithm 2: Bottom-up BFS algorithm which avoids the overhead of atomic operations. In bottom- up approach, the traversal procedure of one unvisited vertex In bottom-up stage, the whole status array needs to be will stop once a neighbor is found in current frontier. To scanned to identify unvisited vertices. With bitmap technique, stop the traversal procedure earlier, degree-aware we can load more vertices’ status in one operation and optimization was proposed by sorting adjacency list of each reduce the operations of memory access. To improve the vertex based on the degree in descending order [21]. In our data locality, Yasui et al. proposed vertex sorting technique algorithm, level array records the distance to source vertex by sorting the vertex indices based on degree in descending and also acts as status array. The pseudocode of bottom-up order [21]. By combing bitmap and locality-friendly (lines 23-31) is shown in Algorithm 2. However, bottom-up techniques, memory access overhead and cache miss rate are approach also has its drawbacks. When there are few vertices highly ;.->,.-. in current frontier, most vertices will not be expanded in this level, causing plenty of redundant edges checking.  CHALLENGES OF BFS PARALLELISM As we can see, the bottom-up BFS is advantageous when Although GPU has large potential in exploiting the the size of current frontier is large, while top-down BFS is parallelism of BFS implementation, it incurs severe efficient when the size is small. So top-down and bottom-up workload imbalance, memory divergence and costly status approaches are complementary. In Beamer’s direction- check. These problems raise the difficulty for efficient optimizing approach [9], BFS started to run in top-down implementation of BFS on GPU. approach and switched to bottom-up approach when the size of current frontier was large enough. In the last several  Workload Imbalance iterations, BFS switched back to top-down approach when Normally, the scale-free graph follows power-law the size of the current frontier became small. Under distribution, which brings serious workload imbalance appropriate switching policy, BFS performance improves a problem in top-down phase. For parallel BFS lot [9]. implementation, the workload is usually divided based on the  Bitmap Optimizing number of active vertices and distributed evenly to available threads [9, 11, 21, 22]. Due to the power-law nature, the To further reduce the cost of memory access and cache degree of active vertices varies significantly. Although GPU miss rate, bitmap based optimization is proposed by Agarwal provides numerous computing cores and threads, the running et al [22]. It uses one bit to represent the status of one vertex. time would be dominated by the vertices with heavy The bit value “1” means the vertex is visited, while value “0” workload. What’s worse, the workload imbalance means the vertex is unvisited. Bitmap technique is mainly phenomenon is magnified in SIMT execution model. The used in bottom-up stage and many optimizing techniques are threads in the same warp are executed in SIMT fashion. put forward based on this data structure. Although some threads have finished their tasks early, their

546 computation units can’t be assigned to other threads until all checks introduce redundant overhead and unnecessary threads in the same warp have finished. Another problem is memory access, leading to performance degradation. the mismatch between the number of threads and the size of current frontier. When the size of current frontier is small, Table I: The number of redundant checks many launching threads will be idle. Iteration edundant checks Ratio In recent years, some prior works have explored fine- grained workload partition methods on GPU [10, 13, 15, 16]. 3 27350 0.08% These work tried to partition processing edges evenly to each 4 16444674 50.13% thread, so that the workload of each thread is similar. However, fine-grained methods are only beneficial on top- 5 32642738 99.51% down phase and most of them introduce extra overhead of 6 32782985 99.93% task partition. According to the experiment with Kronecker graphs, less than 5% vertices are expanded on top-down phase. The overhead may elude the advantage of fine- & ALGORITHM OPTIMIZATION grained partition. In this paper, we will present a fine-grained In this section, we illustrate the detailed optimizations for partition solution to deal with workload imbalance. BFS implementation on GPU. On top of the techniques in  Memory Divergence section II, we propose three techniques regarding the GPU provides not only massive parallelism, but also high challenges of BFS parallelism. These GPU-specific . On NVIDIA GPU, regular and sequential optimizations are used to balance workload, improve global memory access can reach high bandwidth utilization. memory bandwidth utilization, and reduce redundant However, scale-free graphs, like social and web networks, calculations. are highly irregular distributed. According to Khorasani, the  Fine-grained Parallelism memory access efficiency of BFS is only between 12.8% and The architecture of GPU is far different from CPU. One 15.8% [12]. The irregularity of graph algorithm is the major GPU device has many SMs and one SM has many CUDA challenge to efficiently utilize memory bandwidth of GPU. cores. As a result, the number of processing units is far Irregular memory access in the same warp can’t be coalesced beyond those in CPU. Due to the massive number of (memory divergence), leading to a large amount of load and registers in SM and coalescing memory, the number of store transactions and high . created threads is much more than the number of CUDA There are some works focusing on improving memory cores. Therefore, the load imbalance problem among threads efficiency during traversal procedure by utilizing coalescing cannot be ignored. With the support of dynamic parallelism memory [10, 14, 18]. These work [10, 14] aimed at the in CUDA, fine-grained parallelism is proposed to solve the inefficient memory problem by employing global next load imbalance problem in top-down phase. Dynamic frontier in top-down approach. Zhong et al. [18] proposed a parallelism is supported via an extension of the CUDA column-major layout to access messages buffer efficiently. library that allows a CUDA kernel to create and synchronize Memory coalescing is always an way to improve memory new nested kernel. efficiency on GPU. In this work, we focus on the Most previous work performed a coarse-grained improvement of graph data layout by taking the benefit of workload distribution. Under the circumstances, each thread memory coalescing. Original CSR format is not GPU- deals with one vertex in current frontier and gather all its friendly in bottom-up phase. To make BFS more GPU-aware, assigned neighbors in adjacency list. Fine-grained we propose an improved graph data structure to achieve high parallelism allows us to do out-degree based task partition. memory efficiency. Fine-grained parallelism is similar to that in [10], but our  Costly Status Check solution limits the number of child kernels to alleviate too In the bottom-up phase, the status array is checked many launching kernels and combines it with GPU-friendly thoroughly in each iteration to find all unvisited vertices bottom-up approach. At the beginning, a master kernel is instead of using current frontier. Under the circumstances, launched. The size of the master kernel is same as the length atomic operations in current frontier are avoided, but many of current frontier. Each thread of the master kernel deals costly status checks are involved. After a few iterations, the with one vertex in current frontier. Then the child kernel ratio of unvisited vertices becomes relatively small and may be created in the thread of master kernel. Whether the displays sparse distribution. However, bottom-up algorithm child kernel is created depends on the vertex’s out-degree. continues to scan the status of all visited and unvisited The size of child kernel differs with varied out-degrees. The vertices. For large-scale graphs, sequential scanning will load balance in top-down approach can be achieved through have a great impact on performance. the above fine-grained dynamic mechanism. In case of too Table I shows the number of redundant checks for graph many threads are created in master and child kernels, we with scale 26 in bottom-up phase and the ratio among all limit the largest dimension of the master and child kernels. In status checks. It is clearly that each iteration involves other words, one thread in master kernel may deal with more redundant checks. Especially in last few iterations, the ratio than one vertex and one thread in child kernel may deal with of redundant checks is up to 99%. Therefore, costly status more than one neighboring vertex.

547    Firstly, we introduce the idea of bitmap adaptive CSR.  Normally, bitmap optimizing technique is used to reduce the memory access in bottom-up stage. Each thread will deal with one unit of bitmap (the size of bitmap unit is usually 4    bytes) each time. With vertex sorting technique, bitmap can offer good locality. If we use bitmap optimizing technique on

  GPU platform, the threads in the same warp faces memory divergence problem while accessing index mapping lists,  including row list and some other auxiliary structures. If we remap the vertex indices in bitmap, the locality of the bitmap optimizing technique will be broken. So, Bitmap adaptive     CSR is proposed to access index mapping lists efficiently without breaking the benefit of bitmap.

     

            !! !!   Figure 2: Fine-grained strategy with dynamic parallelism.     

Figure 2 shows fine-grained strategy in top-down       !!   !!    procedure. The threads in master kernel are created to process vertices in current frontier. Child kernel will be   created when the out-degree of one vertex is large enough.     By using fine-grained parallelism technique, the workload        imbalance of top-down procedure is greatly alleviated. <:HE8+;874G4FGEH6GHE8B95

548

      = ⋅⋅  +       ⋅   GPU-aware Bottom-up Kernel    ⋅ (1)    ⋅−      + Algorithm 3: Bottom-up with GPU-oriented CSR   Input: undirected graph G=(V,E), level array LA, adjacency list A, warp-aligned adjacency list WA, vertex visit bitmap BV, current         = ⋅⋅  +      ×   frontier bitmap BC, next frontier bitmap BN, source vertex s.    ⋅ (2) Output: level array LA, parent map PM.     ⋅  + 1: for bit_unit in BV in parallel do  2: for each bit in bit_unit do:  Warp-aligned Adjacency List 3: if bit == 1 then Warp-aligned adjacency list is used to improve global 4: continue memory access efficiency in edge traversal procedure. Under 5: u ← get_vertex_id(bit) the SIMT execution, all threads in the same warp execute 6: loc ← id_to_loc(u) edge traversal procedure of different vertices at the same 7: start, edge_count ← align_edge_info(loc) time, leading to severe memory divergence problem under 8: i ← 0, find ← false original adjacency list. Coalesced memory access to adjacency list requires that adjacent threads in the same warp 9: while i < edge_count do address the neighboring cells in global memory. Therefore, 10: w ← WA[start+i*w_size] warp-aligned adjacency list tries to mix the neighbors of the 11: if is_visited(w, BC) then vertices processed in the same warp to follow the rule of 12: record u in LA & PM memory coalescing. 13: find ← true    14: break          15: i++  16: if find == false then      17: for w   do    18: if is_visited(w, BC) then  19: record u in LA & PM 20: find ← true 21: break 22: if find == true then         23: change status of u in BV, BN

       Algorithm 3: Bottom-up algorithm with GPU-oriented CSR Figure 4: The data structure of warp-aligned adjacency list The bottom-up kernel with GPU-oriented CSR structure Adjacent w_size vertices will be processed by the same is shown in Algorithm 3, which is different from typical warp in SIMT execution model, so all vertices are formed bitmap-optimizing bottom-up approaches [17, 21, 23]. into groups in units of w_size. The warp-aligned adjacency Unvisited vertices are identified by the scanning process of list of each group is shown in figure 4. The warp-aligned visit bitmap (BV). current frontier and next frontier are adjacency list is derived from original adjacency list. As replaced by current frontier bitmap (BC) and next frontier shown in the figure, the neighbors of vertex 0 are rearranged bitmap (BN). In bottom-up approach, the traversal procedure with other neighbor vertices which will be processed in the of each unvisited vertex stops when one neighbor is found in same warp. The minus sign (-) means padding space due to current frontier. In our algorithm, the traversal process is varied vertices’ degree. If we mix all neighbors of each divided into two parts. Firstly, warp-aligned adjacency list is vertex, the padding cost will occupy huge memory footprint. used (lines 7-17). If one visited neighbor is found in warp- There exists a discrepancy between memory efficiency and aligned adjacency list, the traversal process stops memory footprint. Under degree-aware optimization, the immediately. Otherwise, the remaining edges in adjacency traversal procedure of bottom-up approach stops in a small list of this vertex still needs to be traversed (lines 18-23). number of neighboring vertices. As a result, the warp-aligned Due to bitmap adaptive CSR technique, the location in index adjacency list is constructed by choosing a certain number of mapping lists is mapped by id_to_loc() expression. edges. Considering memory efficiency and padding cost, the As can be seen in bottom-up algorithm, accessing index warp-aligned adjacency list only includes the first 30% edges mapping lists and adjacency list is the main overhead of of original adjacency list. According to our experiment, new memory access. In our bottom-up kernel, threads in the warp structure brings no more than 7% memory overhead. By would access consecutive addresses in index mapping lists. using this technique, the memory accesses are coalesced, Adjacency list is divided into two parts, warp-aligned thus highly improving the memory efficiency. adjacency list and remaining adjacency list. The memory access to warp-aligned adjacency list is coalesced, but the remaining is not. According to our experiments, more than

549 95% expanded vertices will stop traversing procedure within the least significant bit 1 in unvis_bitunit will be turned into warp-aligned adjacency list. Therefore, our bottom-up kernel “0” before next procedure. is GPU-friendly and highly improves memory efficiency in All the unvisited vertices in this bitmap unit will be bottom-up phase. located in this way. The execution flow on one bitmap unit is shown in Figure 5. The iteration of vertex quick-search starts  Vertex quick-search from the least significant 1 bit in unvis_bitunit to the most Algorithm 4: Vertex quick search on bitmap unit significant 1 bit. The number of iterations is only related to Input: one unvisited bitmap unit unvis_bitunit. the number of “1” in the bitmap unit instead of bitmap unit Output: the lowest unvisited vertex position pos, unvis_bitunit. width. The vertex quick-search technique changes the 1: first_unvis = unvis_bitunit &(~unvis_bitunit+1) originally bitwise status scanning process to coarse-grained 2: mask = first_unvis œ1 positioning. The time complexity of scanning process is 3: pos = __popc(mask) improved from original O(k) to O(log(k)). In the last several iterations in bottom-up phase, the costly status check is 4: unvis_bitunit = unvis_bitunit &~first_unvis highly reduced. In addition, the processed bitmap unit is 5: return pos preloaded to a register instead of being accessed directly, Algorithm 4: Vertex quick-search on one bitmap unit which avoids the impact of poor locality of BFS and greatly reduces the cache miss rate. By combining with bottom-up algorithm, the redundant Table II: The number of skipped vertices using vertex quick-search edge traversal can be highly reduced. In order to avoid technique atomic operations while using current frontier, bottom-up algorithm scans status of each vertex in BV to identify Visited Skipped Skip Iteration unvisited vertices in each iteration. Due to power-law number number ratio distribution, the number of unvisited vertices will decrease 3 27350 32 0.12% sharply and distribute sparsely after several iterations. If we 4 16444674 5588960 33.99% continue to scan each vertex in BV, there will exist many redundant status checks as shown in Table I. While 5 32642738 28894304 88.52% processing large-scale graphs, sequentially scanning of BV 6 32782985 32133184 98.02% will have a great impact on performance.        Table II shows the effectiveness of vertex quick-search technique for graph with scale 26. The first column is the

  iterations where the bottom-up policy is taken. The second               and third column shows the total number of visited vertices

  and skipped vertices by using vertex quick-search. The last        column is the ratio of skipped number to total visited vertices.            The table shows the skip ratio increases with the visited  vertices marked in the bitmap. Especially in the last bottom-              up iteration, the status check of nearly 98% vertices is

            skipped.

Figure 5: The iterations of vertex quick-search  Big Data Extension Although GPU offers high memory bandwidth, host CPU In order to reduce the costly status check, we proposed a memory capacity is far beyond GPU device. Compared with vertex quick-search method to effectively locate unvisited host memory, GPU memory is limited and precious. With vertices in each iteration. By using bitmap optimizing the support of unified memory technology (UM) in CUDA, technique, each time one thread will load one bitmap unit GPU can process relatively large-scale graph beyond GPU from global memory to register and process unvisited global memory by using host memory. By allocating data in vertices in it. Normally, the width of one bitmap unit is 32 or UM, the hardware or software of CUDA system helps to 64. The procedure of this technique is shown in algorithm 4. migrate memory pages during kernel running. At the beginning, the bitmap unit i of BV will be located in However, if the whole graph is allocated in this way, global memory and loaded into register firstly. The unvisited BFS performance will be limited by the PCIe bandwidth. bitmap unit is generated from the loaded bitmap unit by Similar to the idea of prefetch, we put some highly effective bitwise NOT operation. In unvis_bitunit, “1” represents data on GPU memory, including bitmap, warp-aligned unvisited status and “0” represents visited status. If all bits in adjacency list, etc. Bitmap representation of the graph highly unvis_bitunit are “0”, the scanning process on this unit will reduces the memory cost and shows good locality based on be skipped. Otherwise, the algorithm will enter vertex quick- vertex-sorting. Warp-aligned adjacency list includes a small search procedure. The first_unvis only contains the least part of edges in original adjacency list. Most expanded significant 1 bit of unvis_bitunit, which is used to calculate vertices in bottom-up phase will stop traversing procedure in the position of this bit. Then, the unvisited vertex represented warp-aligned adjacency list. With these techniques, our by this bit can be identified. After one vertex search iteration, implementation extends to process very large-scale graph

550 and achieves good performance with one GPU device. The comparison of the performance with various optimizations size of the graph with scale 30 is nearly 192 GB which is 12 for graphs with different scales, ranging from 221 to 226. times larger than P100 global memory. We have launched The TDO outperforms the BL by 1 to 1.6 times across our tests on Tesla P100 and V100, respectively. P100 different scales. We could see that these optimizations achieves 159.76 GTEPS for graph with scale 29 and V100 improve the performance of BFS algorithm on GPU and achieves 158.22 GTEPS for graph with scale 30. Big data achieve 13.93 GTEPS for graph with scale 26. However, the extension gives a way for large-scale graph processing on performance is lower than the state-of-art work on CPU- heterogeneous systems. based platform [24]. To fully utilize the massive processing units of GPU, the workload balance among GPU threads are & EXPERIMENTS very important. This work is implemented in C++ and CUDA. The FPG solves the problem of load imbalance in top-down is compiled with NVIDIA nvcc 9.0 and GCC stage. The speed up ratio of FPG with respect to TDO ranges 4.8.5 with optimization flag of -O3. All experiments and from 4.5 to 7.9. It achieves 61.9 GTEPS for the graph with work comparisons are evaluated on the same platform, Xeon scale 26. After employing FGP, the workload balance in top- Gold 5118 CPU and NVIDIA Tesla P100 GPU. The main down stage improves a lot. dataset used in our evaluation are Kronecker graphs, which GCS and VQS focus on the performance improvement in are generated using Graph500 . The size of the bottom-up stage. GCS uses memory coalescing technique to graph is specified by the scale parameter. edgefactor is the improve the efficiency of memory access. VQS further average degree of the graph. The generator creates reduces the amount of redundant graph computations. After Kronecker graphs with an average edgefactor of 16 and employing these two techniques, the ratio of our coefficients of A, 0.57, B, 0.19, C, 0.19. The generated algorithm generally increases with scale for Kronecker graphs have 2scale vertices and 2scale*edgefactor edges, and graphs. The highest performance achieves 237.94 GTEPS follow power-law degree distribution. with graph scale 26. Among all single-node systems in the Performance of BFS is measured in Giga-Traversed Graph500 rankings, our implementation on P100 is leading Edges per Seconds (GTEPS), by taking the ratio of the the way in small data category. number of edges in the graph over the traversal time. To Thus far, we have discussed the BFS result with the high calculate the traversal time of the algorithm, it starts when performance. In terms of energy efficiency, the power one source vertex is given and ends when the BFS search is consumption for P100 and the single-node system on a completed, taking into account the consumption of the Kronecker graph with scale 26 is 52.56 watt and 130.04 watt, results written to device memory or host memory. For each respectively. Specifically, our BFS shows an energy-eEcient experiment, we launch 64 times BFS search with randomly performance of 1830.31 MTESP/W, and ranks 1st place on selected source vertices and take the average as the the November 2019 Green Graph500 list [25]. performance metric.  Performance Variation with  Performance Variation with Optimizations                             

   

 

   

#!#   #!#   

                #" Figure 7: Performance of graph with edgefactor ranging from 16 to 64 Figure 6: Comparison of the performance with various optimizations We evaluate the scalability of our algorithm varied with In our experiment, we choose direction-optimizing BFS the scale and edgefactor of the Kornecker graphs. Figure 7 as the baseline (BL). Traditional optimizations (TDO) use shows the performance of graph with edgefactor ranging the existing techniques mentioned in section II, including from 16 to 64. Our algorithm achieves a peak performance degree-aware, vertex sorting and bitmap lookup approaches. with graph scale 26 for varied edgefactor and is more Fine-grained parallelism (FGP), GPU-oriented CSR structure efficient for a graph with a larger edgefactor. When the (GCS) and vertex quick-search (VQS) are GPU-specific edgefactor is set to 64, the peak value is nearly 833.19 optimizations presented in this work. Figure 6 shows a GTEPS, which is more than three times better than the graph with edgefactor 16. This speedup afforded by bottom-up

551 approaches reduces the edge traversal required for a dense Gunrock on average. For large graphs, our work performs graph (large edgefactor). With reference to graph500 18.7, 15.61 times better than Tigr and Gunrock, respectively. benchmark, we choose 16 as the default edgefactor in below Apart from Kronecker graph, our optimizations shows its experiments. effectiveness on social and web networks which follow   power-law distribution and achieve better acceleration on large graphs.

 Table III: Performance comparison with other implementations (GTEPS)

  Dataset Tigr Enterprise Gunrock Our work Kron-24-16 2.45 19.9 2.08 194.27  Hollywood 19.41 29.24 7.67 42.28

 Flickr-large 4.18 17.83 2.06 49.35  Stackoverflow 7.43 OOM 2.96 27.7

 Twitter 3.39 OOM 3.91 74.49           Friendster 2.17 OOM 3.29 33.41  Figure 8: Performance of graph with different scales  To show the scalability of graph scales, the performance Analysis of GPU Profiling       of the graph with scale ranging from 21 to 30 are presented ,*(*$ ,*(*$ in Figure 8. When the scale is below 26, the performance   increases with the scale, reaching the peak performance of   237.94 GTEPS at the scale of 26. Under the technique of big    data extension, our algorithm can process the graph with   scale larger than 26, which exceeds the capacity of P100   %(% &(*-( !!## '- &(*-( !!## %(% global memory. Due to insufficient global memory and low  '- !!## &(*-+(* %(%             PCIe bandwidth, the performance drops as the scale      ,*(*$ ,*(*$ increases. For the graph with scale 30, the performance is   only 30.82 GTEPS. If GPU has large memory space or    

NVLink technology is supported, the performance     degradation will be mitigated and improve the efficiency of   processing large graphs. In order to prove our point, we  %(%#++  launched the algorithm on Tesla V100, which is equipped               with 32 GB global memory. The algorithm achieves 146.79 *)"% *)"%  GTEPS at the scale of 30 and the peak performance of <:HE8+;8CEB9  In order to show the effectiveness of our optimizations, we compare our algorithm with several BFS BFS is a data-intensive algorithm and its performance is implementations on GPU platform, including Tigr [16], significantly influenced by memory access efficiency. In this Enterprise [13] and Gunrock [17]. Our work is primarily part, we systematically analyze our algorithm with GPU designed for scale-free graphs with small diameter, instead profiling tools supported by NVIDIA. Since the performance of graphs with high diameter like road networks. Except for of BFS is mainly dominated by bottom-up stage, we focus on Kronecker graph, we also evaluate some other small graphs the analysis in bottom-up stage. We profile our techniques on including Hollywood [26], Stackoverflow [27] and Flickr- graphs with the scale ranging from 20 to 25 in Figure 9, and large [28], as well as some large graphs like Twitter [29] and choose TDO as the baseline. As shown in Figure 9(a) and Friendster [30] which have more than 1.4 billion edges. The 9(b), Our work improves global memory load efficiency by scale and average degree varies among these graph datasets. average 72% and store efficiency by 40%, compared with The results are summarized in Table III. For Kron-24-16 TDO. It means our algorithm effectively improves memory (Kronecker graph with scale 24 and edgefactor 16), our access in GPU. Global hit rate is improved by average 127% algorithm achieves 194.27 GTEPS, which is much higher by combining with our optimizations as shown in Figure 9(c). than other works. For other scale-free graphs, Enterprise The downtrend of the hit rate with the increase of graph scale faces OOM problems while launching with Stackoverflow, is mainly due to the limited L1 cache and share memory. Twitter and Friendster. For small graphs, our work performs Under the condition of relatively good workload balance, the 5.9, 2.1 and 12.93 times better than Tigr, Enterprise and effective memory access leads to performance improvement.

552 The IPC of our algorithm is improved by average 50% in 2 3 '  4E  ;<:;C8E9BE@4A68 :E4C; CEB68FF

553