Zoltan2
Loading...
Searching...
No Matches
Zoltan2_AlgHybrid2GL.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// Zoltan2: A package of combinatorial algorithms for scientific computing
4//
5// Copyright 2012 NTESS and the Zoltan2 contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef _ZOLTAN2_2GHOSTLAYER_HPP_
11#define _ZOLTAN2_2GHOSTLAYER_HPP_
12
13#include <vector>
14#include <unordered_map>
15#include <iostream>
16#include <queue>
17#ifdef _WIN32
18#include <time.h>
19#else
20#include <sys/time.h>
21#endif
22
23#include "Zoltan2_Algorithm.hpp"
26#include "Zoltan2_Util.hpp"
27#include "Zoltan2_TPLTraits.hpp"
28#include "Zoltan2_AlltoAll.hpp"
29
30
31#include "Tpetra_Core.hpp"
32#include "Teuchos_RCP.hpp"
33#include "Tpetra_Import.hpp"
34#include "Tpetra_FEMultiVector.hpp"
35
36#include "KokkosKernels_Handle.hpp"
37#include "KokkosKernels_IOUtils.hpp"
38#include "KokkosGraph_Distance1Color.hpp"
39#include "KokkosGraph_Distance1ColorHandle.hpp"
40
44// for methods that use two layers of ghosts.
45
46
47namespace Zoltan2{
48
49template <typename Adapter>
50class AlgTwoGhostLayer : public Algorithm<Adapter> {
51
52 public:
53
54 using lno_t = typename Adapter::lno_t;
55 using gno_t = typename Adapter::gno_t;
56 using offset_t = typename Adapter::offset_t;
57 using scalar_t = typename Adapter::scalar_t;
58 using base_adapter_t = typename Adapter::base_adapter_t;
59 using map_t = Tpetra::Map<lno_t,gno_t>;
60 using femv_scalar_t = int;
61 using femv_t = Tpetra::FEMultiVector<femv_scalar_t, lno_t, gno_t>;
62 using device_type = typename femv_t::device_type;
63 using execution_space = typename device_type::execution_space;
64 using memory_space = typename device_type::memory_space;
65 using host_exec = typename femv_t::host_view_type::device_type::execution_space;
66 using host_mem = typename femv_t::host_view_type::device_type::memory_space;
67
68 double timer(){
69 struct timeval tp;
70 gettimeofday(&tp, NULL);
71 return ((double) (tp.tv_sec) + 1e-6 * tp.tv_usec);
72 }
73 private:
74
75 //Entry point for parallel local coloring
76 // nVtx is the number of vertices owned by the current process
77 //
78 // adjs_view is the adjacencies, indexed by offset_view
79 //
80 // offset_view is the CSR row map, used to index the adjs_view
81 //
82 // femv is the FEMultiVector that holds the colors for the vertices
83 // the colors will change in this function.
84 //
85 // vertex_list is a list of vertices to recolor
86 //
87 // vertex_list_size is the size of the list of vertices to recolor
88 // vertex_list_size = 0 means recolor all uncolored vertices
89 //
90 // recolor decides which KokkosKernels algorithm to use.
91 //
92 virtual void colorInterior(const size_t nVtx,
93 Kokkos::View<lno_t*, device_type > adjs_view,
94 Kokkos::View<offset_t*,device_type > offset_view,
95 Teuchos::RCP<femv_t> femv,
96 Kokkos::View<lno_t*, device_type> vertex_list,
97 size_t vertex_list_size = 0,
98 bool recolor=false) = 0;
99
100 //Entry point for serial local coloring
101 virtual void colorInterior_serial(const size_t nVtx,
102 typename Kokkos::View<lno_t*, device_type >::host_mirror_type adjs_view,
103 typename Kokkos::View<offset_t*,device_type >::host_mirror_type offset_view,
104 Teuchos::RCP<femv_t> femv,
105 typename Kokkos::View<lno_t*, device_type>::host_mirror_type vertex_list,
106 size_t vertex_list_size = 0,
107 bool recolor=false) = 0;
108 public:
109 //Entry point for parallel conflict detection
110 //INPUT ARGS
111 // n_local is the number of vertices owned by the current process
112 //
113 // dist_offsets_dev is the device view that holds the CSR offsets
114 // It holds offsets for all vertices, owned and ghosts.
115 // It is organized with owned first, and then ghost layers:
116 // -----------------------------------------------
117 // | owned | first | second |
118 // | vtx | ghost | ghost |
119 // | offsets | layer | layer |
120 // -----------------------------------------------
121 // The allocated length is equal to the sum of owned
122 // and ghost vertices + 1. Any vertex with LID >= n_local
123 // is a ghost.
124 //
125 // dist_adjs_dev is the device view that holds CSR adjacencies
126 //
127 // boundary_verts_view holds the local IDs of vertices in the boundary.
128 // By definition, boundary vertices are owned vertices
129 // that have a ghost in their neighborhood.
130 // (distance-1 coloring = 1-hop neighborhood)
131 // (distance-2 coloring = 2-hop neighborhood)
132 // Note: for D1-2GL we do not use this argument.
133 // It is possible to detect all D1 conflicts by only
134 // checking the ghost vertices, without constructing
135 // the boundary.
136 //
137 // rand is a device view that holds random numbers generated from GIDs,
138 // they are consistently generated across processes
139 //
140 // gid is a device view that holds the GIDs of each vertex on this process.
141 // It is indexable by local ID. It stores both owned and ghost
142 // vertices, and LIDs are ordered so that the structure looks like:
143 // -----------------------------------------
144 // | | first | second | owned LIDs are smallest
145 // | owned | ghost | ghost | ghost LIDs increase based
146 // | vtx | layer | layer | on layer (1 < 2)
147 // -----------------------------------------
148 // The allocated size is dist_offsets_dev.extent(0)-1.
149 //
150 //
151 // ghost_degrees is a device view that holds the degrees of ghost vertices only.
152 // A ghost with local ID n_local will be the first entry in this view.
153 // So, ghost_degrees[i] is the degree of the vtx with
154 // GID = gid[n_local+i].
155 //
156 // recolor_degrees is a boolean that determines whether or not we factor in vertex
157 // degrees on recoloring
158 //
159 //OUTPUT ARGS
160 // femv_colors is the device color view.
161 // After this function call, conflicts between vertices will
162 // be resolved via setting a vertex's color to 0. The vertex
163 // to be uncolored is determined by rules that are consistent
164 // across multiple processes.
165 //
166 // verts_to_recolor_view is a device view that holds the list
167 // of vertices to recolor. Any vertex
168 // uncolored by this function will appear in this
169 // view after the function returns.
170 //
171 // verts_to_recolor_size_atomic is an atomic device view that holds the
172 // size of verts_to_recolor_view.
173 // Effectively it represents how many
174 // vertices need to be recolored after
175 // conflict detection.
176 // It differs in size from the allocated size of
177 // verts_to_recolor_view.
178 // Atomicity is required for building
179 // verts_to_recolor_view in parallel, which
180 // is the reason this is a view instead of
181 // just an integer type.
182 //
183 // verts_to_send_view is a device view that holds the list of vertices
184 // that will need to be sent to remotes after recoloring
185 // Note: Only locally owned vertices should ever be in
186 // this view. A process cannot color ghosts correctly.
187 //
188 // verts_to_send_size_atomic is an atomic device view that holds the size of
189 // verts_to_send_view. It differs in size from the
190 // allocated size of verts_to_send_view.
191 // Atomicity is required for building
192 // verts_to_send_view in parallel,
193 // which is the reason this is a view instead of
194 // just an integer type.
195 //
196 // recoloringSize is a device view that stores the total number of
197 // vertices that were uncolored by the conflict detection.
198 //
199 virtual void detectConflicts(const size_t n_local,
200 Kokkos::View<offset_t*, device_type > dist_offsets_dev,
201 Kokkos::View<lno_t*, device_type > dist_adjs_dev,
202 Kokkos::View<int*,device_type > femv_colors,
203 Kokkos::View<lno_t*, device_type > boundary_verts_view,
204 Kokkos::View<lno_t*,
205 device_type > verts_to_recolor_view,
206 Kokkos::View<int*,
208 Kokkos::MemoryTraits<Kokkos::Atomic>> verts_to_recolor_size_atomic,
209 Kokkos::View<lno_t*,
210 device_type > verts_to_send_view,
211 Kokkos::View<size_t*,
213 Kokkos::MemoryTraits<Kokkos::Atomic>> verts_to_send_size_atomic,
214 Kokkos::View<size_t*, device_type> recoloringSize,
215 Kokkos::View<int*,
216 device_type> rand,
217 Kokkos::View<gno_t*,
218 device_type> gid,
219 Kokkos::View<gno_t*,
220 device_type> ghost_degrees,
221 bool recolor_degrees) = 0;
222
223 //Entry point for serial conflict detection
224 virtual void detectConflicts_serial(const size_t n_local,
225 typename Kokkos::View<offset_t*, device_type >::host_mirror_type dist_offsets_host,
226 typename Kokkos::View<lno_t*, device_type >::host_mirror_type dist_adjs_host,
227 typename Kokkos::View<int*,device_type >::host_mirror_type femv_colors,
228 typename Kokkos::View<lno_t*, device_type >::host_mirror_type boundary_verts_view,
229 typename Kokkos::View<lno_t*,device_type>::host_mirror_type verts_to_recolor_view,
230 typename Kokkos::View<int*,device_type>::host_mirror_type verts_to_recolor_size_atomic,
231 typename Kokkos::View<lno_t*,device_type>::host_mirror_type verts_to_send_view,
232 typename Kokkos::View<size_t*,device_type>::host_mirror_type verts_to_send_size_atomic,
233 typename Kokkos::View<size_t*, device_type>::host_mirror_type recoloringSize,
234 typename Kokkos::View<int*, device_type>::host_mirror_type rand,
235 typename Kokkos::View<gno_t*,device_type>::host_mirror_type gid,
236 typename Kokkos::View<gno_t*,device_type>::host_mirror_type ghost_degrees,
237 bool recolor_degrees) = 0;
238 //Entry point for the construction of the boundary
239 //INPUT ARGS
240 // n_local is the number of vertices owned by the current process
241 //
242 // dist_offsets_dev is the device view that holds the CSR offsets
243 //
244 // dist_adjs_dev is the device view that holds CSR adjacencies
245 //
246 // dist_offsets_host is the hostmirror that holds the CSR offsets
247 //
248 // dist_adjs_host is the hostmirror that holds the CSR adjacencies
249 //
250 //OUTPUT ARGS
251 // boundary_verts is an unallocated device view that will hold the
252 // list of boundary vertices.
253 //
254 // verts_to_send_view will hold the list of vertices to send
255 //
256 // verts_to_send_size_atomic will hold the number of vertices to
257 // send currently held in verts_to_send_view.
258 // verts_to_send_size_atomic differs
259 // from the allocated size of verts_to_send_view
260 // Atomicity is required for building
261 // verts_to_send_view in parallel, which is
262 // the reason this is a view instead of an
263 // integer type.
264 //
265 virtual void constructBoundary(const size_t n_local,
266 Kokkos::View<offset_t*, device_type> dist_offsets_dev,
267 Kokkos::View<lno_t*, device_type> dist_adjs_dev,
268 typename Kokkos::View<offset_t*, device_type>::host_mirror_type dist_offsets_host,
269 typename Kokkos::View<lno_t*, device_type>::host_mirror_type dist_adjs_host,
270 Kokkos::View<lno_t*, device_type>& boundary_verts,
271 Kokkos::View<lno_t*,
272 device_type > verts_to_send_view,
273 Kokkos::View<size_t*,
275 Kokkos::MemoryTraits<Kokkos::Atomic>> verts_to_send_size_atomic) = 0;
276
277 protected:
278 RCP<const base_adapter_t> adapter;
279 RCP<Teuchos::ParameterList> pl;
280 RCP<Environment> env;
281 RCP<const Teuchos::Comm<int> > comm;
283 bool timing;
284
285 private:
286 //This function constructs a CSR with complete adjacency information for
287 //first-layer ghost vertices. This CSR can contain edges from:
288 // first-layer ghosts to owned;
289 // first-layer ghosts to first-layer ghosts;
290 // first-layer ghosts to second-layer ghosts;
291 //1) Each process sends a list of ghost GIDs back to their owning process
292 // to request full adjacency information for that vertex.
293 //
294 //2) Initially each process sends back degree information to the requesting
295 // process.
296 //
297 //3) Then, each process can construct send buffers and receive schedules for
298 // the adjacency information for each requested GID it received,
299 // in addition to building the 2GL offsets and relabeling ghost LIDs
300 // to make the final CSR construction easier.
301 //
302 // 3a) We can construct the 2GL offsets here because each process has
303 // already received the degree information for each first-layer
304 // ghost vertex.
305 //
306 // 3b) The original ghost LIDs may not correspond to the order in which
307 // we will receive the adjacency information. Instead of reordering
308 // the received adjacencies, we relabel the GIDs of first-layer
309 // ghosts with new LIDs that correspond to the order in which we
310 // receive the adjacency information. Only first-layer ghost LIDs
311 // are changed.
312 //
313 //4) Due to limitations on the size of an MPI message, we split sending and
314 // receiving into rounds to accommodate arbitrarily large graphs.
315 //
316 //5) As send rounds progress, we construct the 2GL adjacency structure.
317 // Because we relabel ghost LIDs, we do not need to reorder the
318 // data after we receive it.
319 //
320 //
321 //
322 //OUTPUT ARGS
323 // ownedPlusGhosts Initially holds the list of GIDs for owned and ghost
324 // vertices. The only hard constraint on the ordering is
325 // that owned vertices come before ghost vertices.
326 // This function can re-assign the LIDs of ghost vertices
327 // in order to make the final construction of the 2GL
328 // CSR easier. After this function call, some ghost
329 // LIDs may be changed, but they will still be greater
330 // than owned LIDs. No owned LIDs will be changed.
331 //
332 // adjs_2GL holds the second ghost layer CSR adjacencies. The 2GL CSR
333 // contains complete adjacency information for the first-layer
334 // ghosts. These adjacencies can hold both owned vertices and
335 // second-layer ghosts, as well as first-layer ghosts.
336 //
337 // offsets_2GL holds CSR offsets for vertices in the first ghost layer only.
338 // That is, a vertex with GID = gid[n_local] will be the first
339 // vertex with information in this offset structure.
340 //
341 //
342 //INPUT ARGS
343 // owners holds the owning proc for a given vertex, indexed by local ID.
344 //
345 // adjs is the CSR adjacencies of the local graph with a single ghost layer.
346 //
347 // offsets is the CSR offsets of the local graph with a single ghost layer.
348 //
349 // mapOwned translates from Owned GID to LID. We only need this translation
350 // for owned vertices.
351 //
352 //TODO: This function uses many vectors of size comm->getSize();
353 // consider changes to increase memory scalability.
354 void constructSecondGhostLayer(std::vector<gno_t>& ownedPlusGhosts,
355 const std::vector<int>& owners,
356 ArrayView<const gno_t> adjs,
357 ArrayView<const offset_t> offsets,
358 RCP<const map_t> mapOwned,
359 std::vector< gno_t>& adjs_2GL,
360 std::vector< offset_t>& offsets_2GL) {
361
362 //first, we send ghost GIDs back to owners to receive the
363 //degrees of each first-layer ghost.
364 std::vector<int> sendcounts(comm->getSize(),0);
365 std::vector<size_t> sdispls(comm->getSize()+1,0);
366 //loop through owners, count how many vertices we'll send to each processor
367 //We send each ghost GID back to its owning process.
368 if(verbose) std::cout<<comm->getRank()<<": building sendcounts\n";
369 for(size_t i = 0; i < owners.size(); i++){
370 if(owners[i] != comm->getRank()&& owners[i] !=-1) sendcounts[owners[i]]++;
371 }
372 //construct sdispls (for building sendbuf), and sum the total sendcount
373 if(verbose) std::cout<<comm->getRank()<<": building sdispls\n";
374 size_t sendcount = 0;
375 for(int i = 1; i < comm->getSize()+1; i++){
376 sdispls[i] = sdispls[i-1] + sendcounts[i-1];
377 sendcount += sendcounts[i-1];
378 }
379
380 if(verbose) std::cout<<comm->getRank()<<": building idx\n";
381 std::vector<gno_t> idx(comm->getSize(),0);
382 for(int i = 0; i < comm->getSize(); i++){
383 idx[i]=sdispls[i];
384 }
385 //construct sendbuf to send GIDs to owning processes
386 if(verbose) std::cout<<comm->getRank()<<": building sendbuf\n";
387
388 std::vector<gno_t> sendbuf(sendcount,0);
389 for(size_t i = offsets.size()-1; i < owners.size(); i++){
390 if(owners[i] != comm->getRank() && owners[i] != -1){
391 sendbuf[idx[owners[i]]++] = ownedPlusGhosts[i];
392 }
393 }
394
395 //communicate GIDs to owners
396 if(verbose) std::cout<<comm->getRank()<<": requesting GIDs from owners\n";
397 Teuchos::ArrayView<int> sendcounts_view = Teuchos::arrayViewFromVector(sendcounts);
398 Teuchos::ArrayView<gno_t> sendbuf_view = Teuchos::arrayViewFromVector(sendbuf);
399 Teuchos::ArrayRCP<gno_t> recvbuf;
400 std::vector<int> recvcounts(comm->getSize(),0);
401 Teuchos::ArrayView<int> recvcounts_view = Teuchos::arrayViewFromVector(recvcounts);
402 Zoltan2::AlltoAllv<gno_t>(*comm, *env, sendbuf_view, sendcounts_view, recvbuf, recvcounts_view);
403
404 if(verbose) std::cout<<comm->getRank()<<": done communicating\n";
405 //replace entries in recvGIDs with their degrees
406
407 if(verbose) std::cout<<comm->getRank()<<": building rdispls\n";
408 gno_t recvcounttotal = 0;
409 std::vector<int> rdispls(comm->getSize()+1,0);
410 for(size_t i = 1; i<recvcounts.size()+1; i++){
411 rdispls[i] = rdispls[i-1] + recvcounts[i-1];
412 recvcounttotal += recvcounts[i-1];
413 }
414 //send back the degrees to the requesting processes,
415 //build the adjacency counts
416 std::vector<offset_t> sendDegrees(recvcounttotal,0);
417 gno_t adj_len = 0;
418 std::vector<int> adjsendcounts(comm->getSize(),0);
419 if(verbose) std::cout<<comm->getRank()<<": building adjacency counts\n";
420 for(int i = 0; i < comm->getSize(); i++){
421 adjsendcounts[i] = 0;
422 for(int j = rdispls[i]; j < rdispls[i+1]; j++){
423 lno_t lid = mapOwned->getLocalElement(recvbuf[j]);
424 offset_t degree = offsets[lid+1] - offsets[lid];
425 sendDegrees[j] = degree;
426 adj_len +=degree;
427 adjsendcounts[i] += degree;
428 }
429 }
430 //communicate the degrees back to the requesting processes
431 if(verbose) std::cout<<comm->getRank()<<": sending degrees back to requestors\n";
432 Teuchos::ArrayView<offset_t> sendDegrees_view = Teuchos::arrayViewFromVector(sendDegrees);
433 Teuchos::ArrayRCP<offset_t> recvDegrees;
434 std::vector<int> recvDegreesCount(comm->getSize(),0);
435 Teuchos::ArrayView<int> recvDegreesCount_view = Teuchos::arrayViewFromVector(recvDegreesCount);
436 Zoltan2::AlltoAllv<offset_t>(*comm, *env, sendDegrees_view, recvcounts_view, recvDegrees, recvDegreesCount_view);
437
438 //calculate number of rounds of AlltoAllv's that are necessary on this process
439
440 if(verbose) std::cout<<comm->getRank()<<": determining number of rounds necessary\n";
441 int rounds = 1;
442 for(int i = 0; i < comm->getSize(); i++){
443 if(adjsendcounts[i]*sizeof(gno_t)/ INT_MAX > (size_t)rounds){
444 rounds = (adjsendcounts[i]*sizeof(gno_t)/INT_MAX)+1;
445 }
446 }
447
448 //see what the max number of rounds will be globally
449 int max_rounds = 0;
450 Teuchos::reduceAll<int>(*comm, Teuchos::REDUCE_MAX, 1, &rounds, &max_rounds);
451
452 if(verbose) std::cout<<comm->getRank()<<": building per_proc sums\n";
453 //compute send and receive schedules to and from each process
454 std::vector<std::vector<uint64_t>> per_proc_round_adj_sums(max_rounds+1,std::vector<uint64_t>(comm->getSize(),0));
455 std::vector<std::vector<uint64_t>> per_proc_round_vtx_sums(max_rounds+1,std::vector<uint64_t>(comm->getSize(),0));
456
457 if(verbose) std::cout<<comm->getRank()<<": filling per_proc sums\n";
458 //fill out the send schedules
459 for(int proc_to_send = 0; proc_to_send < comm->getSize(); proc_to_send++){
460 int curr_round = 0;
461 for(size_t j = sdispls[proc_to_send]; j < sdispls[proc_to_send+1]; j++){
462 if((per_proc_round_adj_sums[curr_round][proc_to_send] + recvDegrees[j])*sizeof(gno_t) > INT_MAX){
463 curr_round++;
464 }
465 per_proc_round_adj_sums[curr_round][proc_to_send] += recvDegrees[j];
466 per_proc_round_vtx_sums[curr_round][proc_to_send]++;
467 }
468 }
469
470 if(verbose) std::cout<<comm->getRank()<<": building recv GID schedule\n";
471
472 //A 3D vector to hold the GIDs so we can see how this process will receive
473 //the vertices in each round, from each process. This way we can reorder things
474 //locally so that the data we receive will automatically construct the CSR correctly
475 //without any other computation. We do the reordering before we start receiving.
476 std::vector<std::vector<std::vector<gno_t>>> recv_GID_per_proc_per_round(
477 max_rounds+1,std::vector<std::vector<gno_t>>(
478 comm->getSize(),std::vector<gno_t>(0)));
479 for(int i = 0; i < max_rounds; i++){
480 for(int j = 0; j < comm->getSize(); j++){
481 recv_GID_per_proc_per_round[i][j] = std::vector<gno_t>(sendcounts[j],0);
482 }
483 }
484
485 if(verbose) std::cout<<comm->getRank()<<": filling out recv GID schedule\n";
486 for(int i = 0; i < comm->getSize(); i++){
487 int curr_round = 0;
488 size_t curr_idx = 0;
489 for(size_t j = sdispls[i]; j < sdispls[i+1]; j++){
490 if(curr_idx > per_proc_round_vtx_sums[curr_round][i]){
491 curr_round++;
492 curr_idx = 0;
493 }
494 recv_GID_per_proc_per_round[curr_round][i][curr_idx++] = j;
495 }
496 }
497
498 if(verbose) std::cout<<comm->getRank()<<": reordering gids and degrees in the order they'll be received\n";
499 //reorder the GIDs and degrees locally:
500 // - The way we previously stored GIDs and degrees had no relation
501 // to the order that we'll be receiving the adjacency data.
502 // For the CSR to be complete (and usable), we reorder the LIDs of
503 // ghosts so that they are in the order we will receive the adjacency
504 // data.
505 //
506 // - For graphs that are not large enough to trigger sending in multiple
507 // rounds of communication, the LIDs of the ghosts will be ordered
508 // by owning process. If multiple rounds are used, the LIDs of
509 // ghosts will be ordered by rounds in addition to owning process.
510 //
511 // - final_gid_vec and final_degree_vec hold the reorganized gids and
512 // degrees, the final re-mapping happens further down
513 std::vector<gno_t> final_gid_vec(sendcount, 0);
514 std::vector<offset_t> final_degree_vec(sendcount,0);
515 gno_t reorder_idx = 0;
516 for(int i = 0; i < max_rounds; i++){
517 for(int j = 0; j < comm->getSize(); j++){
518 for(size_t k = 0; k < per_proc_round_vtx_sums[i][j]; k++){
519 final_gid_vec[reorder_idx] = sendbuf[recv_GID_per_proc_per_round[i][j][k]];
520 final_degree_vec[reorder_idx++] = recvDegrees[recv_GID_per_proc_per_round[i][j][k]];
521 }
522 }
523 }
524
525 //check to see if the reorganization has happened correctly
526 if(verbose){
527 //do a quick check to see if we ended up reorganizing anything
528 bool reorganized = false;
529 for(size_t i = 0; i < sendcount; i++){
530 if(final_gid_vec[i] != sendbuf[i]) reorganized = true;
531 }
532
533 //if we have more than a single round of communication, we need to reorganize.
534 //this alerts of unnecessary reorganization, and a failure to perform necessary
535 //reorganization.
536 if(!reorganized && (max_rounds > 1)) std::cout<<comm->getRank()<<": did not reorgainze GIDs, but probably should have\n";
537 if(reorganized && (max_rounds == 1)) std::cout<<comm->getRank()<<": reorganized GIDs, but probably should not have\n";
538 }
539 //remap the GIDs so we receive the adjacencies in the same order as the current processes LIDs
540 // originally, the GID->LID mapping has no relevance to how we'll be receiving adj data from
541 // remote processes. Here we make it so that the GID->LID mapping does correspond to the
542 // order we have received degree info and will receive adjacencies. ( LID n_local
543 // corresponds to the first GID whose adjacency info we will receive, LID n_local+1 the
544 // second, etc.)
545 // That way, we don't need to reorder the adjacencies after we receive them.
546 //
547 // This next loop is the final mapping step.
548
549 for (size_t i = 0; i < sendcount; i++){
550 ownedPlusGhosts[i+offsets.size()-1] = final_gid_vec[i];
551 }
552
553 //status report
554 if(verbose) {
555 std::cout<<comm->getRank()<<": done remapping\n";
556 std::cout<<comm->getRank()<<": building ghost offsets\n";
557 }
558 //start building the second ghost layer's offsets
559 std::vector<offset_t> ghost_offsets(sendcount+1,0);
560 for(size_t i = 1; i < sendcount+1; i++){
561 ghost_offsets[i] = ghost_offsets[i-1] + final_degree_vec[i-1];
562 }
563
564
565 if(verbose) std::cout<<comm->getRank()<<": going through the sending rounds\n";
566 //set up counters to keep track of where we are in the sending order
567 std::vector<uint64_t> curr_idx_per_proc(comm->getSize(),0);
568 for(int i = 0; i < comm->getSize(); i++) curr_idx_per_proc[i] = rdispls[i];
569 for(int round = 0; round < max_rounds; round++){
570 //send buffers
571 std::vector<gno_t> send_adj;
572 std::vector<int> send_adj_counts(comm->getSize(),0);
573 if(verbose) std::cout<<comm->getRank()<<": round "<<round<<", constructing send_adj\n";
574 //construct the adjacencies to send for this round
575 for(int curr_proc = 0; curr_proc < comm->getSize(); curr_proc++){
576 uint64_t curr_adj_sum = 0;
577 //keep going through adjacencies to send to this process until we're done
578 while( curr_idx_per_proc[curr_proc] < (size_t)rdispls[curr_proc+1]){
579 lno_t lid = mapOwned->getLocalElement(recvbuf[curr_idx_per_proc[curr_proc]++]);
580
581 //if the next adjacency would push us over the MPI message size max,
582 //stop for this round
583 if((curr_adj_sum + (offsets[lid+1]-offsets[lid]))*sizeof(gno_t) >= INT_MAX){
584 break;
585 }
586
587 //add the adjacencies to the send buffer
588 curr_adj_sum += (offsets[lid+1] - offsets[lid]);
589 for(offset_t j = offsets[lid]; j < offsets[lid+1]; j++){
590 send_adj.push_back(adjs[j]);
591 }
592 }
593 //update the send counts for this round
594 send_adj_counts[curr_proc] = curr_adj_sum;
595 }
596 if(verbose) std::cout<<comm->getRank()<<": round "<<round<<", sending...\n";
597 //do the sending...
598 Teuchos::ArrayView<gno_t> send_adjs_view = Teuchos::arrayViewFromVector(send_adj);
599 Teuchos::ArrayView<int> adjsendcounts_view = Teuchos::arrayViewFromVector(send_adj_counts);
600 Teuchos::ArrayRCP<gno_t> ghost_adjs;
601 std::vector<int> adjrecvcounts(comm->getSize(),0);
602 Teuchos::ArrayView<int> adjsrecvcounts_view = Teuchos::arrayViewFromVector(adjrecvcounts);
603 Zoltan2::AlltoAllv<gno_t>(*comm, *env, send_adjs_view, adjsendcounts_view, ghost_adjs, adjsrecvcounts_view);
604 //Because of the previous reordering, these adjacencies are
605 //in the correct order as they arrive on the process.
606 for(offset_t i = 0; i< (offset_t)ghost_adjs.size(); i++){
607 adjs_2GL.push_back(ghost_adjs[i]);
608 }
609 }
610 if(verbose) std::cout<<comm->getRank()<<": constructing offsets\n";
611 //put the offsets we computed into the output argument.
612 for(size_t i = 0; i < sendcount+1; i++){
613 offsets_2GL.push_back(ghost_offsets[i]);
614 }
615 if(verbose) std::cout<<comm->getRank()<<": done building 2nd ghost layer\n";
616 }
617
618 //Communicates owned vertex colors to remote ghost copies.
619 //
620 //returns: the amount of time the communication call took.
621 //
622 //OUTPUT ARGS:
623 // colors: the owned vertices' colors are not changed,
624 // ghost vertex colors are updated from their owners.
625 //
626 // total_sent: reports the total size of the send buffer
627 //
628 // total_recv: reports the total size of the recv buffer
629 //
630 //INPUT ARGS:
631 //
632 // mapOwnedPlusGhosts: maps global IDs to local IDs and vice-versa.
633 //
634 // nVtx: the number of owned vertices on this process
635 //
636 // verts_to_send: hostmirror of the list of vertices to send.
637 // This list will only contain local vertices
638 // that are ghosted on a remote process.
639 //
640 // verts_to_send_size: hostmirror of the size of verts_to_send
641 //
642 // procs_to_send: map that takes a local ID and gives a vector of
643 // process IDs which have a ghost copy of that vertex.
644 //
645 double doOwnedToGhosts(RCP<const map_t> mapOwnedPlusGhosts,
646 size_t nVtx,
647 typename Kokkos::View<lno_t*,device_type>::host_mirror_type verts_to_send,
648 typename Kokkos::View<size_t*,device_type>::host_mirror_type verts_to_send_size,
649 Teuchos::RCP<femv_t> femv,
650 const std::unordered_map<lno_t, std::vector<int>>& procs_to_send,
651 gno_t& total_sent, gno_t& total_recvd){
652
653 auto femvColors = femv->getLocalViewHost(Tpetra::Access::ReadWrite);
654 auto colors = subview(femvColors, Kokkos::ALL, 0);
655 //create vectors to hold send information
656 int nprocs = comm->getSize();
657 std::vector<int> sendcnts(comm->getSize(), 0);
658 std::vector<gno_t> sdispls(comm->getSize()+1, 0);
659
660 //calculate how much data we're sending to each process
661 for(size_t i = 0; i < verts_to_send_size(0); i++){
662 for(size_t j = 0; j < procs_to_send.at(verts_to_send(i)).size(); j++){
663 sendcnts[procs_to_send.at(verts_to_send(i))[j]] += 2;
664 }
665 }
666 //calculate sendsize and sdispls
667 sdispls[0] = 0;
668 gno_t sendsize = 0;
669 std::vector<int> sentcount(nprocs, 0);
670
671 for(int i = 1; i < comm->getSize()+1; i++){
672 sdispls[i] = sdispls[i-1] + sendcnts[i-1];
673 sendsize += sendcnts[i-1];
674 }
675 total_sent = sendsize;
676 std::vector<gno_t> sendbuf(sendsize,0);
677 //construct sendbuf, send each owned vertex's GID
678 //and its color to the processes that have a
679 //ghost copy of that vertex. If a vertex is not ghosted,
680 //it does not get sent anywhere.
681 for(size_t i = 0; i < verts_to_send_size(0); i++){
682 std::vector<int> procs = procs_to_send.at(verts_to_send(i));
683 for(size_t j = 0; j < procs.size(); j++){
684 size_t idx = sdispls[procs[j]] + sentcount[procs[j]];
685 sentcount[procs[j]] += 2;
686 sendbuf[idx++] = mapOwnedPlusGhosts->getGlobalElement(verts_to_send(i));
687 sendbuf[idx] = colors(verts_to_send(i));
688 }
689 }
690
691 Teuchos::ArrayView<int> sendcnts_view = Teuchos::arrayViewFromVector(sendcnts);
692 Teuchos::ArrayView<gno_t> sendbuf_view = Teuchos::arrayViewFromVector(sendbuf);
693 Teuchos::ArrayRCP<gno_t> recvbuf;
694 std::vector<int> recvcnts(comm->getSize(), 0);
695 Teuchos::ArrayView<int> recvcnts_view = Teuchos::arrayViewFromVector(recvcnts);
696
697 //if we're reporting times, remove the computation imbalance from the comm timer
698 if(timing) comm->barrier();
699 double comm_total = 0.0;
700 double comm_temp = timer();
701
702 Zoltan2::AlltoAllv<gno_t>(*comm, *env, sendbuf_view, sendcnts_view, recvbuf, recvcnts_view);
703 comm_total += timer() - comm_temp;
704
705 //compute total recvsize for updating local ghost colors
706 gno_t recvsize = 0;
707 for(int i = 0; i < recvcnts_view.size(); i++){
708 recvsize += recvcnts_view[i];
709 }
710 total_recvd = recvsize;
711 //update the local ghost copies with the color we just received.
712 for(int i = 0; i < recvsize; i+=2){
713 size_t lid = mapOwnedPlusGhosts->getLocalElement(recvbuf[i]);
714 colors(lid) = recvbuf[i+1];
715 }
716
717 return comm_total;
718 }
719
720 public:
721 //constructor
723 const RCP<const base_adapter_t> &adapter_,
724 const RCP<Teuchos::ParameterList> &pl_,
725 const RCP<Environment> &env_,
726 const RCP<const Teuchos::Comm<int> > &comm_)
727 : adapter(adapter_), pl(pl_), env(env_), comm(comm_){
728 verbose = pl->get<bool>("verbose",false);
729 timing = pl->get<bool>("timing", false);
730 }
731 //Main entry point for graph coloring
732 void color( const RCP<ColoringSolution<Adapter> > &solution){
733 //convert from global graph to local graph
734 ArrayView<const gno_t> vtxIDs;
735 ArrayView<StridedData<lno_t, scalar_t> > vwgts;
736
737 modelFlag_t flags;
738 flags.set(REMOVE_SELF_EDGES);
739 const auto model = rcp(new GraphModel<base_adapter_t>(this->adapter, this->env,
740 this->comm, flags));
741 size_t nVtx = model->getVertexList(vtxIDs, vwgts);
742 // the weights are not used at this point.
743
744 //get the adjacencies in a view that holds GIDs
745 ArrayView<const gno_t> adjs;
746 //get the CSR offsets
747 ArrayView<const offset_t> offsets;
748 ArrayView<StridedData<lno_t, scalar_t> > ewgts;
749 model->getEdgeList(adjs, offsets, ewgts);
750
751 //This map is used to map GID to LID initially
752 std::unordered_map<gno_t,lno_t> globalToLocal;
753 //this vector holds GIDs for owned and ghost vertices
754 //The final order of the gids is:
755 // -----------------------------------
756 // | | first | second |
757 // | owned | layer | layer |
758 // | | ghosts | ghosts |
759 // -----------------------------------
760 //
761 // constructSecondGhostLayer reorders the first layer ghost
762 // GIDs in the particular order that the communication requires.
763
764 std::vector<gno_t> ownedPlusGhosts;
765
766 //this vector holds the owning process for
767 //owned vertices and the first ghost layer.
768 //owners[i] gives the processor owning ownedPlusGhosts[i],
769 //for owned vertices and first layer ghosts.
770 //
771 //This should only be used BEFORE the call to constructSecondGhostLayer
772 std::vector<int> owners;
773
774 //fill these structures using owned vertices
775 for(int i = 0; i < vtxIDs.size(); i++){
776 globalToLocal[vtxIDs[i]] = i;
777 ownedPlusGhosts.push_back(vtxIDs[i]);
778 owners.push_back(comm->getRank());
779 }
780
781 //count the initial number of ghosts,
782 //map them to a local ID.
783 //The globalToLocal map has first layer ghosts
784 //from here on.
785 int nGhosts = 0;
786 std::vector<lno_t> local_adjs;
787 for(int i = 0; i < adjs.size(); i++){
788 if(globalToLocal.count(adjs[i])==0){
789 ownedPlusGhosts.push_back(adjs[i]);
790 globalToLocal[adjs[i]] = vtxIDs.size()+nGhosts;
791 nGhosts++;
792 }
793 local_adjs.push_back(globalToLocal[adjs[i]]);
794 }
795
796 //construct a Tpetra map for the FEMultiVector
797 Tpetra::global_size_t dummy = Teuchos::OrdinalTraits
798 <Tpetra::global_size_t>::invalid();
799 RCP<const map_t> mapOwned = rcp(new map_t(dummy, vtxIDs, 0, comm));
800
801 //GID and owner lookup for ghost vertices
802 std::vector<gno_t> ghosts;
803 std::vector<int> ghostowners;
804 for(size_t i = nVtx; i < nVtx+nGhosts; i++){
805 ghosts.push_back(ownedPlusGhosts[i]);
806 ghostowners.push_back(-1);
807 }
808
809 //get the owners of the ghost vertices
810 ArrayView<int> owningProcs = Teuchos::arrayViewFromVector(ghostowners);
811 ArrayView<const gno_t> gids = Teuchos::arrayViewFromVector(ghosts);
812 mapOwned->getRemoteIndexList(gids, owningProcs);
813
814 //add ghost owners to the total owner vector
815 for(size_t i = 0; i < ghostowners.size(); i++){
816 owners.push_back(ghostowners[i]);
817 }
818
819 //construct the second ghost layer.
820 //NOTE: this may reorder the LIDs of the ghosts.
821 //
822 //these vectors hold the CSR representation of the
823 // first ghost layer. This CSR contains:
824 // - edges from first layer ghosts to:
825 // - owned vertices
826 // - first layer ghosts
827 // - second layer ghosts
828 //
829 // - first_layer_ghost_adjs uses GIDs that need
830 // translated to LIDs.
831 //
832 // - first_layer_ghost_offsets are indexed by LID,
833 // first_layer_ghost_offsets[i] corresponds to
834 // the vertex with GID = ownedPlusGhosts[i+nVtx].
835 // first_layer_ghost_offsets.size() = #first layer ghosts + 1
836 //
837 std::vector< gno_t> first_layer_ghost_adjs;
838 std::vector< offset_t> first_layer_ghost_offsets;
839 constructSecondGhostLayer(ownedPlusGhosts,owners, adjs, offsets, mapOwned,
840 first_layer_ghost_adjs, first_layer_ghost_offsets);
841
842 //we potentially reordered the local IDs of the ghost vertices, so we need
843 //to re-insert the GIDs into the global to local ID mapping.
844 globalToLocal.clear();
845 for(size_t i = 0; i < ownedPlusGhosts.size(); i++){
846 globalToLocal[ownedPlusGhosts[i]] = i;
847 }
848
849 //use updated globalToLocal map to translate
850 //adjacency GIDs to LIDs
851 for(int i = 0 ; i < adjs.size(); i++){
852 local_adjs[i] = globalToLocal[adjs[i]];
853 }
854
855 //at this point, we have ownedPlusGhosts with 1layer ghosts' GIDs.
856 //Need to map the second ghost layer GIDs to new local IDs,
857 //and add them to the map, as well as translating 2layer ghost
858 //adjacencies to use those new LIDs.
859 size_t n2Ghosts = 0;
860
861 //local_ghost_adjs is the LID translation of first_layer_ghost_adjs.
862 std::vector<lno_t> local_ghost_adjs;
863 for(size_t i = 0; i< first_layer_ghost_adjs.size(); i++ ){
864 if(globalToLocal.count(first_layer_ghost_adjs[i]) == 0){
865 ownedPlusGhosts.push_back(first_layer_ghost_adjs[i]);
866 globalToLocal[first_layer_ghost_adjs[i]] = vtxIDs.size() + nGhosts + n2Ghosts;
867 n2Ghosts++;
868 }
869 local_ghost_adjs.push_back(globalToLocal[first_layer_ghost_adjs[i]]);
870 }
871
872 //construct data structures necessary for FEMultiVector
873 if(verbose) std::cout<<comm->getRank()<<": constructing Tpetra map with copies\n";
874 dummy = Teuchos::OrdinalTraits <Tpetra::global_size_t>::invalid();
875 RCP<const map_t> mapWithCopies = rcp(new map_t(dummy,
876 Teuchos::arrayViewFromVector(ownedPlusGhosts),
877 0, comm));
878 if(verbose) std::cout<<comm->getRank()<<": done constructing map with copies\n";
879
880 using import_t = Tpetra::Import<lno_t, gno_t>;
881 Teuchos::RCP<import_t> importer = rcp(new import_t(mapOwned,
882 mapWithCopies));
883 if(verbose) std::cout<<comm->getRank()<<": done constructing importer\n";
884 Teuchos::RCP<femv_t> femv = rcp(new femv_t(mapOwned,
885 importer, 1, true));
886 if(verbose) std::cout<<comm->getRank()<<": done constructing femv\n";
887
888 //Create random numbers seeded on global IDs to resolve conflicts
889 //consistently between processes.
890 std::vector<int> rand(ownedPlusGhosts.size());
891 for(size_t i = 0; i < rand.size(); i++){
892 std::srand(ownedPlusGhosts[i]);
893 rand[i] = std::rand();
894 }
895
896 // get up-to-date owners for all ghosts.
897 std::vector<int> ghostOwners2(ownedPlusGhosts.size() -nVtx);
898 std::vector<gno_t> ghosts2(ownedPlusGhosts.size() - nVtx);
899 for(size_t i = nVtx; i < ownedPlusGhosts.size(); i++) ghosts2[i-nVtx] = ownedPlusGhosts[i];
900 Teuchos::ArrayView<int> owners2 = Teuchos::arrayViewFromVector(ghostOwners2);
901 Teuchos::ArrayView<const gno_t> ghostGIDs = Teuchos::arrayViewFromVector(ghosts2);
902 mapOwned->getRemoteIndexList(ghostGIDs,owners2);
903 if(verbose) std::cout<<comm->getRank()<<": done getting ghost owners\n";
904
905 //calculations for how many 2GL verts are in the boundary of another process, only
906 //do this if it's going to be displayed.
907 if(verbose) {
908 std::cout<<comm->getRank()<<": calculating 2GL stats...\n";
909
910 std::vector<int> sendcounts(comm->getSize(),0);
911 std::vector<gno_t> sdispls(comm->getSize()+1,0);
912 //loop through owners, count how many vertices we'll send to each processor
913 for(int i = nGhosts; i < ghostGIDs.size(); i++){
914 if(owners2[i] != comm->getRank()&& owners2[i] !=-1) sendcounts[owners2[i]]++;
915 }
916 //construct sdispls (for building sendbuf), and sum the total sendcount
917 gno_t sendcount = 0;
918 for(int i = 1; i < comm->getSize()+1; i++){
919 sdispls[i] = sdispls[i-1] + sendcounts[i-1];
920 sendcount += sendcounts[i-1];
921 }
922 std::vector<gno_t> idx(comm->getSize(),0);
923 for(int i = 0; i < comm->getSize(); i++){
924 idx[i]=sdispls[i];
925 }
926 //construct sendbuf to send GIDs to owning processes
927 std::vector<gno_t> sendbuf(sendcount,0);
928 for(size_t i = nGhosts; i < (size_t)ghostGIDs.size(); i++){
929 if(owners2[i] != comm->getRank() && owners2[i] != -1){
930 sendbuf[idx[owners2[i]]++] = ghostGIDs[i];
931 }
932 }
933 //do the communication
934 Teuchos::ArrayView<int> sendcounts_view = Teuchos::arrayViewFromVector(sendcounts);
935 Teuchos::ArrayView<gno_t> sendbuf_view = Teuchos::arrayViewFromVector(sendbuf);
936 Teuchos::ArrayRCP<gno_t> recvbuf;
937 std::vector<int> recvcounts(comm->getSize(),0);
938 Teuchos::ArrayView<int> recvcounts_view = Teuchos::arrayViewFromVector(recvcounts);
939 Zoltan2::AlltoAllv<gno_t>(*comm, *env, sendbuf_view, sendcounts_view, recvbuf, recvcounts_view);
940 std::vector<int> is_bndry_send(recvbuf.size(),0);
941
942 //send back how many received vertices are in the boundary
943 for(int i = 0; i < recvbuf.size(); i++){
944 size_t lid = mapWithCopies->getLocalElement(recvbuf[i]);
945 is_bndry_send[i] = 0;
946 if(lid < nVtx){
947 for(offset_t j = offsets[lid]; j < offsets[lid+1]; j++){
948 if((size_t)local_adjs[j] >= nVtx) is_bndry_send[i] = 1;
949 }
950 } else{
951 for(offset_t j = first_layer_ghost_offsets[lid]; j < first_layer_ghost_offsets[lid+1]; j++){
952 if((size_t)local_ghost_adjs[j] >= nVtx) is_bndry_send[i] = 1;
953 }
954 }
955 }
956
957 //send back the boundary flags
958 Teuchos::ArrayView<int> is_bndry_send_view = Teuchos::arrayViewFromVector(is_bndry_send);
959 Teuchos::ArrayRCP<int> is_bndry_recv;
960 std::vector<int> bndry_recvcounts(comm->getSize(),0);
961 Teuchos::ArrayView<int> bndry_recvcounts_view = Teuchos::arrayViewFromVector(bndry_recvcounts);
962 Zoltan2::AlltoAllv<int> (*comm, *env, is_bndry_send_view, recvcounts_view, is_bndry_recv, bndry_recvcounts_view);
963
964 //add together the flags to compute how many boundary vertices are in the second ghost layer
965 int boundaryverts = 0;
966 for(int i = 0; i < is_bndry_recv.size(); i++){
967 boundaryverts += is_bndry_recv[i];
968 }
969 //this cout is guarded by a verbose check.
970 std::cout<<comm->getRank()<<": "<<boundaryverts<<" boundary verts out of "<<n2Ghosts<<" verts in 2GL\n";
971 }
972
973
974 //local CSR representation for the owned vertices:
975 Teuchos::ArrayView<const lno_t> local_adjs_view = Teuchos::arrayViewFromVector(local_adjs);
976 //NOTE: the offset ArrayView was constructed earlier, and is up-to-date.
977 //
978 //first ghost layer CSR representation:
979 Teuchos::ArrayView<const offset_t> ghost_offsets = Teuchos::arrayViewFromVector(first_layer_ghost_offsets);
980 Teuchos::ArrayView<const lno_t> ghost_adjacencies = Teuchos::arrayViewFromVector(local_ghost_adjs);
981 Teuchos::ArrayView<const int> rand_view = Teuchos::arrayViewFromVector(rand);
982 Teuchos::ArrayView<const gno_t> gid_view = Teuchos::arrayViewFromVector(ownedPlusGhosts);
983 //these ArrayViews contain LIDs that are ghosted remotely,
984 //and the Process IDs that have those ghost copies.
985 //An LID may appear in the exportLIDs more than once.
986 Teuchos::ArrayView<const lno_t> exportLIDs = importer->getExportLIDs();
987 Teuchos::ArrayView<const int> exportPIDs = importer->getExportPIDs();
988
989 //construct a quick lookup datastructure to map from LID to a
990 //list of PIDs we need to send data to.
991 std::unordered_map<lno_t, std::vector<int>> procs_to_send;
992 for(int i = 0; i < exportLIDs.size(); i++){
993 procs_to_send[exportLIDs[i]].push_back(exportPIDs[i]);
994 }
995
996 //call the coloring algorithm
997 twoGhostLayer(nVtx, nVtx+nGhosts, local_adjs_view, offsets, ghost_adjacencies, ghost_offsets,
998 femv, gid_view, rand_view, owners2, mapWithCopies, procs_to_send);
999
1000 //copy colors to the output array
1001 ArrayRCP<int> colors = solution->getColorsRCP();
1002 auto femvdata = femv->getData(0);
1003 for(size_t i=0; i<nVtx; i++){
1004 colors[i] = femvdata[i];
1005 }
1006
1007 }
1008// private:
1009
1010 //This is the coloring logic for all 2GL-based methods.
1011 //
1012 //INPUT ARGS:
1013 //
1014 // n_local: the number of local owned vertices
1015 //
1016 // n_total: the number of local owned vertices plus first layer ghosts
1017 //
1018 // adjs: the CSR adjacencies for the graph, only including owned vertices
1019 // and first layer ghosts
1020 //
1021 // offsets: the CSR offsets for the graph, has owned offsets into adjacencies
1022 //
1023 // ghost_adjs: the adjacency information for first-layer ghost vertices.
1024 // Includes all neighbors (owned, first-layer ghost,
1025 // second-layer ghost) for each first-layer ghost.
1026 //
1027 //
1028 // ghost_offsets: the offsets into ghost_adjs, first layer ghost LIDs
1029 // minus n_local are used to index this
1030 // datastructure. I.E. ghost_offsets(0)
1031 // corresponds to the ghost with LID n_local
1032 //
1033 // gids: a vector that holds GIDs for all vertices on this process
1034 // (includes owned, and all ghosts, indexable by LID)
1035 // The GIDs are ordered like this:
1036 // ----------------------------------
1037 // | | first | second |
1038 // | owned | layer | layer |
1039 // | | ghosts | ghosts |
1040 // ----------------------------------
1041 // ^ ^
1042 // n_local n_total
1043 //
1044 // rand: a vector that holds random numbers generated on GID for tie
1045 // breaking. Indexed by LID.
1046 //
1047 // ghost_owners: contains the process ID for the owner of each remote vertex.
1048 // Indexable by LID. owners[i] = the owning process for vertex
1049 // with GID = gids[i+n_local]
1050 //
1051 // mapOwnedPlusGhosts: map that can translate between GID and LID
1052 //
1053 // procs_to_send: for each vertex that is copied on a remote process,
1054 // this map contains the list of processes that have
1055 // a copy of a given vertex. Input LID, get the list
1056 // of PIDs that have a ghost copy of that LID.
1057 //
1058 //OUTPUT ARGS:
1059 //
1060 // femv: an FEMultiVector that holds a dual view of the colors.
1061 // After this call, femv contains updated colors.
1062 //
1063 void twoGhostLayer(const size_t n_local, const size_t n_total,
1064 const Teuchos::ArrayView<const lno_t>& adjs,
1065 const Teuchos::ArrayView<const offset_t>& offsets,
1066 const Teuchos::ArrayView<const lno_t>& ghost_adjs,
1067 const Teuchos::ArrayView<const offset_t>& ghost_offsets,
1068 const Teuchos::RCP<femv_t>& femv,
1069 const Teuchos::ArrayView<const gno_t>& gids,
1070 const Teuchos::ArrayView<const int>& rand,
1071 const Teuchos::ArrayView<const int>& ghost_owners,
1072 RCP<const map_t> mapOwnedPlusGhosts,
1073 const std::unordered_map<lno_t, std::vector<int>>& procs_to_send){
1074 //initialize timing variables
1075 double total_time = 0.0;
1076 double interior_time = 0.0;
1077 double comm_time = 0.0;
1078 double comp_time = 0.0;
1079 double recoloring_time=0.0;
1080 double conflict_detection = 0.0;
1081
1082 //Number of rounds we are saving statistics for
1083 //100 is a decent default. Reporting requires --verbose argument.
1084 const int numStatisticRecordingRounds = 100;
1085
1086 //includes all ghosts, including the second layer.
1087 const size_t n_ghosts = rand.size() - n_local;
1088
1089
1090 //Get the degrees of all ghost vertices
1091 //This is necessary for recoloring based on degrees,
1092 //we need ghost vertex degrees for consistency.
1093 std::vector<int> deg_send_cnts(comm->getSize(),0);
1094 std::vector<gno_t> deg_sdispls(comm->getSize()+1,0);
1095 for(int i = 0; i < ghost_owners.size(); i++){
1096 deg_send_cnts[ghost_owners[i]]++;
1097 }
1098 deg_sdispls[0] = 0;
1099 gno_t deg_sendsize = 0;
1100 std::vector<int> deg_sentcount(comm->getSize(),0);
1101 for(int i = 1; i < comm->getSize()+1; i++){
1102 deg_sdispls[i] = deg_sdispls[i-1] + deg_send_cnts[i-1];
1103 deg_sendsize += deg_send_cnts[i-1];
1104 }
1105 std::vector<gno_t> deg_sendbuf(deg_sendsize,0);
1106 for(int i = 0; i < ghost_owners.size(); i++){
1107 size_t idx = deg_sdispls[ghost_owners[i]] + deg_sentcount[ghost_owners[i]];
1108 deg_sentcount[ghost_owners[i]]++;
1109 deg_sendbuf[idx] = mapOwnedPlusGhosts->getGlobalElement(i+n_local);
1110 }
1111 Teuchos::ArrayView<int> deg_send_cnts_view = Teuchos::arrayViewFromVector(deg_send_cnts);
1112 Teuchos::ArrayView<gno_t> deg_sendbuf_view = Teuchos::arrayViewFromVector(deg_sendbuf);
1113 Teuchos::ArrayRCP<gno_t> deg_recvbuf;
1114 std::vector<int> deg_recvcnts(comm->getSize(),0);
1115 Teuchos::ArrayView<int> deg_recvcnts_view = Teuchos::arrayViewFromVector(deg_recvcnts);
1116 Zoltan2::AlltoAllv<gno_t>(*comm, *env, deg_sendbuf_view, deg_send_cnts_view, deg_recvbuf, deg_recvcnts_view);
1117
1118 //replace GID with the degree the owning process has for this vertex.
1119 //(this will include all edges present for this vertex globally)
1120 //This is safe to do, since we sent ghosts to their owners.
1121 for(int i = 0; i < deg_recvbuf.size(); i++){
1122 lno_t lid = mapOwnedPlusGhosts->getLocalElement(deg_recvbuf[i]);
1123 deg_recvbuf[i] = offsets[lid+1] - offsets[lid];
1124 }
1125 //send modified buffer back
1126 ArrayRCP<gno_t> ghost_degrees;
1127 Zoltan2::AlltoAllv<gno_t>(*comm, *env, deg_recvbuf(), deg_recvcnts_view, ghost_degrees, deg_send_cnts_view);
1128
1129 //create the ghost degree views, for use on host and device.
1130 Kokkos::View<gno_t*, device_type> ghost_degrees_dev("ghost degree view",ghost_degrees.size());
1131 typename Kokkos::View<gno_t*, device_type>::host_mirror_type ghost_degrees_host = Kokkos::create_mirror(ghost_degrees_dev);
1132 for(int i = 0; i < ghost_degrees.size(); i++){
1133 lno_t lid = mapOwnedPlusGhosts->getLocalElement(deg_sendbuf[i]);
1134 ghost_degrees_host(lid-n_local) = ghost_degrees[i];
1135 }
1136 Kokkos::deep_copy(ghost_degrees_dev, ghost_degrees_host);
1137
1138 //track the size of sends and receives through coloring rounds.
1139 gno_t recvPerRound[numStatisticRecordingRounds];
1140 gno_t sentPerRound[numStatisticRecordingRounds];
1141
1142 //find global max degree to determine which
1143 //coloring algorithm will be the most efficient
1144 //(For D1-2GL this is important, D2 and PD2 should always use NBBIT
1145 offset_t local_max_degree = 0;
1146 offset_t global_max_degree = 0;
1147 for(size_t i = 0; i < n_local; i++){
1148 offset_t curr_degree = offsets[i+1] - offsets[i];
1149 if(curr_degree > local_max_degree){
1150 local_max_degree = curr_degree;
1151 }
1152 }
1153 Teuchos::reduceAll<int, offset_t>(*comm, Teuchos::REDUCE_MAX,1, &local_max_degree, &global_max_degree);
1154 if(comm->getRank() == 0 && verbose) std::cout<<"Input has max degree "<<global_max_degree<<"\n";
1155
1156 if(verbose) std::cout<<comm->getRank()<<": constructing Kokkos Views for initial coloring\n";
1157
1158 //the initial coloring is able to use a standard CSR representation, so we construct that here.
1159 //This is a direct translation of the offsets and adjs arguments into Kokkos::Views.
1160 Kokkos::View<offset_t*, device_type> offsets_dev("Host Offset View", offsets.size());
1161 typename Kokkos::View<offset_t*, device_type>::host_mirror_type offsets_host = Kokkos::create_mirror(offsets_dev);
1162 Kokkos::View<lno_t*, device_type> adjs_dev("Host Adjacencies View", adjs.size());
1163 typename Kokkos::View<lno_t*, device_type>::host_mirror_type adjs_host = Kokkos::create_mirror(adjs_dev);
1164 for(Teuchos_Ordinal i = 0; i < offsets.size(); i++) offsets_host(i) = offsets[i];
1165 for(Teuchos_Ordinal i = 0; i < adjs.size(); i++) adjs_host(i) = adjs[i];
1166 Kokkos::deep_copy(offsets_dev,offsets_host);
1167 Kokkos::deep_copy(adjs_dev, adjs_host);
1168
1169
1170 //create the graph structures which allow KokkosKernels to recolor the conflicting vertices
1171 if(verbose) std::cout<<comm->getRank()<<": constructing Kokkos Views for recoloring\n";
1172
1173 //in order to correctly recolor, all ghost vertices on this process need an entry in the CSR offsets.
1174 //Otherwise, the color of the ghosts will be ignored, and the coloring will not converge.
1175 Kokkos::View<offset_t*, device_type> dist_degrees_dev("Owned+Ghost degree view",rand.size());
1176 typename Kokkos::View<offset_t*, device_type>::host_mirror_type dist_degrees_host = Kokkos::create_mirror(dist_degrees_dev);
1177
1178 //calculate the local degrees for the owned, first layer ghosts, and second layer ghosts
1179 //to be used to construct the CSR representation of the local graph.
1180 //owned
1181 for(Teuchos_Ordinal i = 0; i < offsets.size()-1; i++) dist_degrees_host(i) = offsets[i+1] - offsets[i];
1182 //first layer ghosts
1183 for(Teuchos_Ordinal i = 0; i < ghost_offsets.size()-1; i++) dist_degrees_host(i+n_local) = ghost_offsets[i+1] - ghost_offsets[i];
1184 //second layer ghosts
1185 for(Teuchos_Ordinal i = 0; i < ghost_adjs.size(); i++){
1186 //second layer ghosts have LID >= n_total
1187 if((size_t)ghost_adjs[i] >= n_total ){
1188 dist_degrees_host(ghost_adjs[i])++;
1189 }
1190 }
1191
1192 //The structure of the final CSR constructed here looks like:
1193 //
1194 // Index by LID--v
1195 // --------------------------------------------
1196 // | | first |second |
1197 // dist_offsets_dev/host |owned verts | layer |layer |
1198 // | | ghosts |ghosts |
1199 // --------------------------------------------
1200 // |indexes
1201 // v
1202 // --------------------------------------------
1203 // | |first |second |
1204 // dist_adjs_dev/host |owned adjs |layer |layer |
1205 // | |ghost adjs |ghost adjs |
1206 // --------------------------------------------
1207 //
1208 // We end up with a CSR that has adjacency information for all the vertices on
1209 // this process. We include only edges on the process, so ghosts have only partial
1210 // adjacency information.
1211 //
1212 // We symmetrize the local graph representation as well, for
1213 // KokkosKernels to behave as we need it to. This means that edges to
1214 // second layer ghosts must be symmetrized, so we end up with offsets
1215 // that correspond to second layer ghosts.
1216 Kokkos::View<offset_t*, device_type> dist_offsets_dev("Owned+Ghost Offset view", rand.size()+1);
1217 typename Kokkos::View<offset_t*, device_type>::host_mirror_type dist_offsets_host = Kokkos::create_mirror(dist_offsets_dev);
1218
1219 //we can construct the entire offsets using the degrees we computed above
1220 dist_offsets_host(0) = 0;
1221 offset_t total_adjs = 0;
1222 for(Teuchos_Ordinal i = 1; i < rand.size()+1; i++){
1223 dist_offsets_host(i) = dist_degrees_host(i-1) + dist_offsets_host(i-1);
1224 total_adjs += dist_degrees_host(i-1);
1225 }
1226 Kokkos::View<lno_t*, device_type> dist_adjs_dev("Owned+Ghost adjacency view", total_adjs);
1227 typename Kokkos::View<lno_t*, device_type>::host_mirror_type dist_adjs_host = Kokkos::create_mirror(dist_adjs_dev);
1228
1229 //zero out the degrees to use it as a counter.
1230 //The offsets now allow us to compute degrees,
1231 //so we aren't losing anything.
1232 for(Teuchos_Ordinal i = 0; i < rand.size(); i++){
1233 dist_degrees_host(i) = 0;
1234 }
1235 //We can just copy the adjacencies for the owned verts and first layer ghosts
1236 for(Teuchos_Ordinal i = 0; i < adjs.size(); i++) dist_adjs_host(i) = adjs[i];
1237 for(Teuchos_Ordinal i = adjs.size(); i < adjs.size() + ghost_adjs.size(); i++) dist_adjs_host(i) = ghost_adjs[i-adjs.size()];
1238
1239 //We have to symmetrize edges that involve a second layer ghost.
1240 //Add edges from the second layer ghosts to their neighbors.
1241 for(Teuchos_Ordinal i = 0; i < ghost_offsets.size()-1; i++){
1242 //loop through each first layer ghost
1243 for(offset_t j = ghost_offsets[i]; j < ghost_offsets[i+1]; j++){
1244 //if an adjacency is a second layer ghost
1245 if((size_t)ghost_adjs[j] >= n_total){
1246 //compute the offset to place the symmetric edge, and place it.
1247 dist_adjs_host(dist_offsets_host(ghost_adjs[j]) + dist_degrees_host(ghost_adjs[j])) = i + n_local;
1248 //increment the counter to keep track of how many adjacencies
1249 //have been placed.
1250 dist_degrees_host(ghost_adjs[j])++;
1251 }
1252 }
1253 }
1254 //copy the host views back to the device views
1255 Kokkos::deep_copy(dist_degrees_dev,dist_degrees_host);
1256 Kokkos::deep_copy(dist_offsets_dev,dist_offsets_host);
1257 Kokkos::deep_copy(dist_adjs_dev, dist_adjs_host);
1258
1259 //this view represents how many conflicts were found
1260 Kokkos::View<size_t*, device_type> recoloringSize("Recoloring Queue Size",1);
1261 typename Kokkos::View<size_t*, device_type>::host_mirror_type recoloringSize_host = Kokkos::create_mirror(recoloringSize);
1262 recoloringSize_host(0) = 0;
1263 Kokkos::deep_copy(recoloringSize, recoloringSize_host);
1264
1265 //create a view for the tie-breaking random numbers.
1266 if(verbose) std::cout<<comm->getRank()<<": constructing rand and GIDs views\n";
1267 Kokkos::View<int*, device_type> rand_dev("Random View", rand.size());
1268 typename Kokkos::View<int*, device_type>::host_mirror_type rand_host = Kokkos::create_mirror(rand_dev);
1269 for(Teuchos_Ordinal i = 0; i < rand.size(); i ++) rand_host(i) = rand[i];
1270 Kokkos::deep_copy(rand_dev,rand_host);
1271
1272 //create a view for the global IDs of each vertex.
1273 Kokkos::View<gno_t*, device_type> gid_dev("GIDs", gids.size());
1274 typename Kokkos::View<gno_t*, device_type>::host_mirror_type gid_host = Kokkos::create_mirror(gid_dev);
1275 for(Teuchos_Ordinal i = 0; i < gids.size(); i++) gid_host(i) = gids[i];
1276 Kokkos::deep_copy(gid_dev,gid_host);
1277
1278 //These variables will be initialized by constructBoundary()
1279 //
1280 //boundary_verts_dev holds all owned vertices that could possibly conflict
1281 //with a remote vertex. Stores LIDs.
1282 Kokkos::View<lno_t*, device_type> boundary_verts_dev;
1283 //this is the number of vertices in the boundary.
1284 if(verbose) std::cout<<comm->getRank()<<": constructing communication and recoloring lists\n";
1285
1286 //We keep track of the vertices that need to get recolored
1287 //this list can contain ghost vertices, so it needs to be initialized
1288 //to the number of all vertices on the local process.
1289 //rand has an entry for each vertex, so its size corresponds to what is needed.
1290 Kokkos::View<lno_t*, device_type> verts_to_recolor_view("verts to recolor", rand.size());
1291 typename Kokkos::View<lno_t*, device_type>::host_mirror_type verts_to_recolor_host = create_mirror(verts_to_recolor_view);
1292
1293 //This view keeps track of the size of the list of vertices to recolor.
1294 Kokkos::View<int*, device_type> verts_to_recolor_size("verts to recolor size",1);
1295 Kokkos::View<int*, device_type, Kokkos::MemoryTraits<Kokkos::Atomic>> verts_to_recolor_size_atomic = verts_to_recolor_size;
1296 typename Kokkos::View<int*, device_type>::host_mirror_type verts_to_recolor_size_host = create_mirror(verts_to_recolor_size);
1297
1298 //initialize the host view
1299 verts_to_recolor_size_host(0) = 0;
1300 //copy to device
1301 Kokkos::deep_copy(verts_to_recolor_size, verts_to_recolor_size_host);
1302
1303 //verts to send can only include locally owned vertices,
1304 //so we can safely allocate the view to size n_local.
1305 //
1306 //This view is a list of local vertices that are going to be
1307 //recolored, and need to be sent to their remote copies.
1308 Kokkos::View<lno_t*, device_type> verts_to_send_view("verts to send", n_local);
1309 typename Kokkos::View<lno_t*, device_type>::host_mirror_type verts_to_send_host = create_mirror(verts_to_send_view);
1310
1311 //this view keeps track of the size of verts_to_send.
1312 Kokkos::View<size_t*, device_type> verts_to_send_size("verts to send size",1);
1313 Kokkos::View<size_t*, device_type, Kokkos::MemoryTraits<Kokkos::Atomic>> verts_to_send_size_atomic = verts_to_send_size;
1314 typename Kokkos::View<size_t*, device_type>::host_mirror_type verts_to_send_size_host = create_mirror(verts_to_send_size);
1315
1316 verts_to_send_size_host(0) = 0;
1317 Kokkos::deep_copy(verts_to_send_size, verts_to_send_size_host);
1318
1319 if(verbose) std::cout<<comm->getRank()<<": Constructing the boundary\n";
1320
1321 //this function allocates and initializes the boundary_verts_dev view,
1322 //sets the boundary_size accordingly, and adds vertices to the
1323 //verts_to_send_atomic view, updating the size view as well.
1324 constructBoundary(n_local, dist_offsets_dev, dist_adjs_dev, dist_offsets_host, dist_adjs_host, boundary_verts_dev,
1325 verts_to_send_view, verts_to_send_size_atomic);
1326
1327 //this boolean chooses which KokkosKernels algorithm to use.
1328 //It was experimentally chosen for distance-1 coloring.
1329 //Should be disregarded for distance-2 colorings.
1330 bool use_vbbit = (global_max_degree < 6000);
1331 //Done initializing, start coloring!
1332
1333 //use a barrier if we are reporting timing info
1334 if(timing) comm->barrier();
1335 interior_time = timer();
1336 total_time = timer();
1337 //give the entire local graph to KokkosKernels to color
1338 this->colorInterior(n_local, adjs_dev, offsets_dev, femv,adjs_dev,0,use_vbbit);
1339 interior_time = timer() - interior_time;
1340 comp_time = interior_time;
1341
1342 //ghost_colors holds the colors of only ghost vertices.
1343 //ghost_colors(0) holds the color of a vertex with LID n_local.
1344 //To index this with LIDs, subtract n_local. If an LID is < n_local,
1345 //it will not have a color stored in this view.
1346 Kokkos::View<int*,device_type> ghost_colors("ghost color backups", n_ghosts);
1347
1348 //communicate the initial coloring.
1349 if(verbose) std::cout<<comm->getRank()<<": communicating\n";
1350
1351 //update the host views for the verts to send,
1352 //doOwnedToGhosts needs host memory views, but doesn't
1353 //change the values in them, so no need to copy afterwards
1354 Kokkos::deep_copy(verts_to_send_host, verts_to_send_view);
1355 Kokkos::deep_copy(verts_to_send_size_host, verts_to_send_size);
1356 gno_t recv,sent;
1357 comm_time = doOwnedToGhosts(mapOwnedPlusGhosts,n_local,verts_to_send_host,verts_to_send_size_host,femv,procs_to_send,sent,recv);
1358 sentPerRound[0] = sent;
1359 recvPerRound[0] = recv;
1360
1361 //store ghost colors so we can restore them after recoloring.
1362 //the local process can't color ghosts correctly, so we
1363 //reset the colors to avoid consistency issues.
1364 //get the color view from the FEMultiVector
1365 auto femvColors = femv->getLocalViewDevice(Tpetra::Access::ReadWrite);
1366 auto femv_colors = subview(femvColors, Kokkos::ALL, 0);
1367 Kokkos::parallel_for("get femv colors",
1368 Kokkos::RangePolicy<execution_space, int>(0,n_ghosts),
1369 KOKKOS_LAMBDA(const int& i){
1370 ghost_colors(i) = femv_colors(i+n_local);
1371 });
1372 Kokkos::fence();
1373
1374 double temp = timer();
1375 //detect conflicts only for ghost vertices
1376 bool recolor_degrees = this->pl->template get<bool>("recolor_degrees",false);
1377 if(verbose) std::cout<<comm->getRank()<<": detecting conflicts\n";
1378
1379 //these sizes will be updated by detectConflicts, zero them out beforehand
1380 verts_to_send_size_host(0) = 0;
1381 verts_to_recolor_size_host(0) = 0;
1382 recoloringSize_host(0) = 0;
1383 Kokkos::deep_copy(verts_to_send_size, verts_to_send_size_host);
1384 Kokkos::deep_copy(verts_to_recolor_size, verts_to_recolor_size_host);
1385 Kokkos::deep_copy(recoloringSize, recoloringSize_host);
1386
1387 detectConflicts(n_local, dist_offsets_dev, dist_adjs_dev, femv_colors, boundary_verts_dev,
1388 verts_to_recolor_view, verts_to_recolor_size_atomic, verts_to_send_view, verts_to_send_size_atomic,
1389 recoloringSize, rand_dev, gid_dev, ghost_degrees_dev, recolor_degrees);
1390
1391 //these sizes were updated by detectConflicts,
1392 //copy the device views back into host memory
1393 Kokkos::deep_copy(verts_to_send_host, verts_to_send_view);
1394 Kokkos::deep_copy(verts_to_send_size_host, verts_to_send_size);
1395 Kokkos::deep_copy(recoloringSize_host, recoloringSize);
1396 Kokkos::deep_copy(verts_to_recolor_size_host, verts_to_recolor_size);
1397
1398 if(comm->getSize() > 1){
1399 conflict_detection = timer() - temp;
1400 comp_time += conflict_detection;
1401 }
1402 //all conflicts detected!
1403 if(verbose) std::cout<<comm->getRank()<<": starting to recolor\n";
1404 //variables for statistics
1405 double totalPerRound[numStatisticRecordingRounds];
1406 double commPerRound[numStatisticRecordingRounds];
1407 double compPerRound[numStatisticRecordingRounds];
1408 double recoloringPerRound[numStatisticRecordingRounds];
1409 double conflictDetectionPerRound[numStatisticRecordingRounds];
1410 uint64_t vertsPerRound[numStatisticRecordingRounds];
1411 uint64_t incorrectGhostsPerRound[numStatisticRecordingRounds];
1412 int distributedRounds = 1;
1413 totalPerRound[0] = interior_time + comm_time + conflict_detection;
1414 recoloringPerRound[0] = 0;
1415 commPerRound[0] = comm_time;
1416 compPerRound[0] = interior_time + conflict_detection;
1417 conflictDetectionPerRound[0] = conflict_detection;
1418 recoloringPerRound[0] = 0;
1419 vertsPerRound[0] = 0;
1420 incorrectGhostsPerRound[0]=0;
1421 typename Kokkos::View<int*, device_type>::host_mirror_type ghost_colors_host;
1422 typename Kokkos::View<lno_t*, device_type>::host_mirror_type boundary_verts_host;
1423 size_t serial_threshold = this->pl->template get<int>("serial_threshold",0);
1424 //see if recoloring is necessary.
1425 size_t totalConflicts = 0;
1426 size_t localConflicts = recoloringSize_host(0);
1427 Teuchos::reduceAll<int,size_t>(*comm, Teuchos::REDUCE_SUM, 1, &localConflicts, &totalConflicts);
1428 bool done = !totalConflicts;
1429 if(comm->getSize()==1) done = true;
1430
1431 //recolor until no conflicts are left
1432 while(!done){
1433 //if the number of vertices left to recolor is less than
1434 //the serial threshold set by the user, finish the coloring
1435 //only using host views in a serial execution space.
1436 if(recoloringSize_host(0) < serial_threshold) break;
1437 if(distributedRounds < numStatisticRecordingRounds) {
1438 vertsPerRound[distributedRounds] = verts_to_recolor_size_host(0);
1439 }
1440
1441 if(timing) comm->barrier();
1442 double recolor_temp = timer();
1443 //recolor using KokkosKernels' coloring function
1444 if(verts_to_recolor_size_host(0) > 0){
1445 this->colorInterior(femv_colors.size(), dist_adjs_dev, dist_offsets_dev,femv,verts_to_recolor_view,verts_to_recolor_size_host(0),use_vbbit);
1446 }
1447
1448 if(distributedRounds < numStatisticRecordingRounds){
1449 recoloringPerRound[distributedRounds] = timer() - recolor_temp;
1450 recoloring_time += recoloringPerRound[distributedRounds];
1451 comp_time += recoloringPerRound[distributedRounds];
1452 compPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
1453 totalPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
1454 } else if(timing){
1455 double recoloring_round_time = timer() - recolor_temp;
1456 recoloring_time += recoloring_round_time;
1457 comp_time += recoloring_round_time;
1458 }
1459
1460 //reset the ghost colors to what they were before recoloring
1461 //the recoloring does not have enough information to color
1462 //ghosts correctly, so we set the colors to what they were before
1463 //to avoid consistency issues.
1464 Kokkos::parallel_for("set femv colors",
1465 Kokkos::RangePolicy<execution_space, int>(0,n_ghosts),
1466 KOKKOS_LAMBDA(const int& i){
1467 femv_colors(i+n_local) = ghost_colors(i);
1468 });
1469 Kokkos::fence();
1470
1471 //send views are up-to-date, they were copied after conflict detection.
1472 //communicate the new colors
1473
1474 // Reset device views
1475 femvColors = decltype(femvColors)();
1476 femv_colors = decltype(femv_colors)();
1477 double curr_comm_time = doOwnedToGhosts(mapOwnedPlusGhosts,n_local,verts_to_send_host,verts_to_send_size_host,femv,procs_to_send,sent,recv);
1478 comm_time += curr_comm_time;
1479
1480 if(distributedRounds < numStatisticRecordingRounds){
1481 commPerRound[distributedRounds] = curr_comm_time;
1482 recvPerRound[distributedRounds] = recv;
1483 sentPerRound[distributedRounds] = sent;
1484 totalPerRound[distributedRounds] += commPerRound[distributedRounds];
1485 }
1486
1487 //store ghost colors before we uncolor and recolor them.
1488 //this process doesn't have enough info to correctly color
1489 //ghosts, so we set them back to what they were before to
1490 //remove consistency issues.
1491 femvColors = femv->getLocalViewDevice(Tpetra::Access::ReadWrite);
1492 femv_colors = subview(femvColors, Kokkos::ALL, 0);
1493 Kokkos::parallel_for("get femv colors 2",
1494 Kokkos::RangePolicy<execution_space, int>(0,n_ghosts),
1495 KOKKOS_LAMBDA(const int& i){
1496 ghost_colors(i) = femv_colors(i+n_local);
1497 });
1498 Kokkos::fence();
1499
1500
1501 //these views will change in detectConflicts, they need
1502 //to be zeroed out beforehand
1503 verts_to_send_size_host(0) = 0;
1504 verts_to_recolor_size_host(0) = 0;
1505 recoloringSize_host(0) = 0;
1506 Kokkos::deep_copy(verts_to_send_size, verts_to_send_size_host);
1507 Kokkos::deep_copy(verts_to_recolor_size, verts_to_recolor_size_host);
1508 Kokkos::deep_copy(recoloringSize, recoloringSize_host);
1509
1510 //check for further conflicts
1511 double detection_temp = timer();
1512
1513 detectConflicts(n_local, dist_offsets_dev, dist_adjs_dev,femv_colors, boundary_verts_dev,
1514 verts_to_recolor_view, verts_to_recolor_size_atomic, verts_to_send_view, verts_to_send_size_atomic,
1515 recoloringSize, rand_dev, gid_dev, ghost_degrees_dev, recolor_degrees);
1516
1517 //copy the updated device views back into host memory where necessary
1518 Kokkos::deep_copy(verts_to_send_host, verts_to_send_view);
1519 Kokkos::deep_copy(verts_to_send_size_host, verts_to_send_size);
1520 //we only use the list of verts to recolor on device, no need to copy to host.
1521 Kokkos::deep_copy(verts_to_recolor_size_host, verts_to_recolor_size);
1522 Kokkos::deep_copy(recoloringSize_host, recoloringSize);
1523
1524 if(distributedRounds < numStatisticRecordingRounds){
1525 conflictDetectionPerRound[distributedRounds] = timer() - detection_temp;
1526 conflict_detection += conflictDetectionPerRound[distributedRounds];
1527 compPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
1528 totalPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
1529 comp_time += conflictDetectionPerRound[distributedRounds];
1530 } else if(timing){
1531 double conflict_detection_round_time = timer() - detection_temp;
1532 conflict_detection += conflict_detection_round_time;
1533 comp_time += conflict_detection_round_time;
1534 }
1535
1536 distributedRounds++;
1537 size_t localDone = recoloringSize_host(0);
1538 size_t globalDone = 0;
1539 Teuchos::reduceAll<int,size_t>(*comm, Teuchos::REDUCE_SUM, 1, &localDone, &globalDone);
1540 done = !globalDone;
1541
1542 }//end device coloring
1543
1544
1545 //If we reach this point and still have vertices to color,
1546 //the rest of the coloring is happening in serial on the host.
1547 //
1548 //First, we need to copy device-only views to host mirrors
1549 if(recoloringSize_host(0) > 0 || !done){
1550 ghost_colors_host = Kokkos::create_mirror_view_and_copy(host_mem(),ghost_colors,"ghost_colors host mirror");
1551 boundary_verts_host = Kokkos::create_mirror_view_and_copy(host_mem(),boundary_verts_dev,"boundary_verts host mirror");
1552 }
1553
1554 //Now we do a similar coloring loop to before,
1555 //but using only host views in a serial execution space.
1556 // Reset device views
1557 femvColors = decltype(femvColors)();
1558 femv_colors = decltype(femv_colors)();
1559 while(recoloringSize_host(0) > 0 || !done){
1560 auto femvColors_host = femv->getLocalViewHost(Tpetra::Access::ReadWrite);
1561 auto colors_host = subview(femvColors_host, Kokkos::ALL, 0);
1562 if(distributedRounds < numStatisticRecordingRounds){
1563 vertsPerRound[distributedRounds] = recoloringSize_host(0);
1564 }
1565 if(verbose) std::cout<<comm->getRank()<<": starting to recolor, serial\n";
1566 if(timing) comm->barrier();
1567
1568 double recolor_temp = timer();
1569 if(verts_to_recolor_size_host(0) > 0){
1570 this->colorInterior_serial(colors_host.size(), dist_adjs_host, dist_offsets_host, femv,
1571 verts_to_recolor_host, verts_to_recolor_size_host(0), true);
1572 }
1573 if(distributedRounds < numStatisticRecordingRounds){
1574 recoloringPerRound[distributedRounds] = timer() - recolor_temp;
1575 recoloring_time += recoloringPerRound[distributedRounds];
1576 comp_time += recoloringPerRound[distributedRounds];
1577 compPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
1578 totalPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
1579 } else if(timing){
1580 double recoloring_serial_round_time = timer() - recolor_temp;
1581 recoloring_time += recoloring_serial_round_time;
1582 comp_time += recoloring_serial_round_time;
1583 }
1584
1585 //reset the ghost colors to their previous values to avoid
1586 //consistency issues.
1587 for(size_t i = 0; i < n_ghosts; i++){
1588 colors_host(i+n_local) = ghost_colors_host(i);
1589 }
1590
1591 double curr_comm_time = doOwnedToGhosts(mapOwnedPlusGhosts, n_local,verts_to_send_host, verts_to_send_size_host, femv, procs_to_send, sent,recv);
1592 comm_time += curr_comm_time;
1593
1594 if(distributedRounds < numStatisticRecordingRounds){
1595 commPerRound[distributedRounds] = curr_comm_time;
1596 recvPerRound[distributedRounds] = recv;
1597 sentPerRound[distributedRounds] = sent;
1598 totalPerRound[distributedRounds] += commPerRound[distributedRounds];
1599 }
1600
1601 //store updated ghost colors we received from their owners
1602 //before conflict detection and recoloring changes them locally.
1603 for(size_t i = 0; i < n_ghosts; i++){
1604 ghost_colors_host(i) = colors_host(i+n_local);
1605 }
1606
1607 if(timing) comm->barrier();
1608 double detection_temp = timer();
1609
1610 //zero these out, they'll be updated by detectConflicts_serial
1611 verts_to_recolor_size_host(0) = 0;
1612 verts_to_send_size_host(0) = 0;
1613 recoloringSize_host(0) = 0;
1614
1615 detectConflicts_serial(n_local,dist_offsets_host, dist_adjs_host, colors_host, boundary_verts_host,
1616 verts_to_recolor_host, verts_to_recolor_size_host, verts_to_send_host, verts_to_send_size_host,
1617 recoloringSize_host, rand_host, gid_host, ghost_degrees_host, recolor_degrees);
1618
1619 //no need to update the host views, we're only using host views now.
1620
1621 if(distributedRounds < numStatisticRecordingRounds){
1622 conflictDetectionPerRound[distributedRounds] = timer() - detection_temp;
1623 conflict_detection += conflictDetectionPerRound[distributedRounds];
1624 compPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
1625 totalPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
1626 comp_time += conflictDetectionPerRound[distributedRounds];
1627 } else if(timing){
1628 double conflict_detection_serial_round_time = timer() - detection_temp;
1629 conflict_detection += conflict_detection_serial_round_time;
1630 comp_time += conflict_detection_serial_round_time;
1631 }
1632
1633 size_t globalDone = 0;
1634 size_t localDone = recoloringSize_host(0);
1635 Teuchos::reduceAll<int,size_t>(*comm, Teuchos::REDUCE_SUM, 1, &localDone, &globalDone);
1636 distributedRounds++;
1637 done = !globalDone;
1638 }
1639
1640 total_time = timer() - total_time;
1641 //only compute stats if they will be displayed
1642 if(verbose){
1643 uint64_t localBoundaryVertices = 0;
1644 for(size_t i = 0; i < n_local; i++){
1645 for(offset_t j = offsets[i]; j < offsets[i+1]; j++){
1646 if((size_t)adjs[j] >= n_local){
1647 localBoundaryVertices++;
1648 break;
1649 }
1650 }
1651 }
1652
1653 //if(comm->getRank() == 0) printf("did %d rounds of distributed coloring\n", distributedRounds);
1654 uint64_t totalVertsPerRound[numStatisticRecordingRounds];
1655 uint64_t totalBoundarySize = 0;
1656 uint64_t totalIncorrectGhostsPerRound[numStatisticRecordingRounds];
1657 double finalTotalPerRound[numStatisticRecordingRounds];
1658 double maxRecoloringPerRound[numStatisticRecordingRounds];
1659 double minRecoloringPerRound[numStatisticRecordingRounds];
1660 double finalCommPerRound[numStatisticRecordingRounds];
1661 double finalCompPerRound[numStatisticRecordingRounds];
1662 double finalConflictDetectionPerRound[numStatisticRecordingRounds];
1663 gno_t finalRecvPerRound[numStatisticRecordingRounds];
1664 gno_t finalSentPerRound[numStatisticRecordingRounds];
1665 for(int i = 0; i < numStatisticRecordingRounds; i++){
1666 totalVertsPerRound[i] = 0;
1667 finalTotalPerRound[i] = 0.0;
1668 maxRecoloringPerRound[i] = 0.0;
1669 minRecoloringPerRound[i] = 0.0;
1670 finalCommPerRound[i] = 0.0;
1671 finalCompPerRound[i] = 0.0;
1672 finalConflictDetectionPerRound[i] = 0.0;
1673 finalRecvPerRound[i] = 0;
1674 finalSentPerRound[i] = 0;
1675 }
1676 Teuchos::reduceAll<int,uint64_t>(*comm, Teuchos::REDUCE_SUM,1,&localBoundaryVertices, &totalBoundarySize);
1677 Teuchos::reduceAll<int,uint64_t>(*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,vertsPerRound,totalVertsPerRound);
1678 Teuchos::reduceAll<int,uint64_t>(*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,incorrectGhostsPerRound,totalIncorrectGhostsPerRound);
1679 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,totalPerRound, finalTotalPerRound);
1680 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,recoloringPerRound,maxRecoloringPerRound);
1681 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MIN,numStatisticRecordingRounds,recoloringPerRound,minRecoloringPerRound);
1682 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,commPerRound,finalCommPerRound);
1683 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,compPerRound,finalCompPerRound);
1684 Teuchos::reduceAll<int,double>(*comm,
1685 Teuchos::REDUCE_MAX,numStatisticRecordingRounds,conflictDetectionPerRound,finalConflictDetectionPerRound);
1686 Teuchos::reduceAll<int,gno_t> (*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,recvPerRound,finalRecvPerRound);
1687 Teuchos::reduceAll<int,gno_t> (*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,sentPerRound,finalSentPerRound);
1688 std::cout << "Rank " << comm->getRank()
1689 << ": boundary size: " << localBoundaryVertices << std::endl;
1690 if(comm->getRank() == 0)
1691 std::cout << "Total boundary size: " << totalBoundarySize << std::endl;
1692 for(int i = 0; i < std::min((int)distributedRounds,numStatisticRecordingRounds); i++){
1693 std::cout << "Rank " << comm->getRank()
1694 << ": recolor " << vertsPerRound[i]
1695 << " vertices in round " << i << std::endl;
1696 std::cout << "Rank " << comm->getRank()
1697 << " sentbuf had " << sentPerRound[i]
1698 << " entries in round " << i << std::endl;
1699 if(comm->getRank()==0){
1700 std::cout << "recolored " << totalVertsPerRound[i]
1701 << " vertices in round " << i << std::endl;
1702 std::cout << totalIncorrectGhostsPerRound[i]
1703 << " inconsistent ghosts in round " << i << std::endl;
1704 std::cout << "total time in round " << i
1705 << ": " << finalTotalPerRound[i] << std::endl;
1706 std::cout << "recoloring time in round " << i
1707 << ": " << maxRecoloringPerRound[i] << std::endl;
1708 std::cout << "min recoloring time in round " << i
1709 << ": " << minRecoloringPerRound[i] << std::endl;
1710 std::cout << "conflict detection time in round " << i
1711 << ": " << finalConflictDetectionPerRound[i] << std::endl;
1712 std::cout << "comm time in round " << i
1713 << ": " << finalCommPerRound[i] << std::endl;
1714 std::cout << "recvbuf size in round " << i
1715 << ": " << finalRecvPerRound[i] << std::endl;
1716 std::cout << "sendbuf size in round " << i
1717 << ": " << finalSentPerRound[i] << std::endl;
1718 std::cout << "comp time in round " << i
1719 << ": " << finalCompPerRound[i] << std::endl;
1720 }
1721 }
1722 } else if (timing){
1723 double global_total_time = 0.0;
1724 double global_recoloring_time = 0.0;
1725 double global_min_recoloring_time = 0.0;
1726 double global_conflict_detection=0.0;
1727 double global_comm_time=0.0;
1728 double global_comp_time=0.0;
1729 double global_interior_time=0.0;
1730 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&total_time,&global_total_time);
1731 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&recoloring_time,&global_recoloring_time);
1732 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MIN,1,&recoloring_time,&global_min_recoloring_time);
1733 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&conflict_detection,&global_conflict_detection);
1734 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&comm_time,&global_comm_time);
1735 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&comp_time,&global_comp_time);
1736 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&interior_time,&global_interior_time);
1737 comm->barrier();
1738 fflush(stdout);
1739 if(comm->getRank()==0){
1740 std::cout << "Total Time: " << global_total_time << std::endl;
1741 std::cout << "Interior Time: " << global_interior_time << std::endl;
1742 std::cout << "Recoloring Time: " << global_recoloring_time << std::endl;
1743 std::cout << "Min Recoloring Time: " << global_min_recoloring_time << std::endl;
1744 std::cout << "Conflict Detection Time: " << global_conflict_detection << std::endl;
1745 std::cout << "Comm Time: " << global_comm_time << std::endl;
1746 std::cout << "Comp Time: " << global_comp_time << std::endl;
1747 }
1748 }
1749 }
1750}; //end class
1751
1752}//end namespace Zoltan2
1753
1754#endif
AlltoAll communication methods.
Defines the ColoringSolution class.
Defines the GraphModel interface.
Traits class to handle conversions between gno_t/lno_t and TPL data types (e.g., ParMETIS's idx_t,...
A gathering of useful namespace methods.
typename femv_t::device_type device_type
virtual void detectConflicts_serial(const size_t n_local, typename Kokkos::View< offset_t *, device_type >::host_mirror_type dist_offsets_host, typename Kokkos::View< lno_t *, device_type >::host_mirror_type dist_adjs_host, typename Kokkos::View< int *, device_type >::host_mirror_type femv_colors, typename Kokkos::View< lno_t *, device_type >::host_mirror_type boundary_verts_view, typename Kokkos::View< lno_t *, device_type >::host_mirror_type verts_to_recolor_view, typename Kokkos::View< int *, device_type >::host_mirror_type verts_to_recolor_size_atomic, typename Kokkos::View< lno_t *, device_type >::host_mirror_type verts_to_send_view, typename Kokkos::View< size_t *, device_type >::host_mirror_type verts_to_send_size_atomic, typename Kokkos::View< size_t *, device_type >::host_mirror_type recoloringSize, typename Kokkos::View< int *, device_type >::host_mirror_type rand, typename Kokkos::View< gno_t *, device_type >::host_mirror_type gid, typename Kokkos::View< gno_t *, device_type >::host_mirror_type ghost_degrees, bool recolor_degrees)=0
typename Adapter::base_adapter_t base_adapter_t
void twoGhostLayer(const size_t n_local, const size_t n_total, const Teuchos::ArrayView< const lno_t > &adjs, const Teuchos::ArrayView< const offset_t > &offsets, const Teuchos::ArrayView< const lno_t > &ghost_adjs, const Teuchos::ArrayView< const offset_t > &ghost_offsets, const Teuchos::RCP< femv_t > &femv, const Teuchos::ArrayView< const gno_t > &gids, const Teuchos::ArrayView< const int > &rand, const Teuchos::ArrayView< const int > &ghost_owners, RCP< const map_t > mapOwnedPlusGhosts, const std::unordered_map< lno_t, std::vector< int > > &procs_to_send)
typename Adapter::gno_t gno_t
typename Adapter::scalar_t scalar_t
typename Adapter::offset_t offset_t
Tpetra::FEMultiVector< femv_scalar_t, lno_t, gno_t > femv_t
typename femv_t::host_view_type::device_type::execution_space host_exec
typename device_type::execution_space execution_space
void color(const RCP< ColoringSolution< Adapter > > &solution)
Coloring method.
virtual void detectConflicts(const size_t n_local, Kokkos::View< offset_t *, device_type > dist_offsets_dev, Kokkos::View< lno_t *, device_type > dist_adjs_dev, Kokkos::View< int *, device_type > femv_colors, Kokkos::View< lno_t *, device_type > boundary_verts_view, Kokkos::View< lno_t *, device_type > verts_to_recolor_view, Kokkos::View< int *, device_type, Kokkos::MemoryTraits< Kokkos::Atomic > > verts_to_recolor_size_atomic, Kokkos::View< lno_t *, device_type > verts_to_send_view, Kokkos::View< size_t *, device_type, Kokkos::MemoryTraits< Kokkos::Atomic > > verts_to_send_size_atomic, Kokkos::View< size_t *, device_type > recoloringSize, Kokkos::View< int *, device_type > rand, Kokkos::View< gno_t *, device_type > gid, Kokkos::View< gno_t *, device_type > ghost_degrees, bool recolor_degrees)=0
typename femv_t::host_view_type::device_type::memory_space host_mem
RCP< Teuchos::ParameterList > pl
RCP< const base_adapter_t > adapter
virtual void constructBoundary(const size_t n_local, Kokkos::View< offset_t *, device_type > dist_offsets_dev, Kokkos::View< lno_t *, device_type > dist_adjs_dev, typename Kokkos::View< offset_t *, device_type >::host_mirror_type dist_offsets_host, typename Kokkos::View< lno_t *, device_type >::host_mirror_type dist_adjs_host, Kokkos::View< lno_t *, device_type > &boundary_verts, Kokkos::View< lno_t *, device_type > verts_to_send_view, Kokkos::View< size_t *, device_type, Kokkos::MemoryTraits< Kokkos::Atomic > > verts_to_send_size_atomic)=0
AlgTwoGhostLayer(const RCP< const base_adapter_t > &adapter_, const RCP< Teuchos::ParameterList > &pl_, const RCP< Environment > &env_, const RCP< const Teuchos::Comm< int > > &comm_)
typename device_type::memory_space memory_space
typename Adapter::lno_t lno_t
Tpetra::Map< lno_t, gno_t > map_t
RCP< const Teuchos::Comm< int > > comm
Algorithm defines the base class for all algorithms.
The class containing coloring solution.
GraphModel defines the interface required for graph models.
map_t::local_ordinal_type lno_t
map_t::global_ordinal_type gno_t
Created by mbenlioglu on Aug 31, 2020.
std::bitset< NUM_MODEL_FLAGS > modelFlag_t
@ REMOVE_SELF_EDGES
algorithm requires no self edges