Zoltan2
Loading...
Searching...
No Matches
Zoltan2_AlgHybridD1.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// Zoltan2: A package of combinatorial algorithms for scientific computing
4//
5// Copyright 2012 NTESS and the Zoltan2 contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef _ZOLTAN2_ALGHYBRIDD1_HPP_
11#define _ZOLTAN2_ALGHYBRIDD1_HPP_
12
13#include <vector>
14#include <unordered_map>
15#include <iostream>
16#include <fstream>
17#include <queue>
18#ifdef _WIN32
19#include <time.h>
20#else
21#include <sys/time.h>
22#endif
23#include <algorithm>
24
25#include "Zoltan2_Algorithm.hpp"
28#include "Zoltan2_Util.hpp"
29#include "Zoltan2_TPLTraits.hpp"
30#include "Zoltan2_AlltoAll.hpp"
31
32#include "Tpetra_Core.hpp"
33#include "Teuchos_RCP.hpp"
34#include "Tpetra_Import.hpp"
35#include "Tpetra_FEMultiVector.hpp"
36
37#include "KokkosKernels_Handle.hpp"
38#include "KokkosKernels_IOUtils.hpp"
39#include "KokkosGraph_Distance1Color.hpp"
40#include "KokkosGraph_Distance1ColorHandle.hpp"
41#include <stdlib.h>
42
46
47namespace Zoltan2 {
48
49template <typename Adapter>
50class AlgDistance1 : public Algorithm<Adapter>
51{
52 public:
53
54 using lno_t = typename Adapter::lno_t;
55 using gno_t = typename Adapter::gno_t;
56 using offset_t = typename Adapter::offset_t;
57 using scalar_t = typename Adapter::scalar_t;
58 using base_adapter_t = typename Adapter::base_adapter_t;
59 using map_t = Tpetra::Map<lno_t, gno_t>;
60 using femv_scalar_t = int;
61 using femv_t = Tpetra::FEMultiVector<femv_scalar_t, lno_t, gno_t>;
62 using device_type = typename femv_t::device_type;
63 using execution_space = typename device_type::execution_space;
64 using memory_space = typename device_type::memory_space;
65 using host_exec = typename femv_t::host_view_type::device_type::execution_space;
66 using host_mem = typename femv_t::host_view_type::device_type::memory_space;
67 double timer() {
68 struct timeval tp;
69 gettimeofday(&tp, NULL);
70 return ((double) (tp.tv_sec) + 1e-6 * tp.tv_usec);
71 }
72
73 private:
74
75 void buildModel(modelFlag_t &flags);
76
77 //function to invoke KokkosKernels distance-1 coloring
78 //
79 //OUTPUT ARG:
80 //
81 // femv: FEMultiVector containing dual-view of colors.
82 // After this call each vertex will have a locally-valid color.
83 //
84 //INPUT ARGS:
85 //
86 // nVtx: number of local owned vertices
87 //
88 // adjs_view: CSR adjacencies for the local graph
89 //
90 // offset_view: CSR offsets for indexing adjs_view
91 //
92 // vertex_list: list of local IDs of vertices that
93 // need to be recolored
94 //
95 // vertex_list_size: 0 means no vertex list given,
96 // otherwise it simply is the size
97 // of the vertex_list
98 //
99 // recolor: switches between VB_BIT and EB KokkosKernels
100 // algorithms
101 //
102 template <class ExecutionSpace, typename MemorySpace>
103 void colorInterior(const size_t nVtx,
104 Kokkos::View<lno_t*, Kokkos::Device<ExecutionSpace,MemorySpace> > adjs_view,
105 Kokkos::View<offset_t*, Kokkos::Device<ExecutionSpace, MemorySpace> > offset_view,
106 Teuchos::RCP<femv_t> femv,
107 Kokkos::View<lno_t*, Kokkos::Device<ExecutionSpace, MemorySpace> > vertex_list,
108 size_t vertex_list_size = 0,
109 bool recolor=false){
110
111 //set up types to be used by KokkosKernels
112 using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle
113 <offset_t, lno_t, lno_t, ExecutionSpace, MemorySpace,
114 MemorySpace>;
115 using lno_row_view_t = Kokkos::View<offset_t*, Kokkos::Device<ExecutionSpace, MemorySpace>>;
116 using lno_nnz_view_t = Kokkos::View<lno_t*, Kokkos::Device<ExecutionSpace, MemorySpace>>;
117
118 KernelHandle kh;
119
120 //pick which KokkosKernels algorithm to use.
121 //VBBIT is more efficient for inputs with max degree < ~6000
122 //EB is more efficient for inputs with max degree > ~6000
123 if(recolor){
124 kh.create_graph_coloring_handle(KokkosGraph::COLORING_VBBIT);
125 } else {
126 kh.create_graph_coloring_handle(KokkosGraph::COLORING_EB);
127 }
128
129 //vertex_list_size indicates whether we have provided a list of vertices to recolor.
130 //Only makes a difference if the algorithm to be used is VBBIT.
131 if(vertex_list_size != 0){
132 kh.get_graph_coloring_handle()->set_vertex_list(vertex_list,vertex_list_size);
133 }
134
135 kh.set_verbose(verbose);
136
137 //set the initial coloring of the kh.get_graph_coloring_handle() to be
138 //the data view from the femv.
139 auto femvColors = femv->template getLocalView<Kokkos::Device<ExecutionSpace,MemorySpace> >(Tpetra::Access::ReadWrite);
140 auto sv = subview(femvColors, Kokkos::ALL, 0);
141 kh.get_graph_coloring_handle()->set_vertex_colors(sv);
142 kh.get_graph_coloring_handle()->set_tictoc(verbose);
143 KokkosGraph::Experimental::graph_color_symbolic<KernelHandle, lno_row_view_t, lno_nnz_view_t>
144 (&kh, nVtx, nVtx, offset_view, adjs_view);
145 //numColors = kh.get_graph_coloring_handle()->get_num_colors();
146
147 if(verbose){
148 std::cout<<"\nKokkosKernels Coloring: "<<kh.get_graph_coloring_handle()->get_overall_coloring_time()<<" iterations: "<<kh.get_graph_coloring_handle()->get_num_phases()<<"\n\n";
149 }
150 }
151
152 public:
153 //contains all distance-1 conflict detection logic
154 //
155 //OUTPUT ARGS:
156 //
157 // femv_colors: This function uncolors vertices that have
158 // distance-1 conflicts, so these colors will
159 // change if there are any conflicts present
160 //
161 //INPUT ARGS:
162 //
163 // n_local: number of locally owned vertices
164 //
165 // n_ghosts: number of ghosts on this process
166 //
167 // dist_offsets: CSR offsets of the local graph
168 //
169 // dist_adjs: CSR adjacencies of the local graph
170 //
171 // verts_to_send_view: Used to construct a list of verts to send on device.
172 //
173 // verts_to_send_size_atomic: atomic version of the verts_to_send_size view
174 // Used to construct a list on device,
175 // the atomic is necessary for correctness.
176 //
177 // recoloringSize: holds the total amount of recoloring that will be done
178 // locally. Does not need to be atomic.
179 //
180 // rand: view that holds the tie-breaking random numbers indexed by LID.
181 //
182 // gid: view that holds GIDs, for tie breaking in the case that rand
183 // numbers are the same for two vertices.
184 //
185 // ghost_degrees: view that holds degrees only for ghost vertices.
186 // LID n_local corresponds to ghost_degrees(0).
187 //
188 // recolor_degrees: if true, we factor degrees into the conflict detection
189 // if false, we resolve using only consistent random numbers.
190 //
191 template <class ExecutionSpace, typename MemorySpace>
192 void detectConflicts(const size_t n_local, const size_t n_ghosts,
193 Kokkos::View<offset_t*, Kokkos::Device<ExecutionSpace, MemorySpace>> dist_offsets,
194 Kokkos::View<lno_t*, Kokkos::Device<ExecutionSpace, MemorySpace>> dist_adjs,
195 Kokkos::View<int*, Kokkos::Device<ExecutionSpace, MemorySpace>> femv_colors,
196 Kokkos::View<lno_t*,
197 Kokkos::Device<ExecutionSpace, MemorySpace>> verts_to_send_view,
198 Kokkos::View<size_t*,
199 Kokkos::Device<ExecutionSpace, MemorySpace>,
200 Kokkos::MemoryTraits<Kokkos::Atomic>> verts_to_send_size_atomic,
201 Kokkos::View<gno_t*, Kokkos::Device<ExecutionSpace, MemorySpace>> recoloringSize,
202 Kokkos::View<int*,
203 Kokkos::Device<ExecutionSpace, MemorySpace>> rand,
204 Kokkos::View<gno_t*,
205 Kokkos::Device<ExecutionSpace, MemorySpace>> gid,
206 Kokkos::View<gno_t*,
207 Kokkos::Device<ExecutionSpace, MemorySpace>> ghost_degrees,
208 bool recolor_degrees){
209 gno_t local_recoloring_size;
210 Kokkos::parallel_reduce("Conflict Detection",
211 Kokkos::RangePolicy<ExecutionSpace>(0,n_ghosts),
212 KOKKOS_LAMBDA(const int& i,
213 gno_t& recoloring_size){
214 lno_t lid = i+n_local;
215 int currColor = femv_colors(lid);
216 int currDegree = ghost_degrees(i);
217 for(offset_t j = dist_offsets(lid); j < dist_offsets(lid+1); j++){
218 int nborColor = femv_colors(dist_adjs(j));
219 int nborDegree = 0;
220 if((size_t)dist_adjs(j) < n_local) nborDegree = dist_offsets(dist_adjs(j)+1) - dist_offsets(dist_adjs(j));
221 else nborDegree = ghost_degrees(dist_adjs(j) - n_local);
222 if(currColor == nborColor ){
223 if(currDegree < nborDegree && recolor_degrees){
224 femv_colors(lid) = 0;
225 recoloring_size++;
226 }else if (nborDegree < currDegree && recolor_degrees){
227 femv_colors(dist_adjs(j)) = 0;
228 recoloring_size++;
229 }else if(rand(lid) > rand(dist_adjs(j))){
230 femv_colors(lid) = 0;
231 recoloring_size++;
232 break;
233 }else if (rand(dist_adjs(j)) > rand(lid)){
234 femv_colors(dist_adjs(j)) = 0;
235 recoloring_size++;
236 } else {
237 if (gid(lid) >= gid(dist_adjs(j))){
238 femv_colors(lid) = 0;
239 recoloring_size++;
240 break;
241 } else {
242 femv_colors(dist_adjs(j)) = 0;
243 recoloring_size++;
244 }
245 }
246 }
247 }
248 },local_recoloring_size);
249 Kokkos::deep_copy(recoloringSize, local_recoloring_size);
250 Kokkos::fence();
251 Kokkos::parallel_for("Rebuild verts_to_send_view",
252 Kokkos::RangePolicy<ExecutionSpace>(0,n_local),
253 KOKKOS_LAMBDA(const int& i){
254 if(femv_colors(i) == 0){
255 verts_to_send_view(verts_to_send_size_atomic(0)++) = i;
256 }
257 });
258 Kokkos::fence();
259 }
260
261 private:
262 //Communicates owned vertex colors to ghost copies.
263 //
264 //RETURN VALUE:
265 //
266 // returns the time it took for the communication call to complete
267 //
268 //OUTPUT ARGS:
269 //
270 // femv: FEMultivector that holds the dual-view of the colors
271 // After this call, ghost colors will be up-to-date.
272 //
273 // recv: returns the size of the recv buffer
274 //
275 // send: returns the size of the send buffer
276 //
277 //INPUT ARGS:
278 //
279 // mapOwnedPlusGhosts: a Tpetra map that translates between
280 // LID and GID for any vertex on this process.
281 //
282 // nVtx: the number of locally owned vertices.
283 //
284 // verts_to_send: hostmirror of verts to send. This function sends
285 // all vertices in this list to their ghost copies.
286 //
287 // verts_to_send_size: hostmirror of verts_to_send_size, holds the
288 // size of verts_to_send.
289 //
290 // procs_to_send: map that translates LID into a list of processes that
291 // have a ghost copy of the vertex.
292 //
293 double doOwnedToGhosts(RCP<const map_t> mapOwnedPlusGhosts,
294 size_t nVtx,
295 typename Kokkos::View<lno_t*, device_type>::host_mirror_type verts_to_send,
296 typename Kokkos::View<size_t*, device_type>::host_mirror_type& verts_to_send_size,
297 Teuchos::RCP<femv_t> femv,
298 const std::unordered_map<lno_t, std::vector<int>>& procs_to_send,
299 gno_t& recv, gno_t& send){
300 auto femvColors = femv->getLocalViewHost(Tpetra::Access::ReadWrite);
301 auto colors = subview(femvColors, Kokkos::ALL, 0);
302 int nprocs = comm->getSize();
303
304 std::vector<int> sendcnts(comm->getSize(), 0);
305 std::vector<gno_t> sdispls(comm->getSize()+1, 0);
306 for(size_t i = 0; i < verts_to_send_size(0); i++){
307 for(size_t j = 0; j < procs_to_send.at(verts_to_send(i)).size(); j++){
308 sendcnts[procs_to_send.at(verts_to_send(i))[j]] += 2;
309 }
310 }
311
312 sdispls[0] = 0;
313 gno_t sendsize = 0;
314 std::vector<int> sentcount(nprocs, 0);
315
316 for(int i = 1; i < comm->getSize()+1; i++){
317 sdispls[i] = sdispls[i-1] + sendcnts[i-1];
318 sendsize += sendcnts[i-1];
319 }
320 send = sendsize;
321 std::vector<gno_t> sendbuf(sendsize,0);
322
323 for(size_t i = 0; i < verts_to_send_size(0); i++){
324 std::vector<int> procs = procs_to_send.at(verts_to_send(i));
325 for(size_t j = 0; j < procs.size(); j++){
326 size_t idx = sdispls[procs[j]] + sentcount[procs[j]];
327 sentcount[procs[j]] += 2;
328 sendbuf[idx++] = mapOwnedPlusGhosts->getGlobalElement(verts_to_send(i));
329 sendbuf[idx] = colors(verts_to_send(i));
330 }
331 }
332
333 Teuchos::ArrayView<int> sendcnts_view = Teuchos::arrayViewFromVector(sendcnts);
334 Teuchos::ArrayView<gno_t> sendbuf_view = Teuchos::arrayViewFromVector(sendbuf);
335 Teuchos::ArrayRCP<gno_t> recvbuf;
336 std::vector<int> recvcnts(comm->getSize(), 0);
337 Teuchos::ArrayView<int> recvcnts_view = Teuchos::arrayViewFromVector(recvcnts);
338
339 //if we're reporting timings, remove the computation imbalance from the comm timer.
340 if(timing) comm->barrier();
341 double comm_total = 0.0;
342 double comm_temp = timer();
343
344 AlltoAllv<gno_t>(*comm, *env, sendbuf_view, sendcnts_view, recvbuf, recvcnts_view);
345 comm_total += timer() - comm_temp;
346
347 gno_t recvsize = 0;
348
349 for(int i = 0; i < recvcnts_view.size(); i++){
350 recvsize += recvcnts_view[i];
351 }
352 recv = recvsize;
353 for(int i = 0; i < recvsize; i+=2){
354 size_t lid = mapOwnedPlusGhosts->getLocalElement(recvbuf[i]);
355 if(lid<nVtx && verbose) std::cout<<comm->getRank()<<": received a locally owned vertex, somehow\n";
356 colors(lid) = recvbuf[i+1];
357 }
358
359 return comm_total;
360 }
361
362 RCP<const base_adapter_t> adapter;
363 RCP<GraphModel<base_adapter_t> > model;
364 RCP<Teuchos::ParameterList> pl;
365 RCP<Environment> env;
366 RCP<const Teuchos::Comm<int> > comm;
367 bool verbose;
368 bool timing;
369 public:
370 //constructor for the hybrid distributed distance-1 algorithm
372 const RCP<const base_adapter_t> &adapter_,
373 const RCP<Teuchos::ParameterList> &pl_,
374 const RCP<Environment> &env_,
375 const RCP<const Teuchos::Comm<int> > &comm_)
376 : adapter(adapter_), pl(pl_), env(env_), comm(comm_) {
377 verbose = pl->get<bool>("verbose",false);
378 timing = pl->get<bool>("timing", false);
379 if(verbose) std::cout<<comm->getRank()<<": inside coloring constructor\n";
380 modelFlag_t flags;
381 flags.reset();
382 buildModel(flags);
383 if(verbose) std::cout<<comm->getRank()<<": done constructing coloring class\n";
384 }
385
386
387 //Main entry point for graph coloring
388 void color( const RCP<ColoringSolution<Adapter> > &solution ) {
389 if(verbose) std::cout<<comm->getRank()<<": inside coloring function\n";
390 //this will color the global graph in a manner similar to Zoltan
391
392 //get vertex GIDs in a locally indexed array
393 if(verbose) std::cout<<comm->getRank()<<": getting owned vtxIDs\n";
394 ArrayView<const gno_t> vtxIDs;
395 ArrayView<StridedData<lno_t, scalar_t> > vwgts;
396 size_t nVtx = model->getVertexList(vtxIDs, vwgts);
397 //we do not use weights at this point
398 if(verbose) std::cout<<comm->getRank()<<": getting edge list\n";
399 //get edge information from the model
400 ArrayView<const gno_t> adjs;
401 ArrayView<const offset_t> offsets;
402 ArrayView<StridedData<lno_t, scalar_t> > ewgts;
403 model->getEdgeList(adjs, offsets, ewgts);
404 //again, weights are not used
405
406 RCP<const map_t> mapOwned;
407 RCP<const map_t> mapWithCopies;
408
409 std::vector<gno_t> finalGIDs;
410 std::vector<offset_t> finalOffset_vec;
411 std::vector<lno_t> finalAdjs_vec;
412
413 std::vector<lno_t> reorderToLocal;
414 for(size_t i = 0; i< nVtx; i++) reorderToLocal.push_back(i);
415 if(verbose) std::cout<<comm->getRank()<<": Setting up local datastructures\n";
416 //Set up a typical local mapping here.
417 std::unordered_map<gno_t,lno_t> globalToLocal;
418 std::vector<gno_t> ownedPlusGhosts;
419 for (gno_t i = 0; i < vtxIDs.size(); i++){
420 if(vtxIDs[i] < 0 && verbose) std::cout<<comm->getRank()<<": found a negative GID\n";
421 globalToLocal[vtxIDs[i]] = i;
422 ownedPlusGhosts.push_back(vtxIDs[i]);
423 }
424 gno_t nghosts = 0;
425 for (int i = 0; i < adjs.size(); i++){
426 if(globalToLocal.count(adjs[i]) == 0){
427 //new unique ghost found
428 if(adjs[i] < 0 && verbose) std::cout<<comm->getRank()<<": found a negative adjacency\n";
429 ownedPlusGhosts.push_back(adjs[i]);
430 globalToLocal[adjs[i]] = vtxIDs.size() + nghosts;
431 nghosts++;
432
433 }
434 }
435 if(verbose) std::cout<<comm->getRank()<<": vec.max_size() = "<<finalAdjs_vec.max_size()<<", adjs.size() = "<<adjs.size()<<"\n";
436 finalAdjs_vec.resize(adjs.size());
437 for(size_t i = 0; i < finalAdjs_vec.size();i++){
438 finalAdjs_vec[i] = globalToLocal[adjs[i]];
439 }
440 for(int i = 0; i < offsets.size(); i++) finalOffset_vec.push_back(offsets[i]);
441 finalGIDs = ownedPlusGhosts;
442
443
444 if(verbose) std::cout<<comm->getRank()<<": creating Tpetra Maps\n";
445 Tpetra::global_size_t dummy = Teuchos::OrdinalTraits
446 <Tpetra::global_size_t>::invalid();
447 mapOwned = rcp(new map_t(dummy, vtxIDs, 0, comm));
448
449 dummy = Teuchos::OrdinalTraits <Tpetra::global_size_t>::invalid();
450 mapWithCopies = rcp(new map_t(dummy,
451 Teuchos::arrayViewFromVector(ownedPlusGhosts),
452 0, comm));
453
454 //create the FEMultiVector for the distributed communication.
455 //We also use the views from this datastructure as arguments to
456 //KokkosKernels coloring functions.
457 if(verbose)std::cout<<comm->getRank()<<": creating FEMultiVector\n";
458 typedef Tpetra::Import<lno_t, gno_t> import_t;
459 Teuchos::RCP<import_t> importer = rcp(new import_t(mapOwned,
460 mapWithCopies));
461 Teuchos::RCP<femv_t> femv = rcp(new femv_t(mapOwned,
462 importer, 1, true));
463 //Get color array to fill
464 ArrayRCP<int> colors = solution->getColorsRCP();
465 for(size_t i=0; i<nVtx; i++){
466 colors[i] = 0;
467 }
468
469 //Create random numbers seeded on global IDs so that we don't
470 //need to communicate for consistency. These numbers determine
471 //which vertex gets recolored in the event of a conflict.
472 //taken directly from the Zoltan coloring implementation
473 std::vector<int> rand(finalGIDs.size());
474 for(size_t i = 0; i < finalGIDs.size(); i++){
475 std::srand(finalGIDs[i]);
476 rand[i] = std::rand();
477 }
478
479 //find out who owns the ghosts on this process
480 std::vector<int> ghostOwners(finalGIDs.size() - nVtx);
481 std::vector<gno_t> ghosts(finalGIDs.size() - nVtx);
482 for(size_t i = nVtx; i < finalGIDs.size(); i++) ghosts[i-nVtx] = finalGIDs[i];
483 ArrayView<int> owners = Teuchos::arrayViewFromVector(ghostOwners);
484 ArrayView<const gno_t> ghostGIDs = Teuchos::arrayViewFromVector(ghosts);
485 mapOwned->getRemoteIndexList(ghostGIDs, owners);
486
487 //create const views of the CSR representation of the local graph
488 ArrayView<const offset_t> finalOffsets = Teuchos::arrayViewFromVector(finalOffset_vec);
489 ArrayView<const lno_t> finalAdjs = Teuchos::arrayViewFromVector(finalAdjs_vec);
490 ArrayView<const int> rand_view = Teuchos::arrayViewFromVector(rand);
491 ArrayView<const gno_t> gid_view = Teuchos::arrayViewFromVector(finalGIDs);
492
493 //find out which remote processes have a ghost copy of a local owned vertex.
494 Teuchos::ArrayView<const lno_t> exportLIDs = importer->getExportLIDs();
495 Teuchos::ArrayView<const int> exportPIDs = importer->getExportPIDs();
496
497 //create a quick lookup from LID -> remote processes with a ghost copy
498 std::unordered_map<lno_t, std::vector<int>> procs_to_send;
499 for(int i = 0; i < exportLIDs.size(); i++){
500 procs_to_send[exportLIDs[i]].push_back(exportPIDs[i]);
501 }
502
503 // call coloring function
504 hybridGMB(nVtx, finalAdjs, finalOffsets,femv,gid_view,rand_view,owners,mapWithCopies, procs_to_send);
505
506 //copy colors to the output array.
507 auto femvdata = femv->getData(0);
508 for(int i = 0; i < colors.size(); i++){
509 colors[reorderToLocal[i]] = femvdata[i];
510 }
511
512 }
513
514 //This function contains all coloring logic.
515 //
516 //OUTPUT ARGS:
517 //
518 // femv: FEMultiVector with a dual-view of the vertex colors.
519 // The colors will be a valid distance-1 coloring after this
520 // function returns.
521 //
522 //INPUT ARGS:
523 //
524 // nVtx: number of locally owned vertices
525 //
526 // adjs: CSR adjacencies for the local graph
527 //
528 // offsets: CSR offsets into adjs for the local graph
529 //
530 // gids: global IDs for every vertex on this process.
531 // indexed by local ID
532 //
533 // rand: random number generated for every vertex on
534 // this process. Indexed by local ID
535 //
536 // owners: owners of each ghost vertex.
537 // owners[i] = the owning process for vertex with GID gids[i+nVtx]
538 //
539 // mapOwnedPlusGhosts: Tpetra map that translates between LIDs and GIDs
540 //
541 // procs_to_send: maps from LIDs to Process IDs.
542 // procs_to_send[LID] gives a list of processes that have
543 // a ghost copy of LID.
544 //
545 void hybridGMB(const size_t nVtx,
546 const Teuchos::ArrayView<const lno_t>& adjs,
547 const Teuchos::ArrayView<const offset_t>& offsets,
548 const Teuchos::RCP<femv_t>& femv,
549 const Teuchos::ArrayView<const gno_t>& gids,
550 const Teuchos::ArrayView<const int>& rand,
551 const Teuchos::ArrayView<const int>& owners,
552 RCP<const map_t> mapOwnedPlusGhosts,
553 const std::unordered_map<lno_t, std::vector<int>>& procs_to_send){
554 if(verbose) std::cout<<comm->getRank()<<": inside coloring algorithm\n";
555
556 //initialize timers
557 double total_time = 0.0;
558 double interior_time = 0.0;
559 double comm_time = 0.0;
560 double comp_time = 0.0;
561 double recoloring_time = 0.0;
562 double conflict_detection = 0.0;
563
564 const int numStatisticRecordingRounds = 100;
565
566 //Put together local and remote degrees
567 //1. Communicate remote GIDs to owning processes
568 std::vector<int> deg_send_cnts(comm->getSize(),0);
569 std::vector<gno_t> deg_sdispls(comm->getSize()+1,0);
570 for(int i = 0; i < owners.size(); i++){
571 deg_send_cnts[owners[i]]++;
572 }
573 deg_sdispls[0] = 0;
574 gno_t deg_sendsize = 0;
575 std::vector<int> deg_sentcount(comm->getSize(),0);
576 for(int i = 1; i < comm->getSize()+1; i++){
577 deg_sdispls[i] = deg_sdispls[i-1] + deg_send_cnts[i-1];
578 deg_sendsize += deg_send_cnts[i-1];
579 }
580 std::vector<gno_t> deg_sendbuf(deg_sendsize,0);
581 for(int i = 0; i < owners.size(); i++){
582 size_t idx = deg_sdispls[owners[i]] + deg_sentcount[owners[i]];
583 deg_sentcount[owners[i]]++;
584 deg_sendbuf[idx] = mapOwnedPlusGhosts->getGlobalElement(i+nVtx);
585 }
586 Teuchos::ArrayView<int> deg_send_cnts_view = Teuchos::arrayViewFromVector(deg_send_cnts);
587 Teuchos::ArrayView<gno_t> deg_sendbuf_view = Teuchos::arrayViewFromVector(deg_sendbuf);
588 Teuchos::ArrayRCP<gno_t> deg_recvbuf;
589 std::vector<int> deg_recvcnts(comm->getSize(),0);
590 Teuchos::ArrayView<int> deg_recvcnts_view = Teuchos::arrayViewFromVector(deg_recvcnts);
591 AlltoAllv<gno_t>(*comm, *env, deg_sendbuf_view, deg_send_cnts_view, deg_recvbuf, deg_recvcnts_view);
592
593 //2. replace GID with local degree
594 for(int i = 0; i < deg_recvbuf.size(); i++){
595 lno_t lid = mapOwnedPlusGhosts->getLocalElement(deg_recvbuf[i]);
596 deg_recvbuf[i] = offsets[lid+1] - offsets[lid];
597 }
598 //3. send modified buffer back
599 ArrayRCP<gno_t> ghost_degrees;
600 AlltoAllv<gno_t>(*comm, *env, deg_recvbuf(), deg_recvcnts_view, ghost_degrees, deg_send_cnts_view);
601 //determine max degree locally and globally
602 Kokkos::View<gno_t*, device_type> ghost_degrees_dev("ghost degree view",ghost_degrees.size());
603 typename Kokkos::View<gno_t*, device_type>::host_mirror_type ghost_degrees_host = Kokkos::create_mirror(ghost_degrees_dev);
604 for(int i = 0; i < ghost_degrees.size(); i++){
605 lno_t lid = mapOwnedPlusGhosts->getLocalElement(deg_sendbuf[i]);
606 ghost_degrees_host(lid-nVtx) = ghost_degrees[i];
607 }
608 Kokkos::deep_copy(ghost_degrees_dev, ghost_degrees_host);
609
610 offset_t local_max_degree = 0;
611 offset_t global_max_degree = 0;
612 for(size_t i = 0; i < nVtx; i++){
613 offset_t curr_degree = offsets[i+1] - offsets[i];
614 if(curr_degree > local_max_degree){
615 local_max_degree = curr_degree;
616 }
617 }
618 Teuchos::reduceAll<int, offset_t>(*comm,Teuchos::REDUCE_MAX,1, &local_max_degree, &global_max_degree);
619 if(comm->getRank() == 0 && verbose) std::cout<<"Input has max degree "<<global_max_degree<<"\n";
620 if(verbose)std::cout<<comm->getRank()<<": creating Kokkos Views\n";
621
622 Kokkos::View<offset_t*, device_type> dist_degrees("Owned+Ghost degree view",rand.size());
623 typename Kokkos::View<offset_t*, device_type>::host_mirror_type dist_degrees_host = Kokkos::create_mirror(dist_degrees);
624 //set degree counts for ghosts
625 for(int i = 0; i < adjs.size(); i++){
626 if((size_t)adjs[i] < nVtx) continue;
627 dist_degrees_host(adjs[i])++;
628 }
629 //set degree counts for owned verts
630 for(int i = 0; i < offsets.size()-1; i++){
631 dist_degrees_host(i) = offsets[i+1] - offsets[i];
632 }
633
634 Kokkos::View<offset_t*, device_type> dist_offsets("Owned+Ghost Offset view", rand.size()+1);
635 typename Kokkos::View<offset_t*, device_type>::host_mirror_type dist_offsets_host = Kokkos::create_mirror(dist_offsets);
636
637 //set offsets and total # of adjacencies
638 dist_offsets_host(0) = 0;
639 uint64_t total_adjs = 0;
640 for(Teuchos_Ordinal i = 1; i < rand.size()+1; i++){
641 dist_offsets_host(i) = dist_degrees_host(i-1) + dist_offsets_host(i-1);
642 total_adjs+= dist_degrees_host(i-1);
643 }
644
645 Kokkos::View<lno_t*, device_type> dist_adjs("Owned+Ghost adjacency view", total_adjs);
646 typename Kokkos::View<lno_t*, device_type>::host_mirror_type dist_adjs_host = Kokkos::create_mirror(dist_adjs);
647 //now, use the degree view as a counter
648 for(Teuchos_Ordinal i = 0; i < rand.size(); i++){
649 dist_degrees_host(i) = 0;
650 }
651 for(int i = 0; i < adjs.size(); i++) dist_adjs_host(i) = adjs[i];
652 if(comm->getSize() > 1){
653 for(size_t i = 0; i < nVtx; i++){
654 for(offset_t j = offsets[i]; j < offsets[i+1]; j++){
655 //if the adjacency is a ghost
656 if( (size_t)adjs[j] >= nVtx){
657 //add the symmetric edge to its adjacency list (already accounted for by offsets)
658 dist_adjs_host(dist_offsets_host(adjs[j]) + dist_degrees_host(adjs[j])) = i;
659 dist_degrees_host(adjs[j])++;
660 }
661 }
662 }
663 }
664
665 if(verbose) std::cout<<comm->getRank()<<": copying host mirrors to device views\n";
666 //copy the host data to device memory
667 Kokkos::deep_copy(dist_degrees, dist_degrees_host); //may be unnecessary
668 Kokkos::deep_copy(dist_offsets, dist_offsets_host);
669 Kokkos::deep_copy(dist_adjs, dist_adjs_host);
670 if(verbose) std::cout<<comm->getRank()<<": done copying to device\n";
671
672 //counter in UVM memory for how many vertices need recoloring.
673 Kokkos::View<gno_t*, device_type> recoloringSize("Recoloring Queue Size",1);
674 typename Kokkos::View<gno_t*, device_type>::host_mirror_type recoloringSize_host = Kokkos::create_mirror(recoloringSize);
675 recoloringSize_host(0) = 0;
676 Kokkos::deep_copy(recoloringSize, recoloringSize_host);
677
678 //device copy of the random tie-breakers
679 Kokkos::View<int*,device_type> rand_dev("randVec",rand.size());
680 typename Kokkos::View<int*, device_type>::host_mirror_type rand_host = Kokkos::create_mirror(rand_dev);
681 for(Teuchos_Ordinal i = 0; i < rand.size(); i++){
682 rand_host(i) = rand[i];
683 }
684
685 //device copy of global IDs
686 Kokkos::View<gno_t*, device_type> gid_dev("GIDs",gids.size());
687 typename Kokkos::View<gno_t*,device_type>::host_mirror_type gid_host = Kokkos::create_mirror(gid_dev);
688 for(Teuchos_Ordinal i = 0; i < gids.size(); i++){
689 gid_host(i) = gids[i];
690 }
691
692 //copy host views to device memory
693 Kokkos::deep_copy(rand_dev,rand_host);
694 Kokkos::deep_copy(gid_dev, gid_host);
695
696 if(verbose)std::cout<<comm->getRank()<<": done creating recoloring datastructures\n";
697 //count boundary size to allocate list of vertices to recolor.
698 offset_t boundary_size = 0;
699 for(size_t i = 0; i < nVtx; i++){
700 for(offset_t j = offsets[i]; j < offsets[i+1]; j++){
701 if((size_t)adjs[j] >= nVtx) {
702 boundary_size++;
703 break;
704 }
705 }
706 }
707 if(verbose)std::cout<<comm->getRank()<<": creating send views\n";
708
709 //list of vertices to send to remote processes
710 Kokkos::View<lno_t*, device_type> verts_to_send_view("verts to send",boundary_size);
711 Kokkos::parallel_for("init verts_to_send_view",
712 Kokkos::RangePolicy<execution_space, int>(0,boundary_size),
713 KOKKOS_LAMBDA(const int& i){
714 verts_to_send_view(i) = -1;
715 });
716
717 //size information for the list of vertices to send. Also includes an atomic copy
718 Kokkos::View<size_t*, device_type> verts_to_send_size("verts to send size",1);
719 Kokkos::View<size_t*, device_type, Kokkos::MemoryTraits<Kokkos::Atomic> > verts_to_send_size_atomic = verts_to_send_size;
720 typename Kokkos::View<lno_t*, device_type>::host_mirror_type verts_to_send_host = create_mirror(verts_to_send_view);
721 typename Kokkos::View<size_t*,device_type>::host_mirror_type verts_to_send_size_host = create_mirror(verts_to_send_size);
722 //initialize the device view with a value of zero
723 verts_to_send_size_host(0) = 0;
724 deep_copy(verts_to_send_size, verts_to_send_size_host);
725
726 if(verbose)std::cout<<comm->getRank()<<": Done creating send views, initializing...\n";
727 if(verbose)std::cout<<comm->getRank()<<": boundary_size = "<<boundary_size<<" verts_to_send_size_atomic(0) = "<<verts_to_send_size_atomic(0)<<"\n";
728 //initially the verts to send include all boundary vertices.
729 Kokkos::parallel_for("Initialize verts_to_send",
730 Kokkos::RangePolicy<execution_space, int>(0,nVtx),
731 KOKKOS_LAMBDA(const int&i){
732 for(offset_t j = dist_offsets(i); j < dist_offsets(i+1); j++){
733 if((size_t)dist_adjs(j) >= nVtx){
734 verts_to_send_view(verts_to_send_size_atomic(0)++) = i;
735 break;
736 }
737 }
738 });
739 Kokkos::fence();
740
741
742 //used to replace the incorrectly recolored ghosts
743 Kokkos::View<int*, device_type> ghost_colors("ghost color backups", rand.size()-nVtx);
744 if(verbose)std::cout<<comm->getRank()<<": Done initializing\n";
745 gno_t sentPerRound[numStatisticRecordingRounds];
746 gno_t recvPerRound[numStatisticRecordingRounds];
747
748 if(verbose) std::cout<<comm->getRank()<<": Coloring interior\n";
749 //initialize interior and total timers, barrier to prevent any imbalance from setup.
750 //Only use a barrier if timing is happening.
751 if(timing) comm->barrier();
752 interior_time = timer();
753 total_time = timer();
754 //call the KokkosKernels coloring function with the Tpetra default spaces.
755 bool use_vbbit = (global_max_degree < 6000);
756 this->colorInterior<execution_space,memory_space>
757 (nVtx, dist_adjs, dist_offsets, femv,dist_adjs,0,use_vbbit);
758 if(timing){
759 interior_time = timer() - interior_time;
760 comp_time = interior_time;
761 }
762 if(verbose) std::cout<<comm->getRank()<<": Going to recolor\n";
763 bool recolor_degrees = this->pl->template get<bool>("recolor_degrees", true);
764
765 //if there is more than a single process, check distributed conflicts and recolor
766 if(comm->getSize() > 1){
767
768 if(verbose)std::cout<<comm->getRank()<<": going to communicate\n";
769
770 //communicate
771 Kokkos::deep_copy(verts_to_send_host, verts_to_send_view);
772 Kokkos::deep_copy(verts_to_send_size_host, verts_to_send_size);
773 gno_t recv, sent;
774 comm_time = doOwnedToGhosts(mapOwnedPlusGhosts,
775 nVtx,
776 verts_to_send_host,
777 verts_to_send_size_host,
778 femv,
779 procs_to_send,
780 recv,
781 sent);
782 sentPerRound[0] = sent;
783 recvPerRound[0] = recv;
784 if(verbose) std::cout<<comm->getRank()<<": done communicating\n";
785 verts_to_send_size_host(0) = 0;
786 deep_copy(verts_to_send_size, verts_to_send_size_host);
787 //set the old ghost colors
788 //get the colors from the femv
789 Kokkos::View<int**, Kokkos::LayoutLeft, device_type> femvColors =
790 femv->template getLocalView<device_type>(Tpetra::Access::ReadWrite); // Partial write
791 Kokkos::View<int*, device_type> femv_colors = subview(femvColors, Kokkos::ALL, 0);
792 Kokkos::parallel_for("get colors from femv",
793 Kokkos::RangePolicy<execution_space, int>(0,rand.size()-nVtx),
794 KOKKOS_LAMBDA(const int& i){
795 ghost_colors(i) = femv_colors(i+nVtx);
796 });
797 Kokkos::fence();
798 //detect conflicts on the device, uncolor conflicts consistently.
799 double temp = timer();
800 detectConflicts<execution_space, memory_space>(nVtx,
801 rand.size()-nVtx,
802 dist_offsets,
803 dist_adjs,
804 femv_colors,
805 verts_to_send_view,
806 verts_to_send_size_atomic,
807 recoloringSize,
808 rand_dev,
809 gid_dev,
810 ghost_degrees_dev,
811 recolor_degrees);
812 deep_copy(recoloringSize_host, recoloringSize);
813 conflict_detection += timer() - temp;
814 comp_time += conflict_detection;
815 }
816 //end the initial recoloring
817 if(verbose)std::cout<<comm->getRank()<<": done initial recoloring, begin recoloring loop\n";
818 double totalPerRound[numStatisticRecordingRounds];
819 double commPerRound[numStatisticRecordingRounds];
820 double compPerRound[numStatisticRecordingRounds];
821 double recoloringPerRound[numStatisticRecordingRounds];
822 double conflictDetectionPerRound[numStatisticRecordingRounds];
823 double serialRecoloringPerRound[numStatisticRecordingRounds];
824 int vertsPerRound[numStatisticRecordingRounds];
825 bool done = false; //We're only done when all processors are done
826 if(comm->getSize() == 1) done = true;
827 totalPerRound[0] = interior_time + comm_time + conflict_detection;
828 recoloringPerRound[0] = 0;
829 commPerRound[0] = comm_time;
830 compPerRound[0] = interior_time + conflict_detection;
831 conflictDetectionPerRound[0] = conflict_detection;
832 recoloringPerRound[0] = 0;
833 vertsPerRound[0] = 0;
834 int distributedRounds = 1; //this is the same across all processors
835 int serial_threshold = this->pl->template get<int>("serial_threshold",0);
836
837 Kokkos::View<lno_t*, device_type> verts_to_recolor("verts_to_recolor", boundary_size);
838 typename Kokkos::View<int*, device_type>::host_mirror_type ghost_colors_host;
839 //while the queue is not empty
840 while(recoloringSize_host(0) > 0 || !done){
841 if(recoloringSize_host(0) < serial_threshold) break;
842 //get a subview of the colors:
843 auto femvColors = femv->getLocalViewDevice(Tpetra::Access::ReadWrite);
844 auto femv_colors = subview(femvColors, Kokkos::ALL, 0);
845 //color everything in the recoloring queue, put everything on conflict queue
846 if(distributedRounds < numStatisticRecordingRounds) {
847 vertsPerRound[distributedRounds] = recoloringSize_host(0);
848 }
849
850 //copying the send view to the recolor view is necessary because
851 //KokkosKernels can change the view passed in, and we need the send view
852 //intact for communication.
853 Kokkos::deep_copy(verts_to_recolor, verts_to_send_view);
854
855 double recolor_temp = timer();
856 //use KokkosKernels to recolor the conflicting vertices.
857 deep_copy(verts_to_send_size_host, verts_to_send_size);
858 if(verts_to_send_size_host(0) > 0){
859 this->colorInterior<execution_space,
860 memory_space>(femv_colors.size(),
861 dist_adjs,dist_offsets,
862 femv,
863 verts_to_recolor,
864 verts_to_send_size_host(0),
865 use_vbbit);
866 }
867
868 if(distributedRounds < numStatisticRecordingRounds){
869 recoloringPerRound[distributedRounds] = timer() - recolor_temp;
870 recoloring_time += recoloringPerRound[distributedRounds];
871 comp_time += recoloringPerRound[distributedRounds];
872 compPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
873 totalPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
874 } else if(timing) {
875 double recolor_round_time = timer() - recolor_temp;
876 recoloring_time += recolor_round_time;
877 comp_time += recolor_round_time;
878 }
879
880 //reset the recoloringSize device host and device views
881 //to zero
882 recoloringSize_host(0) = 0;
883 Kokkos::deep_copy(recoloringSize,recoloringSize_host);
884
885 Kokkos::parallel_for("set femv colors",
886 Kokkos::RangePolicy<execution_space, int>(0,rand.size()-nVtx),
887 KOKKOS_LAMBDA(const int& i){
888 femv_colors(i+nVtx) = ghost_colors(i);
889 });
890 Kokkos::fence();
891 //communicate
892 Kokkos::deep_copy(verts_to_send_host, verts_to_send_view);
893 Kokkos::deep_copy(verts_to_send_size_host, verts_to_send_size);
894 gno_t sent,recv;
895 // Reset device views
896 femvColors = decltype(femvColors)();
897 femv_colors = decltype(femv_colors)();
898
899 double curr_comm_time = doOwnedToGhosts(mapOwnedPlusGhosts,
900 nVtx,
901 verts_to_send_host,
902 verts_to_send_size_host,
903 femv,
904 procs_to_send,
905 recv,
906 sent);
907 comm_time += curr_comm_time;
908 if(distributedRounds < numStatisticRecordingRounds){
909 commPerRound[distributedRounds] = curr_comm_time;
910 sentPerRound[distributedRounds] = sent;
911 recvPerRound[distributedRounds] = recv;
912 totalPerRound[distributedRounds] += commPerRound[distributedRounds];
913 }
914 //detect conflicts in parallel. For a detected conflict,
915 //reset the vertex-to-be-recolored's color to 0, in order to
916 //allow KokkosKernels to recolor correctly.
917
918 femvColors = femv->getLocalViewDevice(Tpetra::Access::ReadWrite);
919 femv_colors = subview(femvColors, Kokkos::ALL, 0);
920 Kokkos::parallel_for("get femv colors 2",
921 Kokkos::RangePolicy<execution_space, int>(0,rand.size()-nVtx),
922 KOKKOS_LAMBDA(const int& i){
923 ghost_colors(i) = femv_colors(i+nVtx);
924 });
925 Kokkos::fence();
926 verts_to_send_size_host(0) = 0;
927 deep_copy(verts_to_send_size, verts_to_send_size_host);
928 double detection_temp = timer();
929 detectConflicts<execution_space, memory_space>(nVtx,
930 rand.size()-nVtx,
931 dist_offsets,
932 dist_adjs,
933 femv_colors,
934 verts_to_send_view,
935 verts_to_send_size_atomic,
936 recoloringSize,
937 rand_dev,
938 gid_dev,
939 ghost_degrees_dev,
940 recolor_degrees);
941
942 Kokkos::deep_copy(recoloringSize_host, recoloringSize);
943
944 if(distributedRounds < numStatisticRecordingRounds){
945 conflictDetectionPerRound[distributedRounds] = timer() - detection_temp;
946 conflict_detection += conflictDetectionPerRound[distributedRounds];
947 compPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
948 totalPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
949 comp_time += conflictDetectionPerRound[distributedRounds];
950 } else if(timing){
951 double conflict_detection_round_time = timer()- detection_temp;
952 conflict_detection += conflict_detection_round_time;
953 comp_time += conflict_detection_round_time;
954 }
955 //do a reduction to determine if we're done
956 int globalDone = 0;
957 int localDone = recoloringSize_host(0);
958 Teuchos::reduceAll<int, int>(*comm,Teuchos::REDUCE_SUM,1, &localDone, &globalDone);
959 //We're only allowed to stop once everyone has no work to do.
960 //collectives will hang if one process exits.
961 distributedRounds++;
962 done = !globalDone;
963 }
964
965 if(recoloringSize_host(0) > 0 || !done){
966 ghost_colors_host = Kokkos::create_mirror_view(ghost_colors);
967 deep_copy(ghost_colors_host, ghost_colors);
968 deep_copy(verts_to_send_host, verts_to_send_view);
969 deep_copy(verts_to_send_size_host, verts_to_send_size);
970 }
971
972
973 //finish the local coloring in serial on the host
974 while(recoloringSize_host(0) > 0 || !done){
975 //Use non-templated call to get the Host view
976 auto femvColors = femv->getLocalViewHost(Tpetra::Access::ReadWrite);
977 auto femv_colors = subview(femvColors, Kokkos::ALL, 0);
978 /*Kokkos::View<int*, Kokkos::Device<Kokkos::Serial, Kokkos::HostSpace>>femv_colors_host = create_mirror(femv_colors);
979 Kokkos::deep_copy(femv_colors_host, femv_colors);*/
980 if(distributedRounds < 100){
981 vertsPerRound[distributedRounds] = recoloringSize_host(0);
982 }
983
984 double recolor_temp = timer();
985 //use KokkosKernels to recolor the conflicting vertices
986 if(verts_to_send_size_host(0) > 0){
987 this->colorInterior<host_exec,
988 host_mem>
989 (femv_colors.size(), dist_adjs_host, dist_offsets_host, femv, verts_to_send_host, verts_to_send_size_host(0), true);
990 }
991
992 if(distributedRounds < numStatisticRecordingRounds){
993 recoloringPerRound[distributedRounds] = timer() - recolor_temp;
994 recoloring_time += recoloringPerRound[distributedRounds];
995 comp_time += recoloringPerRound[distributedRounds];
996 compPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
997 totalPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
998 } else if(timing){
999 double recolor_serial_round_time = timer() - recolor_temp;
1000 recoloring_time += recolor_serial_round_time;
1001 comp_time += recolor_serial_round_time;
1002 }
1003
1004 recoloringSize_host(0) = 0;
1005
1006 for(size_t i = 0; i < rand.size() -nVtx; i++){
1007 femv_colors(i+nVtx) = ghost_colors_host(i);
1008 }
1009
1010 gno_t sent,recv;
1011 double curr_comm_time = doOwnedToGhosts(mapOwnedPlusGhosts,
1012 nVtx,
1013 verts_to_send_host,
1014 verts_to_send_size_host,
1015 femv,
1016 procs_to_send,
1017 recv,
1018 sent);
1019 comm_time += curr_comm_time;
1020
1021 if(distributedRounds < numStatisticRecordingRounds){
1022 commPerRound[distributedRounds] = curr_comm_time;
1023 sentPerRound[distributedRounds] = sent;
1024 recvPerRound[distributedRounds] = recv;
1025 totalPerRound[distributedRounds] += commPerRound[distributedRounds];
1026 }
1027 for(size_t i = 0; i < rand.size()-nVtx; i++){
1028 ghost_colors_host(i) = femv_colors(i+nVtx);
1029 }
1030
1031 verts_to_send_size_host(0) = 0;
1032 double detection_temp = timer();
1033 detectConflicts<host_exec, host_mem>(nVtx,
1034 rand.size()-nVtx,
1035 dist_offsets_host,
1036 dist_adjs_host,
1037 femv_colors,
1038 verts_to_send_host,
1039 verts_to_send_size_host,
1040 recoloringSize_host,
1041 rand_host,
1042 gid_host,
1043 ghost_degrees_host,
1044 recolor_degrees);
1045 if(distributedRounds < numStatisticRecordingRounds){
1046 conflictDetectionPerRound[distributedRounds] = timer() - detection_temp;
1047 conflict_detection += conflictDetectionPerRound[distributedRounds];
1048 compPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
1049 totalPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
1050 comp_time += conflictDetectionPerRound[distributedRounds];
1051 } else if(timing){
1052 double conflict_detection_serial_round_time = timer() - detection_temp;
1053 conflict_detection += conflict_detection_serial_round_time;
1054 comp_time += conflict_detection_serial_round_time;
1055 }
1056 //do a reduction to determine if we're done
1057 int globalDone = 0;
1058 int localDone = recoloringSize_host(0);
1059 Teuchos::reduceAll<int, int>(*comm, Teuchos::REDUCE_SUM,1, &localDone, &globalDone);
1060 distributedRounds++;
1061 done = !globalDone;
1062 }
1063 total_time = timer() - total_time;
1064
1065 //only take time to compute statistics if the user wants them
1066 if(verbose){
1067 std::cout<<comm->getRank()<<": done recoloring loop, computing statistics\n";
1068 int localBoundaryVertices = 0;
1069 for(size_t i = 0; i < nVtx; i++){
1070 for(offset_t j = offsets[i]; j < offsets[i+1]; j++){
1071 if((size_t)adjs[j] >= nVtx){
1072 localBoundaryVertices++;
1073 break;
1074 }
1075 }
1076 }
1077 //print how many rounds of speculating/correcting happened (this should be the same for all ranks):
1078 //if(comm->getRank()==0) printf("did %d rounds of distributed coloring\n", distributedRounds);
1079 int totalBoundarySize = 0;
1080 int totalVertsPerRound[numStatisticRecordingRounds];
1081 double finalTotalPerRound[numStatisticRecordingRounds];
1082 double maxRecoloringPerRound[numStatisticRecordingRounds];
1083 double finalSerialRecoloringPerRound[numStatisticRecordingRounds];
1084 double minRecoloringPerRound[numStatisticRecordingRounds];
1085 double finalCommPerRound[numStatisticRecordingRounds];
1086 double finalCompPerRound[numStatisticRecordingRounds];
1087 double finalConflictDetectionPerRound[numStatisticRecordingRounds];
1088 gno_t finalRecvPerRound[numStatisticRecordingRounds];
1089 gno_t finalSentPerRound[numStatisticRecordingRounds];
1090 for(int i = 0; i < numStatisticRecordingRounds; i++) {
1091 totalVertsPerRound[i] = 0;
1092 finalTotalPerRound[i] = 0.0;
1093 maxRecoloringPerRound[i] = 0.0;
1094 minRecoloringPerRound[i] = 0.0;
1095 finalCommPerRound[i] = 0.0;
1096 finalCompPerRound[i] = 0.0;
1097 finalConflictDetectionPerRound[i] = 0.0;
1098 finalSentPerRound[i] = 0;
1099 finalRecvPerRound[i] = 0;
1100 }
1101 Teuchos::reduceAll<int,int>(*comm, Teuchos::REDUCE_SUM,1, &localBoundaryVertices,&totalBoundarySize);
1102 Teuchos::reduceAll<int,int>(*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,vertsPerRound,totalVertsPerRound);
1103 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,totalPerRound,finalTotalPerRound);
1104 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,recoloringPerRound,maxRecoloringPerRound);
1105 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MIN,numStatisticRecordingRounds,recoloringPerRound,minRecoloringPerRound);
1106 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,serialRecoloringPerRound,finalSerialRecoloringPerRound);
1107 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,commPerRound,finalCommPerRound);
1108 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,compPerRound,finalCompPerRound);
1109 Teuchos::reduceAll<int,double>(*comm,
1110 Teuchos::REDUCE_MAX,numStatisticRecordingRounds,conflictDetectionPerRound,finalConflictDetectionPerRound);
1111 Teuchos::reduceAll<int,gno_t> (*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,recvPerRound, finalRecvPerRound);
1112 Teuchos::reduceAll<int,gno_t> (*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,sentPerRound, finalSentPerRound);
1113
1114 std::cout << "Rank " << comm->getRank()
1115 << ": boundary size: " << localBoundaryVertices << std::endl;
1116 if(comm->getRank()==0)
1117 std::cout << "Total boundary size: " << totalBoundarySize << std::endl;
1118 for(int i = 0; i < std::min(distributedRounds,numStatisticRecordingRounds); i++){
1119 std::cout << "Rank " << comm->getRank()
1120 << ": recolor " << vertsPerRound[i]
1121 << " vertices in round " << i << std::endl;
1122 if(comm->getRank()==0) {
1123 std::cout << "recolored " << totalVertsPerRound[i]
1124 << " vertices in round " << i << std::endl;
1125 std::cout << "total time in round " << i
1126 << ": " << finalTotalPerRound[i] << std::endl;;
1127 std::cout << "recoloring time in round " << i
1128 << ": " << maxRecoloringPerRound[i] << std::endl;
1129 std::cout << "serial recoloring time in round " << i
1130 << ": " << finalSerialRecoloringPerRound[i] << std::endl;
1131 std::cout << "min recoloring time in round " << i
1132 << ": " << minRecoloringPerRound[i] << std::endl;
1133 std::cout << "conflict detection time in round " << i
1134 << ": " << finalConflictDetectionPerRound[i] << std::endl;
1135 std::cout << "comm time in round " << i
1136 << ": " << finalCommPerRound[i] << std::endl;
1137 std::cout << "total sent in round " << i
1138 << ": " << finalSentPerRound[i] << std::endl;
1139 std::cout << "total recv in round " << i
1140 << ": " << finalRecvPerRound[i] << std::endl;
1141 std::cout << "comp time in round " << i
1142 << ": " << finalCompPerRound[i] << std::endl;
1143 }
1144 }
1145 } else if(timing){
1146 double global_total_time = 0.0;
1147 double global_recoloring_time=0.0;
1148 double global_min_recoloring_time=0.0;
1149 double global_conflict_detection=0.0;
1150 double global_comm_time=0.0;
1151 double global_comp_time=0.0;
1152 double global_interior_time = 0.0;
1153 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&total_time,&global_total_time);
1154 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&recoloring_time,&global_recoloring_time);
1155 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MIN,1,&recoloring_time,&global_min_recoloring_time);
1156 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&conflict_detection,&global_conflict_detection);
1157 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&comm_time,&global_comm_time);
1158 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&comp_time,&global_comp_time);
1159 Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&interior_time,&global_interior_time);
1160 comm->barrier();
1161 fflush(stdout);
1162 if(comm->getRank()==0){
1163 std::cout << "Total Time: " << global_total_time << std::endl;
1164 std::cout << "Interior Time: " << global_interior_time << std::endl;
1165 std::cout << "Recoloring Time: " << global_recoloring_time << std::endl;
1166 std::cout << "Min Recoloring Time: " << global_min_recoloring_time << std::endl;
1167 std::cout << "Conflict Detection Time: " << global_conflict_detection << std::endl;
1168 std::cout << "Comm Time: " << global_comm_time << std::endl;
1169 std::cout << "Comp Time: " << global_comp_time << std::endl;
1170 }
1171 }
1172 if(verbose) std::cout<<comm->getRank()<<": exiting coloring\n";
1173 }
1174};
1175
1176template <typename Adapter>
1177void AlgDistance1<Adapter>::buildModel(modelFlag_t &flags){
1178 flags.set(REMOVE_SELF_EDGES);
1179
1180 this->env->debug(DETAILED_STATUS, " building graph model");
1181 if(verbose) std::cout<<comm->getRank()<<": starting to construct graph model\n";
1182 this->model = rcp(new GraphModel<base_adapter_t>(this->adapter, this->env,
1183 this->comm, flags));
1184 if(verbose) std::cout<<comm->getRank()<<": done constructing graph model\n";
1185 this->env->debug(DETAILED_STATUS, " graph model built");
1186}
1187
1188
1189}//end namespace Zoltan2
1190#endif
AlltoAll communication methods.
Defines the ColoringSolution class.
Defines the GraphModel interface.
Traits class to handle conversions between gno_t/lno_t and TPL data types (e.g., ParMETIS's idx_t,...
A gathering of useful namespace methods.
Tpetra::FEMultiVector< femv_scalar_t, lno_t, gno_t > femv_t
typename femv_t::device_type device_type
Tpetra::Map< lno_t, gno_t > map_t
typename femv_t::host_view_type::device_type::memory_space host_mem
typename Adapter::gno_t gno_t
typename Adapter::scalar_t scalar_t
void detectConflicts(const size_t n_local, const size_t n_ghosts, Kokkos::View< offset_t *, Kokkos::Device< ExecutionSpace, MemorySpace > > dist_offsets, Kokkos::View< lno_t *, Kokkos::Device< ExecutionSpace, MemorySpace > > dist_adjs, Kokkos::View< int *, Kokkos::Device< ExecutionSpace, MemorySpace > > femv_colors, Kokkos::View< lno_t *, Kokkos::Device< ExecutionSpace, MemorySpace > > verts_to_send_view, Kokkos::View< size_t *, Kokkos::Device< ExecutionSpace, MemorySpace >, Kokkos::MemoryTraits< Kokkos::Atomic > > verts_to_send_size_atomic, Kokkos::View< gno_t *, Kokkos::Device< ExecutionSpace, MemorySpace > > recoloringSize, Kokkos::View< int *, Kokkos::Device< ExecutionSpace, MemorySpace > > rand, Kokkos::View< gno_t *, Kokkos::Device< ExecutionSpace, MemorySpace > > gid, Kokkos::View< gno_t *, Kokkos::Device< ExecutionSpace, MemorySpace > > ghost_degrees, bool recolor_degrees)
typename Adapter::base_adapter_t base_adapter_t
typename Adapter::lno_t lno_t
typename device_type::execution_space execution_space
typename femv_t::host_view_type::device_type::execution_space host_exec
void color(const RCP< ColoringSolution< Adapter > > &solution)
Coloring method.
typename device_type::memory_space memory_space
void hybridGMB(const size_t nVtx, const Teuchos::ArrayView< const lno_t > &adjs, const Teuchos::ArrayView< const offset_t > &offsets, const Teuchos::RCP< femv_t > &femv, const Teuchos::ArrayView< const gno_t > &gids, const Teuchos::ArrayView< const int > &rand, const Teuchos::ArrayView< const int > &owners, RCP< const map_t > mapOwnedPlusGhosts, const std::unordered_map< lno_t, std::vector< int > > &procs_to_send)
AlgDistance1(const RCP< const base_adapter_t > &adapter_, const RCP< Teuchos::ParameterList > &pl_, const RCP< Environment > &env_, const RCP< const Teuchos::Comm< int > > &comm_)
typename Adapter::offset_t offset_t
Algorithm defines the base class for all algorithms.
The class containing coloring solution.
map_t::local_ordinal_type lno_t
map_t::global_ordinal_type gno_t
Created by mbenlioglu on Aug 31, 2020.
std::bitset< NUM_MODEL_FLAGS > modelFlag_t
@ DETAILED_STATUS
sub-steps, each method's entry and exit
@ REMOVE_SELF_EDGES
algorithm requires no self edges