Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
Tpetra_Details_unpackCrsGraphAndCombine_def.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
11#define TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
12
13#include "TpetraCore_config.h"
14#include "Teuchos_Array.hpp"
15#include "Teuchos_ArrayView.hpp"
24#include "Kokkos_Core.hpp"
25#include <memory>
26#include <string>
27
46
47namespace Tpetra {
48
49//
50// Users must never rely on anything in the Details namespace.
51//
52namespace Details {
53
54namespace UnpackAndCombineCrsGraphImpl {
55
65template <class Packet, class GO, class Device, class BufferDevice>
66KOKKOS_FUNCTION int
67unpackRow(const Kokkos::View<GO*, Device, Kokkos::MemoryUnmanaged>& gids_out,
68 const Kokkos::View<int*, Device, Kokkos::MemoryUnmanaged>& pids_out,
69 const Kokkos::View<const Packet*, BufferDevice>& imports,
70 const size_t offset,
71 const size_t num_ent) {
72 using size_type = typename Kokkos::View<GO*, Device>::size_type;
73
74 if (num_ent == 0) {
75 // Empty rows always take zero bytes, to ensure sparsity.
76 return 0;
77 }
78
79 // Unpack GIDs
80 for (size_type k = 0; k < num_ent; k++)
81 gids_out(k) = imports(offset + k);
82
83 // Unpack PIDs
84 if (pids_out.size() > 0) {
85 for (size_type k = 0; k < num_ent; k++) {
86 pids_out(k) = static_cast<int>(imports(offset + num_ent + k));
87 }
88 }
89
90 return 0;
91}
92
103template <class LocalOrdinal,
104 class Packet,
105 class RowView,
106 class IndicesView,
107 class BufferDevice>
109 using LO = LocalOrdinal;
110 using GO = typename IndicesView::value_type;
111 using packet_type = Packet;
112 using row_ptrs_type = RowView;
115
116 using device_type = typename IndicesView::device_type;
117 using execution_space = typename device_type::execution_space;
118
119 using num_packets_per_lid_type = Kokkos::View<const size_t*, buffer_device_type>;
120 using offsets_type = Kokkos::View<const size_t*, device_type>;
121 using input_buffer_type = Kokkos::View<const packet_type*, buffer_device_type>;
122 using import_lids_type = Kokkos::View<const LO*, buffer_device_type>;
123
124 using gids_scratch_type = Kokkos::View<GO*, device_type>;
125 using pids_scratch_type = Kokkos::View<int*, device_type>;
126
127 row_ptrs_type row_ptrs_beg;
128 row_ptrs_type row_ptrs_end;
129 indices_type indices;
130 input_buffer_type imports;
131 num_packets_per_lid_type num_packets_per_lid;
132 import_lids_type import_lids;
133 offsets_type offsets;
134 size_t max_num_ent;
135 bool unpack_pids;
136 Kokkos::Experimental::UniqueToken<execution_space,
137 Kokkos::Experimental::UniqueTokenScope::Global>
138 tokens;
139 gids_scratch_type gids_scratch;
140 pids_scratch_type pids_scratch;
141
142 public:
143 using value_type = Kokkos::pair<int, LO>;
144
149 const input_buffer_type& imports_in,
150 const num_packets_per_lid_type& num_packets_per_lid_in,
151 const import_lids_type& import_lids_in,
152 const offsets_type& offsets_in,
153 const size_t max_num_ent_in,
154 const bool unpack_pids_in)
155 : row_ptrs_beg(row_ptrs_beg_in)
156 , row_ptrs_end(row_ptrs_end_in)
157 , indices(indices_in)
158 , imports(imports_in)
159 , num_packets_per_lid(num_packets_per_lid_in)
160 , import_lids(import_lids_in)
161 , offsets(offsets_in)
162 , max_num_ent(max_num_ent_in)
163 , unpack_pids(unpack_pids_in)
164 , tokens(execution_space())
165 , gids_scratch("gids_scratch", tokens.size() * max_num_ent)
166 , pids_scratch("pids_scratch", tokens.size() * max_num_ent) {}
167
168 KOKKOS_INLINE_FUNCTION void init(value_type& dst) const {
169 using Tpetra::Details::OrdinalTraits;
170 dst = Kokkos::make_pair(0, OrdinalTraits<LO>::invalid());
171 }
172
174 join(value_type& dst, const value_type& src) const {
175 // `dst` should reflect the first (least) bad index and
176 // all other associated error codes and data. Thus, we need only
177 // check if the `src` object shows an error and if its associated
178 // bad index is less than `dst`'s bad index.
179 using Tpetra::Details::OrdinalTraits;
180 if (src.second != OrdinalTraits<LO>::invalid()) {
181 // An error in the src; check if
182 // 1. `dst` shows errors
183 // 2. If `dst` does show errors, if src's bad index is less than
184 // *this' bad index
185 if (dst.second == OrdinalTraits<LO>::invalid() ||
186 src.second < dst.second) {
187 dst = src;
188 }
189 }
190 }
191
193 void operator()(const LO i, value_type& dst) const {
194 using Kokkos::MemoryUnmanaged;
195 using Kokkos::subview;
196 using Kokkos::View;
197 using size_type = typename execution_space::size_type;
198 using slice = typename Kokkos::pair<size_type, size_type>;
199
202
203 const size_t num_packets_this_lid = num_packets_per_lid(i);
204 const size_t num_ent = (unpack_pids) ? num_packets_this_lid / 2
206 if (unpack_pids && num_packets_this_lid % 2 != 0) {
207 // Attempting to unpack PIDs, but num_packets_this_lid is not even; this
208 // should never
209 dst = Kokkos::make_pair(1, i);
210 return;
211 }
212
213 // Only unpack data if there is a nonzero number to unpack
214 if (num_ent == 0) {
215 return;
216 }
217
218 // there is actually something in the row
219 const size_t buf_size = imports.size();
220 const size_t offset = offsets(i);
221
223 dst = Kokkos::make_pair(2, i); // out of bounds
224 return;
225 }
226
227 // Get subviews in to the scratch arrays. The token returned from acquire
228 // is an integer in [0, tokens.size()). It is used to grab a unique (to
229 // this thread) subview of the scratch arrays.
230 const size_type token = tokens.acquire();
231 const size_t a = static_cast<size_t>(token) * max_num_ent;
232 const size_t b = a + num_ent;
233 gids_out_type gids_out = subview(gids_scratch, slice(a, b));
234 pids_out_type pids_out = subview(pids_scratch, slice(a, (unpack_pids ? b : a)));
235
236 const int err = unpackRow(gids_out, pids_out, imports, offset, num_ent);
237
238 if (err != 0) {
239 dst = Kokkos::make_pair(3, i);
240 tokens.release(token);
241 return;
242 }
243
244 auto import_lid = import_lids(i);
245 for (size_t k = 0; k < num_ent; ++k) {
246 indices(row_ptrs_end(import_lid)) = gids_out(k);
247 // this is OK; don't need atomic, since LIDs to pack don't have repeats.
248 row_ptrs_end(import_lid) += 1;
249 }
250
251 tokens.release(token);
252 }
253};
254
261template <class LocalOrdinal, class GlobalOrdinal, class Node,
262 class RowView, class IndicesView, class BufferDevice>
263void unpackAndCombine(const RowView& row_ptrs_beg,
264 const RowView& row_ptrs_end,
265 IndicesView& indices,
266 const Kokkos::View<const GlobalOrdinal*, BufferDevice,
267 Kokkos::MemoryUnmanaged>& imports,
268 const Kokkos::View<const size_t*, BufferDevice,
269 Kokkos::MemoryUnmanaged>& num_packets_per_lid,
270 const Kokkos::View<const LocalOrdinal*, BufferDevice,
271 Kokkos::MemoryUnmanaged>& import_lids,
272 const typename CrsGraph<LocalOrdinal, GlobalOrdinal,
273 Node>::padding_type& padding,
274 const bool unpack_pids,
275 const int myRank,
276 const bool verbose) {
277 using LO = LocalOrdinal;
278 using GO = GlobalOrdinal;
279 using device_type = typename Node::device_type;
280 using execution_space = typename BufferDevice::execution_space;
281 using range_policy =
282 Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
283 using unpack_functor_type =
285
286 const char prefix[] =
287 "Tpetra::Details::UnpackAndCombineCrsGraphImpl::unpackAndCombine: ";
288
289 const size_t num_import_lids = static_cast<size_t>(import_lids.extent(0));
290 if (num_import_lids == 0) {
291 // Nothing to unpack
292 return;
293 }
294
295 // Resize row pointers and indices to accommodate incoming data
296 padCrsArrays(row_ptrs_beg, row_ptrs_end, indices, padding,
297 myRank, verbose);
298
299 // Get the offsets
300 Kokkos::View<size_t*, device_type> offsets("offsets", num_import_lids + 1);
301 computeOffsetsFromCounts(offsets, num_packets_per_lid);
302
303 // Determine the maximum number of entries in any row in the graph. The
304 // maximum number of entries is needed to allocate unpack buffers on the
305 // device.
306 size_t max_num_ent;
307 Kokkos::parallel_reduce(
308 "MaxReduce",
309 range_policy(0, LO(num_packets_per_lid.size())),
310 KOKKOS_LAMBDA(const LO i, size_t& running_max_num_ent) {
311 const size_t num_packets_this_lid = num_packets_per_lid(i);
312 const size_t num_ent = (unpack_pids) ? num_packets_this_lid / 2 : num_packets_this_lid;
315 }
316 },
317 Kokkos::Max<size_t>(max_num_ent));
318
319 // Now do the actual unpack!
320 unpack_functor_type f(row_ptrs_beg, row_ptrs_end, indices, imports,
321 num_packets_per_lid, import_lids, offsets,
322 max_num_ent, unpack_pids);
323
324 typename unpack_functor_type::value_type x;
325 Kokkos::parallel_reduce(range_policy(0, static_cast<LO>(num_import_lids)), f, x);
326 auto x_h = x.to_std_pair();
327 TEUCHOS_TEST_FOR_EXCEPTION(x_h.first != 0, std::runtime_error,
328 prefix << "UnpackAndCombineFunctor reported error code "
329 << x_h.first << " for the first bad row " << x_h.second);
330}
331
332template <class Packet, class LocalGraph, class BufferDevice>
333size_t
335 const LocalGraph& local_graph,
336 const Kokkos::View<const typename LocalGraph::data_type*,
337 typename LocalGraph::device_type,
338 Kokkos::MemoryUnmanaged>
340 const Kokkos::View<const Packet*, BufferDevice>& /* imports */,
341 const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
342 const size_t num_same_ids) {
343 using Kokkos::parallel_reduce;
344 using local_graph_type = LocalGraph;
345 using LO = typename local_graph_type::data_type;
346 using device_type = typename local_graph_type::device_type;
347 using execution_space = typename device_type::execution_space;
348 using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
349
350 size_t count = 0;
351 LO num_items;
352
353 // Number of graph entries to unpack (returned by this function).
354 num_items = static_cast<LO>(num_same_ids);
355 if (num_items) {
356 size_t kcnt = 0;
359 KOKKOS_LAMBDA(const LO lid, size_t& update) {
360 update += static_cast<size_t>(local_graph.row_map[lid + 1] - local_graph.row_map[lid]);
361 },
362 kcnt);
363 count += kcnt;
364 }
365
366 // Count entries copied directly from the source graph with permuting.
367 num_items = static_cast<LO>(permute_from_lids.extent(0));
368 if (num_items) {
369 size_t kcnt = 0;
370 parallel_reduce(
371 range_policy(0, num_items),
372 KOKKOS_LAMBDA(const LO i, size_t& update) {
373 const LO lid = permute_from_lids(i);
374 update += static_cast<size_t>(local_graph.row_map[lid + 1] - local_graph.row_map[lid]);
375 },
376 kcnt);
377 count += kcnt;
378 }
379
380 {
381 // Count entries received from other MPI processes.
382 size_t tot_num_ent = 0;
383 parallel_reduce(
384 "SumReduce",
385 range_policy(0, num_packets_per_lid.size()),
386 KOKKOS_LAMBDA(const int& i, size_t& lsum) {
387 lsum += num_packets_per_lid(i) / 2;
388 },
389 Kokkos::Sum<size_t>(tot_num_ent));
390 count += tot_num_ent;
391 }
392
393 return count;
394}
395
397template <class Packet, class LO, class Device, class BufferDevice>
399 const Kokkos::View<size_t*, Device>& tgt_rowptr,
400 const Kokkos::View<const LO*, BufferDevice>& import_lids,
401 const Kokkos::View<const Packet*, BufferDevice>& /* imports */,
402 const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid) {
403 using Kokkos::parallel_reduce;
404 using device_type = Device;
405 using execution_space = typename device_type::execution_space;
406 using size_type = typename Kokkos::View<size_t*, device_type>::size_type;
407 using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
408
409 const size_type N = num_packets_per_lid.extent(0);
411 "Setup row pointers for remotes",
412 range_policy(0, N),
413 KOKKOS_LAMBDA(const size_t i) {
414 using atomic_incr_type = typename std::remove_reference<decltype(tgt_rowptr(0))>::type;
415 const size_t num_packets_this_lid = num_packets_per_lid(i);
416 const size_t num_ent = num_packets_this_lid / 2;
417 Kokkos::atomic_fetch_add(&tgt_rowptr(import_lids(i)), atomic_incr_type(num_ent));
418 });
419}
420
421// Convert array of row lengths to a CRS pointer array
422template <class Device>
423void makeCrsRowPtrFromLengths(
424 const Kokkos::View<size_t*, Device, Kokkos::MemoryUnmanaged>& tgt_rowptr,
425 const Kokkos::View<size_t*, Device>& new_start_row) {
426 using Kokkos::parallel_scan;
427 using device_type = Device;
428 using execution_space = typename device_type::execution_space;
429 using size_type = typename Kokkos::View<size_t*, device_type>::size_type;
430 using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
431 const size_type N = new_start_row.extent(0);
433 range_policy(0, N),
434 KOKKOS_LAMBDA(const size_t& i, size_t& update, const bool& final) {
435 auto cur_val = tgt_rowptr(i);
436 if (final) {
437 tgt_rowptr(i) = update;
439 }
440 update += cur_val;
441 });
442}
443
444template <class LocalGraph, class LocalMap>
445void copyDataFromSameIDs(
446 const Kokkos::View<typename LocalMap::global_ordinal_type*,
447 typename LocalMap::device_type>& tgt_colind,
448 const Kokkos::View<int*, typename LocalMap::device_type>& tgt_pids,
449 const Kokkos::View<size_t*, typename LocalMap::device_type>& new_start_row,
450 const Kokkos::View<size_t*, typename LocalMap::device_type>& tgt_rowptr,
451 const Kokkos::View<const int*, typename LocalMap::device_type>& src_pids,
452 const LocalGraph& local_graph,
453 const LocalMap& local_col_map,
454 const size_t num_same_ids,
455 const int my_pid) {
456 using Kokkos::parallel_for;
457 using device_type = typename LocalMap::device_type;
458 using LO = typename LocalMap::local_ordinal_type;
459 using execution_space = typename device_type::execution_space;
460 using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_t>>;
461
462 parallel_for(
463 range_policy(0, num_same_ids),
464 KOKKOS_LAMBDA(const size_t i) {
465 using atomic_incr_type = typename std::remove_reference<decltype(new_start_row(0))>::type;
466
467 const LO src_lid = static_cast<LO>(i);
468 size_t src_row = local_graph.row_map(src_lid);
469
470 const LO tgt_lid = static_cast<LO>(i);
471 const size_t tgt_row = tgt_rowptr(tgt_lid);
472
473 const size_t nsr = local_graph.row_map(src_lid + 1) - local_graph.row_map(src_lid);
474 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
475
476 for (size_t j = local_graph.row_map(src_lid);
477 j < local_graph.row_map(src_lid + 1); ++j) {
478 LO src_col = local_graph.entries(j);
479 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
480 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
481 }
482 });
483}
484
485template <class LocalGraph, class LocalMap, class BufferDevice>
486void copyDataFromPermuteIDs(
487 const Kokkos::View<typename LocalMap::global_ordinal_type*,
488 typename LocalMap::device_type>& tgt_colind,
489 const Kokkos::View<int*,
490 typename LocalMap::device_type>& tgt_pids,
491 const Kokkos::View<size_t*,
492 typename LocalMap::device_type>& new_start_row,
493 const Kokkos::View<size_t*,
494 typename LocalMap::device_type>& tgt_rowptr,
495 const Kokkos::View<const int*,
496 typename LocalMap::device_type>& src_pids,
497 const Kokkos::View<const typename LocalMap::local_ordinal_type*,
498 BufferDevice, Kokkos::MemoryUnmanaged>& permute_to_lids,
499 const Kokkos::View<const typename LocalMap::local_ordinal_type*,
500 BufferDevice, Kokkos::MemoryUnmanaged>& permute_from_lids,
501 const LocalGraph& local_graph,
502 const LocalMap& local_col_map,
503 const int my_pid) {
504 using Kokkos::parallel_for;
505 using device_type = typename LocalMap::device_type;
506 using LO = typename LocalMap::local_ordinal_type;
507 using execution_space = typename device_type::execution_space;
508 using size_type = typename Kokkos::View<LO*, device_type>::size_type;
509 using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
510
511 const size_type num_permute_to_lids = permute_to_lids.extent(0);
512
513 parallel_for(
514 range_policy(0, num_permute_to_lids),
515 KOKKOS_LAMBDA(const size_t i) {
516 using atomic_incr_type = typename std::remove_reference<decltype(new_start_row(0))>::type;
517
518 const LO src_lid = permute_from_lids(i);
519 const size_t src_row = local_graph.row_map(src_lid);
520
521 const LO tgt_lid = permute_to_lids(i);
522 const size_t tgt_row = tgt_rowptr(tgt_lid);
523
524 size_t nsr = local_graph.row_map(src_lid + 1) - local_graph.row_map(src_lid);
525 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
526
527 for (size_t j = local_graph.row_map(src_lid);
528 j < local_graph.row_map(src_lid + 1); ++j) {
529 LO src_col = local_graph.entries(j);
530 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
531 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
532 }
533 });
534}
535
536template <class Packet, class LocalGraph, class LocalMap, class BufferDevice>
537void unpackAndCombineIntoCrsArrays2(
538 const Kokkos::View<typename LocalMap::global_ordinal_type*, typename LocalMap::device_type>& tgt_colind,
539 const Kokkos::View<int*, typename LocalMap::device_type>& tgt_pids,
540 const Kokkos::View<size_t*, typename LocalMap::device_type>& new_start_row,
541 const Kokkos::View<const size_t*, typename LocalMap::device_type>& offsets,
542 const Kokkos::View<
543 const typename LocalMap::local_ordinal_type*,
544 BufferDevice,
545 Kokkos::MemoryUnmanaged>& import_lids,
546 const Kokkos::View<const Packet*, BufferDevice>& imports,
547 const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
548 const LocalGraph& /* local_graph */,
549 const LocalMap /*& local_col_map*/,
550 const int my_pid) {
551 using Kokkos::atomic_fetch_add;
552 using Kokkos::MemoryUnmanaged;
553 using Kokkos::parallel_reduce;
554 using Kokkos::subview;
555 using Kokkos::View;
556
557 using device_type = typename LocalMap::device_type;
558 using LO = typename LocalMap::local_ordinal_type;
559 using GO = typename LocalMap::global_ordinal_type;
560 using execution_space = typename device_type::execution_space;
561 using size_type = typename Kokkos::View<LO*, device_type>::size_type;
562 using slice = typename Kokkos::pair<size_type, size_type>;
563 using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
564
565 using pids_out_type = View<int*, device_type, MemoryUnmanaged>;
566 using gids_out_type = View<GO*, device_type, MemoryUnmanaged>;
567
568 const size_type num_import_lids = import_lids.size();
569 const char prefix[] = "UnpackAndCombineCrsGraphImpl::unpackAndCombineIntoCrsArrays2: ";
570
571 // RemoteIDs: Loop structure following UnpackAndCombine
572 int gbl_err_count;
573 parallel_reduce(
574 "Unpack and combine into CRS",
575 range_policy(0, num_import_lids),
576 KOKKOS_LAMBDA(const size_t i, int& err) {
577 using atomic_incr_type = typename std::remove_reference<decltype(new_start_row(0))>::type;
578 const size_t num_packets_this_lid = num_packets_per_lid(i);
579 const size_t num_ent = num_packets_this_lid / 2;
580 const size_t offset = offsets(i);
581 const LO lcl_row = import_lids(i);
582 const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
583 const size_t end_row = start_row + num_ent;
584
585 gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
586 pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
587
588 err += unpackRow(gids_out, pids_out, imports, offset, num_ent);
589
590 // Correct target PIDs.
591 for (size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
592 const int pid = pids_out(j);
593 pids_out(j) = (pid != my_pid) ? pid : -1;
594 }
595 },
596 gbl_err_count);
597
598 TEUCHOS_TEST_FOR_EXCEPTION(gbl_err_count != 0,
599 std::invalid_argument, prefix << "Attempting to unpack PIDs, but num_ent is not even; this should never "
600 "happen! Please report this bug to the Tpetra developers.");
601
602 return;
603}
604
605template <class Packet, class LocalGraph, class LocalMap, class BufferDevice>
607 const LocalGraph& local_graph,
608 const LocalMap& local_col_map,
609 const Kokkos::View<const typename LocalMap::local_ordinal_type*,
610 BufferDevice,
611 Kokkos::MemoryUnmanaged>& import_lids,
612 const Kokkos::View<const Packet*, BufferDevice>& imports,
613 const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
614 const Kokkos::View<const typename LocalMap::local_ordinal_type*,
615 BufferDevice,
616 Kokkos::MemoryUnmanaged>& permute_to_lids,
617 const Kokkos::View<const typename LocalMap::local_ordinal_type*,
618 BufferDevice,
619 Kokkos::MemoryUnmanaged>& permute_from_lids,
620 const Kokkos::View<size_t*,
621 typename LocalMap::device_type,
622 Kokkos::MemoryUnmanaged>& tgt_rowptr,
623 const Kokkos::View<typename LocalMap::global_ordinal_type*,
624 typename LocalMap::device_type,
625 Kokkos::MemoryUnmanaged>& tgt_colind,
626 const Kokkos::View<const int*,
627 typename LocalMap::device_type,
628 Kokkos::MemoryUnmanaged>& src_pids,
629 const Kokkos::View<int*,
630 typename LocalMap::device_type,
631 Kokkos::MemoryUnmanaged>& tgt_pids,
632 const size_t num_same_ids,
633 const size_t tgt_num_rows,
634 const size_t tgt_num_nonzeros,
635 const int my_tgt_pid) {
636 using Kokkos::MemoryUnmanaged;
637 using Kokkos::parallel_for;
638 using Kokkos::subview;
639 using Kokkos::View;
640 using packet_type = Packet;
641 using local_map_type = LocalMap;
642 using local_graph_type = LocalGraph;
643 using buffer_device_type = BufferDevice;
644 using device_type = typename LocalMap::device_type;
645 using LO = typename LocalMap::local_ordinal_type;
646 using execution_space = typename device_type::execution_space;
647 using size_type = typename Kokkos::View<LO*, device_type>::size_type;
648 using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_t>>;
649
650 const char prefix[] = "UnpackAndCombineCrsGraphImpl::unpackAndCombineIntoCrsArrays: ";
651
652 const size_t N = tgt_num_rows;
653 const size_t mynnz = tgt_num_nonzeros;
654
655 // In the case of reduced communicators, the sourceGraph won't have
656 // the right "my_pid", so thus we have to supply it.
657 const int my_pid = my_tgt_pid;
658
659 // FIXME (mfh 24 Jun 2019)
660 //
661 // 1. Only zero the entries of tgt_rowptr that actually need it.
662 // 2. Consider merging these three kernels into one.
663
664 // Zero the rowptr
665 parallel_for(
666 range_policy(0, N + 1),
667 KOKKOS_LAMBDA(const size_t i) {
668 tgt_rowptr(i) = 0;
669 });
670
671 // same IDs: Always first, always in the same place
672 parallel_for(
673 range_policy(0, num_same_ids),
674 KOKKOS_LAMBDA(const size_t i) {
675 const LO tgt_lid = static_cast<LO>(i);
676 const LO src_lid = static_cast<LO>(i);
677 tgt_rowptr(tgt_lid) = local_graph.row_map(src_lid + 1) - local_graph.row_map(src_lid);
678 });
679
680 // Permute IDs: Still local, but reordered
681 const size_type num_permute_to_lids = permute_to_lids.extent(0);
682 parallel_for(
683 range_policy(0, num_permute_to_lids),
684 KOKKOS_LAMBDA(const size_t i) {
685 const LO tgt_lid = permute_to_lids(i);
686 const LO src_lid = permute_from_lids(i);
687 tgt_rowptr(tgt_lid) = local_graph.row_map(src_lid + 1) - local_graph.row_map(src_lid);
688 });
689
690 // Get the offsets from the number of packets per LID
691 const size_type num_import_lids = import_lids.extent(0);
692 View<size_t*, device_type> offsets("offsets", num_import_lids + 1);
693 computeOffsetsFromCounts(offsets, num_packets_per_lid);
694
695#ifdef HAVE_TPETRA_DEBUG
696 {
697 auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
698 const bool condition =
699 nth_offset_h != static_cast<size_t>(imports.extent(0));
700 TEUCHOS_TEST_FOR_EXCEPTION(condition, std::logic_error, prefix << "The final offset in bytes " << nth_offset_h << " != imports.size() = " << imports.extent(0) << ". Please report this bug to the Tpetra developers.");
701 }
702#endif // HAVE_TPETRA_DEBUG
703
704 // Setup row pointers for remotes
705 setupRowPointersForRemotes<packet_type, LO, device_type, buffer_device_type>(
706 tgt_rowptr, import_lids, imports, num_packets_per_lid);
707
708 // If multiple processes contribute to the same row, we may need to
709 // update row offsets. This tracks that.
710 View<size_t*, device_type> new_start_row("new_start_row", N + 1);
711
712 // Turn row length into a real CRS row pointer
713 makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
714 {
715 auto nth_tgt_rowptr_h = getEntryOnHost(tgt_rowptr, N);
716 bool condition = nth_tgt_rowptr_h != mynnz;
717 TEUCHOS_TEST_FOR_EXCEPTION(condition, std::invalid_argument,
718 prefix << "CRS_rowptr[last] = " << nth_tgt_rowptr_h << "!= mynnz = " << mynnz << ".");
719 }
720
721 // SameIDs: Copy the data over
722 copyDataFromSameIDs<LocalGraph, LocalMap>(tgt_colind, tgt_pids, new_start_row,
723 tgt_rowptr, src_pids, local_graph, local_col_map, num_same_ids, my_pid);
724
725 copyDataFromPermuteIDs<LocalGraph, LocalMap>(tgt_colind, tgt_pids, new_start_row,
726 tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
727 local_graph, local_col_map, my_pid);
728
729 if (imports.extent(0) <= 0) {
730 return;
731 }
732
733 unpackAndCombineIntoCrsArrays2<
734 packet_type, local_graph_type, local_map_type, buffer_device_type>(
735 tgt_colind, tgt_pids, new_start_row, offsets, import_lids, imports,
736 num_packets_per_lid, local_graph, local_col_map, my_pid);
737
738 return;
739}
740
741} // namespace UnpackAndCombineCrsGraphImpl
742
790template <class LocalOrdinal, class GlobalOrdinal, class Node>
791size_t
794 const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
795 const Teuchos::ArrayView<const typename CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::packet_type>& imports,
796 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
797 size_t /* constantNumPackets */,
798 CombineMode /* combineMode */,
799 size_t numSameIDs,
800 const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
801 const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs) {
802 using Kokkos::MemoryUnmanaged;
803 using Kokkos::View;
804 using device_type = typename Node::device_type;
806 using local_graph_device_type = typename CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::local_graph_device_type;
807 using buffer_device_type = typename CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::buffer_device_type;
808 const char prefix[] = "unpackAndCombineWithOwningPIDsCount: ";
809
810 TEUCHOS_TEST_FOR_EXCEPTION(permuteToLIDs.size() != permuteFromLIDs.size(), std::invalid_argument,
811 prefix << "permuteToLIDs.size() = " << permuteToLIDs.size() << " != "
812 "permuteFromLIDs.size() = "
813 << permuteFromLIDs.size() << ".");
814 // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
815 // process, then the graph is neither locally nor globally indexed.
816 const bool locallyIndexed = sourceGraph.isLocallyIndexed();
817 TEUCHOS_TEST_FOR_EXCEPTION(!locallyIndexed, std::invalid_argument, prefix << "The input "
818 "CrsGraph 'sourceGraph' must be locally indexed.");
819 TEUCHOS_TEST_FOR_EXCEPTION(importLIDs.size() != numPacketsPerLID.size(), std::invalid_argument,
820 prefix << "importLIDs.size() = " << importLIDs.size() << " != "
821 "numPacketsPerLID.size() = "
822 << numPacketsPerLID.size() << ".");
823
824 auto local_graph = sourceGraph.getLocalGraphDevice();
827 permuteFromLIDs.getRawPtr(),
828 permuteFromLIDs.size(), true,
829 "permute_from_lids");
830 auto imports_d =
831 create_mirror_view_from_raw_host_array(buffer_device_type(),
832 imports.getRawPtr(),
833 imports.size(), true,
834 "imports");
836 create_mirror_view_from_raw_host_array(buffer_device_type(),
837 numPacketsPerLID.getRawPtr(),
838 numPacketsPerLID.size(), true,
839 "num_packets_per_lid");
840
841 return UnpackAndCombineCrsGraphImpl::unpackAndCombineWithOwningPIDsCount<
842 packet_type, local_graph_device_type, buffer_device_type>(
843 local_graph, permute_from_lids_d, imports_d, num_packets_per_lid_d, numSameIDs);
844}
845
859template <class LocalOrdinal, class GlobalOrdinal, class Node>
862 const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
863 const Teuchos::ArrayView<const typename CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::packet_type>& imports,
864 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
865 const size_t /* constantNumPackets */,
866 const CombineMode /* combineMode */,
867 const size_t numSameIDs,
868 const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
869 const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
870 size_t TargetNumRows,
871 size_t TargetNumNonzeros,
872 const int MyTargetPID,
873 const Teuchos::ArrayView<size_t>& CRS_rowptr,
874 const Teuchos::ArrayView<GlobalOrdinal>& CRS_colind,
875 const Teuchos::ArrayView<const int>& SourcePids,
876 Teuchos::Array<int>& TargetPids) {
877 using Kokkos::deep_copy;
878 using Kokkos::View;
879 using Teuchos::outArg;
880 using Teuchos::REDUCE_MAX;
881 using Teuchos::reduceAll;
882 using LO = LocalOrdinal;
883 using GO = GlobalOrdinal;
884 using crs_graph_type = CrsGraph<LO, GO, Node>;
885 using packet_type = typename crs_graph_type::packet_type;
886 using local_graph_device_type = typename crs_graph_type::local_graph_device_type;
887 using buffer_device_type = typename crs_graph_type::buffer_device_type;
888 using device_type = typename Node::device_type;
889 using size_type = typename Teuchos::ArrayView<const LO>::size_type;
890
891 const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays: ";
892
894 TargetNumRows + 1 != static_cast<size_t>(CRS_rowptr.size()),
895 std::invalid_argument, prefix << "CRS_rowptr.size() = " << CRS_rowptr.size() << "!= TargetNumRows+1 = " << TargetNumRows + 1 << ".");
896
898 permuteToLIDs.size() != permuteFromLIDs.size(), std::invalid_argument,
899 prefix << "permuteToLIDs.size() = " << permuteToLIDs.size()
900 << "!= permuteFromLIDs.size() = " << permuteFromLIDs.size() << ".");
901 const size_type numImportLIDs = importLIDs.size();
902
904 numImportLIDs != numPacketsPerLID.size(), std::invalid_argument,
905 prefix << "importLIDs.size() = " << numImportLIDs << " != "
906 "numPacketsPerLID.size() = "
907 << numPacketsPerLID.size() << ".");
908
909 // Preseed TargetPids with -1 for local
910 if (static_cast<size_t>(TargetPids.size()) != TargetNumNonzeros) {
912 }
913 TargetPids.assign(TargetNumNonzeros, -1);
914
915 // Grab pointers for sourceGraph
916 auto local_graph = sourceGraph.getLocalGraphDevice();
917 auto local_col_map = sourceGraph.getColMap()->getLocalMap();
918
919 // Convert input arrays to Kokkos::View
920 device_type outputDevice;
921 buffer_device_type bufferOutputDevice;
922
923 Kokkos::View<const LO*, buffer_device_type> import_lids_d =
925 importLIDs.size(), true, "import_lids");
926
927 Kokkos::View<const packet_type*, buffer_device_type> imports_d =
929 imports.size(), true, "imports");
930
931 Kokkos::View<const size_t*, buffer_device_type> num_packets_per_lid_d =
933 numPacketsPerLID.getRawPtr(), numPacketsPerLID.size(),
934 true, "num_packets_per_lid");
935
936 Kokkos::View<const LO*, buffer_device_type> permute_to_lids_d =
938 permuteToLIDs.getRawPtr(), permuteToLIDs.size(),
939 true, "permute_to_lids");
940
941 Kokkos::View<const LO*, buffer_device_type> permute_from_lids_d =
943 permuteFromLIDs.getRawPtr(), permuteFromLIDs.size(),
944 true, "permute_from_lids");
945
946 Kokkos::View<size_t*, device_type> crs_rowptr_d =
948 CRS_rowptr.getRawPtr(), CRS_rowptr.size(),
949 true, "crs_rowptr");
950
951 Kokkos::View<GO*, device_type> crs_colind_d =
953 CRS_colind.getRawPtr(), CRS_colind.size(),
954 true, "crs_colidx");
955
956 Kokkos::View<const int*, device_type> src_pids_d =
958 SourcePids.getRawPtr(), SourcePids.size(),
959 true, "src_pids");
960
961 Kokkos::View<int*, device_type> tgt_pids_d =
963 TargetPids.getRawPtr(), TargetPids.size(),
964 true, "tgt_pids");
965
966 using local_map_type = decltype(local_col_map);
967 UnpackAndCombineCrsGraphImpl::unpackAndCombineIntoCrsArrays<
968 packet_type, local_graph_device_type, local_map_type, buffer_device_type>(
969 local_graph, local_col_map, import_lids_d, imports_d, num_packets_per_lid_d,
972
973 // FIXME (mfh 25 Jun 2019) host_mirror_type of CudaUVMSpace is CudaUVMSpace!!!
974
975 // Copy outputs back to host
976 typename decltype(crs_rowptr_d)::host_mirror_type crs_rowptr_h(
977 CRS_rowptr.getRawPtr(), CRS_rowptr.size());
979
980 typename decltype(crs_colind_d)::host_mirror_type crs_colind_h(
981 CRS_colind.getRawPtr(), CRS_colind.size());
983
984 typename decltype(tgt_pids_d)::host_mirror_type tgt_pids_h(
985 TargetPids.getRawPtr(), TargetPids.size());
987}
988
989} // namespace Details
990} // namespace Tpetra
991
992#define TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_INSTANT(LO, GO, NT) \
993 template void \
994 Details::unpackAndCombineIntoCrsArrays<LO, GO, NT>( \
995 const CrsGraph<LO, GO, NT>&, \
996 const Teuchos::ArrayView<const LO>&, \
997 const Teuchos::ArrayView<const typename CrsGraph<LO, GO, NT>::packet_type>&, \
998 const Teuchos::ArrayView<const size_t>&, \
999 const size_t, \
1000 const CombineMode, \
1001 const size_t, \
1002 const Teuchos::ArrayView<const LO>&, \
1003 const Teuchos::ArrayView<const LO>&, \
1004 size_t, \
1005 size_t, \
1006 const int, \
1007 const Teuchos::ArrayView<size_t>&, \
1008 const Teuchos::ArrayView<GO>&, \
1009 const Teuchos::ArrayView<const int>&, \
1010 Teuchos::Array<int>&); \
1011 template size_t \
1012 Details::unpackAndCombineWithOwningPIDsCount<LO, GO, NT>( \
1013 const CrsGraph<LO, GO, NT>&, \
1014 const Teuchos::ArrayView<const LO>&, \
1015 const Teuchos::ArrayView<const typename CrsGraph<LO, GO, NT>::packet_type>&, \
1016 const Teuchos::ArrayView<const size_t>&, \
1017 size_t, \
1018 CombineMode, \
1019 size_t, \
1020 const Teuchos::ArrayView<const LO>&, \
1021 const Teuchos::ArrayView<const LO>&);
1022
1023#endif // TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
Declaration of the Tpetra::CrsGraph class.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Import KokkosSparse::OrdinalTraits, a traits class for "invalid" (flag) values of integer types,...
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary,...
Functions for manipulating CRS arrays.
Declaration and definition of Tpetra::Details::getEntryOnHost.
void unpackAndCombine(const RowView &row_ptrs_beg, const RowView &row_ptrs_end, IndicesView &indices, const Kokkos::View< const GlobalOrdinal *, BufferDevice, Kokkos::MemoryUnmanaged > &imports, const Kokkos::View< const size_t *, BufferDevice, Kokkos::MemoryUnmanaged > &num_packets_per_lid, const Kokkos::View< const LocalOrdinal *, BufferDevice, Kokkos::MemoryUnmanaged > &import_lids, const typename CrsGraph< LocalOrdinal, GlobalOrdinal, Node >::padding_type &padding, const bool unpack_pids, const int myRank, const bool verbose)
Perform the unpack operation for the graph.
KOKKOS_FUNCTION int unpackRow(const Kokkos::View< GO *, Device, Kokkos::MemoryUnmanaged > &gids_out, const Kokkos::View< int *, Device, Kokkos::MemoryUnmanaged > &pids_out, const Kokkos::View< const Packet *, BufferDevice > &imports, const size_t offset, const size_t num_ent)
Unpack a single row of a CrsGraph.
void setupRowPointersForRemotes(const Kokkos::View< size_t *, Device > &tgt_rowptr, const Kokkos::View< const LO *, BufferDevice > &import_lids, const Kokkos::View< const Packet *, BufferDevice > &, const Kokkos::View< const size_t *, BufferDevice > &num_packets_per_lid)
Setup row pointers for remotes.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
KokkosSparse::StaticCrsGraph< local_ordinal_type, Kokkos::LayoutLeft, device_type, void, size_t > local_graph_device_type
The type of the part of the sparse graph on each MPI process.
typename dist_object_type::buffer_device_type buffer_device_type
Kokkos::Device specialization for communication buffers.
Struct that holds views of the contents of a CrsMatrix.
LocalOrdinal local_ordinal_type
The type of local indices.
GlobalOrdinal global_ordinal_type
The type of global indices.
DeviceType device_type
The device type.
Implementation details of Tpetra.
void padCrsArrays(const RowPtr &rowPtrBeg, const RowPtr &rowPtrEnd, Indices &indices_wdv, const Padding &padding, const int my_rank, const bool verbose)
Determine if the row pointers and indices arrays need to be resized to accommodate new entries....
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks.
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
CombineMode
Rule for combining data in an Import or Export.