Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
11#define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
12
13#include <memory>
14#include <string>
15#include "TpetraCore_config.h"
16#include "Kokkos_Core.hpp"
17#include "Teuchos_Array.hpp"
18#include "Teuchos_ArrayView.hpp"
19#include "Teuchos_OrdinalTraits.hpp"
20#include "Teuchos_TimeMonitor.hpp"
28#include "Tpetra_Details_DefaultTypes.hpp"
30
51
52namespace Tpetra {
53
54//
55// Users must never rely on anything in the Details namespace.
56//
57namespace Details {
58
59namespace UnpackAndCombineCrsMatrixImpl {
60
70template <class ST, class LO, class GO>
71KOKKOS_FUNCTION int
75 const char imports[],
76 const size_t offset,
77 const size_t /* num_bytes */,
78 const size_t num_ent,
79 const size_t bytes_per_value) {
80 if (num_ent == 0) {
81 // Empty rows always take zero bytes, to ensure sparsity.
82 return 0;
83 }
84 bool unpack_pids = pids_out.size() > 0;
85
86 const size_t num_ent_beg = offset;
87 const size_t num_ent_len = PackTraits<LO>::packValueCount(LO(0));
88
89 const size_t gids_beg = num_ent_beg + num_ent_len;
90 const size_t gids_len =
92
93 const size_t pids_beg = gids_beg + gids_len;
94 const size_t pids_len = unpack_pids ? size_t(num_ent * PackTraits<int>::packValueCount(int(0))) : size_t(0);
95
96 const size_t vals_beg = gids_beg + gids_len + pids_len;
97 const size_t vals_len = num_ent * bytes_per_value;
98
99 const char* const num_ent_in = imports + num_ent_beg;
100 const char* const gids_in = imports + gids_beg;
101 const char* const pids_in = unpack_pids ? imports + pids_beg : nullptr;
102 const char* const vals_in = imports + vals_beg;
103
104 size_t num_bytes_out = 0;
105 LO num_ent_out;
107 if (static_cast<size_t>(num_ent_out) != num_ent) {
108 return 20; // error code
109 }
110
111 {
112 Kokkos::pair<int, size_t> p;
114 if (p.first != 0) {
115 return 21; // error code
116 }
117 num_bytes_out += p.second;
118
119 if (unpack_pids) {
121 if (p.first != 0) {
122 return 22; // error code
123 }
124 num_bytes_out += p.second;
125 }
126
128 if (p.first != 0) {
129 return 23; // error code
130 }
131 num_bytes_out += p.second;
132 }
133
136 return 24; // error code
137 }
138 return 0; // no errors
139} // unpackRow
140
151template <class LocalMatrix, class LocalMap, class BufferDeviceType>
154 typedef LocalMap local_map_type;
155
156 typedef typename local_matrix_type::value_type ST;
157 typedef typename local_map_type::local_ordinal_type LO;
159 typedef typename local_map_type::device_type DT;
160 typedef typename DT::execution_space XS;
161
162 typedef Kokkos::View<const size_t*, BufferDeviceType>
163 num_packets_per_lid_type;
164 typedef Kokkos::View<const size_t*, DT> offsets_type;
165 typedef Kokkos::View<const char*, BufferDeviceType> input_buffer_type;
166 typedef Kokkos::View<const LO*, BufferDeviceType> import_lids_type;
167
168 typedef Kokkos::View<int, DT> error_type;
169 using member_type = typename Kokkos::TeamPolicy<XS>::member_type;
170
171 static_assert(std::is_same<LO, typename local_matrix_type::ordinal_type>::value,
172 "LocalMap::local_ordinal_type and "
173 "LocalMatrix::ordinal_type must be the same.");
174
175 local_matrix_type local_matrix;
176 local_map_type local_col_map;
177 input_buffer_type imports;
178 num_packets_per_lid_type num_packets_per_lid;
179 import_lids_type import_lids;
180 Kokkos::View<const LO* [2], DT> batch_info;
181 offsets_type offsets;
182 Tpetra::CombineMode combine_mode;
183 size_t batch_size;
184 size_t bytes_per_value;
185 bool atomic;
186 error_type error_code;
187
191 const input_buffer_type& imports_in,
192 const num_packets_per_lid_type& num_packets_per_lid_in,
193 const import_lids_type& import_lids_in,
194 const Kokkos::View<const LO* [2], DT>& batch_info_in,
195 const offsets_type& offsets_in,
197 const size_t batch_size_in,
198 const size_t bytes_per_value_in,
199 const bool atomic_in)
200 : local_matrix(local_matrix_in)
201 , local_col_map(local_col_map_in)
202 , imports(imports_in)
203 , num_packets_per_lid(num_packets_per_lid_in)
204 , import_lids(import_lids_in)
205 , batch_info(batch_info_in)
206 , offsets(offsets_in)
207 , combine_mode(combine_mode_in)
208 , batch_size(batch_size_in)
209 , bytes_per_value(bytes_per_value_in)
210 , atomic(atomic_in)
211 , error_code("error") {}
212
214 void operator()(member_type team_member) const {
215 using Kokkos::MemoryUnmanaged;
216 using Kokkos::subview;
217 using Kokkos::View;
218
219 const LO batch = team_member.league_rank();
220 const LO lid_no = batch_info(batch, 0);
221 const LO batch_no = batch_info(batch, 1);
222
223 const size_t num_bytes = num_packets_per_lid(lid_no);
224
225 // Only unpack data if there is a nonzero number of bytes.
226 if (num_bytes == 0)
227 return;
228
229 // there is actually something in the row
230 const LO import_lid = import_lids(lid_no);
231 const size_t buf_size = imports.size();
232 const size_t offset = offsets(lid_no);
233
234 // Get the number of entries to expect in the received data for this row.
235 LO num_ent_LO = 0;
236 const char* const in_buf = imports.data() + offset;
238 const size_t num_entries_in_row = static_cast<size_t>(num_ent_LO);
239
240 // Count the number of bytes expected to unpack
241 size_t expected_num_bytes = 0;
242 {
246 }
247
249 Kokkos::printf(
250 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
251 "At row %d, the expected number of bytes (%d) != number of unpacked bytes (%d)\n",
252 (int)lid_no, (int)expected_num_bytes, (int)num_bytes);
253
254 Kokkos::atomic_compare_exchange(error_code.data(), 0, 21);
255 return;
256 }
257
258 if (offset > buf_size || offset + num_bytes > buf_size) {
259 Kokkos::printf(
260 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
261 "At row %d, the offset (%d) > buffer size (%d)\n",
262 (int)lid_no, (int)offset, (int)buf_size);
263
264 Kokkos::atomic_compare_exchange(error_code.data(), 0, 22);
265 return;
266 }
267
268 // Determine the number of entries to unpack in this batch
269 size_t num_entries_in_batch = 0;
270 if (num_entries_in_row <= batch_size)
272 else if (num_entries_in_row >= (batch_no + 1) * batch_size)
273 num_entries_in_batch = batch_size;
274 else
276
277 const size_t bytes_per_lid = PackTraits<LO>::packValueCount(LO(0));
278 const size_t num_ent_start = offset;
279 const size_t num_ent_end = num_ent_start + bytes_per_lid;
280
282 const size_t gids_start = num_ent_end;
284
285 const size_t vals_start = gids_end;
286
287 const size_t shift = batch_no * batch_size;
288 const char* const num_ent_in = imports.data() + num_ent_start;
289 const char* const gids_in = imports.data() + gids_start + shift * bytes_per_gid;
290 const char* const vals_in = imports.data() + vals_start + shift * bytes_per_value;
291
292 LO num_ent_out;
294 if (static_cast<size_t>(num_ent_out) != num_entries_in_row) {
295 Kokkos::printf(
296 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
297 "At row %d, number of entries (%d) != number of entries unpacked (%d)\n",
298 (int)lid_no, (int)num_entries_in_row, (int)num_ent_out);
299
300 Kokkos::atomic_compare_exchange(error_code.data(), 0, 23);
301 }
302
303 constexpr bool matrix_has_sorted_rows = true; // see #6282
304 // Note BMK 6-22: this lambda must use capture-by-value [=] and not capture-by-ref [&].
305 // By ref triggers compiler bug in CUDA 10.
306 Kokkos::parallel_for(
307 Kokkos::TeamThreadRange(team_member, num_entries_in_batch),
308 [=, *this](const LO& j) {
309 size_t distance = 0;
310
311 GO gid_out;
314 auto lid_out = local_col_map.getLocalElement(gid_out);
315
316 // Column indices come in as global indices, in case the
317 // source object's column Map differs from the target object's
318 // (this's) column Map, and must be converted local index values
319
320 // assume that ST is default constructible
321 ST val_out;
322 distance = j * bytes_per_value;
324
325 if (combine_mode == ADD) {
326 // NOTE (mfh 20 Nov 2019) Must assume atomic is required, unless
327 // different threads don't touch the same row (i.e., no
328 // duplicates in incoming LIDs list).
329 const bool use_atomic_updates = atomic;
330 (void)local_matrix.sumIntoValues(
332 &lid_out,
333 1,
334 &val_out,
337 } else if (combine_mode == REPLACE) {
338 // NOTE (mfh 20 Nov 2019): It's never correct to use REPLACE
339 // combine mode with multiple incoming rows that touch the same
340 // target matrix entries, so we never need atomic updates.
341 const bool use_atomic_updates = false;
342 (void)local_matrix.replaceValues(
344 &lid_out,
345 1,
346 &val_out,
349 } else {
350 // should never get here
351 Kokkos::printf(
352 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
353 "At row %d, an unknown error occurred during unpack\n",
354 (int)lid_no);
355 Kokkos::atomic_compare_exchange(error_code.data(), 0, 31);
356 }
357 });
358
359 team_member.team_barrier();
360 }
361
363 int error() const {
364 auto error_code_h = Kokkos::create_mirror_view_and_copy(
365 Kokkos::HostSpace(), error_code);
366 return error_code_h();
367 }
368
369}; // UnpackCrsMatrixAndCombineFunctor
370
371struct MaxNumEntTag {};
372struct TotNumEntTag {};
373
382template <class LO, class DT, class BDT>
384 public:
385 typedef Kokkos::View<const size_t*, BDT> num_packets_per_lid_type;
386 typedef Kokkos::View<const size_t*, DT> offsets_type;
387 typedef Kokkos::View<const char*, BDT> input_buffer_type;
388 // This needs to be public, since it appears in the argument list of
389 // public methods (see below). Otherwise, build errors may happen.
390 typedef size_t value_type;
391
392 private:
393 num_packets_per_lid_type num_packets_per_lid;
394 offsets_type offsets;
395 input_buffer_type imports;
396
397 public:
398 NumEntriesFunctor(const num_packets_per_lid_type num_packets_per_lid_in,
399 const offsets_type& offsets_in,
400 const input_buffer_type& imports_in)
401 : num_packets_per_lid(num_packets_per_lid_in)
402 , offsets(offsets_in)
403 , imports(imports_in) {}
404
406 operator()(const MaxNumEntTag, const LO i, value_type& update) const {
407 // Get how many entries to expect in the received data for this row.
408 const size_t num_bytes = num_packets_per_lid(i);
409 if (num_bytes > 0) {
410 LO num_ent_LO = 0; // output argument of unpackValue
411 const char* const in_buf = imports.data() + offsets(i);
413 const size_t num_ent = static_cast<size_t>(num_ent_LO);
414
415 update = (update < num_ent) ? num_ent : update;
416 }
417 }
418
420 join(const MaxNumEntTag,
421 value_type& dst,
422 const value_type& src) const {
423 if (dst < src) dst = src;
424 }
425
427 operator()(const TotNumEntTag, const LO i, value_type& tot_num_ent) const {
428 // Get how many entries to expect in the received data for this row.
429 const size_t num_bytes = num_packets_per_lid(i);
430 if (num_bytes > 0) {
431 LO num_ent_LO = 0; // output argument of unpackValue
432 const char* const in_buf = imports.data() + offsets(i);
434 tot_num_ent += static_cast<size_t>(num_ent_LO);
435 }
436 }
437}; // NumEntriesFunctor
438
446template <class LO, class DT, class BDT>
447size_t
449 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
450 const Kokkos::View<const size_t*, DT>& offsets,
451 const Kokkos::View<const char*, BDT>& imports) {
452 typedef typename DT::execution_space XS;
453 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>,
454 MaxNumEntTag>
455 range_policy;
456
457 NumEntriesFunctor<LO, DT, BDT> functor(num_packets_per_lid, offsets,
458 imports);
459 const LO numRowsToUnpack =
460 static_cast<LO>(num_packets_per_lid.extent(0));
461 size_t max_num_ent = 0;
462 Kokkos::parallel_reduce("Max num entries in CRS",
463 range_policy(0, numRowsToUnpack),
464 functor, max_num_ent);
465 return max_num_ent;
466}
467
475template <class LO, class DT, class BDT>
476size_t
478 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
479 const Kokkos::View<const size_t*, DT>& offsets,
480 const Kokkos::View<const char*, BDT>& imports) {
481 typedef typename DT::execution_space XS;
482 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>, TotNumEntTag> range_policy;
483 size_t tot_num_ent = 0;
484 NumEntriesFunctor<LO, DT, BDT> functor(num_packets_per_lid, offsets,
485 imports);
486 const LO numRowsToUnpack =
487 static_cast<LO>(num_packets_per_lid.extent(0));
488 Kokkos::parallel_reduce("Total num entries in CRS to unpack",
489 range_policy(0, numRowsToUnpack),
491 return tot_num_ent;
492}
493
494template <class LO>
496 size_t
497 unpackRowCount(const char imports[],
498 const size_t offset,
499 const size_t num_bytes) {
500 using PT = PackTraits<LO>;
501
502 LO num_ent_LO = 0;
503 if (num_bytes > 0) {
504 const size_t p_num_bytes = PT::packValueCount(num_ent_LO);
505 if (p_num_bytes > num_bytes) {
507 }
508 const char* const in_buf = imports + offset;
509 (void)PT::unpackValue(num_ent_LO, in_buf);
510 }
511 return static_cast<size_t>(num_ent_LO);
512}
513
518template <class View1, class View2>
519inline bool
521 const View1& batches_per_lid,
522 View2& batch_info) {
523 using LO = typename View2::value_type;
524 size_t batch = 0;
525 for (size_t i = 0; i < batches_per_lid.extent(0); i++) {
526 for (size_t batch_no = 0; batch_no < batches_per_lid(i); batch_no++) {
527 batch_info(batch, 0) = static_cast<LO>(i);
528 batch_info(batch, 1) = batch_no;
529 batch++;
530 }
531 }
532 return batch == batch_info.extent(0);
533}
534
542template <class LocalMatrix, class LocalMap, class BufferDeviceType>
544 const LocalMatrix& local_matrix,
545 const LocalMap& local_map,
546 const Kokkos::View<const char*, BufferDeviceType>& imports,
547 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
549 const Tpetra::CombineMode combine_mode) {
550 using ST = typename LocalMatrix::value_type;
551 using LO = typename LocalMap::local_ordinal_type;
552 using DT = typename LocalMap::device_type;
553 using XS = typename DT::execution_space;
554 const char prefix[] =
555 "Tpetra::Details::UnpackAndCombineCrsMatrixImpl::"
556 "unpackAndCombineIntoCrsMatrix: ";
557
558 const size_t num_import_lids = static_cast<size_t>(import_lids.extent(0));
559 if (num_import_lids == 0) {
560 // Nothing to unpack
561 return;
562 }
563
564 {
565 // Check for correct input
566 TEUCHOS_TEST_FOR_EXCEPTION(combine_mode == ABSMAX,
567 std::invalid_argument,
568 prefix << "ABSMAX combine mode is not yet implemented for a matrix that has a "
569 "static graph (i.e., was constructed with the CrsMatrix constructor "
570 "that takes a const CrsGraph pointer).");
571
572 TEUCHOS_TEST_FOR_EXCEPTION(combine_mode == INSERT,
573 std::invalid_argument,
574 prefix << "INSERT combine mode is not allowed if the matrix has a static graph "
575 "(i.e., was constructed with the CrsMatrix constructor that takes a "
576 "const CrsGraph pointer).");
577
578 // Unknown combine mode!
579 TEUCHOS_TEST_FOR_EXCEPTION(!(combine_mode == ADD || combine_mode == REPLACE),
580 std::invalid_argument,
581 prefix << "Invalid combine mode; should never get "
582 "here! Please report this bug to the Tpetra developers.");
583
584 // Check that sizes of input objects are consistent.
586 num_import_lids != static_cast<size_t>(num_packets_per_lid.extent(0));
588 std::invalid_argument,
589 prefix << "importLIDs.size() (" << num_import_lids << ") != "
590 "numPacketsPerLID.size() ("
591 << num_packets_per_lid.extent(0) << ").");
592 } // end QA error checking
593
594 // Get the offsets
595 Kokkos::View<size_t*, DT> offsets("offsets", num_import_lids + 1);
596 computeOffsetsFromCounts(offsets, num_packets_per_lid);
597
598 // Determine the sizes of the unpack batches
599 size_t max_num_ent = compute_maximum_num_entries<LO, DT>(num_packets_per_lid, offsets, imports);
601 const size_t batch_size = std::min(default_batch_size, max_num_ent);
602
603 // To achieve some balance amongst threads, unpack each row in equal size batches
604 size_t num_batches = 0;
605 Kokkos::View<LO* [2], DT> batch_info("", num_batches);
606 Kokkos::View<size_t*, DT> batches_per_lid("", num_import_lids);
607 // Compute meta data that allows batch unpacking
608 Kokkos::parallel_reduce(
609 Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t>>(0, num_import_lids),
610 KOKKOS_LAMBDA(const size_t i, size_t& batches) {
612 imports.data(), offsets(i), num_packets_per_lid(i));
614 (num_entries_in_row <= batch_size) ? 1 : num_entries_in_row / batch_size + (num_entries_in_row % batch_size != 0);
616 },
618 Kokkos::resize(batch_info, num_batches);
619
620 Kokkos::HostSpace host_space;
621 auto batches_per_lid_h = Kokkos::create_mirror_view(host_space, batches_per_lid);
622 // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
623 Kokkos::deep_copy(XS(), batches_per_lid_h, batches_per_lid);
624
625 auto batch_info_h = Kokkos::create_mirror_view(host_space, batch_info);
626
628 // DEEP_COPY REVIEW - HOSTMIRROR-TO-DEVICE
629 Kokkos::deep_copy(XS(), batch_info, batch_info_h);
630
631 // FIXME (TJF SEP 2017)
632 // The scalar type is not necessarily default constructible
633 size_t bytes_per_value = PackTraits<ST>::packValueCount(ST());
634
635 // Now do the actual unpack!
636 const bool atomic = XS().concurrency() != 1;
638 functor f(
639 local_matrix,
640 local_map,
641 imports,
642 num_packets_per_lid,
643 import_lids,
644 batch_info,
645 offsets,
646 combine_mode,
647 batch_size,
648 bytes_per_value,
649 atomic);
650
651 using policy = Kokkos::TeamPolicy<XS, Kokkos::IndexType<LO>>;
653 if (!Spaces::is_gpu_exec_space<XS>() || team_size == Teuchos::OrdinalTraits<size_t>::invalid()) {
654 Kokkos::parallel_for(policy(static_cast<LO>(num_batches), Kokkos::AUTO), f);
655 } else {
656 Kokkos::parallel_for(policy(static_cast<LO>(num_batches), static_cast<int>(team_size)), f);
657 }
658
659 auto error_code = f.error();
661 error_code != 0,
662 std::runtime_error,
663 prefix << "UnpackCrsMatrixAndCombineFunctor reported error code " << error_code);
664} // unpackAndCombineIntoCrsMatrix (Kokkos version)
665
666template <class LocalMatrix, class BufferDeviceType>
667size_t
669 const LocalMatrix& local_matrix,
672 const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
673 const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
674#else
675 const Kokkos::View<const char*, BufferDeviceType>& imports,
676 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
677#endif
678 const size_t num_same_ids) {
679 using Kokkos::parallel_reduce;
680 typedef typename LocalMatrix::ordinal_type LO;
681 typedef typename LocalMatrix::device_type device_type;
682 typedef typename device_type::execution_space XS;
683 typedef typename Kokkos::View<LO*, device_type>::size_type size_type;
684 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>> range_policy;
685 typedef BufferDeviceType BDT;
686
687 size_t count = 0;
688 LO num_items;
689
690 // Number of matrix entries to unpack (returned by this function).
691 num_items = static_cast<LO>(num_same_ids);
692 if (num_items) {
693 size_t kcnt = 0;
695 range_policy(0, num_items),
696 KOKKOS_LAMBDA(const LO lid, size_t& update) {
697 update += static_cast<size_t>(local_matrix.graph.row_map[lid + 1] - local_matrix.graph.row_map[lid]);
698 },
699 kcnt);
700 count += kcnt;
701 }
702
703 // Count entries copied directly from the source matrix with permuting.
704 num_items = static_cast<LO>(permute_from_lids.extent(0));
705 if (num_items) {
706 size_t kcnt = 0;
707 parallel_reduce(
708 range_policy(0, num_items),
709 KOKKOS_LAMBDA(const LO i, size_t& update) {
710 const LO lid = permute_from_lids(i);
711 update += static_cast<size_t>(local_matrix.graph.row_map[lid + 1] - local_matrix.graph.row_map[lid]);
712 },
713 kcnt);
714 count += kcnt;
715 }
716
717 {
718 // Count entries received from other MPI processes.
719 const size_type np = num_packets_per_lid.extent(0);
720 Kokkos::View<size_t*, device_type> offsets("offsets", np + 1);
721 computeOffsetsFromCounts(offsets, num_packets_per_lid);
722 count +=
723 compute_total_num_entries<LO, device_type, BDT>(num_packets_per_lid,
724 offsets, imports);
725 }
726
727 return count;
728} // unpackAndCombineWithOwningPIDsCount (Kokkos version)
729
731template <class LO, class DT, class BDT>
732int setupRowPointersForRemotes(
734 const typename PackTraits<LO>::input_array_type& import_lids,
735 const Kokkos::View<const char*, BDT>& imports,
736 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
737 const typename PackTraits<size_t>::input_array_type& offsets) {
738 using Kokkos::parallel_reduce;
739 typedef typename DT::execution_space XS;
740 typedef typename PackTraits<size_t>::input_array_type::size_type size_type;
741 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type>> range_policy;
742
744 const size_type N = num_packets_per_lid.extent(0);
745
746 int errors = 0;
748 "Setup row pointers for remotes",
749 range_policy(0, N),
750 KOKKOS_LAMBDA(const size_t i, int& k_error) {
751 typedef typename std::remove_reference<decltype(tgt_rowptr(0))>::type atomic_incr_type;
752 const size_t num_bytes = num_packets_per_lid(i);
753 const size_t offset = offsets(i);
754 const size_t num_ent = unpackRowCount<LO>(imports.data(), offset, num_bytes);
755 if (num_ent == InvalidNum) {
756 k_error += 1;
757 }
758 Kokkos::atomic_fetch_add(&tgt_rowptr(import_lids(i)), atomic_incr_type(num_ent));
759 },
760 errors);
761 return errors;
762}
763
764// Convert array of row lengths to a CRS pointer array
765template <class DT>
766void makeCrsRowPtrFromLengths(
768 const Kokkos::View<size_t*, DT>& new_start_row) {
769 using Kokkos::parallel_scan;
770 typedef typename DT::execution_space XS;
771 typedef typename Kokkos::View<size_t*, DT>::size_type size_type;
772 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type>> range_policy;
773 const size_type N = new_start_row.extent(0);
775 range_policy(0, N),
776 KOKKOS_LAMBDA(const size_t& i, size_t& update, const bool& final) {
777 auto cur_val = tgt_rowptr(i);
778 if (final) {
779 tgt_rowptr(i) = update;
781 }
782 update += cur_val;
783 });
784}
785
786template <class LocalMatrix, class LocalMap>
787void copyDataFromSameIDs(
789 const typename PackTraits<int>::output_array_type& tgt_pids,
791 const Kokkos::View<size_t*, typename LocalMap::device_type>& new_start_row,
792 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
793 const typename PackTraits<int>::input_array_type& src_pids,
794 const LocalMatrix& local_matrix,
795 const LocalMap& local_col_map,
796 const size_t num_same_ids,
797 const int my_pid) {
798 using Kokkos::parallel_for;
799 typedef typename LocalMap::device_type DT;
800 typedef typename LocalMap::local_ordinal_type LO;
801 typedef typename DT::execution_space XS;
802 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t>> range_policy;
803
804 parallel_for(
805 range_policy(0, num_same_ids),
806 KOKKOS_LAMBDA(const size_t i) {
807 typedef typename std::remove_reference<decltype(new_start_row(0))>::type atomic_incr_type;
808
809 const LO src_lid = static_cast<LO>(i);
810 size_t src_row = local_matrix.graph.row_map(src_lid);
811
812 const LO tgt_lid = static_cast<LO>(i);
813 const size_t tgt_row = tgt_rowptr(tgt_lid);
814
815 const size_t nsr = local_matrix.graph.row_map(src_lid + 1) - local_matrix.graph.row_map(src_lid);
816 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
817
818 for (size_t j = local_matrix.graph.row_map(src_lid);
819 j < local_matrix.graph.row_map(src_lid + 1); ++j) {
820 LO src_col = local_matrix.graph.entries(j);
821 tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
822 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
823 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
824 }
825 });
826}
827
828template <class LocalMatrix, class LocalMap>
829void copyDataFromPermuteIDs(
831 const typename PackTraits<int>::output_array_type& tgt_pids,
833 const Kokkos::View<size_t*, typename LocalMap::device_type>& new_start_row,
834 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
835 const typename PackTraits<int>::input_array_type& src_pids,
838 const LocalMatrix& local_matrix,
839 const LocalMap& local_col_map,
840 const int my_pid) {
841 using Kokkos::parallel_for;
842 typedef typename LocalMap::device_type DT;
843 typedef typename LocalMap::local_ordinal_type LO;
844 typedef typename DT::execution_space XS;
845 typedef typename PackTraits<LO>::input_array_type::size_type size_type;
846 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type>> range_policy;
847
848 const size_type num_permute_to_lids = permute_to_lids.extent(0);
849
850 parallel_for(
851 range_policy(0, num_permute_to_lids),
852 KOKKOS_LAMBDA(const size_t i) {
853 typedef typename std::remove_reference<decltype(new_start_row(0))>::type atomic_incr_type;
854
855 const LO src_lid = permute_from_lids(i);
856 const size_t src_row = local_matrix.graph.row_map(src_lid);
857
858 const LO tgt_lid = permute_to_lids(i);
859 const size_t tgt_row = tgt_rowptr(tgt_lid);
860
861 size_t nsr = local_matrix.graph.row_map(src_lid + 1) - local_matrix.graph.row_map(src_lid);
862 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
863
864 for (size_t j = local_matrix.graph.row_map(src_lid);
865 j < local_matrix.graph.row_map(src_lid + 1); ++j) {
866 LO src_col = local_matrix.graph.entries(j);
867 tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
868 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
869 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
870 }
871 });
872}
873
874template <typename LocalMatrix, typename LocalMap, typename BufferDeviceType>
875int unpackAndCombineIntoCrsArrays2(
877 const typename PackTraits<int>::output_array_type& tgt_pids,
879 const Kokkos::View<size_t*, typename LocalMap::device_type>& new_start_row,
880 const typename PackTraits<size_t>::input_array_type& offsets,
882#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
883 const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
884 const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
885#else
886 const Kokkos::View<const char*, BufferDeviceType>& imports,
887 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
888#endif
889 const LocalMatrix& /* local_matrix */,
890 const LocalMap /*& local_col_map*/,
891 const int my_pid,
892 const size_t bytes_per_value) {
893 using Kokkos::atomic_fetch_add;
894 using Kokkos::MemoryUnmanaged;
895 using Kokkos::parallel_reduce;
896 using Kokkos::subview;
897 using Kokkos::View;
899 typedef typename LocalMap::device_type DT;
900 typedef typename LocalMap::local_ordinal_type LO;
901 typedef typename LocalMap::global_ordinal_type GO;
902 typedef typename LocalMatrix::value_type ST;
903 typedef typename DT::execution_space XS;
904 typedef typename Kokkos::View<LO*, DT>::size_type size_type;
905 typedef typename Kokkos::pair<size_type, size_type> slice;
906 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type>> range_policy;
907
908 typedef View<int*, DT, MemoryUnmanaged> pids_out_type;
909 typedef View<GO*, DT, MemoryUnmanaged> gids_out_type;
910 typedef View<ST*, DT, MemoryUnmanaged> vals_out_type;
911
912 const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
913
914 int errors = 0;
915 const size_type num_import_lids = import_lids.size();
916
917 // RemoteIDs: Loop structure following UnpackAndCombine
918 parallel_reduce(
919 "Unpack and combine into CRS",
920 range_policy(0, num_import_lids),
921 KOKKOS_LAMBDA(const size_t i, int& k_error) {
922 typedef typename std::remove_reference<decltype(new_start_row(0))>::type atomic_incr_type;
923 const size_t num_bytes = num_packets_per_lid(i);
924 const size_t offset = offsets(i);
925 if (num_bytes == 0) {
926 // Empty buffer means that the row is empty.
927 return;
928 }
929 size_t num_ent = unpackRowCount<LO>(imports.data(), offset, num_bytes);
930 if (num_ent == InvalidNum) {
931 k_error += 1;
932 return;
933 }
934 const LO lcl_row = import_lids(i);
935 const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
936 const size_t end_row = start_row + num_ent;
937
938 gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
939 vals_out_type vals_out = subview(tgt_vals, slice(start_row, end_row));
940 pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
941
942 k_error += unpackRow<ST, LO, GO>(gids_out, pids_out, vals_out,
943 imports.data(), offset, num_bytes,
944 num_ent, bytes_per_value);
945
946 // Correct target PIDs.
947 for (size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
948 const int pid = pids_out(j);
949 pids_out(j) = (pid != my_pid) ? pid : -1;
950 }
951 },
952 errors);
953
954 return errors;
955}
956
957template <typename LocalMatrix, typename LocalMap, typename BufferDeviceType>
959 const LocalMatrix& local_matrix,
960 const LocalMap& local_col_map,
962#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
963 const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
964 const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
965#else
966 const Kokkos::View<const char*, BufferDeviceType>& imports,
967 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
968#endif
971 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
974 const typename PackTraits<int>::input_array_type& src_pids,
975 const typename PackTraits<int>::output_array_type& tgt_pids,
976 const size_t num_same_ids,
977 const size_t tgt_num_rows,
978 const size_t tgt_num_nonzeros,
979 const int my_tgt_pid,
980 const size_t bytes_per_value) {
981 using Kokkos::MemoryUnmanaged;
982 using Kokkos::parallel_for;
983 using Kokkos::subview;
984 using Kokkos::View;
986 typedef typename LocalMap::device_type DT;
987 typedef typename LocalMap::local_ordinal_type LO;
988 typedef typename DT::execution_space XS;
989 typedef typename Kokkos::View<LO*, DT>::size_type size_type;
990 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t>> range_policy;
991 typedef BufferDeviceType BDT;
992
993 const char prefix[] = "unpackAndCombineIntoCrsArrays: ";
994
995 const size_t N = tgt_num_rows;
996
997 // In the case of reduced communicators, the sourceMatrix won't have
998 // the right "my_pid", so thus we have to supply it.
999 const int my_pid = my_tgt_pid;
1000
1001 // Zero the rowptr
1002 parallel_for(
1003 range_policy(0, N + 1),
1004 KOKKOS_LAMBDA(const size_t i) {
1005 tgt_rowptr(i) = 0;
1006 });
1007
1008 // same IDs: Always first, always in the same place
1009 parallel_for(
1010 range_policy(0, num_same_ids),
1011 KOKKOS_LAMBDA(const size_t i) {
1012 const LO tgt_lid = static_cast<LO>(i);
1013 const LO src_lid = static_cast<LO>(i);
1014 tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid + 1) - local_matrix.graph.row_map(src_lid);
1015 });
1016
1017 // Permute IDs: Still local, but reordered
1018 const size_type num_permute_to_lids = permute_to_lids.extent(0);
1019 parallel_for(
1020 range_policy(0, num_permute_to_lids),
1021 KOKKOS_LAMBDA(const size_t i) {
1022 const LO tgt_lid = permute_to_lids(i);
1023 const LO src_lid = permute_from_lids(i);
1024 tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid + 1) - local_matrix.graph.row_map(src_lid);
1025 });
1026
1027 // Get the offsets from the number of packets per LID
1028 const size_type num_import_lids = import_lids.extent(0);
1029 View<size_t*, DT> offsets("offsets", num_import_lids + 1);
1030 computeOffsetsFromCounts(offsets, num_packets_per_lid);
1031
1032#ifdef HAVE_TPETRA_DEBUG
1033 {
1034 auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
1035 const bool condition =
1036 nth_offset_h != static_cast<size_t>(imports.extent(0));
1037 TEUCHOS_TEST_FOR_EXCEPTION(condition, std::logic_error, prefix << "The final offset in bytes " << nth_offset_h << " != imports.size() = " << imports.extent(0) << ". Please report this bug to the Tpetra developers.");
1038 }
1039#endif // HAVE_TPETRA_DEBUG
1040
1041 // Setup row pointers for remotes
1042 int k_error =
1043 setupRowPointersForRemotes<LO, DT, BDT>(tgt_rowptr,
1044 import_lids, imports, num_packets_per_lid, offsets);
1045 TEUCHOS_TEST_FOR_EXCEPTION(k_error != 0, std::logic_error, prefix << " Error transferring data to target row pointers. "
1046 "Please report this bug to the Tpetra developers.");
1047
1048 // If multiple processes contribute to the same row, we may need to
1049 // update row offsets. This tracks that.
1050 View<size_t*, DT> new_start_row("new_start_row", N + 1);
1051
1052 // Turn row length into a real CRS row pointer
1053 makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
1054
1055 // SameIDs: Copy the data over
1056 copyDataFromSameIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1057 tgt_rowptr, src_pids, local_matrix, local_col_map, num_same_ids, my_pid);
1058
1059 copyDataFromPermuteIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1060 tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
1061 local_matrix, local_col_map, my_pid);
1062
1063 if (imports.extent(0) <= 0) {
1064 return;
1065 }
1066
1067 int unpack_err = unpackAndCombineIntoCrsArrays2(tgt_colind, tgt_pids,
1068 tgt_vals, new_start_row, offsets, import_lids, imports, num_packets_per_lid,
1069 local_matrix, local_col_map, my_pid, bytes_per_value);
1070 TEUCHOS_TEST_FOR_EXCEPTION(
1071 unpack_err != 0, std::logic_error, prefix << "unpack loop failed. This "
1072 "should never happen. Please report this bug to the Tpetra developers.");
1073
1074 return;
1075}
1076
1077} // namespace UnpackAndCombineCrsMatrixImpl
1078
1113template <typename ST, typename LO, typename GO, typename Node>
1116 const Teuchos::ArrayView<const char>& imports,
1117 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1118 const Teuchos::ArrayView<const LO>& importLIDs,
1119 size_t /* constantNumPackets */,
1121 using Kokkos::View;
1122 typedef typename Node::device_type device_type;
1123 typedef typename CrsMatrix<ST, LO, GO, Node>::local_matrix_device_type local_matrix_device_type;
1124 static_assert(std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1125 "Node::device_type and LocalMatrix::device_type must be the same.");
1126
1127 // Convert all Teuchos::Array to Kokkos::View.
1128 device_type outputDevice;
1129
1130 // numPacketsPerLID, importLIDs, and imports are input, so we have to copy
1131 // them to device. Since unpacking is done directly in to the local matrix
1132 // (lclMatrix), no copying needs to be performed after unpacking.
1135 numPacketsPerLID.size(), true, "num_packets_per_lid");
1136
1137 auto import_lids_d =
1139 importLIDs.size(), true, "import_lids");
1140
1141 auto imports_d =
1143 imports.size(), true, "imports");
1144
1145 auto local_matrix = sourceMatrix.getLocalMatrixDevice();
1146 auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1147
1148 // KDDKDD This loop doesn't appear to do anything; what is it?
1149 // KDDKDD for (int i=0; i<importLIDs.size(); i++)
1150 // KDDKDD {
1151 // KDDKDD auto lclRow = importLIDs[i];
1152 // KDDKDD Teuchos::ArrayView<const LO> A_indices;
1153 // KDDKDD Teuchos::ArrayView<const ST> A_values;
1154 // KDDKDD sourceMatrix.getLocalRowView(lclRow, A_indices, A_values);
1155 // KDDKDD }
1156 // Now do the actual unpack!
1157 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix(
1158 local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1160}
1161
1162template <typename ST, typename LO, typename GO, typename NT>
1163void unpackCrsMatrixAndCombineNew(
1165 Kokkos::DualView<char*,
1167 imports,
1168 Kokkos::DualView<size_t*,
1171 const Kokkos::DualView<const LO*,
1173 const size_t /* constantNumPackets */,
1174 const CombineMode combineMode) {
1175 using Kokkos::View;
1176 using crs_matrix_type = CrsMatrix<ST, LO, GO, NT>;
1177 using dist_object_type = DistObject<char, LO, GO, NT>;
1178 using device_type = typename crs_matrix_type::device_type;
1179 using local_matrix_device_type = typename crs_matrix_type::local_matrix_device_type;
1180 using buffer_device_type = typename dist_object_type::buffer_device_type;
1181
1182 static_assert(std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1183 "crs_matrix_type::device_type and local_matrix_device_type::device_type "
1184 "must be the same.");
1185
1186 if (numPacketsPerLID.need_sync_device()) {
1187 numPacketsPerLID.sync_device();
1188 }
1189 auto num_packets_per_lid_d = numPacketsPerLID.view_device();
1190
1191 TEUCHOS_ASSERT(!importLIDs.need_sync_device());
1192 auto import_lids_d = importLIDs.view_device();
1193
1194 if (imports.need_sync_device()) {
1195 imports.sync_device();
1196 }
1197 auto imports_d = imports.view_device();
1198
1199 auto local_matrix = sourceMatrix.getLocalMatrixDevice();
1200 auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1201 typedef decltype(local_col_map) local_map_type;
1202
1203 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix<
1204 local_matrix_device_type,
1205 local_map_type,
1206 buffer_device_type>(local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1207 import_lids_d, combineMode);
1208}
1209
1256//
1265template <typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1266size_t
1269 const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
1270 const Teuchos::ArrayView<const char>& imports,
1271 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1272 size_t /* constantNumPackets */,
1273 CombineMode /* combineMode */,
1274 size_t numSameIDs,
1275 const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1276 const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs) {
1277 using Kokkos::MemoryUnmanaged;
1278 using Kokkos::View;
1279 typedef typename Node::device_type DT;
1280 const char prefix[] = "unpackAndCombineWithOwningPIDsCount: ";
1281
1282 TEUCHOS_TEST_FOR_EXCEPTION(permuteToLIDs.size() != permuteFromLIDs.size(), std::invalid_argument,
1283 prefix << "permuteToLIDs.size() = " << permuteToLIDs.size() << " != "
1284 "permuteFromLIDs.size() = "
1285 << permuteFromLIDs.size() << ".");
1286 // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
1287 // process, then the matrix is neither locally nor globally indexed.
1288 const bool locallyIndexed = sourceMatrix.isLocallyIndexed();
1289 TEUCHOS_TEST_FOR_EXCEPTION(!locallyIndexed, std::invalid_argument, prefix << "The input "
1290 "CrsMatrix 'sourceMatrix' must be locally indexed.");
1291 TEUCHOS_TEST_FOR_EXCEPTION(importLIDs.size() != numPacketsPerLID.size(), std::invalid_argument,
1292 prefix << "importLIDs.size() = " << importLIDs.size() << " != "
1293 "numPacketsPerLID.size() = "
1294 << numPacketsPerLID.size() << ".");
1295
1296 auto local_matrix = sourceMatrix.getLocalMatrixDevice();
1297
1298 using kokkos_device_type = Kokkos::Device<typename Node::device_type::execution_space,
1299 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>;
1300
1301#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1302 Kokkos::View<LocalOrdinal const*, kokkos_device_type, void, void> permute_from_lids_d =
1303#else
1304 Kokkos::View<LocalOrdinal const*, kokkos_device_type> permute_from_lids_d =
1305#endif
1307 permuteFromLIDs.getRawPtr(),
1308 permuteFromLIDs.size(), true,
1309 "permute_from_lids");
1310
1311#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1312 Kokkos::View<const char*, kokkos_device_type, void, void> imports_d =
1313#else
1314 Kokkos::View<const char*, kokkos_device_type> imports_d =
1315#endif
1317 imports.getRawPtr(),
1318 imports.size(), true,
1319 "imports");
1320
1321#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1322 Kokkos::View<const size_t*, kokkos_device_type, void, void> num_packets_per_lid_d =
1323#else
1324 Kokkos::View<const size_t*, kokkos_device_type> num_packets_per_lid_d =
1325#endif
1327 numPacketsPerLID.getRawPtr(),
1328 numPacketsPerLID.size(), true,
1329 "num_packets_per_lid");
1330
1331 return UnpackAndCombineCrsMatrixImpl::unpackAndCombineWithOwningPIDsCount(
1332 local_matrix, permute_from_lids_d, imports_d,
1333 num_packets_per_lid_d, numSameIDs);
1334} // unpackAndCombineWithOwningPIDsCount (Teuchos::Array version)
1335
1350
1351template <typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1354 const Kokkos::View<LocalOrdinal const*,
1355 Kokkos::Device<typename Node::device_type::execution_space,
1356 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1358 ,
1359 void, void
1360#endif
1361 >
1363 const Kokkos::View<const char*,
1364 Kokkos::Device<typename Node::device_type::execution_space,
1365 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1367 ,
1368 void, void
1369#endif
1370 >
1371 imports_d,
1372 const Kokkos::View<const size_t*,
1373 Kokkos::Device<typename Node::device_type::execution_space,
1374 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1376 ,
1377 void, void
1378#endif
1379 >
1381 const size_t numSameIDs,
1382 const Kokkos::View<LocalOrdinal const*,
1383 Kokkos::Device<typename Node::device_type::execution_space,
1384 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1386 ,
1387 void, void
1388#endif
1389 >
1391 const Kokkos::View<LocalOrdinal const*,
1392 Kokkos::Device<typename Node::device_type::execution_space,
1393 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1395 ,
1396 void, void
1397#endif
1398 >
1400 size_t TargetNumRows,
1401 const int MyTargetPID,
1402 Kokkos::View<size_t*, typename Node::device_type>& crs_rowptr_d,
1403 Kokkos::View<GlobalOrdinal*, typename Node::device_type>& crs_colind_d,
1404 Kokkos::View<typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::impl_scalar_type*, typename Node::device_type>& crs_vals_d,
1405 const Teuchos::ArrayView<const int>& SourcePids,
1406 Kokkos::View<int*, typename Node::device_type>& TargetPids) {
1407 using execution_space = typename Node::execution_space;
1409
1410 using Kokkos::deep_copy;
1411 using Kokkos::View;
1412
1413 using Teuchos::ArrayView;
1414 using Teuchos::outArg;
1415 using Teuchos::REDUCE_MAX;
1416 using Teuchos::reduceAll;
1417
1418 typedef typename Node::device_type DT;
1419
1421 typedef typename matrix_type::impl_scalar_type ST;
1422
1423 const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays_new: ";
1424 Teuchos::RCP<Tpetra::Details::ProfilingRegion> tm;
1425
1426 using Kokkos::MemoryUnmanaged;
1427
1428 TEUCHOS_TEST_FOR_EXCEPTION(permute_to_lids_d.size() != permute_from_lids_d.size(), std::invalid_argument,
1429 prefix << "permute_to_lids_d.size() = " << permute_to_lids_d.size() << " != "
1430 "permute_from_lids_d.size() = "
1431 << permute_from_lids_d.size() << ".");
1432 // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
1433 // process, then the matrix is neither locally nor globally indexed.
1434 const bool locallyIndexed = sourceMatrix.isLocallyIndexed();
1435 TEUCHOS_TEST_FOR_EXCEPTION(!locallyIndexed, std::invalid_argument, prefix << "The input "
1436 "CrsMatrix 'sourceMatrix' must be locally indexed.");
1437 TEUCHOS_TEST_FOR_EXCEPTION(((size_t)import_lids_d.size()) != num_packets_per_lid_d.size(), std::invalid_argument,
1438 prefix << "import_lids_d.size() = " << import_lids_d.size() << " != "
1439 "num_packets_per_lid_d.size() = "
1440 << num_packets_per_lid_d.size() << ".");
1441
1442 auto local_matrix = sourceMatrix.getLocalMatrixDevice();
1443
1444 // TargetNumNonzeros is number of nonzeros in local matrix.
1445 tm = Teuchos::rcp(new Tpetra::Details::ProfilingRegion("Tpetra::Details::unpackAndCombineIntoCrsArrays_new: unpackAndCombineWithOwningPIDsCount"));
1446 size_t TargetNumNonzeros =
1447 UnpackAndCombineCrsMatrixImpl::unpackAndCombineWithOwningPIDsCount(
1448 local_matrix, permute_from_lids_d, imports_d,
1449 num_packets_per_lid_d, numSameIDs);
1450 tm = Teuchos::null;
1451
1452 tm = Teuchos::rcp(new Tpetra::Details::ProfilingRegion("Tpetra::Details::unpackAndCombineIntoCrsArrays_new: resize CRS pointers"));
1453 Kokkos::resize(crs_rowptr_d, TargetNumRows + 1);
1454 Kokkos::resize(crs_colind_d, TargetNumNonzeros);
1455 Kokkos::resize(crs_vals_d, TargetNumNonzeros);
1456 tm = Teuchos::null;
1457
1459 permute_to_lids_d.size() != permute_from_lids_d.size(), std::invalid_argument,
1460 prefix << "permuteToLIDs.size() = " << permute_to_lids_d.size()
1461 << "!= permute_from_lids_d.size() = " << permute_from_lids_d.size() << ".");
1462
1463 if (static_cast<size_t>(TargetPids.size()) != TargetNumNonzeros) {
1464 Kokkos::resize(TargetPids, TargetNumNonzeros);
1465 }
1466 Kokkos::deep_copy(execution_space(), TargetPids, -1);
1467
1468 // Grab pointers for sourceMatrix
1469 auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1470
1471 tm = Teuchos::rcp(new Tpetra::Details::ProfilingRegion("Tpetra::Details::unpackAndCombineIntoCrsArrays_new: create mirror views from inputs"));
1472 // Convert input arrays to Kokkos::Views
1473 DT outputDevice;
1474
1475 auto src_pids_d =
1477 SourcePids.size(), true, "src_pids");
1478
1479 tm = Teuchos::null;
1480
1481 size_t bytes_per_value = 0;
1483 // assume that ST is default constructible
1484 bytes_per_value = PackTraits<ST>::packValueCount(ST());
1485 } else {
1486 // Since the packed data come from the source matrix, we can use the source
1487 // matrix to get the number of bytes per Scalar value stored in the matrix.
1488 // This assumes that all Scalar values in the source matrix require the same
1489 // number of bytes. If the source matrix has no entries on the calling
1490 // process, then we hope that some process does have some idea how big
1491 // a Scalar value is. Of course, if no processes have any entries, then no
1492 // values should be packed (though this does assume that in our packing
1493 // scheme, rows with zero entries take zero bytes).
1494 size_t bytes_per_value_l = 0;
1495 if (local_matrix.values.extent(0) > 0) {
1496 const ST& val = local_matrix.values(0);
1498 } else {
1499 const ST& val = crs_vals_d(0);
1501 }
1502 Teuchos::reduceAll<int, size_t>(*(sourceMatrix.getComm()),
1503 Teuchos::REDUCE_MAX,
1505 outArg(bytes_per_value));
1506 }
1507
1508 tm = Teuchos::rcp(new Tpetra::Details::ProfilingRegion("Tpetra::Details::unpackAndCombineIntoCrsArrays_new: unpackAndCombineIntoCrsArrays"));
1509 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsArrays(
1510 local_matrix, local_col_map, import_lids_d, imports_d,
1514 bytes_per_value);
1515 tm = Teuchos::null;
1516
1517 // Copy outputs back to host
1518 tm = Teuchos::rcp(new Tpetra::Details::ProfilingRegion("Tpetra::Details::unpackAndCombineIntoCrsArrays_new: copy back to host"));
1519
1520 Kokkos::parallel_for(
1521 "setLocalEntriesToPID", Kokkos::RangePolicy<typename DT::execution_space>(0, TargetPids.size()), KOKKOS_LAMBDA(const size_t i) {
1522 if (TargetPids(i) == -1) TargetPids(i) = MyTargetPID;
1523 });
1524
1525} // unpackAndCombineIntoCrsArrays
1526
1527template <typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1530 const Kokkos::View<LocalOrdinal const*,
1531 Kokkos::Device<typename Node::device_type::execution_space,
1532 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1534 ,
1535 void, void
1536#endif
1537 >
1539 const Kokkos::View<const char*,
1540 Kokkos::Device<typename Node::device_type::execution_space,
1541 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1543 ,
1544 void, void
1545#endif
1546 >
1547 imports_d,
1548 const Kokkos::View<const size_t*,
1549 Kokkos::Device<typename Node::device_type::execution_space,
1550 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1552 ,
1553 void, void
1554#endif
1555 >
1557 const size_t numSameIDs,
1558 const Kokkos::View<LocalOrdinal const*,
1559 Kokkos::Device<typename Node::device_type::execution_space,
1560 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1562 ,
1563 void, void
1564#endif
1565 >
1567 const Kokkos::View<LocalOrdinal const*,
1568 Kokkos::Device<typename Node::device_type::execution_space,
1569 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1571 ,
1572 void, void
1573#endif
1574 >
1576 size_t TargetNumRows,
1577 const int MyTargetPID,
1578 Teuchos::ArrayRCP<size_t>& CRS_rowptr,
1579 Teuchos::ArrayRCP<GlobalOrdinal>& CRS_colind,
1580 Teuchos::ArrayRCP<Scalar>& CRS_vals,
1581 const Teuchos::ArrayView<const int>& SourcePids,
1582 Teuchos::Array<int>& TargetPids) {
1583 using execution_space = typename Node::execution_space;
1585
1586 using Kokkos::deep_copy;
1587 using Kokkos::View;
1588
1589 using Teuchos::ArrayView;
1590 using Teuchos::outArg;
1591 using Teuchos::REDUCE_MAX;
1592 using Teuchos::reduceAll;
1593
1594 typedef typename Node::device_type DT;
1595
1597 typedef typename matrix_type::impl_scalar_type ST;
1598
1599 const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays_new: ";
1600 Teuchos::RCP<Tpetra::Details::ProfilingRegion> tm;
1601
1602 using Kokkos::MemoryUnmanaged;
1603
1604 TEUCHOS_TEST_FOR_EXCEPTION(permute_to_lids_d.size() != permute_from_lids_d.size(), std::invalid_argument,
1605 prefix << "permute_to_lids_d.size() = " << permute_to_lids_d.size() << " != "
1606 "permute_from_lids_d.size() = "
1607 << permute_from_lids_d.size() << ".");
1608 // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
1609 // process, then the matrix is neither locally nor globally indexed.
1610 const bool locallyIndexed = sourceMatrix.isLocallyIndexed();
1611 TEUCHOS_TEST_FOR_EXCEPTION(!locallyIndexed, std::invalid_argument, prefix << "The input "
1612 "CrsMatrix 'sourceMatrix' must be locally indexed.");
1613 TEUCHOS_TEST_FOR_EXCEPTION(((size_t)import_lids_d.size()) != num_packets_per_lid_d.size(), std::invalid_argument,
1614 prefix << "import_lids_d.size() = " << import_lids_d.size() << " != "
1615 "num_packets_per_lid_d.size() = "
1616 << num_packets_per_lid_d.size() << ".");
1617
1618 auto local_matrix = sourceMatrix.getLocalMatrixDevice();
1619
1620 // TargetNumNonzeros is number of nonzeros in local matrix.
1621 tm = Teuchos::rcp(new Tpetra::Details::ProfilingRegion("Tpetra::Details::unpackAndCombineIntoCrsArrays_new: unpackAndCombineWithOwningPIDsCount"));
1622 size_t TargetNumNonzeros =
1623 UnpackAndCombineCrsMatrixImpl::unpackAndCombineWithOwningPIDsCount(
1624 local_matrix, permute_from_lids_d, imports_d,
1625 num_packets_per_lid_d, numSameIDs);
1626 tm = Teuchos::null;
1627
1628 tm = Teuchos::rcp(new Tpetra::Details::ProfilingRegion("Tpetra::Details::unpackAndCombineIntoCrsArrays_new: resize CRS pointers"));
1629 CRS_rowptr.resize(TargetNumRows + 1);
1632 Teuchos::ArrayRCP<ST> const& CRS_vals_impl_scalar_type = Teuchos::arcp_reinterpret_cast<ST>(CRS_vals);
1633 tm = Teuchos::null;
1634
1636 permute_to_lids_d.size() != permute_from_lids_d.size(), std::invalid_argument,
1637 prefix << "permuteToLIDs.size() = " << permute_to_lids_d.size()
1638 << "!= permute_from_lids_d.size() = " << permute_from_lids_d.size() << ".");
1639
1640 // Preseed TargetPids with -1 for local
1641 if (static_cast<size_t>(TargetPids.size()) != TargetNumNonzeros) {
1643 }
1644 TargetPids.assign(TargetNumNonzeros, -1);
1645
1646 // Grab pointers for sourceMatrix
1647 auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1648
1649 tm = Teuchos::rcp(new Tpetra::Details::ProfilingRegion("Tpetra::Details::unpackAndCombineIntoCrsArrays_new: create mirror views from inputs"));
1650 // Convert input arrays to Kokkos::Views
1651 DT outputDevice;
1652
1653 auto crs_rowptr_d =
1655 CRS_rowptr.size(), true, "crs_rowptr");
1656
1657 auto crs_colind_d =
1659 CRS_colind.size(), true, "crs_colidx");
1660#ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1661 static_assert(!std::is_same<
1662 typename std::remove_const<
1663 typename std::decay<
1664 decltype(CRS_vals_impl_scalar_type)>::type::value_type>::type,
1665 std::complex<double>>::value,
1666 "CRS_vals::value_type is std::complex<double>; this should never happen"
1667 ", since std::complex does not work in Kokkos::View objects.");
1668#endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1669
1670 auto crs_vals_d =
1672 CRS_vals_impl_scalar_type.size(), true, "crs_vals");
1673
1674#ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1675 static_assert(!std::is_same<
1676 typename decltype(crs_vals_d)::non_const_value_type,
1677 std::complex<double>>::value,
1678 "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1679 "never happen, since std::complex does not work in Kokkos::View objects.");
1680#endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1681
1682 auto src_pids_d =
1684 SourcePids.size(), true, "src_pids");
1685
1686 auto tgt_pids_d =
1688 TargetPids.size(), true, "tgt_pids");
1689
1690 tm = Teuchos::null;
1691
1692 size_t bytes_per_value = 0;
1694 // assume that ST is default constructible
1695 bytes_per_value = PackTraits<ST>::packValueCount(ST());
1696 } else {
1697 // Since the packed data come from the source matrix, we can use the source
1698 // matrix to get the number of bytes per Scalar value stored in the matrix.
1699 // This assumes that all Scalar values in the source matrix require the same
1700 // number of bytes. If the source matrix has no entries on the calling
1701 // process, then we hope that some process does have some idea how big
1702 // a Scalar value is. Of course, if no processes have any entries, then no
1703 // values should be packed (though this does assume that in our packing
1704 // scheme, rows with zero entries take zero bytes).
1705 size_t bytes_per_value_l = 0;
1706 if (local_matrix.values.extent(0) > 0) {
1707 const ST& val = local_matrix.values(0);
1709 } else {
1710 const ST& val = crs_vals_d(0);
1712 }
1713 Teuchos::reduceAll<int, size_t>(*(sourceMatrix.getComm()),
1714 Teuchos::REDUCE_MAX,
1716 outArg(bytes_per_value));
1717 }
1718
1719#ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1720 static_assert(!std::is_same<
1721 typename decltype(crs_vals_d)::non_const_value_type,
1722 std::complex<double>>::value,
1723 "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1724 "never happen, since std::complex does not work in Kokkos::View objects.");
1725#endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1726
1727 tm = Teuchos::rcp(new Tpetra::Details::ProfilingRegion("Tpetra::Details::unpackAndCombineIntoCrsArrays_new: unpackAndCombineIntoCrsArrays"));
1728 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsArrays(
1729 local_matrix, local_col_map, import_lids_d, imports_d,
1733 bytes_per_value);
1734 tm = Teuchos::null;
1735
1736 // Copy outputs back to host
1737 tm = Teuchos::rcp(new Tpetra::Details::ProfilingRegion("Tpetra::Details::unpackAndCombineIntoCrsArrays_new: copy back to host"));
1738 typename decltype(crs_rowptr_d)::host_mirror_type crs_rowptr_h(
1739 CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1740 // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
1741 deep_copy(execution_space(), crs_rowptr_h, crs_rowptr_d);
1742
1743 typename decltype(crs_colind_d)::host_mirror_type crs_colind_h(
1744 CRS_colind.getRawPtr(), CRS_colind.size());
1745 // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
1746 deep_copy(execution_space(), crs_colind_h, crs_colind_d);
1747
1748 typename decltype(crs_vals_d)::host_mirror_type crs_vals_h(
1750 // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
1751 deep_copy(execution_space(), crs_vals_h, crs_vals_d);
1752
1753 typename decltype(tgt_pids_d)::host_mirror_type tgt_pids_h(
1754 TargetPids.getRawPtr(), TargetPids.size());
1755 // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
1756 deep_copy(execution_space(), tgt_pids_h, tgt_pids_d);
1757
1758} // unpackAndCombineIntoCrsArrays
1759
1760} // namespace Details
1761} // namespace Tpetra
1762
1763#define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT_KOKKOS_DEPRECATED_CODE_4_ON(ST, LO, GO, NT) \
1764 template void \
1765 Details::unpackCrsMatrixAndCombine<ST, LO, GO, NT>( \
1766 const CrsMatrix<ST, LO, GO, NT>&, \
1767 const Teuchos::ArrayView<const char>&, \
1768 const Teuchos::ArrayView<const size_t>&, \
1769 const Teuchos::ArrayView<const LO>&, \
1770 size_t, \
1771 CombineMode); \
1772 template size_t \
1773 Details::unpackAndCombineWithOwningPIDsCount<ST, LO, GO, NT>( \
1774 const CrsMatrix<ST, LO, GO, NT>&, \
1775 const Teuchos::ArrayView<const LO>&, \
1776 const Teuchos::ArrayView<const char>&, \
1777 const Teuchos::ArrayView<const size_t>&, \
1778 size_t, \
1779 CombineMode, \
1780 size_t, \
1781 const Teuchos::ArrayView<const LO>&, \
1782 const Teuchos::ArrayView<const LO>&); \
1783 template void \
1784 Details::unpackCrsMatrixAndCombineNew<ST, LO, GO, NT>( \
1785 const CrsMatrix<ST, LO, GO, NT>&, \
1786 Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1787 Kokkos::DualView<size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1788 const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1789 const size_t, \
1790 const CombineMode); \
1791 template void \
1792 Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT>( \
1793 const CrsMatrix<ST, LO, GO, NT>&, \
1794 const Kokkos::View<LO const*, \
1795 Kokkos::Device<typename NT::device_type::execution_space, \
1796 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1797 void, void>, \
1798 const Kokkos::View<const char*, \
1799 Kokkos::Device<typename NT::device_type::execution_space, \
1800 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1801 void, void>, \
1802 const Kokkos::View<const size_t*, \
1803 Kokkos::Device<typename NT::device_type::execution_space, \
1804 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1805 void, void>, \
1806 const size_t, \
1807 const Kokkos::View<LO const*, \
1808 Kokkos::Device<typename NT::device_type::execution_space, \
1809 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1810 void, void>, \
1811 const Kokkos::View<LO const*, \
1812 Kokkos::Device<typename NT::device_type::execution_space, \
1813 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1814 void, void>, \
1815 size_t, \
1816 const int, \
1817 Kokkos::View<size_t*, typename NT::device_type>&, \
1818 Kokkos::View<GO*, typename NT::device_type>&, \
1819 Kokkos::View<typename CrsMatrix<ST, LO, GO, NT>::impl_scalar_type*, typename NT::device_type>&, \
1820 const Teuchos::ArrayView<const int>&, \
1821 Kokkos::View<int*, typename NT::device_type>&); \
1822 template void \
1823 Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT>( \
1824 const CrsMatrix<ST, LO, GO, NT>&, \
1825 const Kokkos::View<LO const*, \
1826 Kokkos::Device<typename NT::device_type::execution_space, \
1827 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1828 void, void>, \
1829 const Kokkos::View<const char*, \
1830 Kokkos::Device<typename NT::device_type::execution_space, \
1831 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1832 void, void>, \
1833 const Kokkos::View<const size_t*, \
1834 Kokkos::Device<typename NT::device_type::execution_space, \
1835 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1836 void, void>, \
1837 const size_t, \
1838 const Kokkos::View<LO const*, \
1839 Kokkos::Device<typename NT::device_type::execution_space, \
1840 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1841 void, void>, \
1842 const Kokkos::View<LO const*, \
1843 Kokkos::Device<typename NT::device_type::execution_space, \
1844 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1845 void, void>, \
1846 size_t, \
1847 const int, \
1848 Teuchos::ArrayRCP<size_t>&, \
1849 Teuchos::ArrayRCP<GO>&, \
1850 Teuchos::ArrayRCP<ST>&, \
1851 const Teuchos::ArrayView<const int>&, \
1852 Teuchos::Array<int>&);
1853
1854#define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT_KOKKOS_DEPRECATED_CODE_4_OFF(ST, LO, GO, NT) \
1855 template void \
1856 Details::unpackCrsMatrixAndCombine<ST, LO, GO, NT>( \
1857 const CrsMatrix<ST, LO, GO, NT>&, \
1858 const Teuchos::ArrayView<const char>&, \
1859 const Teuchos::ArrayView<const size_t>&, \
1860 const Teuchos::ArrayView<const LO>&, \
1861 size_t, \
1862 CombineMode); \
1863 template size_t \
1864 Details::unpackAndCombineWithOwningPIDsCount<ST, LO, GO, NT>( \
1865 const CrsMatrix<ST, LO, GO, NT>&, \
1866 const Teuchos::ArrayView<const LO>&, \
1867 const Teuchos::ArrayView<const char>&, \
1868 const Teuchos::ArrayView<const size_t>&, \
1869 size_t, \
1870 CombineMode, \
1871 size_t, \
1872 const Teuchos::ArrayView<const LO>&, \
1873 const Teuchos::ArrayView<const LO>&); \
1874 template void \
1875 Details::unpackCrsMatrixAndCombineNew<ST, LO, GO, NT>( \
1876 const CrsMatrix<ST, LO, GO, NT>&, \
1877 Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1878 Kokkos::DualView<size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1879 const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1880 const size_t, \
1881 const CombineMode); \
1882 template void \
1883 Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT>( \
1884 const CrsMatrix<ST, LO, GO, NT>&, \
1885 const Kokkos::View<LO const*, \
1886 Kokkos::Device<typename NT::device_type::execution_space, \
1887 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1888 const Kokkos::View<const char*, \
1889 Kokkos::Device<typename NT::device_type::execution_space, \
1890 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1891 const Kokkos::View<const size_t*, \
1892 Kokkos::Device<typename NT::device_type::execution_space, \
1893 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1894 const size_t, \
1895 const Kokkos::View<LO const*, \
1896 Kokkos::Device<typename NT::device_type::execution_space, \
1897 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1898 const Kokkos::View<LO const*, \
1899 Kokkos::Device<typename NT::device_type::execution_space, \
1900 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1901 size_t, \
1902 const int, \
1903 Kokkos::View<size_t*, typename NT::device_type>&, \
1904 Kokkos::View<GO*, typename NT::device_type>&, \
1905 Kokkos::View<typename CrsMatrix<ST, LO, GO, NT>::impl_scalar_type*, typename NT::device_type>&, \
1906 const Teuchos::ArrayView<const int>&, \
1907 Kokkos::View<int*, typename NT::device_type>&); \
1908 template void \
1909 Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT>( \
1910 const CrsMatrix<ST, LO, GO, NT>&, \
1911 const Kokkos::View<LO const*, \
1912 Kokkos::Device<typename NT::device_type::execution_space, \
1913 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1914 const Kokkos::View<const char*, \
1915 Kokkos::Device<typename NT::device_type::execution_space, \
1916 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1917 const Kokkos::View<const size_t*, \
1918 Kokkos::Device<typename NT::device_type::execution_space, \
1919 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1920 const size_t, \
1921 const Kokkos::View<LO const*, \
1922 Kokkos::Device<typename NT::device_type::execution_space, \
1923 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1924 const Kokkos::View<LO const*, \
1925 Kokkos::Device<typename NT::device_type::execution_space, \
1926 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1927 size_t, \
1928 const int, \
1929 Teuchos::ArrayRCP<size_t>&, \
1930 Teuchos::ArrayRCP<GO>&, \
1931 Teuchos::ArrayRCP<ST>&, \
1932 const Teuchos::ArrayView<const int>&, \
1933 Teuchos::Array<int>&);
1934
1935#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1936#define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT(ST, LO, GO, NT) \
1937 TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT_KOKKOS_DEPRECATED_CODE_4_ON(ST, LO, GO, NT)
1938#else
1939#define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT(ST, LO, GO, NT) \
1940 TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT_KOKKOS_DEPRECATED_CODE_4_OFF(ST, LO, GO, NT)
1941#endif
1942
1943#endif // TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
Declaration of the Tpetra::CrsMatrix class.
Import KokkosSparse::OrdinalTraits, a traits class for "invalid" (flag) values of integer types,...
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary,...
Declaration and definition of Tpetra::Details::getEntryOnHost.
size_t compute_total_num_entries(const Kokkos::View< const size_t *, BDT > &num_packets_per_lid, const Kokkos::View< const size_t *, DT > &offsets, const Kokkos::View< const char *, BDT > &imports)
Total number of entries in any row of the packed matrix.
void unpackAndCombineIntoCrsMatrix(const LocalMatrix &local_matrix, const LocalMap &local_map, const Kokkos::View< const char *, BufferDeviceType > &imports, const Kokkos::View< const size_t *, BufferDeviceType > &num_packets_per_lid, const typename PackTraits< typename LocalMap::local_ordinal_type >::input_array_type import_lids, const Tpetra::CombineMode combine_mode)
Perform the unpack operation for the matrix.
size_t compute_maximum_num_entries(const Kokkos::View< const size_t *, BDT > &num_packets_per_lid, const Kokkos::View< const size_t *, DT > &offsets, const Kokkos::View< const char *, BDT > &imports)
Maximum number of entries in any row of the packed matrix.
bool compute_batch_info(const View1 &batches_per_lid, View2 &batch_info)
Compute the index and batch number associated with each batch.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
typename row_matrix_type::impl_scalar_type impl_scalar_type
The type used internally in place of Scalar.
Struct that holds views of the contents of a CrsMatrix.
static size_t hierarchicalUnpackBatchSize()
Size of batch for hierarchical unpacking.
static size_t hierarchicalUnpackTeamSize()
Size of team for hierarchical unpacking.
"Local" part of Map suitable for Kokkos kernels.
KOKKOS_INLINE_FUNCTION LocalOrdinal getLocalElement(const GlobalOrdinal globalIndex) const
Get the local index corresponding to the given global index. (device only)
LocalOrdinal local_ordinal_type
The type of local indices.
GlobalOrdinal global_ordinal_type
The type of global indices.
DeviceType device_type
The device type.
Kokkos::parallel_reduce functor to determine the number of entries (to unpack) in a KokkosSparse::Crs...
Kokkos::Device< typename device_type::execution_space, buffer_memory_space > buffer_device_type
Kokkos::Device specialization for communication buffers.
Implementation details of Tpetra.
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks.
void unpackCrsMatrixAndCombine(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, const Teuchos::ArrayView< const char > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &importLIDs, size_t constantNumPackets, CombineMode combineMode)
Unpack the imported column indices and values, and combine into matrix.
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
CombineMode
Rule for combining data in an Import or Export.
@ REPLACE
Replace existing values with new values.
@ ADD
Sum new values.
@ ABSMAX
Replace old value with maximum of magnitudes of old and new values.
@ INSERT
Insert new values that don't currently exist.
Traits class for packing / unpacking data of type T.
static KOKKOS_INLINE_FUNCTION Kokkos::pair< int, size_t > unpackArray(value_type outBuf[], const char inBuf[], const size_t numEnt)
Unpack numEnt value_type entries from the given input buffer of bytes, to the given output buffer of ...
static KOKKOS_INLINE_FUNCTION size_t unpackValue(T &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
Kokkos::View< value_type *, Kokkos::AnonymousSpace > output_array_type
The type of an output array of value_type.
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const T &)
Number of bytes required to pack or unpack the given value of type value_type.
Kokkos::View< const value_type *, Kokkos::AnonymousSpace > input_array_type
The type of an input array of value_type.