Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
11#define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
12
13#include <memory>
14#include <string>
15#include "TpetraCore_config.h"
16#include "Kokkos_Core.hpp"
17#include "Teuchos_Array.hpp"
18#include "Teuchos_ArrayView.hpp"
19#include "Teuchos_OrdinalTraits.hpp"
20#include "Teuchos_TimeMonitor.hpp"
28#include "Tpetra_Details_DefaultTypes.hpp"
30
51
52namespace Tpetra {
53
54//
55// Users must never rely on anything in the Details namespace.
56//
57namespace Details {
58
59namespace UnpackAndCombineCrsMatrixImpl {
60
70template <class ST, class LO, class GO>
71KOKKOS_FUNCTION int
75 const char imports[],
76 const size_t offset,
77 const size_t /* num_bytes */,
78 const size_t num_ent,
79 const size_t bytes_per_value) {
80 if (num_ent == 0) {
81 // Empty rows always take zero bytes, to ensure sparsity.
82 return 0;
83 }
84 bool unpack_pids = pids_out.size() > 0;
85
86 const size_t num_ent_beg = offset;
87 const size_t num_ent_len = PackTraits<LO>::packValueCount(LO(0));
88
89 const size_t gids_beg = num_ent_beg + num_ent_len;
90 const size_t gids_len =
92
93 const size_t pids_beg = gids_beg + gids_len;
94 const size_t pids_len = unpack_pids ? size_t(num_ent * PackTraits<int>::packValueCount(int(0))) : size_t(0);
95
96 const size_t vals_beg = gids_beg + gids_len + pids_len;
97 const size_t vals_len = num_ent * bytes_per_value;
98
99 const char* const num_ent_in = imports + num_ent_beg;
100 const char* const gids_in = imports + gids_beg;
101 const char* const pids_in = unpack_pids ? imports + pids_beg : nullptr;
102 const char* const vals_in = imports + vals_beg;
103
104 size_t num_bytes_out = 0;
105 LO num_ent_out;
107 if (static_cast<size_t>(num_ent_out) != num_ent) {
108 return 20; // error code
109 }
110
111 {
112 Kokkos::pair<int, size_t> p;
114 if (p.first != 0) {
115 return 21; // error code
116 }
117 num_bytes_out += p.second;
118
119 if (unpack_pids) {
121 if (p.first != 0) {
122 return 22; // error code
123 }
124 num_bytes_out += p.second;
125 }
126
128 if (p.first != 0) {
129 return 23; // error code
130 }
131 num_bytes_out += p.second;
132 }
133
136 return 24; // error code
137 }
138 return 0; // no errors
139} // unpackRow
140
151template <class LocalMatrix, class LocalMap, class BufferDeviceType>
154 typedef LocalMap local_map_type;
155
156 typedef typename local_matrix_type::value_type ST;
157 typedef typename local_map_type::local_ordinal_type LO;
159 typedef typename local_map_type::device_type DT;
160 typedef typename DT::execution_space XS;
161
162 typedef Kokkos::View<const size_t*, BufferDeviceType>
163 num_packets_per_lid_type;
164 typedef Kokkos::View<const size_t*, DT> offsets_type;
165 typedef Kokkos::View<const char*, BufferDeviceType> input_buffer_type;
166 typedef Kokkos::View<const LO*, BufferDeviceType> import_lids_type;
167
168 typedef Kokkos::View<int, DT> error_type;
169 using member_type = typename Kokkos::TeamPolicy<XS>::member_type;
170
171 static_assert(std::is_same<LO, typename local_matrix_type::ordinal_type>::value,
172 "LocalMap::local_ordinal_type and "
173 "LocalMatrix::ordinal_type must be the same.");
174
175 local_matrix_type local_matrix;
176 local_map_type local_col_map;
177 input_buffer_type imports;
178 num_packets_per_lid_type num_packets_per_lid;
179 import_lids_type import_lids;
180 Kokkos::View<const LO* [2], DT> batch_info;
181 offsets_type offsets;
182 Tpetra::CombineMode combine_mode;
183 size_t batch_size;
184 size_t bytes_per_value;
185 bool atomic;
186 error_type error_code;
187
191 const input_buffer_type& imports_in,
192 const num_packets_per_lid_type& num_packets_per_lid_in,
193 const import_lids_type& import_lids_in,
194 const Kokkos::View<const LO* [2], DT>& batch_info_in,
195 const offsets_type& offsets_in,
197 const size_t batch_size_in,
198 const size_t bytes_per_value_in,
199 const bool atomic_in)
200 : local_matrix(local_matrix_in)
201 , local_col_map(local_col_map_in)
202 , imports(imports_in)
203 , num_packets_per_lid(num_packets_per_lid_in)
204 , import_lids(import_lids_in)
205 , batch_info(batch_info_in)
206 , offsets(offsets_in)
207 , combine_mode(combine_mode_in)
208 , batch_size(batch_size_in)
209 , bytes_per_value(bytes_per_value_in)
210 , atomic(atomic_in)
211 , error_code("error") {}
212
214 void operator()(member_type team_member) const {
215 using Kokkos::MemoryUnmanaged;
216 using Kokkos::subview;
217 using Kokkos::View;
218
219 const LO batch = team_member.league_rank();
220 const LO lid_no = batch_info(batch, 0);
221 const LO batch_no = batch_info(batch, 1);
222
223 const size_t num_bytes = num_packets_per_lid(lid_no);
224
225 // Only unpack data if there is a nonzero number of bytes.
226 if (num_bytes == 0)
227 return;
228
229 // there is actually something in the row
230 const LO import_lid = import_lids(lid_no);
231 const size_t buf_size = imports.size();
232 const size_t offset = offsets(lid_no);
233
234 // Get the number of entries to expect in the received data for this row.
235 LO num_ent_LO = 0;
236 const char* const in_buf = imports.data() + offset;
238 const size_t num_entries_in_row = static_cast<size_t>(num_ent_LO);
239
240 // Count the number of bytes expected to unpack
241 size_t expected_num_bytes = 0;
242 {
246 }
247
249 Kokkos::printf(
250 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
251 "At row %d, the expected number of bytes (%d) != number of unpacked bytes (%d)\n",
252 (int)lid_no, (int)expected_num_bytes, (int)num_bytes);
253
254 Kokkos::atomic_compare_exchange(error_code.data(), 0, 21);
255 return;
256 }
257
258 if (offset > buf_size || offset + num_bytes > buf_size) {
259 Kokkos::printf(
260 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
261 "At row %d, the offset (%d) > buffer size (%d)\n",
262 (int)lid_no, (int)offset, (int)buf_size);
263
264 Kokkos::atomic_compare_exchange(error_code.data(), 0, 22);
265 return;
266 }
267
268 // Determine the number of entries to unpack in this batch
269 size_t num_entries_in_batch = 0;
270 if (num_entries_in_row <= batch_size)
272 else if (num_entries_in_row >= (batch_no + 1) * batch_size)
273 num_entries_in_batch = batch_size;
274 else
276
277 const size_t bytes_per_lid = PackTraits<LO>::packValueCount(LO(0));
278 const size_t num_ent_start = offset;
279 const size_t num_ent_end = num_ent_start + bytes_per_lid;
280
282 const size_t gids_start = num_ent_end;
284
285 const size_t vals_start = gids_end;
286
287 const size_t shift = batch_no * batch_size;
288 const char* const num_ent_in = imports.data() + num_ent_start;
289 const char* const gids_in = imports.data() + gids_start + shift * bytes_per_gid;
290 const char* const vals_in = imports.data() + vals_start + shift * bytes_per_value;
291
292 LO num_ent_out;
294 if (static_cast<size_t>(num_ent_out) != num_entries_in_row) {
295 Kokkos::printf(
296 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
297 "At row %d, number of entries (%d) != number of entries unpacked (%d)\n",
298 (int)lid_no, (int)num_entries_in_row, (int)num_ent_out);
299
300 Kokkos::atomic_compare_exchange(error_code.data(), 0, 23);
301 }
302
303 constexpr bool matrix_has_sorted_rows = true; // see #6282
304 // Note BMK 6-22: this lambda must use capture-by-value [=] and not capture-by-ref [&].
305 // By ref triggers compiler bug in CUDA 10.
306 Kokkos::parallel_for(
307 Kokkos::TeamThreadRange(team_member, num_entries_in_batch),
308 [=, *this](const LO& j) {
309 size_t distance = 0;
310
311 GO gid_out;
314 auto lid_out = local_col_map.getLocalElement(gid_out);
315
316 // Column indices come in as global indices, in case the
317 // source object's column Map differs from the target object's
318 // (this's) column Map, and must be converted local index values
319
320 // assume that ST is default constructible
321 ST val_out;
322 distance = j * bytes_per_value;
324
325 if (combine_mode == ADD) {
326 // NOTE (mfh 20 Nov 2019) Must assume atomic is required, unless
327 // different threads don't touch the same row (i.e., no
328 // duplicates in incoming LIDs list).
329 const bool use_atomic_updates = atomic;
330 (void)local_matrix.sumIntoValues(
332 &lid_out,
333 1,
334 &val_out,
337 } else if (combine_mode == REPLACE) {
338 // NOTE (mfh 20 Nov 2019): It's never correct to use REPLACE
339 // combine mode with multiple incoming rows that touch the same
340 // target matrix entries, so we never need atomic updates.
341 const bool use_atomic_updates = false;
342 (void)local_matrix.replaceValues(
344 &lid_out,
345 1,
346 &val_out,
349 } else {
350 // should never get here
351 Kokkos::printf(
352 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
353 "At row %d, an unknown error occurred during unpack\n",
354 (int)lid_no);
355 Kokkos::atomic_compare_exchange(error_code.data(), 0, 31);
356 }
357 });
358
359 team_member.team_barrier();
360 }
361
363 int error() const {
364 auto error_code_h = Kokkos::create_mirror_view_and_copy(
365 Kokkos::HostSpace(), error_code);
366 return error_code_h();
367 }
368
369}; // UnpackCrsMatrixAndCombineFunctor
370
371struct MaxNumEntTag {};
372struct TotNumEntTag {};
373
382template <class LO, class DT, class BDT>
384 public:
385 typedef Kokkos::View<const size_t*, BDT> num_packets_per_lid_type;
386 typedef Kokkos::View<const size_t*, DT> offsets_type;
387 typedef Kokkos::View<const char*, BDT> input_buffer_type;
388 // This needs to be public, since it appears in the argument list of
389 // public methods (see below). Otherwise, build errors may happen.
390 typedef size_t value_type;
391
392 private:
393 num_packets_per_lid_type num_packets_per_lid;
394 offsets_type offsets;
395 input_buffer_type imports;
396
397 public:
398 NumEntriesFunctor(const num_packets_per_lid_type num_packets_per_lid_in,
399 const offsets_type& offsets_in,
400 const input_buffer_type& imports_in)
401 : num_packets_per_lid(num_packets_per_lid_in)
402 , offsets(offsets_in)
403 , imports(imports_in) {}
404
406 operator()(const MaxNumEntTag, const LO i, value_type& update) const {
407 // Get how many entries to expect in the received data for this row.
408 const size_t num_bytes = num_packets_per_lid(i);
409 if (num_bytes > 0) {
410 LO num_ent_LO = 0; // output argument of unpackValue
411 const char* const in_buf = imports.data() + offsets(i);
413 const size_t num_ent = static_cast<size_t>(num_ent_LO);
414
415 update = (update < num_ent) ? num_ent : update;
416 }
417 }
418
420 join(const MaxNumEntTag,
421 value_type& dst,
422 const value_type& src) const {
423 if (dst < src) dst = src;
424 }
425
427 operator()(const TotNumEntTag, const LO i, value_type& tot_num_ent) const {
428 // Get how many entries to expect in the received data for this row.
429 const size_t num_bytes = num_packets_per_lid(i);
430 if (num_bytes > 0) {
431 LO num_ent_LO = 0; // output argument of unpackValue
432 const char* const in_buf = imports.data() + offsets(i);
434 tot_num_ent += static_cast<size_t>(num_ent_LO);
435 }
436 }
437}; // NumEntriesFunctor
438
446template <class LO, class DT, class BDT>
447size_t
449 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
450 const Kokkos::View<const size_t*, DT>& offsets,
451 const Kokkos::View<const char*, BDT>& imports) {
452 typedef typename DT::execution_space XS;
453 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>,
454 MaxNumEntTag>
456
457 NumEntriesFunctor<LO, DT, BDT> functor(num_packets_per_lid, offsets,
458 imports);
459 const LO numRowsToUnpack =
460 static_cast<LO>(num_packets_per_lid.extent(0));
461 size_t max_num_ent = 0;
462 Kokkos::parallel_reduce("Max num entries in CRS",
464 functor, max_num_ent);
465 return max_num_ent;
466}
467
475template <class LO, class DT, class BDT>
476size_t
478 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
479 const Kokkos::View<const size_t*, DT>& offsets,
480 const Kokkos::View<const char*, BDT>& imports) {
481 typedef typename DT::execution_space XS;
482 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>, TotNumEntTag> range_policy;
483 size_t tot_num_ent = 0;
484 NumEntriesFunctor<LO, DT, BDT> functor(num_packets_per_lid, offsets,
485 imports);
486 const LO numRowsToUnpack =
487 static_cast<LO>(num_packets_per_lid.extent(0));
488 Kokkos::parallel_reduce("Total num entries in CRS to unpack",
491 return tot_num_ent;
492}
493
494template <class LO>
496 size_t
497 unpackRowCount(const char imports[],
498 const size_t offset,
499 const size_t num_bytes) {
500 using PT = PackTraits<LO>;
501
502 LO num_ent_LO = 0;
503 if (num_bytes > 0) {
504 const size_t p_num_bytes = PT::packValueCount(num_ent_LO);
505 if (p_num_bytes > num_bytes) {
507 }
508 const char* const in_buf = imports + offset;
509 (void)PT::unpackValue(num_ent_LO, in_buf);
510 }
511 return static_cast<size_t>(num_ent_LO);
512}
513
518template <class View1, class View2>
519inline bool
521 const View1& batches_per_lid,
522 View2& batch_info) {
523 using LO = typename View2::value_type;
524 size_t batch = 0;
525 for (size_t i = 0; i < batches_per_lid.extent(0); i++) {
526 for (size_t batch_no = 0; batch_no < batches_per_lid(i); batch_no++) {
527 batch_info(batch, 0) = static_cast<LO>(i);
528 batch_info(batch, 1) = batch_no;
529 batch++;
530 }
531 }
532 return batch == batch_info.extent(0);
533}
534
542template <class LocalMatrix, class LocalMap, class BufferDeviceType>
544 const LocalMatrix& local_matrix,
545 const LocalMap& local_map,
546 const Kokkos::View<const char*, BufferDeviceType>& imports,
547 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
549 const Tpetra::CombineMode combine_mode) {
550 using ST = typename LocalMatrix::value_type;
551 using LO = typename LocalMap::local_ordinal_type;
552 using DT = typename LocalMap::device_type;
553 using XS = typename DT::execution_space;
554 const char prefix[] =
555 "Tpetra::Details::UnpackAndCombineCrsMatrixImpl::"
556 "unpackAndCombineIntoCrsMatrix: ";
557
558 const size_t num_import_lids = static_cast<size_t>(import_lids.extent(0));
559 if (num_import_lids == 0) {
560 // Nothing to unpack
561 return;
562 }
563
564 {
565 // Check for correct input
566 TEUCHOS_TEST_FOR_EXCEPTION(combine_mode == ABSMAX,
567 std::invalid_argument,
568 prefix << "ABSMAX combine mode is not yet implemented for a matrix that has a "
569 "static graph (i.e., was constructed with the CrsMatrix constructor "
570 "that takes a const CrsGraph pointer).");
571
572 TEUCHOS_TEST_FOR_EXCEPTION(combine_mode == INSERT,
573 std::invalid_argument,
574 prefix << "INSERT combine mode is not allowed if the matrix has a static graph "
575 "(i.e., was constructed with the CrsMatrix constructor that takes a "
576 "const CrsGraph pointer).");
577
578 // Unknown combine mode!
579 TEUCHOS_TEST_FOR_EXCEPTION(!(combine_mode == ADD || combine_mode == REPLACE),
580 std::invalid_argument,
581 prefix << "Invalid combine mode; should never get "
582 "here! Please report this bug to the Tpetra developers.");
583
584 // Check that sizes of input objects are consistent.
586 num_import_lids != static_cast<size_t>(num_packets_per_lid.extent(0));
588 std::invalid_argument,
589 prefix << "importLIDs.size() (" << num_import_lids << ") != "
590 "numPacketsPerLID.size() ("
591 << num_packets_per_lid.extent(0) << ").");
592 } // end QA error checking
593
594 // Get the offsets
595 Kokkos::View<size_t*, DT> offsets("offsets", num_import_lids + 1);
596 computeOffsetsFromCounts(offsets, num_packets_per_lid);
597
598 // Determine the sizes of the unpack batches
599 size_t max_num_ent = compute_maximum_num_entries<LO, DT>(num_packets_per_lid, offsets, imports);
601 const size_t batch_size = std::min(default_batch_size, max_num_ent);
602
603 // To achieve some balance amongst threads, unpack each row in equal size batches
604 size_t num_batches = 0;
605 Kokkos::View<LO* [2], DT> batch_info("", num_batches);
606 Kokkos::View<size_t*, DT> batches_per_lid("", num_import_lids);
607 // Compute meta data that allows batch unpacking
608 Kokkos::parallel_reduce(
609 Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t>>(0, num_import_lids),
610 KOKKOS_LAMBDA(const size_t i, size_t& batches) {
612 imports.data(), offsets(i), num_packets_per_lid(i));
614 (num_entries_in_row <= batch_size) ? 1 : num_entries_in_row / batch_size + (num_entries_in_row % batch_size != 0);
616 },
618 Kokkos::resize(batch_info, num_batches);
619
620 Kokkos::HostSpace host_space;
621 auto batches_per_lid_h = Kokkos::create_mirror_view(host_space, batches_per_lid);
622 // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
623 Kokkos::deep_copy(XS(), batches_per_lid_h, batches_per_lid);
624
625 auto batch_info_h = Kokkos::create_mirror_view(host_space, batch_info);
626
628 // DEEP_COPY REVIEW - HOSTMIRROR-TO-DEVICE
629 Kokkos::deep_copy(XS(), batch_info, batch_info_h);
630
631 // FIXME (TJF SEP 2017)
632 // The scalar type is not necessarily default constructible
633 size_t bytes_per_value = PackTraits<ST>::packValueCount(ST());
634
635 // Now do the actual unpack!
636 const bool atomic = XS().concurrency() != 1;
638 functor f(
639 local_matrix,
640 local_map,
641 imports,
642 num_packets_per_lid,
643 import_lids,
644 batch_info,
645 offsets,
646 combine_mode,
647 batch_size,
648 bytes_per_value,
649 atomic);
650
651 using policy = Kokkos::TeamPolicy<XS, Kokkos::IndexType<LO>>;
653 if (!Spaces::is_gpu_exec_space<XS>() || team_size == Teuchos::OrdinalTraits<size_t>::invalid()) {
654 Kokkos::parallel_for(policy(static_cast<LO>(num_batches), Kokkos::AUTO), f);
655 } else {
656 Kokkos::parallel_for(policy(static_cast<LO>(num_batches), static_cast<int>(team_size)), f);
657 }
658
659 auto error_code = f.error();
661 error_code != 0,
662 std::runtime_error,
663 prefix << "UnpackCrsMatrixAndCombineFunctor reported error code " << error_code);
664} // unpackAndCombineIntoCrsMatrix (Kokkos version)
665
666template <class LocalMatrix, class BufferDeviceType>
667size_t
669 const LocalMatrix& local_matrix,
672 const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
673 const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
674#else
675 const Kokkos::View<const char*, BufferDeviceType>& imports,
676 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
677#endif
678 const size_t num_same_ids) {
679 using Kokkos::parallel_reduce;
680 typedef typename LocalMatrix::ordinal_type LO;
681 typedef typename LocalMatrix::device_type device_type;
682 typedef typename device_type::execution_space XS;
683 typedef typename Kokkos::View<LO*, device_type>::size_type size_type;
684 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>> range_policy;
685 typedef BufferDeviceType BDT;
686
687 size_t count = 0;
688 LO num_items;
689
690 // Number of matrix entries to unpack (returned by this function).
691 num_items = static_cast<LO>(num_same_ids);
692 if (num_items) {
693 size_t kcnt = 0;
696 KOKKOS_LAMBDA(const LO lid, size_t& update) {
697 update += static_cast<size_t>(local_matrix.graph.row_map[lid + 1] - local_matrix.graph.row_map[lid]);
698 },
699 kcnt);
700 count += kcnt;
701 }
702
703 // Count entries copied directly from the source matrix with permuting.
704 num_items = static_cast<LO>(permute_from_lids.extent(0));
705 if (num_items) {
706 size_t kcnt = 0;
707 parallel_reduce(
708 range_policy(0, num_items),
709 KOKKOS_LAMBDA(const LO i, size_t& update) {
710 const LO lid = permute_from_lids(i);
711 update += static_cast<size_t>(local_matrix.graph.row_map[lid + 1] - local_matrix.graph.row_map[lid]);
712 },
713 kcnt);
714 count += kcnt;
715 }
716
717 {
718 // Count entries received from other MPI processes.
719 const size_type np = num_packets_per_lid.extent(0);
720 Kokkos::View<size_t*, device_type> offsets("offsets", np + 1);
721 computeOffsetsFromCounts(offsets, num_packets_per_lid);
722 count +=
723 compute_total_num_entries<LO, device_type, BDT>(num_packets_per_lid,
724 offsets, imports);
725 }
726
727 return count;
728} // unpackAndCombineWithOwningPIDsCount (Kokkos version)
729
731template <class LO, class DT, class BDT>
732int setupRowPointersForRemotes(
734 const typename PackTraits<LO>::input_array_type& import_lids,
735 const Kokkos::View<const char*, BDT>& imports,
736 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
737 const typename PackTraits<size_t>::input_array_type& offsets) {
738 using Kokkos::parallel_reduce;
739 typedef typename DT::execution_space XS;
740 typedef typename PackTraits<size_t>::input_array_type::size_type size_type;
741 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type>> range_policy;
742
744 const size_type N = num_packets_per_lid.extent(0);
745
746 int errors = 0;
748 "Setup row pointers for remotes",
749 range_policy(0, N),
750 KOKKOS_LAMBDA(const size_t i, int& k_error) {
751 typedef typename std::remove_reference<decltype(tgt_rowptr(0))>::type atomic_incr_type;
752 const size_t num_bytes = num_packets_per_lid(i);
753 const size_t offset = offsets(i);
754 const size_t num_ent = unpackRowCount<LO>(imports.data(), offset, num_bytes);
755 if (num_ent == InvalidNum) {
756 k_error += 1;
757 }
758 Kokkos::atomic_fetch_add(&tgt_rowptr(import_lids(i)), atomic_incr_type(num_ent));
759 },
760 errors);
761 return errors;
762}
763
764// Convert array of row lengths to a CRS pointer array
765template <class DT>
766void makeCrsRowPtrFromLengths(
768 const Kokkos::View<size_t*, DT>& new_start_row) {
769 using Kokkos::parallel_scan;
770 typedef typename DT::execution_space XS;
771 typedef typename Kokkos::View<size_t*, DT>::size_type size_type;
772 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type>> range_policy;
773 const size_type N = new_start_row.extent(0);
775 range_policy(0, N),
776 KOKKOS_LAMBDA(const size_t& i, size_t& update, const bool& final) {
777 auto cur_val = tgt_rowptr(i);
778 if (final) {
779 tgt_rowptr(i) = update;
781 }
782 update += cur_val;
783 });
784}
785
786template <class LocalMatrix, class LocalMap>
787void copyDataFromSameIDs(
789 const typename PackTraits<int>::output_array_type& tgt_pids,
791 const Kokkos::View<size_t*, typename LocalMap::device_type>& new_start_row,
792 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
793 const typename PackTraits<int>::input_array_type& src_pids,
794 const LocalMatrix& local_matrix,
795 const LocalMap& local_col_map,
796 const size_t num_same_ids,
797 const int my_pid) {
798 using Kokkos::parallel_for;
799 typedef typename LocalMap::device_type DT;
800 typedef typename LocalMap::local_ordinal_type LO;
801 typedef typename DT::execution_space XS;
802 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t>> range_policy;
803
804 parallel_for(
805 range_policy(0, num_same_ids),
806 KOKKOS_LAMBDA(const size_t i) {
807 typedef typename std::remove_reference<decltype(new_start_row(0))>::type atomic_incr_type;
808
809 const LO src_lid = static_cast<LO>(i);
810 size_t src_row = local_matrix.graph.row_map(src_lid);
811
812 const LO tgt_lid = static_cast<LO>(i);
813 const size_t tgt_row = tgt_rowptr(tgt_lid);
814
815 const size_t nsr = local_matrix.graph.row_map(src_lid + 1) - local_matrix.graph.row_map(src_lid);
816 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
817
818 for (size_t j = local_matrix.graph.row_map(src_lid);
819 j < local_matrix.graph.row_map(src_lid + 1); ++j) {
820 LO src_col = local_matrix.graph.entries(j);
821 tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
822 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
823 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
824 }
825 });
826}
827
828template <class LocalMatrix, class LocalMap>
829void copyDataFromPermuteIDs(
831 const typename PackTraits<int>::output_array_type& tgt_pids,
833 const Kokkos::View<size_t*, typename LocalMap::device_type>& new_start_row,
834 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
835 const typename PackTraits<int>::input_array_type& src_pids,
838 const LocalMatrix& local_matrix,
839 const LocalMap& local_col_map,
840 const int my_pid) {
841 using Kokkos::parallel_for;
842 typedef typename LocalMap::device_type DT;
843 typedef typename LocalMap::local_ordinal_type LO;
844 typedef typename DT::execution_space XS;
845 typedef typename PackTraits<LO>::input_array_type::size_type size_type;
846 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type>> range_policy;
847
848 const size_type num_permute_to_lids = permute_to_lids.extent(0);
849
850 parallel_for(
851 range_policy(0, num_permute_to_lids),
852 KOKKOS_LAMBDA(const size_t i) {
853 typedef typename std::remove_reference<decltype(new_start_row(0))>::type atomic_incr_type;
854
855 const LO src_lid = permute_from_lids(i);
856 const size_t src_row = local_matrix.graph.row_map(src_lid);
857
858 const LO tgt_lid = permute_to_lids(i);
859 const size_t tgt_row = tgt_rowptr(tgt_lid);
860
861 size_t nsr = local_matrix.graph.row_map(src_lid + 1) - local_matrix.graph.row_map(src_lid);
862 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
863
864 for (size_t j = local_matrix.graph.row_map(src_lid);
865 j < local_matrix.graph.row_map(src_lid + 1); ++j) {
866 LO src_col = local_matrix.graph.entries(j);
867 tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
868 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
869 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
870 }
871 });
872}
873
874template <typename LocalMatrix, typename LocalMap, typename BufferDeviceType>
875int unpackAndCombineIntoCrsArrays2(
877 const typename PackTraits<int>::output_array_type& tgt_pids,
879 const Kokkos::View<size_t*, typename LocalMap::device_type>& new_start_row,
880 const typename PackTraits<size_t>::input_array_type& offsets,
882#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
883 const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
884 const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
885#else
886 const Kokkos::View<const char*, BufferDeviceType>& imports,
887 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
888#endif
889 const LocalMatrix& /* local_matrix */,
890 const LocalMap /*& local_col_map*/,
891 const int my_pid,
892 const size_t bytes_per_value) {
893 using Kokkos::atomic_fetch_add;
894 using Kokkos::MemoryUnmanaged;
895 using Kokkos::parallel_reduce;
896 using Kokkos::subview;
897 using Kokkos::View;
899 typedef typename LocalMap::device_type DT;
900 typedef typename LocalMap::local_ordinal_type LO;
901 typedef typename LocalMap::global_ordinal_type GO;
902 typedef typename LocalMatrix::value_type ST;
903 typedef typename DT::execution_space XS;
904 typedef typename Kokkos::View<LO*, DT>::size_type size_type;
905 typedef typename Kokkos::pair<size_type, size_type> slice;
906 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type>> range_policy;
907
908 typedef View<int*, DT, MemoryUnmanaged> pids_out_type;
909 typedef View<GO*, DT, MemoryUnmanaged> gids_out_type;
910 typedef View<ST*, DT, MemoryUnmanaged> vals_out_type;
911
912 const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
913
914 int errors = 0;
915 const size_type num_import_lids = import_lids.size();
916
917 // RemoteIDs: Loop structure following UnpackAndCombine
918 parallel_reduce(
919 "Unpack and combine into CRS",
920 range_policy(0, num_import_lids),
921 KOKKOS_LAMBDA(const size_t i, int& k_error) {
922 typedef typename std::remove_reference<decltype(new_start_row(0))>::type atomic_incr_type;
923 const size_t num_bytes = num_packets_per_lid(i);
924 const size_t offset = offsets(i);
925 if (num_bytes == 0) {
926 // Empty buffer means that the row is empty.
927 return;
928 }
929 size_t num_ent = unpackRowCount<LO>(imports.data(), offset, num_bytes);
930 if (num_ent == InvalidNum) {
931 k_error += 1;
932 return;
933 }
934 const LO lcl_row = import_lids(i);
935 const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
936 const size_t end_row = start_row + num_ent;
937
938 gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
939 vals_out_type vals_out = subview(tgt_vals, slice(start_row, end_row));
940 pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
941
942 k_error += unpackRow<ST, LO, GO>(gids_out, pids_out, vals_out,
943 imports.data(), offset, num_bytes,
944 num_ent, bytes_per_value);
945
946 // Correct target PIDs.
947 for (size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
948 const int pid = pids_out(j);
949 pids_out(j) = (pid != my_pid) ? pid : -1;
950 }
951 },
952 errors);
953
954 return errors;
955}
956
957template <typename LocalMatrix, typename LocalMap, typename BufferDeviceType>
959 const LocalMatrix& local_matrix,
960 const LocalMap& local_col_map,
962#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
963 const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
964 const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
965#else
966 const Kokkos::View<const char*, BufferDeviceType>& imports,
967 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
968#endif
971 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
974 const typename PackTraits<int>::input_array_type& src_pids,
975 const typename PackTraits<int>::output_array_type& tgt_pids,
976 const size_t num_same_ids,
977 const size_t tgt_num_rows,
978 const size_t tgt_num_nonzeros,
979 const int my_tgt_pid,
980 const size_t bytes_per_value) {
981 using Kokkos::MemoryUnmanaged;
982 using Kokkos::parallel_for;
983 using Kokkos::subview;
984 using Kokkos::View;
986 typedef typename LocalMap::device_type DT;
987 typedef typename LocalMap::local_ordinal_type LO;
988 typedef typename DT::execution_space XS;
989 typedef typename Kokkos::View<LO*, DT>::size_type size_type;
990 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t>> range_policy;
991 typedef BufferDeviceType BDT;
992
993 const char prefix[] = "unpackAndCombineIntoCrsArrays: ";
994
995 const size_t N = tgt_num_rows;
996
997 // In the case of reduced communicators, the sourceMatrix won't have
998 // the right "my_pid", so thus we have to supply it.
999 const int my_pid = my_tgt_pid;
1000
1001 // Zero the rowptr
1002 parallel_for(
1003 range_policy(0, N + 1),
1004 KOKKOS_LAMBDA(const size_t i) {
1005 tgt_rowptr(i) = 0;
1006 });
1007
1008 // same IDs: Always first, always in the same place
1009 parallel_for(
1010 range_policy(0, num_same_ids),
1011 KOKKOS_LAMBDA(const size_t i) {
1012 const LO tgt_lid = static_cast<LO>(i);
1013 const LO src_lid = static_cast<LO>(i);
1014 tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid + 1) - local_matrix.graph.row_map(src_lid);
1015 });
1016
1017 // Permute IDs: Still local, but reordered
1018 const size_type num_permute_to_lids = permute_to_lids.extent(0);
1019 parallel_for(
1020 range_policy(0, num_permute_to_lids),
1021 KOKKOS_LAMBDA(const size_t i) {
1022 const LO tgt_lid = permute_to_lids(i);
1023 const LO src_lid = permute_from_lids(i);
1024 tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid + 1) - local_matrix.graph.row_map(src_lid);
1025 });
1026
1027 // Get the offsets from the number of packets per LID
1028 const size_type num_import_lids = import_lids.extent(0);
1029 View<size_t*, DT> offsets("offsets", num_import_lids + 1);
1030 computeOffsetsFromCounts(offsets, num_packets_per_lid);
1031
1032#ifdef HAVE_TPETRA_DEBUG
1033 {
1034 auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
1035 const bool condition =
1036 nth_offset_h != static_cast<size_t>(imports.extent(0));
1037 TEUCHOS_TEST_FOR_EXCEPTION(condition, std::logic_error, prefix << "The final offset in bytes " << nth_offset_h << " != imports.size() = " << imports.extent(0) << ". Please report this bug to the Tpetra developers.");
1038 }
1039#endif // HAVE_TPETRA_DEBUG
1040
1041 // Setup row pointers for remotes
1042 int k_error =
1043 setupRowPointersForRemotes<LO, DT, BDT>(tgt_rowptr,
1044 import_lids, imports, num_packets_per_lid, offsets);
1045 TEUCHOS_TEST_FOR_EXCEPTION(k_error != 0, std::logic_error, prefix << " Error transferring data to target row pointers. "
1046 "Please report this bug to the Tpetra developers.");
1047
1048 // If multiple processes contribute to the same row, we may need to
1049 // update row offsets. This tracks that.
1050 View<size_t*, DT> new_start_row("new_start_row", N + 1);
1051
1052 // Turn row length into a real CRS row pointer
1053 makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
1054
1055 // SameIDs: Copy the data over
1056 copyDataFromSameIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1057 tgt_rowptr, src_pids, local_matrix, local_col_map, num_same_ids, my_pid);
1058
1059 copyDataFromPermuteIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1060 tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
1061 local_matrix, local_col_map, my_pid);
1062
1063 if (imports.extent(0) <= 0) {
1064 return;
1065 }
1066
1067 int unpack_err = unpackAndCombineIntoCrsArrays2(tgt_colind, tgt_pids,
1068 tgt_vals, new_start_row, offsets, import_lids, imports, num_packets_per_lid,
1069 local_matrix, local_col_map, my_pid, bytes_per_value);
1070 TEUCHOS_TEST_FOR_EXCEPTION(
1071 unpack_err != 0, std::logic_error, prefix << "unpack loop failed. This "
1072 "should never happen. Please report this bug to the Tpetra developers.");
1073
1074 return;
1075}
1076
1077} // namespace UnpackAndCombineCrsMatrixImpl
1078
1118template <typename ST, typename LO, typename GO, typename Node>
1121 const Teuchos::ArrayView<const char>& imports,
1122 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1123 const Teuchos::ArrayView<const LO>& importLIDs,
1124 size_t /* constantNumPackets */,
1126 using Kokkos::View;
1127 typedef typename Node::device_type device_type;
1128 typedef typename CrsMatrix<ST, LO, GO, Node>::local_matrix_device_type local_matrix_device_type;
1129 static_assert(std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1130 "Node::device_type and LocalMatrix::device_type must be the same.");
1131
1132 // Convert all Teuchos::Array to Kokkos::View.
1133 device_type outputDevice;
1134
1135 // numPacketsPerLID, importLIDs, and imports are input, so we have to copy
1136 // them to device. Since unpacking is done directly in to the local matrix
1137 // (lclMatrix), no copying needs to be performed after unpacking.
1140 numPacketsPerLID.size(), true, "num_packets_per_lid");
1141
1142 auto import_lids_d =
1144 importLIDs.size(), true, "import_lids");
1145
1146 auto imports_d =
1148 imports.size(), true, "imports");
1149
1150 auto local_matrix = sourceMatrix.getLocalMatrixDevice();
1151 auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1152
1153 // KDDKDD This loop doesn't appear to do anything; what is it?
1154 // KDDKDD for (int i=0; i<importLIDs.size(); i++)
1155 // KDDKDD {
1156 // KDDKDD auto lclRow = importLIDs[i];
1157 // KDDKDD Teuchos::ArrayView<const LO> A_indices;
1158 // KDDKDD Teuchos::ArrayView<const ST> A_values;
1159 // KDDKDD sourceMatrix.getLocalRowView(lclRow, A_indices, A_values);
1160 // KDDKDD }
1161 // Now do the actual unpack!
1162 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix(
1163 local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1165}
1166
1167template <typename ST, typename LO, typename GO, typename NT>
1168void unpackCrsMatrixAndCombineNew(
1170 Kokkos::DualView<char*,
1172 imports,
1173 Kokkos::DualView<size_t*,
1176 const Kokkos::DualView<const LO*,
1178 const size_t /* constantNumPackets */,
1179 const CombineMode combineMode) {
1180 using Kokkos::View;
1181 using crs_matrix_type = CrsMatrix<ST, LO, GO, NT>;
1182 using dist_object_type = DistObject<char, LO, GO, NT>;
1183 using device_type = typename crs_matrix_type::device_type;
1184 using local_matrix_device_type = typename crs_matrix_type::local_matrix_device_type;
1185 using buffer_device_type = typename dist_object_type::buffer_device_type;
1186
1187 static_assert(std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1188 "crs_matrix_type::device_type and local_matrix_device_type::device_type "
1189 "must be the same.");
1190
1191 if (numPacketsPerLID.need_sync_device()) {
1192 numPacketsPerLID.sync_device();
1193 }
1194 auto num_packets_per_lid_d = numPacketsPerLID.view_device();
1195
1196 TEUCHOS_ASSERT(!importLIDs.need_sync_device());
1197 auto import_lids_d = importLIDs.view_device();
1198
1199 if (imports.need_sync_device()) {
1200 imports.sync_device();
1201 }
1202 auto imports_d = imports.view_device();
1203
1204 auto local_matrix = sourceMatrix.getLocalMatrixDevice();
1205 auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1206 typedef decltype(local_col_map) local_map_type;
1207
1208 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix<
1209 local_matrix_device_type,
1210 local_map_type,
1211 buffer_device_type>(local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1212 import_lids_d, combineMode);
1213}
1214
1261//
1270template <typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1271size_t
1274 const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
1275 const Teuchos::ArrayView<const char>& imports,
1276 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1277 size_t /* constantNumPackets */,
1278 CombineMode /* combineMode */,
1279 size_t numSameIDs,
1280 const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1281 const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs) {
1282 using Kokkos::MemoryUnmanaged;
1283 using Kokkos::View;
1284 typedef typename Node::device_type DT;
1285 const char prefix[] = "unpackAndCombineWithOwningPIDsCount: ";
1286
1287 TEUCHOS_TEST_FOR_EXCEPTION(permuteToLIDs.size() != permuteFromLIDs.size(), std::invalid_argument,
1288 prefix << "permuteToLIDs.size() = " << permuteToLIDs.size() << " != "
1289 "permuteFromLIDs.size() = "
1290 << permuteFromLIDs.size() << ".");
1291 // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
1292 // process, then the matrix is neither locally nor globally indexed.
1293 const bool locallyIndexed = sourceMatrix.isLocallyIndexed();
1294 TEUCHOS_TEST_FOR_EXCEPTION(!locallyIndexed, std::invalid_argument, prefix << "The input "
1295 "CrsMatrix 'sourceMatrix' must be locally indexed.");
1296 TEUCHOS_TEST_FOR_EXCEPTION(importLIDs.size() != numPacketsPerLID.size(), std::invalid_argument,
1297 prefix << "importLIDs.size() = " << importLIDs.size() << " != "
1298 "numPacketsPerLID.size() = "
1299 << numPacketsPerLID.size() << ".");
1300
1301 auto local_matrix = sourceMatrix.getLocalMatrixDevice();
1302
1303 using kokkos_device_type = Kokkos::Device<typename Node::device_type::execution_space,
1304 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>;
1305
1306#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1307 Kokkos::View<LocalOrdinal const*, kokkos_device_type, void, void> permute_from_lids_d =
1308#else
1309 Kokkos::View<LocalOrdinal const*, kokkos_device_type> permute_from_lids_d =
1310#endif
1312 permuteFromLIDs.getRawPtr(),
1313 permuteFromLIDs.size(), true,
1314 "permute_from_lids");
1315
1316#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1317 Kokkos::View<const char*, kokkos_device_type, void, void> imports_d =
1318#else
1319 Kokkos::View<const char*, kokkos_device_type> imports_d =
1320#endif
1322 imports.getRawPtr(),
1323 imports.size(), true,
1324 "imports");
1325
1326#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1327 Kokkos::View<const size_t*, kokkos_device_type, void, void> num_packets_per_lid_d =
1328#else
1329 Kokkos::View<const size_t*, kokkos_device_type> num_packets_per_lid_d =
1330#endif
1332 numPacketsPerLID.getRawPtr(),
1333 numPacketsPerLID.size(), true,
1334 "num_packets_per_lid");
1335
1336 return UnpackAndCombineCrsMatrixImpl::unpackAndCombineWithOwningPIDsCount(
1337 local_matrix, permute_from_lids_d, imports_d,
1338 num_packets_per_lid_d, numSameIDs);
1339} // unpackAndCombineWithOwningPIDsCount (Teuchos::Array version)
1340
1355
1356template <typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1359 const Kokkos::View<LocalOrdinal const*,
1360 Kokkos::Device<typename Node::device_type::execution_space,
1361 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1363 ,
1364 void, void
1365#endif
1366 >
1368 const Kokkos::View<const char*,
1369 Kokkos::Device<typename Node::device_type::execution_space,
1370 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1372 ,
1373 void, void
1374#endif
1375 >
1376 imports_d,
1377 const Kokkos::View<const size_t*,
1378 Kokkos::Device<typename Node::device_type::execution_space,
1379 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1381 ,
1382 void, void
1383#endif
1384 >
1386 const size_t numSameIDs,
1387 const Kokkos::View<LocalOrdinal const*,
1388 Kokkos::Device<typename Node::device_type::execution_space,
1389 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1391 ,
1392 void, void
1393#endif
1394 >
1396 const Kokkos::View<LocalOrdinal const*,
1397 Kokkos::Device<typename Node::device_type::execution_space,
1398 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1400 ,
1401 void, void
1402#endif
1403 >
1405 size_t TargetNumRows,
1406 const int MyTargetPID,
1407 Kokkos::View<size_t*, typename Node::device_type>& crs_rowptr_d,
1408 Kokkos::View<GlobalOrdinal*, typename Node::device_type>& crs_colind_d,
1409 Kokkos::View<typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::impl_scalar_type*, typename Node::device_type>& crs_vals_d,
1410 const Teuchos::ArrayView<const int>& SourcePids,
1411 Kokkos::View<int*, typename Node::device_type>& TargetPids) {
1412 using execution_space = typename Node::execution_space;
1414
1415 using Kokkos::deep_copy;
1416 using Kokkos::View;
1417
1418 using Teuchos::ArrayView;
1419 using Teuchos::outArg;
1420 using Teuchos::REDUCE_MAX;
1421 using Teuchos::reduceAll;
1422
1423 typedef typename Node::device_type DT;
1424
1426 typedef typename matrix_type::impl_scalar_type ST;
1427
1428 const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays_new: ";
1429#ifdef HAVE_TPETRA_MMM_TIMINGS
1430 using Teuchos::TimeMonitor;
1431 Teuchos::RCP<TimeMonitor> tm;
1432#endif
1433
1434 using Kokkos::MemoryUnmanaged;
1435
1436 TEUCHOS_TEST_FOR_EXCEPTION(permute_to_lids_d.size() != permute_from_lids_d.size(), std::invalid_argument,
1437 prefix << "permute_to_lids_d.size() = " << permute_to_lids_d.size() << " != "
1438 "permute_from_lids_d.size() = "
1439 << permute_from_lids_d.size() << ".");
1440 // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
1441 // process, then the matrix is neither locally nor globally indexed.
1442 const bool locallyIndexed = sourceMatrix.isLocallyIndexed();
1443 TEUCHOS_TEST_FOR_EXCEPTION(!locallyIndexed, std::invalid_argument, prefix << "The input "
1444 "CrsMatrix 'sourceMatrix' must be locally indexed.");
1445 TEUCHOS_TEST_FOR_EXCEPTION(((size_t)import_lids_d.size()) != num_packets_per_lid_d.size(), std::invalid_argument,
1446 prefix << "import_lids_d.size() = " << import_lids_d.size() << " != "
1447 "num_packets_per_lid_d.size() = "
1448 << num_packets_per_lid_d.size() << ".");
1449
1450 auto local_matrix = sourceMatrix.getLocalMatrixDevice();
1451
1452 // TargetNumNonzeros is number of nonzeros in local matrix.
1453#ifdef HAVE_TPETRA_MMM_TIMINGS
1454 tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("unpackAndCombineWithOwningPIDsCount"))));
1455#endif
1456 size_t TargetNumNonzeros =
1457 UnpackAndCombineCrsMatrixImpl::unpackAndCombineWithOwningPIDsCount(
1458 local_matrix, permute_from_lids_d, imports_d,
1459 num_packets_per_lid_d, numSameIDs);
1460#ifdef HAVE_TPETRA_MMM_TIMINGS
1461 tm = Teuchos::null;
1462#endif
1463
1464#ifdef HAVE_TPETRA_MMM_TIMINGS
1465 tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("resize CRS pointers"))));
1466#endif
1467 Kokkos::resize(crs_rowptr_d, TargetNumRows + 1);
1468 Kokkos::resize(crs_colind_d, TargetNumNonzeros);
1469 Kokkos::resize(crs_vals_d, TargetNumNonzeros);
1470#ifdef HAVE_TPETRA_MMM_TIMINGS
1471 tm = Teuchos::null;
1472#endif
1473
1475 permute_to_lids_d.size() != permute_from_lids_d.size(), std::invalid_argument,
1476 prefix << "permuteToLIDs.size() = " << permute_to_lids_d.size()
1477 << "!= permute_from_lids_d.size() = " << permute_from_lids_d.size() << ".");
1478
1479 if (static_cast<size_t>(TargetPids.size()) != TargetNumNonzeros) {
1480 Kokkos::resize(TargetPids, TargetNumNonzeros);
1481 }
1482 Kokkos::deep_copy(execution_space(), TargetPids, -1);
1483
1484 // Grab pointers for sourceMatrix
1485 auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1486
1487#ifdef HAVE_TPETRA_MMM_TIMINGS
1488 tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("create mirror views from inputs"))));
1489#endif
1490 // Convert input arrays to Kokkos::Views
1491 DT outputDevice;
1492
1493 auto src_pids_d =
1495 SourcePids.size(), true, "src_pids");
1496
1497#ifdef HAVE_TPETRA_MMM_TIMINGS
1498 tm = Teuchos::null;
1499#endif
1500
1501 size_t bytes_per_value = 0;
1503 // assume that ST is default constructible
1504 bytes_per_value = PackTraits<ST>::packValueCount(ST());
1505 } else {
1506 // Since the packed data come from the source matrix, we can use the source
1507 // matrix to get the number of bytes per Scalar value stored in the matrix.
1508 // This assumes that all Scalar values in the source matrix require the same
1509 // number of bytes. If the source matrix has no entries on the calling
1510 // process, then we hope that some process does have some idea how big
1511 // a Scalar value is. Of course, if no processes have any entries, then no
1512 // values should be packed (though this does assume that in our packing
1513 // scheme, rows with zero entries take zero bytes).
1514 size_t bytes_per_value_l = 0;
1515 if (local_matrix.values.extent(0) > 0) {
1516 const ST& val = local_matrix.values(0);
1518 } else {
1519 const ST& val = crs_vals_d(0);
1521 }
1522 Teuchos::reduceAll<int, size_t>(*(sourceMatrix.getComm()),
1523 Teuchos::REDUCE_MAX,
1525 outArg(bytes_per_value));
1526 }
1527
1528#ifdef HAVE_TPETRA_MMM_TIMINGS
1529 tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("unpackAndCombineIntoCrsArrays"))));
1530#endif
1531 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsArrays(
1532 local_matrix, local_col_map, import_lids_d, imports_d,
1536 bytes_per_value);
1537#ifdef HAVE_TPETRA_MMM_TIMINGS
1538 tm = Teuchos::null;
1539#endif
1540
1541 // Copy outputs back to host
1542#ifdef HAVE_TPETRA_MMM_TIMINGS
1543 tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("copy back to host"))));
1544#endif
1545
1546 Kokkos::parallel_for(
1547 "setLocalEntriesToPID", Kokkos::RangePolicy<typename DT::execution_space>(0, TargetPids.size()), KOKKOS_LAMBDA(const size_t i) {
1548 if (TargetPids(i) == -1) TargetPids(i) = MyTargetPID;
1549 });
1550
1551} // unpackAndCombineIntoCrsArrays
1552
1553template <typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1556 const Kokkos::View<LocalOrdinal const*,
1557 Kokkos::Device<typename Node::device_type::execution_space,
1558 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1560 ,
1561 void, void
1562#endif
1563 >
1565 const Kokkos::View<const char*,
1566 Kokkos::Device<typename Node::device_type::execution_space,
1567 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1569 ,
1570 void, void
1571#endif
1572 >
1573 imports_d,
1574 const Kokkos::View<const size_t*,
1575 Kokkos::Device<typename Node::device_type::execution_space,
1576 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1578 ,
1579 void, void
1580#endif
1581 >
1583 const size_t numSameIDs,
1584 const Kokkos::View<LocalOrdinal const*,
1585 Kokkos::Device<typename Node::device_type::execution_space,
1586 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1588 ,
1589 void, void
1590#endif
1591 >
1593 const Kokkos::View<LocalOrdinal const*,
1594 Kokkos::Device<typename Node::device_type::execution_space,
1595 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1597 ,
1598 void, void
1599#endif
1600 >
1602 size_t TargetNumRows,
1603 const int MyTargetPID,
1604 Teuchos::ArrayRCP<size_t>& CRS_rowptr,
1605 Teuchos::ArrayRCP<GlobalOrdinal>& CRS_colind,
1606 Teuchos::ArrayRCP<Scalar>& CRS_vals,
1607 const Teuchos::ArrayView<const int>& SourcePids,
1608 Teuchos::Array<int>& TargetPids) {
1609 using execution_space = typename Node::execution_space;
1611
1612 using Kokkos::deep_copy;
1613 using Kokkos::View;
1614
1615 using Teuchos::ArrayView;
1616 using Teuchos::outArg;
1617 using Teuchos::REDUCE_MAX;
1618 using Teuchos::reduceAll;
1619
1620 typedef typename Node::device_type DT;
1621
1623 typedef typename matrix_type::impl_scalar_type ST;
1624
1625 const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays_new: ";
1626#ifdef HAVE_TPETRA_MMM_TIMINGS
1627 using Teuchos::TimeMonitor;
1628 Teuchos::RCP<TimeMonitor> tm;
1629#endif
1630
1631 using Kokkos::MemoryUnmanaged;
1632
1633 TEUCHOS_TEST_FOR_EXCEPTION(permute_to_lids_d.size() != permute_from_lids_d.size(), std::invalid_argument,
1634 prefix << "permute_to_lids_d.size() = " << permute_to_lids_d.size() << " != "
1635 "permute_from_lids_d.size() = "
1636 << permute_from_lids_d.size() << ".");
1637 // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
1638 // process, then the matrix is neither locally nor globally indexed.
1639 const bool locallyIndexed = sourceMatrix.isLocallyIndexed();
1640 TEUCHOS_TEST_FOR_EXCEPTION(!locallyIndexed, std::invalid_argument, prefix << "The input "
1641 "CrsMatrix 'sourceMatrix' must be locally indexed.");
1642 TEUCHOS_TEST_FOR_EXCEPTION(((size_t)import_lids_d.size()) != num_packets_per_lid_d.size(), std::invalid_argument,
1643 prefix << "import_lids_d.size() = " << import_lids_d.size() << " != "
1644 "num_packets_per_lid_d.size() = "
1645 << num_packets_per_lid_d.size() << ".");
1646
1647 auto local_matrix = sourceMatrix.getLocalMatrixDevice();
1648
1649 // TargetNumNonzeros is number of nonzeros in local matrix.
1650#ifdef HAVE_TPETRA_MMM_TIMINGS
1651 tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("unpackAndCombineWithOwningPIDsCount"))));
1652#endif
1653 size_t TargetNumNonzeros =
1654 UnpackAndCombineCrsMatrixImpl::unpackAndCombineWithOwningPIDsCount(
1655 local_matrix, permute_from_lids_d, imports_d,
1656 num_packets_per_lid_d, numSameIDs);
1657#ifdef HAVE_TPETRA_MMM_TIMINGS
1658 tm = Teuchos::null;
1659#endif
1660
1661#ifdef HAVE_TPETRA_MMM_TIMINGS
1662 tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("resize CRS pointers"))));
1663#endif
1664 CRS_rowptr.resize(TargetNumRows + 1);
1667 Teuchos::ArrayRCP<ST> const& CRS_vals_impl_scalar_type = Teuchos::arcp_reinterpret_cast<ST>(CRS_vals);
1668#ifdef HAVE_TPETRA_MMM_TIMINGS
1669 tm = Teuchos::null;
1670#endif
1671
1673 permute_to_lids_d.size() != permute_from_lids_d.size(), std::invalid_argument,
1674 prefix << "permuteToLIDs.size() = " << permute_to_lids_d.size()
1675 << "!= permute_from_lids_d.size() = " << permute_from_lids_d.size() << ".");
1676
1677 // Preseed TargetPids with -1 for local
1678 if (static_cast<size_t>(TargetPids.size()) != TargetNumNonzeros) {
1680 }
1681 TargetPids.assign(TargetNumNonzeros, -1);
1682
1683 // Grab pointers for sourceMatrix
1684 auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1685
1686#ifdef HAVE_TPETRA_MMM_TIMINGS
1687 tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("create mirror views from inputs"))));
1688#endif
1689 // Convert input arrays to Kokkos::Views
1690 DT outputDevice;
1691
1692 auto crs_rowptr_d =
1694 CRS_rowptr.size(), true, "crs_rowptr");
1695
1696 auto crs_colind_d =
1698 CRS_colind.size(), true, "crs_colidx");
1699#ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1700 static_assert(!std::is_same<
1701 typename std::remove_const<
1702 typename std::decay<
1703 decltype(CRS_vals_impl_scalar_type)>::type::value_type>::type,
1704 std::complex<double>>::value,
1705 "CRS_vals::value_type is std::complex<double>; this should never happen"
1706 ", since std::complex does not work in Kokkos::View objects.");
1707#endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1708
1709 auto crs_vals_d =
1711 CRS_vals_impl_scalar_type.size(), true, "crs_vals");
1712
1713#ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1714 static_assert(!std::is_same<
1715 typename decltype(crs_vals_d)::non_const_value_type,
1716 std::complex<double>>::value,
1717 "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1718 "never happen, since std::complex does not work in Kokkos::View objects.");
1719#endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1720
1721 auto src_pids_d =
1723 SourcePids.size(), true, "src_pids");
1724
1725 auto tgt_pids_d =
1727 TargetPids.size(), true, "tgt_pids");
1728
1729#ifdef HAVE_TPETRA_MMM_TIMINGS
1730 tm = Teuchos::null;
1731#endif
1732
1733 size_t bytes_per_value = 0;
1735 // assume that ST is default constructible
1736 bytes_per_value = PackTraits<ST>::packValueCount(ST());
1737 } else {
1738 // Since the packed data come from the source matrix, we can use the source
1739 // matrix to get the number of bytes per Scalar value stored in the matrix.
1740 // This assumes that all Scalar values in the source matrix require the same
1741 // number of bytes. If the source matrix has no entries on the calling
1742 // process, then we hope that some process does have some idea how big
1743 // a Scalar value is. Of course, if no processes have any entries, then no
1744 // values should be packed (though this does assume that in our packing
1745 // scheme, rows with zero entries take zero bytes).
1746 size_t bytes_per_value_l = 0;
1747 if (local_matrix.values.extent(0) > 0) {
1748 const ST& val = local_matrix.values(0);
1750 } else {
1751 const ST& val = crs_vals_d(0);
1753 }
1754 Teuchos::reduceAll<int, size_t>(*(sourceMatrix.getComm()),
1755 Teuchos::REDUCE_MAX,
1757 outArg(bytes_per_value));
1758 }
1759
1760#ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1761 static_assert(!std::is_same<
1762 typename decltype(crs_vals_d)::non_const_value_type,
1763 std::complex<double>>::value,
1764 "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1765 "never happen, since std::complex does not work in Kokkos::View objects.");
1766#endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1767
1768#ifdef HAVE_TPETRA_MMM_TIMINGS
1769 tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("unpackAndCombineIntoCrsArrays"))));
1770#endif
1771 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsArrays(
1772 local_matrix, local_col_map, import_lids_d, imports_d,
1776 bytes_per_value);
1777#ifdef HAVE_TPETRA_MMM_TIMINGS
1778 tm = Teuchos::null;
1779#endif
1780
1781 // Copy outputs back to host
1782#ifdef HAVE_TPETRA_MMM_TIMINGS
1783 tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("copy back to host"))));
1784#endif
1785 typename decltype(crs_rowptr_d)::host_mirror_type crs_rowptr_h(
1786 CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1787 // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
1788 deep_copy(execution_space(), crs_rowptr_h, crs_rowptr_d);
1789
1790 typename decltype(crs_colind_d)::host_mirror_type crs_colind_h(
1791 CRS_colind.getRawPtr(), CRS_colind.size());
1792 // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
1793 deep_copy(execution_space(), crs_colind_h, crs_colind_d);
1794
1795 typename decltype(crs_vals_d)::host_mirror_type crs_vals_h(
1797 // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
1798 deep_copy(execution_space(), crs_vals_h, crs_vals_d);
1799
1800 typename decltype(tgt_pids_d)::host_mirror_type tgt_pids_h(
1801 TargetPids.getRawPtr(), TargetPids.size());
1802 // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
1803 deep_copy(execution_space(), tgt_pids_h, tgt_pids_d);
1804
1805} // unpackAndCombineIntoCrsArrays
1806
1807} // namespace Details
1808} // namespace Tpetra
1809
1810#define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT_KOKKOS_DEPRECATED_CODE_4_ON(ST, LO, GO, NT) \
1811 template void \
1812 Details::unpackCrsMatrixAndCombine<ST, LO, GO, NT>( \
1813 const CrsMatrix<ST, LO, GO, NT>&, \
1814 const Teuchos::ArrayView<const char>&, \
1815 const Teuchos::ArrayView<const size_t>&, \
1816 const Teuchos::ArrayView<const LO>&, \
1817 size_t, \
1818 CombineMode); \
1819 template size_t \
1820 Details::unpackAndCombineWithOwningPIDsCount<ST, LO, GO, NT>( \
1821 const CrsMatrix<ST, LO, GO, NT>&, \
1822 const Teuchos::ArrayView<const LO>&, \
1823 const Teuchos::ArrayView<const char>&, \
1824 const Teuchos::ArrayView<const size_t>&, \
1825 size_t, \
1826 CombineMode, \
1827 size_t, \
1828 const Teuchos::ArrayView<const LO>&, \
1829 const Teuchos::ArrayView<const LO>&); \
1830 template void \
1831 Details::unpackCrsMatrixAndCombineNew<ST, LO, GO, NT>( \
1832 const CrsMatrix<ST, LO, GO, NT>&, \
1833 Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1834 Kokkos::DualView<size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1835 const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1836 const size_t, \
1837 const CombineMode); \
1838 template void \
1839 Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT>( \
1840 const CrsMatrix<ST, LO, GO, NT>&, \
1841 const Kokkos::View<LO const*, \
1842 Kokkos::Device<typename NT::device_type::execution_space, \
1843 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1844 void, void>, \
1845 const Kokkos::View<const char*, \
1846 Kokkos::Device<typename NT::device_type::execution_space, \
1847 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1848 void, void>, \
1849 const Kokkos::View<const size_t*, \
1850 Kokkos::Device<typename NT::device_type::execution_space, \
1851 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1852 void, void>, \
1853 const size_t, \
1854 const Kokkos::View<LO const*, \
1855 Kokkos::Device<typename NT::device_type::execution_space, \
1856 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1857 void, void>, \
1858 const Kokkos::View<LO const*, \
1859 Kokkos::Device<typename NT::device_type::execution_space, \
1860 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1861 void, void>, \
1862 size_t, \
1863 const int, \
1864 Kokkos::View<size_t*, typename NT::device_type>&, \
1865 Kokkos::View<GO*, typename NT::device_type>&, \
1866 Kokkos::View<typename CrsMatrix<ST, LO, GO, NT>::impl_scalar_type*, typename NT::device_type>&, \
1867 const Teuchos::ArrayView<const int>&, \
1868 Kokkos::View<int*, typename NT::device_type>&); \
1869 template void \
1870 Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT>( \
1871 const CrsMatrix<ST, LO, GO, NT>&, \
1872 const Kokkos::View<LO const*, \
1873 Kokkos::Device<typename NT::device_type::execution_space, \
1874 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1875 void, void>, \
1876 const Kokkos::View<const char*, \
1877 Kokkos::Device<typename NT::device_type::execution_space, \
1878 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1879 void, void>, \
1880 const Kokkos::View<const size_t*, \
1881 Kokkos::Device<typename NT::device_type::execution_space, \
1882 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1883 void, void>, \
1884 const size_t, \
1885 const Kokkos::View<LO const*, \
1886 Kokkos::Device<typename NT::device_type::execution_space, \
1887 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1888 void, void>, \
1889 const Kokkos::View<LO const*, \
1890 Kokkos::Device<typename NT::device_type::execution_space, \
1891 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1892 void, void>, \
1893 size_t, \
1894 const int, \
1895 Teuchos::ArrayRCP<size_t>&, \
1896 Teuchos::ArrayRCP<GO>&, \
1897 Teuchos::ArrayRCP<ST>&, \
1898 const Teuchos::ArrayView<const int>&, \
1899 Teuchos::Array<int>&);
1900
1901#define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT_KOKKOS_DEPRECATED_CODE_4_OFF(ST, LO, GO, NT) \
1902 template void \
1903 Details::unpackCrsMatrixAndCombine<ST, LO, GO, NT>( \
1904 const CrsMatrix<ST, LO, GO, NT>&, \
1905 const Teuchos::ArrayView<const char>&, \
1906 const Teuchos::ArrayView<const size_t>&, \
1907 const Teuchos::ArrayView<const LO>&, \
1908 size_t, \
1909 CombineMode); \
1910 template size_t \
1911 Details::unpackAndCombineWithOwningPIDsCount<ST, LO, GO, NT>( \
1912 const CrsMatrix<ST, LO, GO, NT>&, \
1913 const Teuchos::ArrayView<const LO>&, \
1914 const Teuchos::ArrayView<const char>&, \
1915 const Teuchos::ArrayView<const size_t>&, \
1916 size_t, \
1917 CombineMode, \
1918 size_t, \
1919 const Teuchos::ArrayView<const LO>&, \
1920 const Teuchos::ArrayView<const LO>&); \
1921 template void \
1922 Details::unpackCrsMatrixAndCombineNew<ST, LO, GO, NT>( \
1923 const CrsMatrix<ST, LO, GO, NT>&, \
1924 Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1925 Kokkos::DualView<size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1926 const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1927 const size_t, \
1928 const CombineMode); \
1929 template void \
1930 Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT>( \
1931 const CrsMatrix<ST, LO, GO, NT>&, \
1932 const Kokkos::View<LO const*, \
1933 Kokkos::Device<typename NT::device_type::execution_space, \
1934 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1935 const Kokkos::View<const char*, \
1936 Kokkos::Device<typename NT::device_type::execution_space, \
1937 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1938 const Kokkos::View<const size_t*, \
1939 Kokkos::Device<typename NT::device_type::execution_space, \
1940 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1941 const size_t, \
1942 const Kokkos::View<LO const*, \
1943 Kokkos::Device<typename NT::device_type::execution_space, \
1944 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1945 const Kokkos::View<LO const*, \
1946 Kokkos::Device<typename NT::device_type::execution_space, \
1947 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1948 size_t, \
1949 const int, \
1950 Kokkos::View<size_t*, typename NT::device_type>&, \
1951 Kokkos::View<GO*, typename NT::device_type>&, \
1952 Kokkos::View<typename CrsMatrix<ST, LO, GO, NT>::impl_scalar_type*, typename NT::device_type>&, \
1953 const Teuchos::ArrayView<const int>&, \
1954 Kokkos::View<int*, typename NT::device_type>&); \
1955 template void \
1956 Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT>( \
1957 const CrsMatrix<ST, LO, GO, NT>&, \
1958 const Kokkos::View<LO const*, \
1959 Kokkos::Device<typename NT::device_type::execution_space, \
1960 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1961 const Kokkos::View<const char*, \
1962 Kokkos::Device<typename NT::device_type::execution_space, \
1963 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1964 const Kokkos::View<const size_t*, \
1965 Kokkos::Device<typename NT::device_type::execution_space, \
1966 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1967 const size_t, \
1968 const Kokkos::View<LO const*, \
1969 Kokkos::Device<typename NT::device_type::execution_space, \
1970 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1971 const Kokkos::View<LO const*, \
1972 Kokkos::Device<typename NT::device_type::execution_space, \
1973 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1974 size_t, \
1975 const int, \
1976 Teuchos::ArrayRCP<size_t>&, \
1977 Teuchos::ArrayRCP<GO>&, \
1978 Teuchos::ArrayRCP<ST>&, \
1979 const Teuchos::ArrayView<const int>&, \
1980 Teuchos::Array<int>&);
1981
1982#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1983#define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT(ST, LO, GO, NT) \
1984 TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT_KOKKOS_DEPRECATED_CODE_4_ON(ST, LO, GO, NT)
1985#else
1986#define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT(ST, LO, GO, NT) \
1987 TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT_KOKKOS_DEPRECATED_CODE_4_OFF(ST, LO, GO, NT)
1988#endif
1989
1990#endif // TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
Declaration of the Tpetra::CrsMatrix class.
Import KokkosSparse::OrdinalTraits, a traits class for "invalid" (flag) values of integer types,...
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary,...
Declaration and definition of Tpetra::Details::getEntryOnHost.
size_t compute_total_num_entries(const Kokkos::View< const size_t *, BDT > &num_packets_per_lid, const Kokkos::View< const size_t *, DT > &offsets, const Kokkos::View< const char *, BDT > &imports)
Total number of entries in any row of the packed matrix.
void unpackAndCombineIntoCrsMatrix(const LocalMatrix &local_matrix, const LocalMap &local_map, const Kokkos::View< const char *, BufferDeviceType > &imports, const Kokkos::View< const size_t *, BufferDeviceType > &num_packets_per_lid, const typename PackTraits< typename LocalMap::local_ordinal_type >::input_array_type import_lids, const Tpetra::CombineMode combine_mode)
Perform the unpack operation for the matrix.
size_t compute_maximum_num_entries(const Kokkos::View< const size_t *, BDT > &num_packets_per_lid, const Kokkos::View< const size_t *, DT > &offsets, const Kokkos::View< const char *, BDT > &imports)
Maximum number of entries in any row of the packed matrix.
bool compute_batch_info(const View1 &batches_per_lid, View2 &batch_info)
Compute the index and batch number associated with each batch.
Struct that holds views of the contents of a CrsMatrix.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
typename row_matrix_type::impl_scalar_type impl_scalar_type
The type used internally in place of Scalar.
static size_t hierarchicalUnpackBatchSize()
Size of batch for hierarchical unpacking.
static size_t hierarchicalUnpackTeamSize()
Size of team for hierarchical unpacking.
"Local" part of Map suitable for Kokkos kernels.
KOKKOS_INLINE_FUNCTION LocalOrdinal getLocalElement(const GlobalOrdinal globalIndex) const
Get the local index corresponding to the given global index. (device only)
LocalOrdinal local_ordinal_type
The type of local indices.
GlobalOrdinal global_ordinal_type
The type of global indices.
DeviceType device_type
The device type.
Kokkos::parallel_reduce functor to determine the number of entries (to unpack) in a KokkosSparse::Crs...
Kokkos::Device< typename device_type::execution_space, buffer_memory_space > buffer_device_type
Kokkos::Device specialization for communication buffers.
Implementation details of Tpetra.
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks.
void unpackCrsMatrixAndCombine(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, const Teuchos::ArrayView< const char > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &importLIDs, size_t constantNumPackets, CombineMode combineMode)
Unpack the imported column indices and values, and combine into matrix.
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
CombineMode
Rule for combining data in an Import or Export.
@ REPLACE
Replace existing values with new values.
@ ADD
Sum new values.
@ ABSMAX
Replace old value with maximum of magnitudes of old and new values.
@ INSERT
Insert new values that don't currently exist.
Traits class for packing / unpacking data of type T.
static KOKKOS_INLINE_FUNCTION Kokkos::pair< int, size_t > unpackArray(value_type outBuf[], const char inBuf[], const size_t numEnt)
Unpack numEnt value_type entries from the given input buffer of bytes, to the given output buffer of ...
static KOKKOS_INLINE_FUNCTION size_t unpackValue(T &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
Kokkos::View< value_type *, Kokkos::AnonymousSpace > output_array_type
The type of an output array of value_type.
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const T &)
Number of bytes required to pack or unpack the given value of type value_type.
Kokkos::View< const value_type *, Kokkos::AnonymousSpace > input_array_type
The type of an input array of value_type.