Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
Tpetra_Details_packCrsMatrix_def.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef TPETRA_DETAILS_PACKCRSMATRIX_DEF_HPP
11#define TPETRA_DETAILS_PACKCRSMATRIX_DEF_HPP
12
13#include "TpetraCore_config.h"
14#include "Teuchos_Array.hpp"
15#include "Teuchos_ArrayView.hpp"
24#include <memory>
25#include <sstream>
26#include <stdexcept>
27#include <string>
28
51
52namespace Tpetra {
53
54//
55// Users must never rely on anything in the Details namespace.
56//
57namespace Details {
58
59namespace PackCrsMatrixImpl {
67template <class OutputOffsetsViewType,
68 class CountsViewType,
69 class InputOffsetsViewType,
70 class InputLocalRowIndicesViewType,
71 class InputLocalRowPidsViewType,
72 const bool debug =
73#ifdef HAVE_TPETRA_DEBUG
74 true
75#else
76 false
77#endif // HAVE_TPETRA_DEBUG
78 >
80 public:
81 typedef typename OutputOffsetsViewType::non_const_value_type output_offset_type;
82 typedef typename CountsViewType::non_const_value_type count_type;
83 typedef typename InputOffsetsViewType::non_const_value_type input_offset_type;
84 typedef typename InputLocalRowIndicesViewType::non_const_value_type local_row_index_type;
85 typedef typename InputLocalRowPidsViewType::non_const_value_type local_row_pid_type;
86 // output Views drive where execution happens.
87 typedef typename OutputOffsetsViewType::device_type device_type;
88 static_assert(std::is_same<typename CountsViewType::device_type::execution_space,
89 typename device_type::execution_space>::value,
90 "OutputOffsetsViewType and CountsViewType must have the same execution space.");
91 static_assert(Kokkos::is_view<OutputOffsetsViewType>::value,
92 "OutputOffsetsViewType must be a Kokkos::View.");
93 static_assert(std::is_same<typename OutputOffsetsViewType::value_type, output_offset_type>::value,
94 "OutputOffsetsViewType must be a nonconst Kokkos::View.");
95 static_assert(std::is_integral<output_offset_type>::value,
96 "The type of each entry of OutputOffsetsViewType must be a built-in integer type.");
97 static_assert(Kokkos::is_view<CountsViewType>::value,
98 "CountsViewType must be a Kokkos::View.");
99 static_assert(std::is_same<typename CountsViewType::value_type, output_offset_type>::value,
100 "CountsViewType must be a nonconst Kokkos::View.");
101 static_assert(std::is_integral<count_type>::value,
102 "The type of each entry of CountsViewType must be a built-in integer type.");
103 static_assert(Kokkos::is_view<InputOffsetsViewType>::value,
104 "InputOffsetsViewType must be a Kokkos::View.");
105 static_assert(std::is_integral<input_offset_type>::value,
106 "The type of each entry of InputOffsetsViewType must be a built-in integer type.");
107 static_assert(Kokkos::is_view<InputLocalRowIndicesViewType>::value,
108 "InputLocalRowIndicesViewType must be a Kokkos::View.");
109 static_assert(std::is_integral<local_row_index_type>::value,
110 "The type of each entry of InputLocalRowIndicesViewType must be a built-in integer type.");
111
113 const CountsViewType& counts,
117 const count_type sizeOfLclCount,
118 const count_type sizeOfGblColInd,
119 const count_type sizeOfPid,
120 const count_type sizeOfValue)
121 : outputOffsets_(outputOffsets)
122 , counts_(counts)
123 , rowOffsets_(rowOffsets)
124 , lclRowInds_(lclRowInds)
125 , lclRowPids_(lclRowPids)
126 , sizeOfLclCount_(sizeOfLclCount)
127 , sizeOfGblColInd_(sizeOfGblColInd)
128 , sizeOfPid_(sizeOfPid)
129 , sizeOfValue_(sizeOfValue)
130 , error_("error") // don't forget this, or you'll get segfaults!
131 {
132 if (debug) {
133 const size_t numRowsToPack = static_cast<size_t>(lclRowInds_.extent(0));
134
135 if (numRowsToPack != static_cast<size_t>(counts_.extent(0))) {
136 std::ostringstream os;
137 os << "lclRowInds.extent(0) = " << numRowsToPack
138 << " != counts.extent(0) = " << counts_.extent(0)
139 << ".";
140 TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, os.str());
141 }
142 if (static_cast<size_t>(numRowsToPack + 1) !=
143 static_cast<size_t>(outputOffsets_.extent(0))) {
144 std::ostringstream os;
145 os << "lclRowInds.extent(0) + 1 = " << (numRowsToPack + 1)
146 << " != outputOffsets.extent(0) = " << outputOffsets_.extent(0)
147 << ".";
148 TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, os.str());
149 }
150 }
151 }
152
154 operator()(const local_row_index_type& curInd,
155 output_offset_type& update,
156 const bool final) const {
157 if (debug) {
158 if (curInd < static_cast<local_row_index_type>(0)) {
159 error_() = 1;
160 return;
161 }
162 }
163
164 if (final) {
165 if (debug) {
166 if (curInd >= static_cast<local_row_index_type>(outputOffsets_.extent(0))) {
167 error_() = 2;
168 return;
169 }
170 }
171 outputOffsets_(curInd) = update;
172 }
173
174 if (curInd < static_cast<local_row_index_type>(counts_.extent(0))) {
175 const auto lclRow = lclRowInds_(curInd);
176 if (static_cast<size_t>(lclRow + 1) >= static_cast<size_t>(rowOffsets_.extent(0)) ||
177 static_cast<local_row_index_type>(lclRow) < static_cast<local_row_index_type>(0)) {
178 error_() = 3;
179 return;
180 }
181 // count_type could differ from the type of each row offset.
182 // For example, row offsets might each be 64 bits, but if their
183 // difference always fits in 32 bits, we may then safely use a
184 // 32-bit count_type.
185 const count_type count =
186 static_cast<count_type>(rowOffsets_(lclRow + 1) - rowOffsets_(lclRow));
187
188 // We pack first the number of entries in the row, then that
189 // many global column indices, then that many pids (if any),
190 // then that many values. However, if the number of entries in
191 // the row is zero, we pack nothing.
192 const count_type numBytes = (count == 0) ? static_cast<count_type>(0) : sizeOfLclCount_ + count * (sizeOfGblColInd_ + (lclRowPids_.size() > 0 ? sizeOfPid_ : 0) + sizeOfValue_);
193
194 if (final) {
195 counts_(curInd) = numBytes;
196 }
197 update += numBytes;
198 }
199 }
200
201 // mfh 31 May 2017: Don't need init or join. If you have join, MUST
202 // have join both with and without volatile! Otherwise intrawarp
203 // joins are really slow on GPUs.
204
206 int getError() const {
207 auto error_h = Kokkos::create_mirror_view(error_);
208 // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
209 // Note: In the UVM case, this would otherwise be a no-op
210 // and thus not fence, so the value might not be correct on return
211 // In the non-UVM case, create_mirror_view will block for the allocation
212 Kokkos::deep_copy(error_h, error_);
213 return error_h();
214 }
215
216 private:
217 OutputOffsetsViewType outputOffsets_;
218 CountsViewType counts_;
219 typename InputOffsetsViewType::const_type rowOffsets_;
220 typename InputLocalRowIndicesViewType::const_type lclRowInds_;
221 typename InputLocalRowPidsViewType::const_type lclRowPids_;
222 count_type sizeOfLclCount_;
223 count_type sizeOfGblColInd_;
224 count_type sizeOfPid_;
225 count_type sizeOfValue_;
226 Kokkos::View<int, device_type> error_;
227};
228
238template <class OutputOffsetsViewType,
239 class CountsViewType,
243typename CountsViewType::non_const_value_type
244computeNumPacketsAndOffsets(const OutputOffsetsViewType& outputOffsets,
245 const CountsViewType& counts,
249 const typename CountsViewType::non_const_value_type sizeOfLclCount,
250 const typename CountsViewType::non_const_value_type sizeOfGblColInd,
251 const typename CountsViewType::non_const_value_type sizeOfPid,
252 const typename CountsViewType::non_const_value_type sizeOfValue) {
254 CountsViewType, typename InputOffsetsViewType::const_type,
255 typename InputLocalRowIndicesViewType::const_type,
256 typename InputLocalRowPidsViewType::const_type>
258 typedef typename CountsViewType::non_const_value_type count_type;
259 typedef typename OutputOffsetsViewType::size_type size_type;
260 typedef typename OutputOffsetsViewType::execution_space execution_space;
261 typedef typename functor_type::local_row_index_type LO;
262 typedef Kokkos::RangePolicy<execution_space, LO> range_type;
263 const char prefix[] = "computeNumPacketsAndOffsets: ";
264
265 count_type count = 0;
266 const count_type numRowsToPack = lclRowInds.extent(0);
267
268 if (numRowsToPack == 0) {
269 return count;
270 } else {
271 TEUCHOS_TEST_FOR_EXCEPTION(rowOffsets.extent(0) <= static_cast<size_type>(1),
272 std::invalid_argument, prefix << "There is at least one row to pack, "
273 "but the matrix has no rows. lclRowInds.extent(0) = "
274 << numRowsToPack << ", but rowOffsets.extent(0) = " << rowOffsets.extent(0) << " <= 1.");
276 static_cast<size_type>(numRowsToPack + 1),
277 std::invalid_argument,
278 prefix << "Output dimension does not match number of rows to pack. "
279 << "outputOffsets.extent(0) = " << outputOffsets.extent(0)
280 << " != lclRowInds.extent(0) + 1 = "
281 << static_cast<size_type>(numRowsToPack + 1) << ".");
282 TEUCHOS_TEST_FOR_EXCEPTION(counts.extent(0) != numRowsToPack, std::invalid_argument,
283 prefix << "counts.extent(0) = " << counts.extent(0)
284 << " != numRowsToPack = " << numRowsToPack << ".");
285
289 Kokkos::parallel_scan(range_type(0, numRowsToPack + 1), f);
290
291 // At least in debug mode, this functor checks for errors.
292 const int errCode = f.getError();
293 TEUCHOS_TEST_FOR_EXCEPTION(errCode != 0, std::runtime_error, prefix << "parallel_scan error code " << errCode << " != 0.");
294
295#if 0
296 size_t total = 0;
297 for (LO k = 0; k < numRowsToPack; ++k) {
298 total += counts[k];
299 }
301 if (errStr.get () == NULL) {
302 errStr = std::unique_ptr<std::ostringstream> (new std::ostringstream ());
303 }
304 std::ostringstream& os = *errStr;
305 os << prefix
306 << "outputOffsets(numRowsToPack=" << numRowsToPack << ") "
307 << outputOffsets(numRowsToPack) << " != sum of counts = "
308 << total << "." << std::endl;
309 if (numRowsToPack != 0) {
310 // Only print the array if it's not too long.
311 if (numRowsToPack < static_cast<LO> (10)) {
312 os << "outputOffsets: [";
313 for (LO i = 0; i <= numRowsToPack; ++i) {
314 os << outputOffsets(i);
315 if (static_cast<LO> (i + 1) <= numRowsToPack) {
316 os << ",";
317 }
318 }
319 os << "]" << std::endl;
320 os << "counts: [";
321 for (LO i = 0; i < numRowsToPack; ++i) {
322 os << counts(i);
323 if (static_cast<LO> (i + 1) < numRowsToPack) {
324 os << ",";
325 }
326 }
327 os << "]" << std::endl;
328 }
329 else {
330 os << "outputOffsets(" << (numRowsToPack-1) << ") = "
331 << outputOffsets(numRowsToPack-1) << "." << std::endl;
332 }
333 }
335 return {false, errStr};
336 }
337#endif // HAVE_TPETRA_DEBUG
338
339 // Get last entry of outputOffsets, which is the sum of the entries
340 // of counts. Don't assume UVM.
341 using Tpetra::Details::getEntryOnHost;
342 return static_cast<count_type>(getEntryOnHost(outputOffsets,
344 }
345}
346
362template <class ST, class ColumnMap, class BufferDeviceType>
364 Kokkos::pair<int, size_t>
366 const Kokkos::View<char*, BufferDeviceType>& exports,
370 const size_t offset,
371 const size_t num_ent,
372 const size_t num_bytes_per_value,
373 const bool pack_pids) {
374 using Kokkos::subview;
375 using LO = typename ColumnMap::local_ordinal_type;
376 using GO = typename ColumnMap::global_ordinal_type;
377 using return_type = Kokkos::pair<int, size_t>;
378
379 if (num_ent == 0) {
380 // Empty rows always take zero bytes, to ensure sparsity.
381 return return_type(0, 0);
382 }
383
384 const LO num_ent_LO = static_cast<LO>(num_ent); // packValueCount wants this
385 const size_t num_ent_beg = offset;
387
388 const size_t gids_beg = num_ent_beg + num_ent_len;
389 const size_t gids_len = num_ent * PackTraits<GO>::packValueCount(GO(0));
390
391 const size_t pids_beg = gids_beg + gids_len;
392 const size_t pids_len = pack_pids ? num_ent * PackTraits<int>::packValueCount(int(0)) : static_cast<size_t>(0);
393
394 const size_t vals_beg = gids_beg + gids_len + pids_len;
395 const size_t vals_len = num_ent * num_bytes_per_value;
396
397 char* const num_ent_out = exports.data() + num_ent_beg;
398 char* const gids_out = exports.data() + gids_beg;
399 char* const pids_out = pack_pids ? exports.data() + pids_beg : NULL;
400 char* const vals_out = exports.data() + vals_beg;
401
402 size_t num_bytes_out = 0;
403 int error_code = 0;
405
406 {
407 // Copy column indices one at a time, so that we don't need
408 // temporary storage.
409 for (size_t k = 0; k < num_ent; ++k) {
410 const LO lid = lids_in[k];
411 const GO gid = col_map.getGlobalElement(lid);
413 }
414 // Copy PIDs one at a time, so that we don't need temporary storage.
415 if (pack_pids) {
416 for (size_t k = 0; k < num_ent; ++k) {
417 const LO lid = lids_in[k];
418 const int pid = pids_in[lid];
420 }
421 }
422 const auto p =
424 error_code += p.first;
425 num_bytes_out += p.second;
426 }
427
428 if (error_code != 0) {
429 return return_type(10, num_bytes_out);
430 }
431
432 const size_t expected_num_bytes =
435 return return_type(11, num_bytes_out);
436 }
437 return return_type(0, num_bytes_out);
438}
439
440template <class LocalMatrix, class LocalMap, class BufferDeviceType>
441struct PackCrsMatrixFunctor {
442 typedef LocalMatrix local_matrix_device_type;
443 typedef LocalMap local_map_type;
444 typedef typename local_matrix_device_type::value_type ST;
445 typedef typename local_map_type::local_ordinal_type LO;
446 typedef typename local_map_type::global_ordinal_type GO;
447 typedef typename local_matrix_device_type::device_type DT;
448
449 typedef Kokkos::View<const size_t*, BufferDeviceType>
450 num_packets_per_lid_view_type;
451 typedef Kokkos::View<const size_t*, BufferDeviceType> offsets_view_type;
452 typedef Kokkos::View<char*, BufferDeviceType> exports_view_type;
453 using export_lids_view_type = typename PackTraits<LO>::input_array_type;
454 using source_pids_view_type = typename PackTraits<int>::input_array_type;
455
456 typedef typename num_packets_per_lid_view_type::non_const_value_type
457 count_type;
458 typedef typename offsets_view_type::non_const_value_type
459 offset_type;
460 typedef Kokkos::pair<int, LO> value_type;
461
462 static_assert(std::is_same<LO, typename local_matrix_device_type::ordinal_type>::value,
463 "local_map_type::local_ordinal_type and "
464 "local_matrix_device_type::ordinal_type must be the same.");
465
466 local_matrix_device_type local_matrix;
467 local_map_type local_col_map;
468 exports_view_type exports;
469 num_packets_per_lid_view_type num_packets_per_lid;
470 export_lids_view_type export_lids;
471 source_pids_view_type source_pids;
472 offsets_view_type offsets;
473 size_t num_bytes_per_value;
474 bool pack_pids;
475
476 PackCrsMatrixFunctor(const local_matrix_device_type& local_matrix_in,
477 const local_map_type& local_col_map_in,
478 const exports_view_type& exports_in,
479 const num_packets_per_lid_view_type& num_packets_per_lid_in,
480 const export_lids_view_type& export_lids_in,
481 const source_pids_view_type& source_pids_in,
482 const offsets_view_type& offsets_in,
483 const size_t num_bytes_per_value_in,
484 const bool pack_pids_in)
485 : local_matrix(local_matrix_in)
486 , local_col_map(local_col_map_in)
487 , exports(exports_in)
488 , num_packets_per_lid(num_packets_per_lid_in)
489 , export_lids(export_lids_in)
490 , source_pids(source_pids_in)
491 , offsets(offsets_in)
492 , num_bytes_per_value(num_bytes_per_value_in)
493 , pack_pids(pack_pids_in) {
494 const LO numRows = local_matrix_in.numRows();
495 const LO rowMapDim =
496 static_cast<LO>(local_matrix.graph.row_map.extent(0));
497 TEUCHOS_TEST_FOR_EXCEPTION(numRows != 0 && rowMapDim != numRows + static_cast<LO>(1),
498 std::logic_error, "local_matrix.graph.row_map.extent(0) = " << rowMapDim << " != numRows (= " << numRows << " ) + 1.");
499 }
500
501 KOKKOS_INLINE_FUNCTION void init(value_type& dst) const {
502 using ::Tpetra::Details::OrdinalTraits;
503 dst = Kokkos::make_pair(0, OrdinalTraits<LO>::invalid());
504 }
505
506 KOKKOS_INLINE_FUNCTION void
507 join(value_type& dst, const value_type& src) const {
508 // `dst` should reflect the first (least) bad index and all other
509 // associated error codes and data, so prefer keeping it.
510 if (src.first != 0 && dst.first == 0) {
511 dst = src;
512 }
513 }
514
515 KOKKOS_INLINE_FUNCTION
516 void operator()(const LO i, value_type& dst) const {
517 const size_t offset = offsets[i];
518 const LO export_lid = export_lids[i];
519 const size_t buf_size = exports.size();
520 const size_t num_bytes = num_packets_per_lid(i);
521 const size_t num_ent =
522 static_cast<size_t>(local_matrix.graph.row_map[export_lid + 1] - local_matrix.graph.row_map[export_lid]);
523
524 // Only pack this row's data if it has a nonzero number of
525 // entries. We can do this because receiving processes get the
526 // number of packets, and will know that zero packets means zero
527 // entries.
528 if (num_ent == 0) {
529 return;
530 }
531
532 if (export_lid >= local_matrix.numRows()) {
533 if (dst.first != 0) { // keep only the first error
534 dst = Kokkos::make_pair(1, i); // invalid row
535 }
536 return;
537 } else if ((offset > buf_size || offset + num_bytes > buf_size)) {
538 if (dst.first != 0) { // keep only the first error
539 dst = Kokkos::make_pair(2, i); // out of bounds
540 }
541 return;
542 }
543
544 // We can now pack this row
545
546 // Since the matrix is locally indexed on the calling process, we
547 // have to use its column Map (which it _must_ have in this case)
548 // to convert to global indices.
549 const auto row_beg = local_matrix.graph.row_map[export_lid];
550 const auto row_end = local_matrix.graph.row_map[export_lid + 1];
551 auto vals_in = subview(local_matrix.values,
552 Kokkos::make_pair(row_beg, row_end));
553 auto lids_in = subview(local_matrix.graph.entries,
554 Kokkos::make_pair(row_beg, row_end));
555 typedef local_map_type LMT;
556 typedef BufferDeviceType BDT;
557 auto p = packCrsMatrixRow<ST, LMT, BDT>(local_col_map, exports, lids_in,
558 source_pids, vals_in, offset,
559 num_ent, num_bytes_per_value,
560 pack_pids);
561 int error_code_this_row = p.first;
562 size_t num_bytes_packed_this_row = p.second;
563 if (error_code_this_row != 0) {
564 if (dst.first != 0) { // keep only the first error
565 dst = Kokkos::make_pair(error_code_this_row, i); // bad pack
566 }
567 } else if (num_bytes_packed_this_row != num_bytes) {
568 if (dst.first != 0) { // keep only the first error
569 dst = Kokkos::make_pair(3, i);
570 }
571 }
572 }
573};
574
582template <class LocalMatrix, class LocalMap, class BufferDeviceType>
583void do_pack(const LocalMatrix& local_matrix,
584 const LocalMap& local_map,
585 const Kokkos::View<char*, BufferDeviceType>& exports,
586 const typename PackTraits<size_t>::input_array_type& num_packets_per_lid,
588 const typename PackTraits<int>::input_array_type& source_pids,
589 const Kokkos::View<const size_t*, BufferDeviceType>& offsets,
590 const size_t num_bytes_per_value,
591 const bool pack_pids) {
592 using LO = typename LocalMap::local_ordinal_type;
593 using DT = typename LocalMatrix::device_type;
594 using range_type = Kokkos::RangePolicy<typename DT::execution_space, LO>;
595 const char prefix[] = "Tpetra::Details::do_pack: ";
596
597 if (export_lids.extent(0) != 0) {
598 TEUCHOS_TEST_FOR_EXCEPTION(static_cast<size_t>(offsets.extent(0)) !=
599 static_cast<size_t>(export_lids.extent(0) + 1),
600 std::invalid_argument, prefix << "offsets.extent(0) = " << offsets.extent(0) << " != export_lids.extent(0) (= " << export_lids.extent(0) << ") + 1.");
601 TEUCHOS_TEST_FOR_EXCEPTION(export_lids.extent(0) != num_packets_per_lid.extent(0),
602 std::invalid_argument, prefix << "export_lids.extent(0) = " << export_lids.extent(0) << " != num_packets_per_lid.extent(0) = " << num_packets_per_lid.extent(0) << ".");
603 // If exports has nonzero length at this point, then the matrix
604 // has at least one entry to pack. Thus, if packing process
605 // ranks, we had better have at least one process rank to pack.
606 TEUCHOS_TEST_FOR_EXCEPTION(pack_pids && exports.extent(0) != 0 &&
607 source_pids.extent(0) == 0,
608 std::invalid_argument, prefix << "pack_pids is true, and exports.extent(0) = " << exports.extent(0) << " != 0, meaning that we need to pack at "
609 "least one matrix entry, but source_pids.extent(0) = 0.");
610 }
611
612 using pack_functor_type =
614 pack_functor_type f(local_matrix, local_map, exports,
615 num_packets_per_lid, export_lids,
616 source_pids, offsets, num_bytes_per_value,
617 pack_pids);
618
619 typename pack_functor_type::value_type result;
620 range_type range(0, num_packets_per_lid.extent(0));
621 Kokkos::parallel_reduce(range, f, result);
622
623 if (result.first != 0) {
624 // We can't deep_copy from AnonymousSpace Views, so we can't print
625 // out any information from them in case of error.
626 TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, prefix << "PackCrsMatrixFunctor "
627 "reported error code "
628 << result.first << " for the first "
629 "bad row "
630 << result.second << ".");
631 }
632}
633
663template <typename ST, typename LO, typename GO, typename NT, typename BufferDeviceType>
665 Kokkos::DualView<char*, BufferDeviceType>& exports,
666 const Kokkos::View<size_t*, BufferDeviceType>& num_packets_per_lid,
667 const Kokkos::View<const LO*, BufferDeviceType>& export_lids,
668 const Kokkos::View<const int*, typename NT::device_type>& export_pids,
669 size_t& constant_num_packets,
670 const bool pack_pids) {
672 "Tpetra::Details::PackCrsMatrixImpl::packCrsMatrix",
673 "Import/Export");
674 using Kokkos::View;
675 typedef BufferDeviceType DT;
676 typedef Kokkos::DualView<char*, BufferDeviceType> exports_view_type;
677 const char prefix[] = "Tpetra::Details::PackCrsMatrixImpl::packCrsMatrix: ";
678 constexpr bool debug = false;
679
680 auto local_matrix = sourceMatrix.getLocalMatrixDevice();
681 auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
682
683 // Setting this to zero tells the caller to expect a possibly
684 // different ("nonconstant") number of packets per local index
685 // (i.e., a possibly different number of entries per row).
687
688 const size_t num_export_lids =
689 static_cast<size_t>(export_lids.extent(0));
691 static_cast<size_t>(num_packets_per_lid.extent(0)),
692 std::invalid_argument, prefix << "num_export_lids.extent(0) = " << num_export_lids << " != num_packets_per_lid.extent(0) = " << num_packets_per_lid.extent(0) << ".");
693 if (num_export_lids != 0) {
694 TEUCHOS_TEST_FOR_EXCEPTION(num_packets_per_lid.data() == NULL, std::invalid_argument,
695 prefix << "num_export_lids = " << num_export_lids << " != 0, but "
696 "num_packets_per_lid.data() = "
697 << num_packets_per_lid.data() << " == NULL.");
698 }
699
703
704 size_t num_bytes_per_value = 0;
706 // Assume ST is default constructible; packValueCount wants an instance.
707 num_bytes_per_value = PackTraits<ST>::packValueCount(ST());
708 } else {
709 // Since the packed data come from the source matrix, we can use
710 // the source matrix to get the number of bytes per Scalar value
711 // stored in the matrix. This assumes that all Scalar values in
712 // the source matrix require the same number of bytes. If the
713 // source matrix has no entries on the calling process, then we
714 // hope that some process does have some idea how big a Scalar
715 // value is. Of course, if no processes have any entries, then no
716 // values should be packed (though this does assume that in our
717 // packing scheme, rows with zero entries take zero bytes).
718 size_t num_bytes_per_value_l = 0;
719 if (local_matrix.values.extent(0) > 0) {
720 const ST& val = local_matrix.values(0);
722 }
723 using Teuchos::reduceAll;
725 Teuchos::REDUCE_MAX,
727 Teuchos::outArg(num_bytes_per_value));
728 }
729
730 if (num_export_lids == 0) {
731 exports = exports_view_type("exports", 0);
732 return;
733 }
734
735 // Array of offsets into the pack buffer.
736 Kokkos::View<size_t*, DT> offsets("offsets", num_export_lids + 1);
737
738 // Compute number of packets per LID (row to send), as well as
739 // corresponding offsets (the prefix sum of the packet counts).
740 const size_t count =
741 computeNumPacketsAndOffsets(offsets, num_packets_per_lid,
742 local_matrix.graph.row_map, export_lids,
745 num_bytes_per_pid, num_bytes_per_value);
746
747 // Resize the output pack buffer if needed.
748 if (count > static_cast<size_t>(exports.extent(0))) {
749 exports = exports_view_type("exports", count);
750 if (debug) {
751 std::ostringstream os;
752 os << "*** exports resized to " << count << std::endl;
753 std::cerr << os.str();
754 }
755 }
756 if (debug) {
757 std::ostringstream os;
758 os << "*** count: " << count << ", exports.extent(0): "
759 << exports.extent(0) << std::endl;
760 std::cerr << os.str();
761 }
762
763 // If exports has nonzero length at this point, then the matrix has
764 // at least one entry to pack. Thus, if packing process ranks, we
765 // had better have at least one process rank to pack.
766 TEUCHOS_TEST_FOR_EXCEPTION(pack_pids && exports.extent(0) != 0 &&
767 export_pids.extent(0) == 0,
768 std::invalid_argument, prefix << "pack_pids is true, and exports.extent(0) = " << exports.extent(0) << " != 0, meaning that we need to pack at least "
769 "one matrix entry, but export_pids.extent(0) = 0.");
770
771 typedef typename std::decay<decltype(local_matrix)>::type
772 local_matrix_device_type;
773 typedef typename std::decay<decltype(local_col_map)>::type
774 local_map_type;
775
776 exports.modify_device();
777 auto exports_d = exports.view_device();
778 do_pack<local_matrix_device_type, local_map_type, DT>(local_matrix, local_col_map, exports_d, num_packets_per_lid,
779 export_lids, export_pids, offsets, num_bytes_per_value,
780 pack_pids);
781 // If we got this far, we succeeded.
782}
783
784} // namespace PackCrsMatrixImpl
785
786template <typename ST, typename LO, typename GO, typename NT>
788 Teuchos::Array<char>& exports,
789 const Teuchos::ArrayView<size_t>& numPacketsPerLID,
790 const Teuchos::ArrayView<const LO>& exportLIDs,
791 size_t& constantNumPackets) {
793 using buffer_device_type = typename DistObject<char, LO, GO, NT>::buffer_device_type;
794 using host_exec_space = typename Kokkos::View<size_t*, device_type>::host_mirror_type::execution_space;
795 using device_exec_space = typename device_type::execution_space;
796 using host_dev_type = Kokkos::Device<host_exec_space, Kokkos::HostSpace>;
797
798 // Convert all Teuchos::Array to Kokkos::View
799
800 // This is an output array, so we don't have to copy to device here.
801 // However, we'll have to remember to copy back to host when done.
802 Kokkos::View<size_t*, buffer_device_type> num_packets_per_lid_d =
803 create_mirror_view_from_raw_host_array(buffer_device_type(),
804 numPacketsPerLID.getRawPtr(),
805 numPacketsPerLID.size(), false,
806 "num_packets_per_lid");
807 // FIXME (mfh 05 Feb 2019) We should just pass the exportLIDs
808 // DualView through here, instead of recreating a device View from a
809 // host ArrayView that itself came from a DualView.
810 //
811 // This is an input array, so we have to copy to device here.
812 // However, we never need to copy it back to host.
813 Kokkos::View<const LO*, buffer_device_type> export_lids_d =
814 create_mirror_view_from_raw_host_array(buffer_device_type(),
815 exportLIDs.getRawPtr(),
816 exportLIDs.size(), true,
817 "export_lids");
818
819 Kokkos::View<int*, device_type> export_pids_d; // output arg
820 Kokkos::DualView<char*, buffer_device_type> exports_dv; // output arg
821 constexpr bool pack_pids = false;
822 PackCrsMatrixImpl::packCrsMatrix<ST, LO, GO, NT, buffer_device_type>(
825
826 // The counts are an output of PackCrsMatrixImpl::packCrsMatrix, so we have to
827 // copy them back to host.
828 Kokkos::View<size_t*, host_dev_type> num_packets_per_lid_h(numPacketsPerLID.getRawPtr(),
829 numPacketsPerLID.size());
830 // DEEP_COPY REVIEW - DEVICE-TO-HOST
832
833 // FIXME (mfh 23 Aug 2017) If we're forced to use a DualView for
834 // exports_dv above, then we have two host copies for exports_h.
835
836 // The exports are an output of PackCrsMatrixImpl::packCrsMatrix, so we have
837 // to copy them back to host.
838 if (static_cast<size_t>(exports.size()) !=
839 static_cast<size_t>(exports_dv.extent(0))) {
840 exports.resize(exports_dv.extent(0));
841 }
842 Kokkos::View<char*, host_dev_type> exports_h(exports.getRawPtr(),
843 exports.size());
844 // DEEP_COPY REVIEW - DEVICE-TO-HOST
845 Kokkos::deep_copy(device_exec_space(), exports_h, exports_dv.view_device());
846}
847
848template <typename ST, typename LO, typename GO, typename NT>
851 Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>& exports,
852 const Kokkos::DualView<size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>& numPacketsPerLID,
853 const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>& exportLIDs,
854 size_t& constantNumPackets) {
855 using device_type = typename CrsMatrix<ST, LO, GO, NT>::device_type;
856 using buffer_device_type = typename DistObject<char, LO, GO, NT>::buffer_device_type;
857
858 // Create an empty array of PIDs, since the interface needs it.
859 Kokkos::View<int*, device_type> exportPIDs_d("exportPIDs", 0);
860 constexpr bool pack_pids = false;
861
862 // Write-only device access
863 auto numPacketsPerLID_nc = numPacketsPerLID; // const DV& -> DV
864 numPacketsPerLID_nc.clear_sync_state();
865 numPacketsPerLID_nc.modify_device();
866 auto numPacketsPerLID_d = numPacketsPerLID.view_device();
867
868 // Read-only device access
869 TEUCHOS_ASSERT(!exportLIDs.need_sync_device());
870 auto exportLIDs_d = exportLIDs.view_device();
871
873 "Tpetra::Details::packCrsMatrixNew",
874 "Import/Export");
875 PackCrsMatrixImpl::packCrsMatrix<ST, LO, GO, NT, buffer_device_type>(
877 exportPIDs_d, constantNumPackets, pack_pids);
878}
879
880template <typename ST, typename LO, typename GO, typename NT>
882 Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>& exports_dv,
883 const Teuchos::ArrayView<size_t>& numPacketsPerLID,
884 const Teuchos::ArrayView<const LO>& exportLIDs,
885 const Teuchos::ArrayView<const int>& sourcePIDs,
886 size_t& constantNumPackets) {
887 typedef typename CrsMatrix<ST, LO, GO, NT>::local_matrix_device_type local_matrix_device_type;
888 typedef typename DistObject<char, LO, GO, NT>::buffer_device_type buffer_device_type;
889 typedef typename Kokkos::DualView<char*, buffer_device_type>::t_host::execution_space host_exec_space;
890 typedef Kokkos::Device<host_exec_space, Kokkos::HostSpace> host_dev_type;
891
892 typename local_matrix_device_type::device_type outputDevice;
893 typedef typename NT::execution_space execution_space;
894
895 const bool verbose = ::Tpetra::Details::Behavior::verbose();
896 std::unique_ptr<std::string> prefix;
897 if (verbose) {
898 const int myRank = [&]() {
899 auto map = sourceMatrix.getMap();
900 if (map.get() == nullptr) {
901 return -1;
902 }
903 auto comm = map->getComm();
904 if (comm.get() == nullptr) {
905 return -2;
906 }
907 return comm->getRank();
908 }();
909 std::ostringstream os;
910 os << "Proc " << myRank << ": packCrsMatrixWithOwningPIDs: ";
911 prefix = std::unique_ptr<std::string>(new std::string(os.str()));
912
913 std::ostringstream os2;
914 os2 << *prefix << "start" << std::endl;
915 std::cerr << os2.str();
916 }
917
918 // Convert all Teuchos::Array to Kokkos::View
919
920 // This is an output array, so we don't have to copy to device here.
921 // However, we'll have to remember to copy back to host when done.
923 create_mirror_view_from_raw_host_array(buffer_device_type(),
924 numPacketsPerLID.getRawPtr(),
925 numPacketsPerLID.size(), false,
926 "num_packets_per_lid");
927
928 // This is an input array, so we have to copy to device here.
929 // However, we never need to copy it back to host.
930 auto export_lids_d =
931 create_mirror_view_from_raw_host_array(buffer_device_type(),
932 exportLIDs.getRawPtr(),
933 exportLIDs.size(), true,
934 "export_lids");
935 // This is an input array, so we have to copy to device here.
936 // However, we never need to copy it back to host.
937 auto export_pids_d =
939 sourcePIDs.getRawPtr(),
940 sourcePIDs.size(), true,
941 "export_pids");
942 constexpr bool pack_pids = true;
943 try {
944 PackCrsMatrixImpl::packCrsMatrix(sourceMatrix, exports_dv, num_packets_per_lid_d, export_lids_d,
946 } catch (std::exception& e) {
947 if (verbose) {
948 std::ostringstream os;
949 os << *prefix << "PackCrsMatrixImpl::packCrsMatrix threw: "
950 << e.what() << std::endl;
951 std::cerr << os.str();
952 }
953 throw;
954 } catch (...) {
955 if (verbose) {
956 std::ostringstream os;
957 os << *prefix << "PackCrsMatrixImpl::packCrsMatrix threw an exception "
958 "not a subclass of std::exception"
959 << std::endl;
960 std::cerr << os.str();
961 }
962 throw;
963 }
964
965 if (numPacketsPerLID.size() != 0) {
966 try {
967 // The counts are an output of PackCrsMatrixImpl::packCrsMatrix,
968 // so we have to copy them back to host.
969 Kokkos::View<size_t*, host_dev_type> num_packets_per_lid_h(numPacketsPerLID.getRawPtr(), numPacketsPerLID.size());
970 // DEEP_COPY REVIEW - DEVICE-TO-HOST
971 Kokkos::deep_copy(execution_space(), num_packets_per_lid_h, num_packets_per_lid_d);
972 } catch (std::exception& e) {
973 if (verbose) {
974 std::ostringstream os;
975 os << *prefix << "Kokkos::deep_copy threw: " << e.what() << std::endl;
976 std::cerr << os.str();
977 }
978 throw;
979 } catch (...) {
980 if (verbose) {
981 std::ostringstream os;
982 os << *prefix << "Kokkos::deep_copy threw an exception not a subclass "
983 "of std::exception"
984 << std::endl;
985 std::cerr << os.str();
986 }
987 throw;
988 }
989 }
990
991 if (verbose) {
992 std::ostringstream os;
993 os << *prefix << "done" << std::endl;
994 std::cerr << os.str();
995 }
996}
997
998} // namespace Details
999} // namespace Tpetra
1000
1001#define TPETRA_DETAILS_PACKCRSMATRIX_INSTANT(ST, LO, GO, NT) \
1002 template void \
1003 Details::packCrsMatrix<ST, LO, GO, NT>(const CrsMatrix<ST, LO, GO, NT>&, \
1004 Teuchos::Array<char>&, \
1005 const Teuchos::ArrayView<size_t>&, \
1006 const Teuchos::ArrayView<const LO>&, \
1007 size_t&); \
1008 template void \
1009 Details::packCrsMatrixNew<ST, LO, GO, NT>(const CrsMatrix<ST, LO, GO, NT>&, \
1010 Kokkos::DualView<char*, DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1011 const Kokkos::DualView<size_t*, DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1012 const Kokkos::DualView<const LO*, DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1013 size_t&); \
1014 template void \
1015 Details::packCrsMatrixWithOwningPIDs<ST, LO, GO, NT>(const CrsMatrix<ST, LO, GO, NT>&, \
1016 Kokkos::DualView<char*, DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1017 const Teuchos::ArrayView<size_t>&, \
1018 const Teuchos::ArrayView<const LO>&, \
1019 const Teuchos::ArrayView<const int>&, \
1020 size_t&);
1021
1022#endif // TPETRA_DETAILS_PACKCRSMATRIX_DEF_HPP
Declaration of the Tpetra::CrsMatrix class.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Import KokkosSparse::OrdinalTraits, a traits class for "invalid" (flag) values of integer types,...
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary,...
Declaration and definition of Tpetra::Details::getEntryOnHost.
KOKKOS_FUNCTION Kokkos::pair< int, size_t > packCrsMatrixRow(const ColumnMap &col_map, const Kokkos::View< char *, BufferDeviceType > &exports, const typename PackTraits< typename ColumnMap::local_ordinal_type >::input_array_type &lids_in, const typename PackTraits< int >::input_array_type &pids_in, const typename PackTraits< ST >::input_array_type &vals_in, const size_t offset, const size_t num_ent, const size_t num_bytes_per_value, const bool pack_pids)
Packs a single row of the CrsMatrix.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries.
typename Node::device_type device_type
The Kokkos device type.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
Struct that holds views of the contents of a CrsMatrix.
static bool verbose()
Whether Tpetra is in verbose mode.
"Local" part of Map suitable for Kokkos kernels.
LocalOrdinal local_ordinal_type
The type of local indices.
GlobalOrdinal global_ordinal_type
The type of global indices.
Compute the number of packets and offsets for the pack procedure.
Kokkos::Device< typename device_type::execution_space, buffer_memory_space > buffer_device_type
Kokkos::Device specialization for communication buffers.
Implementation details of Tpetra.
void packCrsMatrix(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, Teuchos::Array< char > &exports, const Teuchos::ArrayView< size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &exportLIDs, size_t &constantNumPackets)
Pack specified entries of the given local sparse matrix for communication.
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
void packCrsMatrixWithOwningPIDs(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, Kokkos::DualView< char *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &exports_dv, const Teuchos::ArrayView< size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &exportLIDs, const Teuchos::ArrayView< const int > &sourcePIDs, size_t &constantNumPackets)
Pack specified entries of the given local sparse matrix for communication.
void packCrsMatrixNew(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, Kokkos::DualView< char *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &exports, const Kokkos::DualView< size_t *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &numPacketsPerLID, const Kokkos::DualView< const LO *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &exportLIDs, size_t &constantNumPackets)
Pack specified entries of the given local sparse matrix for communication, for "new" DistObject inter...
Namespace Tpetra contains the class and methods constituting the Tpetra library.
Traits class for packing / unpacking data of type T.
static KOKKOS_INLINE_FUNCTION Kokkos::pair< int, size_t > packArray(char outBuf[], const value_type inBuf[], const size_t numEnt)
Pack the first numEnt entries of the given input buffer of value_type, into the output buffer of byte...
static KOKKOS_INLINE_FUNCTION size_t packValue(char outBuf[], const T &inVal)
Pack the given value of type value_type into the given output buffer of bytes (char).
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const T &)
Number of bytes required to pack or unpack the given value of type value_type.
Kokkos::View< const value_type *, Kokkos::AnonymousSpace > input_array_type
The type of an input array of value_type.