Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
Tpetra_Details_packCrsMatrix_def.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef TPETRA_DETAILS_PACKCRSMATRIX_DEF_HPP
11#define TPETRA_DETAILS_PACKCRSMATRIX_DEF_HPP
12
13#include "TpetraCore_config.h"
14#include "Teuchos_Array.hpp"
15#include "Teuchos_ArrayView.hpp"
24#include <memory>
25#include <sstream>
26#include <stdexcept>
27#include <string>
28
51
52namespace Tpetra {
53
54//
55// Users must never rely on anything in the Details namespace.
56//
57namespace Details {
58
59namespace PackCrsMatrixImpl {
67template <class OutputOffsetsViewType,
68 class CountsViewType,
69 class InputOffsetsViewType,
70 class InputLocalRowIndicesViewType,
71 class InputLocalRowPidsViewType,
72 const bool debug =
73#ifdef HAVE_TPETRA_DEBUG
74 true
75#else
76 false
77#endif // HAVE_TPETRA_DEBUG
78 >
80 public:
81 typedef typename OutputOffsetsViewType::non_const_value_type output_offset_type;
82 typedef typename CountsViewType::non_const_value_type count_type;
83 typedef typename InputOffsetsViewType::non_const_value_type input_offset_type;
84 typedef typename InputLocalRowIndicesViewType::non_const_value_type local_row_index_type;
85 typedef typename InputLocalRowPidsViewType::non_const_value_type local_row_pid_type;
86 // output Views drive where execution happens.
87 typedef typename OutputOffsetsViewType::device_type device_type;
88 static_assert(std::is_same<typename CountsViewType::device_type::execution_space,
89 typename device_type::execution_space>::value,
90 "OutputOffsetsViewType and CountsViewType must have the same execution space.");
91 static_assert(Kokkos::is_view<OutputOffsetsViewType>::value,
92 "OutputOffsetsViewType must be a Kokkos::View.");
93 static_assert(std::is_same<typename OutputOffsetsViewType::value_type, output_offset_type>::value,
94 "OutputOffsetsViewType must be a nonconst Kokkos::View.");
95 static_assert(std::is_integral<output_offset_type>::value,
96 "The type of each entry of OutputOffsetsViewType must be a built-in integer type.");
97 static_assert(Kokkos::is_view<CountsViewType>::value,
98 "CountsViewType must be a Kokkos::View.");
99 static_assert(std::is_same<typename CountsViewType::value_type, output_offset_type>::value,
100 "CountsViewType must be a nonconst Kokkos::View.");
101 static_assert(std::is_integral<count_type>::value,
102 "The type of each entry of CountsViewType must be a built-in integer type.");
103 static_assert(Kokkos::is_view<InputOffsetsViewType>::value,
104 "InputOffsetsViewType must be a Kokkos::View.");
105 static_assert(std::is_integral<input_offset_type>::value,
106 "The type of each entry of InputOffsetsViewType must be a built-in integer type.");
107 static_assert(Kokkos::is_view<InputLocalRowIndicesViewType>::value,
108 "InputLocalRowIndicesViewType must be a Kokkos::View.");
109 static_assert(std::is_integral<local_row_index_type>::value,
110 "The type of each entry of InputLocalRowIndicesViewType must be a built-in integer type.");
111
113 const CountsViewType& counts,
117 const count_type sizeOfLclCount,
118 const count_type sizeOfGblColInd,
119 const count_type sizeOfPid,
120 const count_type sizeOfValue)
121 : outputOffsets_(outputOffsets)
122 , counts_(counts)
123 , rowOffsets_(rowOffsets)
124 , lclRowInds_(lclRowInds)
125 , lclRowPids_(lclRowPids)
126 , sizeOfLclCount_(sizeOfLclCount)
127 , sizeOfGblColInd_(sizeOfGblColInd)
128 , sizeOfPid_(sizeOfPid)
129 , sizeOfValue_(sizeOfValue)
130 , error_("error") // don't forget this, or you'll get segfaults!
131 {
132 if (debug) {
133 const size_t numRowsToPack = static_cast<size_t>(lclRowInds_.extent(0));
134
135 if (numRowsToPack != static_cast<size_t>(counts_.extent(0))) {
136 std::ostringstream os;
137 os << "lclRowInds.extent(0) = " << numRowsToPack
138 << " != counts.extent(0) = " << counts_.extent(0)
139 << ".";
140 TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, os.str());
141 }
142 if (static_cast<size_t>(numRowsToPack + 1) !=
143 static_cast<size_t>(outputOffsets_.extent(0))) {
144 std::ostringstream os;
145 os << "lclRowInds.extent(0) + 1 = " << (numRowsToPack + 1)
146 << " != outputOffsets.extent(0) = " << outputOffsets_.extent(0)
147 << ".";
148 TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, os.str());
149 }
150 }
151 }
152
154 operator()(const local_row_index_type& curInd,
155 output_offset_type& update,
156 const bool final) const {
157 if (debug) {
158 if (curInd < static_cast<local_row_index_type>(0)) {
159 error_() = 1;
160 return;
161 }
162 }
163
164 if (final) {
165 if (debug) {
166 if (curInd >= static_cast<local_row_index_type>(outputOffsets_.extent(0))) {
167 error_() = 2;
168 return;
169 }
170 }
171 outputOffsets_(curInd) = update;
172 }
173
174 if (curInd < static_cast<local_row_index_type>(counts_.extent(0))) {
175 const auto lclRow = lclRowInds_(curInd);
176 if (static_cast<size_t>(lclRow + 1) >= static_cast<size_t>(rowOffsets_.extent(0)) ||
177 static_cast<local_row_index_type>(lclRow) < static_cast<local_row_index_type>(0)) {
178 error_() = 3;
179 return;
180 }
181 // count_type could differ from the type of each row offset.
182 // For example, row offsets might each be 64 bits, but if their
183 // difference always fits in 32 bits, we may then safely use a
184 // 32-bit count_type.
185 const count_type count =
186 static_cast<count_type>(rowOffsets_(lclRow + 1) - rowOffsets_(lclRow));
187
188 // We pack first the number of entries in the row, then that
189 // many global column indices, then that many pids (if any),
190 // then that many values. However, if the number of entries in
191 // the row is zero, we pack nothing.
192 const count_type numBytes = (count == 0) ? static_cast<count_type>(0) : sizeOfLclCount_ + count * (sizeOfGblColInd_ + (lclRowPids_.size() > 0 ? sizeOfPid_ : 0) + sizeOfValue_);
193
194 if (final) {
195 counts_(curInd) = numBytes;
196 }
197 update += numBytes;
198 }
199 }
200
201 // mfh 31 May 2017: Don't need init or join. If you have join, MUST
202 // have join both with and without volatile! Otherwise intrawarp
203 // joins are really slow on GPUs.
204
206 int getError() const {
207 auto error_h = Kokkos::create_mirror_view(error_);
208 // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
209 // Note: In the UVM case, this would otherwise be a no-op
210 // and thus not fence, so the value might not be correct on return
211 // In the non-UVM case, create_mirror_view will block for the allocation
212 Kokkos::deep_copy(error_h, error_);
213 return error_h();
214 }
215
216 private:
217 OutputOffsetsViewType outputOffsets_;
218 CountsViewType counts_;
219 typename InputOffsetsViewType::const_type rowOffsets_;
220 typename InputLocalRowIndicesViewType::const_type lclRowInds_;
221 typename InputLocalRowPidsViewType::const_type lclRowPids_;
222 count_type sizeOfLclCount_;
223 count_type sizeOfGblColInd_;
224 count_type sizeOfPid_;
225 count_type sizeOfValue_;
226 Kokkos::View<int, device_type> error_;
227};
228
238template <class OutputOffsetsViewType,
239 class CountsViewType,
243typename CountsViewType::non_const_value_type
244computeNumPacketsAndOffsets(const OutputOffsetsViewType& outputOffsets,
245 const CountsViewType& counts,
249 const typename CountsViewType::non_const_value_type sizeOfLclCount,
250 const typename CountsViewType::non_const_value_type sizeOfGblColInd,
251 const typename CountsViewType::non_const_value_type sizeOfPid,
252 const typename CountsViewType::non_const_value_type sizeOfValue) {
254 CountsViewType, typename InputOffsetsViewType::const_type,
255 typename InputLocalRowIndicesViewType::const_type,
256 typename InputLocalRowPidsViewType::const_type>
258 typedef typename CountsViewType::non_const_value_type count_type;
259 typedef typename OutputOffsetsViewType::size_type size_type;
260 typedef typename OutputOffsetsViewType::execution_space execution_space;
261 typedef typename functor_type::local_row_index_type LO;
262 typedef Kokkos::RangePolicy<execution_space, LO> range_type;
263 const char prefix[] = "computeNumPacketsAndOffsets: ";
264
265 count_type count = 0;
266 const count_type numRowsToPack = lclRowInds.extent(0);
267
268 if (numRowsToPack == 0) {
269 return count;
270 } else {
271 TEUCHOS_TEST_FOR_EXCEPTION(rowOffsets.extent(0) <= static_cast<size_type>(1),
272 std::invalid_argument, prefix << "There is at least one row to pack, "
273 "but the matrix has no rows. lclRowInds.extent(0) = "
274 << numRowsToPack << ", but rowOffsets.extent(0) = " << rowOffsets.extent(0) << " <= 1.");
276 static_cast<size_type>(numRowsToPack + 1),
277 std::invalid_argument,
278 prefix << "Output dimension does not match number of rows to pack. "
279 << "outputOffsets.extent(0) = " << outputOffsets.extent(0)
280 << " != lclRowInds.extent(0) + 1 = "
281 << static_cast<size_type>(numRowsToPack + 1) << ".");
282 TEUCHOS_TEST_FOR_EXCEPTION(counts.extent(0) != numRowsToPack, std::invalid_argument,
283 prefix << "counts.extent(0) = " << counts.extent(0)
284 << " != numRowsToPack = " << numRowsToPack << ".");
285
289 Kokkos::parallel_scan(range_type(0, numRowsToPack + 1), f);
290
291 // At least in debug mode, this functor checks for errors.
292 const int errCode = f.getError();
293 TEUCHOS_TEST_FOR_EXCEPTION(errCode != 0, std::runtime_error, prefix << "parallel_scan error code " << errCode << " != 0.");
294
295#if 0
296 size_t total = 0;
297 for (LO k = 0; k < numRowsToPack; ++k) {
298 total += counts[k];
299 }
301 if (errStr.get () == NULL) {
302 errStr = std::unique_ptr<std::ostringstream> (new std::ostringstream ());
303 }
304 std::ostringstream& os = *errStr;
305 os << prefix
306 << "outputOffsets(numRowsToPack=" << numRowsToPack << ") "
307 << outputOffsets(numRowsToPack) << " != sum of counts = "
308 << total << "." << std::endl;
309 if (numRowsToPack != 0) {
310 // Only print the array if it's not too long.
311 if (numRowsToPack < static_cast<LO> (10)) {
312 os << "outputOffsets: [";
313 for (LO i = 0; i <= numRowsToPack; ++i) {
314 os << outputOffsets(i);
315 if (static_cast<LO> (i + 1) <= numRowsToPack) {
316 os << ",";
317 }
318 }
319 os << "]" << std::endl;
320 os << "counts: [";
321 for (LO i = 0; i < numRowsToPack; ++i) {
322 os << counts(i);
323 if (static_cast<LO> (i + 1) < numRowsToPack) {
324 os << ",";
325 }
326 }
327 os << "]" << std::endl;
328 }
329 else {
330 os << "outputOffsets(" << (numRowsToPack-1) << ") = "
331 << outputOffsets(numRowsToPack-1) << "." << std::endl;
332 }
333 }
335 return {false, errStr};
336 }
337#endif // HAVE_TPETRA_DEBUG
338
339 // Get last entry of outputOffsets, which is the sum of the entries
340 // of counts. Don't assume UVM.
341 using Tpetra::Details::getEntryOnHost;
342 return static_cast<count_type>(getEntryOnHost(outputOffsets,
344 }
345}
346
362template <class ST, class ColumnMap, class BufferDeviceType>
364 Kokkos::pair<int, size_t>
366 const Kokkos::View<char*, BufferDeviceType>& exports,
370 const size_t offset,
371 const size_t num_ent,
372 const size_t num_bytes_per_value,
373 const bool pack_pids) {
374 using Kokkos::subview;
375 using LO = typename ColumnMap::local_ordinal_type;
376 using GO = typename ColumnMap::global_ordinal_type;
377 using return_type = Kokkos::pair<int, size_t>;
378
379 if (num_ent == 0) {
380 // Empty rows always take zero bytes, to ensure sparsity.
381 return return_type(0, 0);
382 }
383
384 const LO num_ent_LO = static_cast<LO>(num_ent); // packValueCount wants this
385 const size_t num_ent_beg = offset;
387
388 const size_t gids_beg = num_ent_beg + num_ent_len;
389 const size_t gids_len = num_ent * PackTraits<GO>::packValueCount(GO(0));
390
391 const size_t pids_beg = gids_beg + gids_len;
392 const size_t pids_len = pack_pids ? num_ent * PackTraits<int>::packValueCount(int(0)) : static_cast<size_t>(0);
393
394 const size_t vals_beg = gids_beg + gids_len + pids_len;
395 const size_t vals_len = num_ent * num_bytes_per_value;
396
397 char* const num_ent_out = exports.data() + num_ent_beg;
398 char* const gids_out = exports.data() + gids_beg;
399 char* const pids_out = pack_pids ? exports.data() + pids_beg : NULL;
400 char* const vals_out = exports.data() + vals_beg;
401
402 size_t num_bytes_out = 0;
403 int error_code = 0;
405
406 {
407 // Copy column indices one at a time, so that we don't need
408 // temporary storage.
409 for (size_t k = 0; k < num_ent; ++k) {
410 const LO lid = lids_in[k];
411 const GO gid = col_map.getGlobalElement(lid);
413 }
414 // Copy PIDs one at a time, so that we don't need temporary storage.
415 if (pack_pids) {
416 for (size_t k = 0; k < num_ent; ++k) {
417 const LO lid = lids_in[k];
418 const int pid = pids_in[lid];
420 }
421 }
422 const auto p =
424 error_code += p.first;
425 num_bytes_out += p.second;
426 }
427
428 if (error_code != 0) {
429 return return_type(10, num_bytes_out);
430 }
431
432 const size_t expected_num_bytes =
435 return return_type(11, num_bytes_out);
436 }
437 return return_type(0, num_bytes_out);
438}
439
440template <class LocalMatrix, class LocalMap, class BufferDeviceType>
441struct PackCrsMatrixFunctor {
442 typedef LocalMatrix local_matrix_device_type;
443 typedef LocalMap local_map_type;
444 typedef typename local_matrix_device_type::value_type ST;
445 typedef typename local_map_type::local_ordinal_type LO;
446 typedef typename local_map_type::global_ordinal_type GO;
447 typedef typename local_matrix_device_type::device_type DT;
448
449 typedef Kokkos::View<const size_t*, BufferDeviceType>
450 num_packets_per_lid_view_type;
451 typedef Kokkos::View<const size_t*, BufferDeviceType> offsets_view_type;
452 typedef Kokkos::View<char*, BufferDeviceType> exports_view_type;
453 using export_lids_view_type = typename PackTraits<LO>::input_array_type;
454 using source_pids_view_type = typename PackTraits<int>::input_array_type;
455
456 typedef typename num_packets_per_lid_view_type::non_const_value_type
457 count_type;
458 typedef typename offsets_view_type::non_const_value_type
459 offset_type;
460 typedef Kokkos::pair<int, LO> value_type;
461
462 static_assert(std::is_same<LO, typename local_matrix_device_type::ordinal_type>::value,
463 "local_map_type::local_ordinal_type and "
464 "local_matrix_device_type::ordinal_type must be the same.");
465
466 local_matrix_device_type local_matrix;
467 local_map_type local_col_map;
468 exports_view_type exports;
469 num_packets_per_lid_view_type num_packets_per_lid;
470 export_lids_view_type export_lids;
471 source_pids_view_type source_pids;
472 offsets_view_type offsets;
473 size_t num_bytes_per_value;
474 bool pack_pids;
475
476 PackCrsMatrixFunctor(const local_matrix_device_type& local_matrix_in,
477 const local_map_type& local_col_map_in,
478 const exports_view_type& exports_in,
479 const num_packets_per_lid_view_type& num_packets_per_lid_in,
480 const export_lids_view_type& export_lids_in,
481 const source_pids_view_type& source_pids_in,
482 const offsets_view_type& offsets_in,
483 const size_t num_bytes_per_value_in,
484 const bool pack_pids_in)
485 : local_matrix(local_matrix_in)
486 , local_col_map(local_col_map_in)
487 , exports(exports_in)
488 , num_packets_per_lid(num_packets_per_lid_in)
489 , export_lids(export_lids_in)
490 , source_pids(source_pids_in)
491 , offsets(offsets_in)
492 , num_bytes_per_value(num_bytes_per_value_in)
493 , pack_pids(pack_pids_in) {
494 const LO numRows = local_matrix_in.numRows();
495 const LO rowMapDim =
496 static_cast<LO>(local_matrix.graph.row_map.extent(0));
497 TEUCHOS_TEST_FOR_EXCEPTION(numRows != 0 && rowMapDim != numRows + static_cast<LO>(1),
498 std::logic_error, "local_matrix.graph.row_map.extent(0) = " << rowMapDim << " != numRows (= " << numRows << " ) + 1.");
499 }
500
501 KOKKOS_INLINE_FUNCTION void init(value_type& dst) const {
502 using ::Tpetra::Details::OrdinalTraits;
503 dst = Kokkos::make_pair(0, OrdinalTraits<LO>::invalid());
504 }
505
506 KOKKOS_INLINE_FUNCTION void
507 join(value_type& dst, const value_type& src) const {
508 // `dst` should reflect the first (least) bad index and all other
509 // associated error codes and data, so prefer keeping it.
510 if (src.first != 0 && dst.first == 0) {
511 dst = src;
512 }
513 }
514
515 KOKKOS_INLINE_FUNCTION
516 void operator()(const LO i, value_type& dst) const {
517 const size_t offset = offsets[i];
518 const LO export_lid = export_lids[i];
519 const size_t buf_size = exports.size();
520 const size_t num_bytes = num_packets_per_lid(i);
521 const size_t num_ent =
522 static_cast<size_t>(local_matrix.graph.row_map[export_lid + 1] - local_matrix.graph.row_map[export_lid]);
523
524 // Only pack this row's data if it has a nonzero number of
525 // entries. We can do this because receiving processes get the
526 // number of packets, and will know that zero packets means zero
527 // entries.
528 if (num_ent == 0) {
529 return;
530 }
531
532 if (export_lid >= local_matrix.numRows()) {
533 if (dst.first != 0) { // keep only the first error
534 dst = Kokkos::make_pair(1, i); // invalid row
535 }
536 return;
537 } else if ((offset > buf_size || offset + num_bytes > buf_size)) {
538 if (dst.first != 0) { // keep only the first error
539 dst = Kokkos::make_pair(2, i); // out of bounds
540 }
541 return;
542 }
543
544 // We can now pack this row
545
546 // Since the matrix is locally indexed on the calling process, we
547 // have to use its column Map (which it _must_ have in this case)
548 // to convert to global indices.
549 const auto row_beg = local_matrix.graph.row_map[export_lid];
550 const auto row_end = local_matrix.graph.row_map[export_lid + 1];
551 auto vals_in = subview(local_matrix.values,
552 Kokkos::make_pair(row_beg, row_end));
553 auto lids_in = subview(local_matrix.graph.entries,
554 Kokkos::make_pair(row_beg, row_end));
555 typedef local_map_type LMT;
556 typedef BufferDeviceType BDT;
557 auto p = packCrsMatrixRow<ST, LMT, BDT>(local_col_map, exports, lids_in,
558 source_pids, vals_in, offset,
559 num_ent, num_bytes_per_value,
560 pack_pids);
561 int error_code_this_row = p.first;
562 size_t num_bytes_packed_this_row = p.second;
563 if (error_code_this_row != 0) {
564 if (dst.first != 0) { // keep only the first error
565 dst = Kokkos::make_pair(error_code_this_row, i); // bad pack
566 }
567 } else if (num_bytes_packed_this_row != num_bytes) {
568 if (dst.first != 0) { // keep only the first error
569 dst = Kokkos::make_pair(3, i);
570 }
571 }
572 }
573};
574
582template <class LocalMatrix, class LocalMap, class BufferDeviceType>
583void do_pack(const LocalMatrix& local_matrix,
584 const LocalMap& local_map,
585 const Kokkos::View<char*, BufferDeviceType>& exports,
586 const typename PackTraits<size_t>::input_array_type& num_packets_per_lid,
588 const typename PackTraits<int>::input_array_type& source_pids,
589 const Kokkos::View<const size_t*, BufferDeviceType>& offsets,
590 const size_t num_bytes_per_value,
591 const bool pack_pids) {
592 using LO = typename LocalMap::local_ordinal_type;
593 using DT = typename LocalMatrix::device_type;
594 using range_type = Kokkos::RangePolicy<typename DT::execution_space, LO>;
595 const char prefix[] = "Tpetra::Details::do_pack: ";
596
597 if (export_lids.extent(0) != 0) {
598 TEUCHOS_TEST_FOR_EXCEPTION(static_cast<size_t>(offsets.extent(0)) !=
599 static_cast<size_t>(export_lids.extent(0) + 1),
600 std::invalid_argument, prefix << "offsets.extent(0) = " << offsets.extent(0) << " != export_lids.extent(0) (= " << export_lids.extent(0) << ") + 1.");
601 TEUCHOS_TEST_FOR_EXCEPTION(export_lids.extent(0) != num_packets_per_lid.extent(0),
602 std::invalid_argument, prefix << "export_lids.extent(0) = " << export_lids.extent(0) << " != num_packets_per_lid.extent(0) = " << num_packets_per_lid.extent(0) << ".");
603 // If exports has nonzero length at this point, then the matrix
604 // has at least one entry to pack. Thus, if packing process
605 // ranks, we had better have at least one process rank to pack.
606 TEUCHOS_TEST_FOR_EXCEPTION(pack_pids && exports.extent(0) != 0 &&
607 source_pids.extent(0) == 0,
608 std::invalid_argument, prefix << "pack_pids is true, and exports.extent(0) = " << exports.extent(0) << " != 0, meaning that we need to pack at "
609 "least one matrix entry, but source_pids.extent(0) = 0.");
610 }
611
612 using pack_functor_type =
614 pack_functor_type f(local_matrix, local_map, exports,
615 num_packets_per_lid, export_lids,
616 source_pids, offsets, num_bytes_per_value,
617 pack_pids);
618
619 typename pack_functor_type::value_type result;
620 range_type range(0, num_packets_per_lid.extent(0));
621 Kokkos::parallel_reduce(range, f, result);
622
623 if (result.first != 0) {
624 // We can't deep_copy from AnonymousSpace Views, so we can't print
625 // out any information from them in case of error.
626 TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, prefix << "PackCrsMatrixFunctor "
627 "reported error code "
628 << result.first << " for the first "
629 "bad row "
630 << result.second << ".");
631 }
632}
633
663template <typename ST, typename LO, typename GO, typename NT, typename BufferDeviceType>
665 Kokkos::DualView<char*, BufferDeviceType>& exports,
666 const Kokkos::View<size_t*, BufferDeviceType>& num_packets_per_lid,
667 const Kokkos::View<const LO*, BufferDeviceType>& export_lids,
668 const Kokkos::View<const int*, typename NT::device_type>& export_pids,
669 size_t& constant_num_packets,
670 const bool pack_pids) {
672 "Tpetra::Details::PackCrsMatrixImpl::packCrsMatrix",
673 "Import/Export");
674 using Kokkos::View;
675 typedef BufferDeviceType DT;
676 typedef Kokkos::DualView<char*, BufferDeviceType> exports_view_type;
677 const char prefix[] = "Tpetra::Details::PackCrsMatrixImpl::packCrsMatrix: ";
678 constexpr bool debug = false;
679
680 auto local_matrix = sourceMatrix.getLocalMatrixDevice();
681 auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
682
683 // Setting this to zero tells the caller to expect a possibly
684 // different ("nonconstant") number of packets per local index
685 // (i.e., a possibly different number of entries per row).
687
688 const size_t num_export_lids =
689 static_cast<size_t>(export_lids.extent(0));
691 static_cast<size_t>(num_packets_per_lid.extent(0)),
692 std::invalid_argument, prefix << "num_export_lids.extent(0) = " << num_export_lids << " != num_packets_per_lid.extent(0) = " << num_packets_per_lid.extent(0) << ".");
693 if (num_export_lids != 0) {
694 TEUCHOS_TEST_FOR_EXCEPTION(num_packets_per_lid.data() == NULL, std::invalid_argument,
695 prefix << "num_export_lids = " << num_export_lids << " != 0, but "
696 "num_packets_per_lid.data() = "
697 << num_packets_per_lid.data() << " == NULL.");
698 }
699
703
704 size_t num_bytes_per_value = 0;
706 // Assume ST is default constructible; packValueCount wants an instance.
707 num_bytes_per_value = PackTraits<ST>::packValueCount(ST());
708 } else {
709 // Since the packed data come from the source matrix, we can use
710 // the source matrix to get the number of bytes per Scalar value
711 // stored in the matrix. This assumes that all Scalar values in
712 // the source matrix require the same number of bytes. If the
713 // source matrix has no entries on the calling process, then we
714 // hope that some process does have some idea how big a Scalar
715 // value is. Of course, if no processes have any entries, then no
716 // values should be packed (though this does assume that in our
717 // packing scheme, rows with zero entries take zero bytes).
718 size_t num_bytes_per_value_l = 0;
719 if (local_matrix.values.extent(0) > 0) {
720 const ST& val = local_matrix.values(0);
722 }
723 using Teuchos::reduceAll;
725 Teuchos::REDUCE_MAX,
727 Teuchos::outArg(num_bytes_per_value));
728 }
729
730 if (num_export_lids == 0) {
731 exports = exports_view_type("exports", 0);
732 return;
733 }
734
735 // Array of offsets into the pack buffer.
736 Kokkos::View<size_t*, DT> offsets("offsets", num_export_lids + 1);
737
738 // Compute number of packets per LID (row to send), as well as
739 // corresponding offsets (the prefix sum of the packet counts).
740 const size_t count =
741 computeNumPacketsAndOffsets(offsets, num_packets_per_lid,
742 local_matrix.graph.row_map, export_lids,
745 num_bytes_per_pid, num_bytes_per_value);
746
747 // Resize the output pack buffer if needed.
748 if (count > static_cast<size_t>(exports.extent(0))) {
749 exports = exports_view_type("exports", count);
750 if (debug) {
751 std::ostringstream os;
752 os << "*** exports resized to " << count << std::endl;
753 std::cerr << os.str();
754 }
755 }
756 if (debug) {
757 std::ostringstream os;
758 os << "*** count: " << count << ", exports.extent(0): "
759 << exports.extent(0) << std::endl;
760 std::cerr << os.str();
761 }
762
763 // If exports has nonzero length at this point, then the matrix has
764 // at least one entry to pack. Thus, if packing process ranks, we
765 // had better have at least one process rank to pack.
766 TEUCHOS_TEST_FOR_EXCEPTION(pack_pids && exports.extent(0) != 0 &&
767 export_pids.extent(0) == 0,
768 std::invalid_argument, prefix << "pack_pids is true, and exports.extent(0) = " << exports.extent(0) << " != 0, meaning that we need to pack at least "
769 "one matrix entry, but export_pids.extent(0) = 0.");
770
771 typedef typename std::decay<decltype(local_matrix)>::type
772 local_matrix_device_type;
773 typedef typename std::decay<decltype(local_col_map)>::type
774 local_map_type;
775
776 exports.modify_device();
777 auto exports_d = exports.view_device();
778 do_pack<local_matrix_device_type, local_map_type, DT>(local_matrix, local_col_map, exports_d, num_packets_per_lid,
779 export_lids, export_pids, offsets, num_bytes_per_value,
780 pack_pids);
781 // If we got this far, we succeeded.
782}
783
784} // namespace PackCrsMatrixImpl
785
786template <typename ST, typename LO, typename GO, typename NT>
788 Teuchos::Array<char>& exports,
789 const Teuchos::ArrayView<size_t>& numPacketsPerLID,
790 const Teuchos::ArrayView<const LO>& exportLIDs,
791 size_t& constantNumPackets) {
793 using buffer_device_type = typename DistObject<char, LO, GO, NT>::buffer_device_type;
794 using host_exec_space = typename Kokkos::View<size_t*, device_type>::host_mirror_type::execution_space;
795 using device_exec_space = typename device_type::execution_space;
796 using host_dev_type = Kokkos::Device<host_exec_space, Kokkos::HostSpace>;
797
798 // Convert all Teuchos::Array to Kokkos::View
799
800 // This is an output array, so we don't have to copy to device here.
801 // However, we'll have to remember to copy back to host when done.
802 Kokkos::View<size_t*, buffer_device_type> num_packets_per_lid_d =
803 create_mirror_view_from_raw_host_array(buffer_device_type(),
804 numPacketsPerLID.getRawPtr(),
805 numPacketsPerLID.size(), false,
806 "num_packets_per_lid");
807 // FIXME (mfh 05 Feb 2019) We should just pass the exportLIDs
808 // DualView through here, instead of recreating a device View from a
809 // host ArrayView that itself came from a DualView.
810 //
811 // This is an input array, so we have to copy to device here.
812 // However, we never need to copy it back to host.
813 Kokkos::View<const LO*, buffer_device_type> export_lids_d =
814 create_mirror_view_from_raw_host_array(buffer_device_type(),
815 exportLIDs.getRawPtr(),
816 exportLIDs.size(), true,
817 "export_lids");
818
819 Kokkos::View<int*, device_type> export_pids_d; // output arg
820 Kokkos::DualView<char*, buffer_device_type> exports_dv; // output arg
821 constexpr bool pack_pids = false;
822 PackCrsMatrixImpl::packCrsMatrix<ST, LO, GO, NT, buffer_device_type>(
825
826 // The counts are an output of PackCrsMatrixImpl::packCrsMatrix, so we have to
827 // copy them back to host.
828 Kokkos::View<size_t*, host_dev_type> num_packets_per_lid_h(numPacketsPerLID.getRawPtr(),
829 numPacketsPerLID.size());
830 // DEEP_COPY REVIEW - DEVICE-TO-HOST
832
833 // FIXME (mfh 23 Aug 2017) If we're forced to use a DualView for
834 // exports_dv above, then we have two host copies for exports_h.
835
836 // The exports are an output of PackCrsMatrixImpl::packCrsMatrix, so we have
837 // to copy them back to host.
838 if (static_cast<size_t>(exports.size()) !=
839 static_cast<size_t>(exports_dv.extent(0))) {
840 exports.resize(exports_dv.extent(0));
841 }
842 Kokkos::View<char*, host_dev_type> exports_h(exports.getRawPtr(),
843 exports.size());
844 // DEEP_COPY REVIEW - DEVICE-TO-HOST
845 Kokkos::deep_copy(device_exec_space(), exports_h, exports_dv.view_device());
846}
847
848template <typename ST, typename LO, typename GO, typename NT>
851 Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>& exports,
852 const Kokkos::DualView<size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>& numPacketsPerLID,
853 const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>& exportLIDs,
854 size_t& constantNumPackets) {
855 using device_type = typename CrsMatrix<ST, LO, GO, NT>::device_type;
856 using buffer_device_type = typename DistObject<char, LO, GO, NT>::buffer_device_type;
857
858 // Create an empty array of PIDs, since the interface needs it.
859 Kokkos::View<int*, device_type> exportPIDs_d("exportPIDs", 0);
860 constexpr bool pack_pids = false;
861
862 // Write-only device access
863 auto numPacketsPerLID_nc = numPacketsPerLID; // const DV& -> DV
864 numPacketsPerLID_nc.clear_sync_state();
865 numPacketsPerLID_nc.modify_device();
866 auto numPacketsPerLID_d = numPacketsPerLID.view_device();
867
868 // Read-only device access
869 TEUCHOS_ASSERT(!exportLIDs.need_sync_device());
870 auto exportLIDs_d = exportLIDs.view_device();
871
873 "Tpetra::Details::packCrsMatrixNew",
874 "Import/Export");
875 PackCrsMatrixImpl::packCrsMatrix<ST, LO, GO, NT, buffer_device_type>(
877 exportPIDs_d, constantNumPackets, pack_pids);
878}
879
880template <typename ST, typename LO, typename GO, typename NT>
882 Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>& exports_dv,
883 const Teuchos::ArrayView<size_t>& numPacketsPerLID,
884 const Teuchos::ArrayView<const LO>& exportLIDs,
885 const Teuchos::ArrayView<const int>& sourcePIDs,
886 size_t& constantNumPackets) {
887 typedef typename CrsMatrix<ST, LO, GO, NT>::local_matrix_device_type local_matrix_device_type;
888 typedef typename DistObject<char, LO, GO, NT>::buffer_device_type buffer_device_type;
889 typedef typename Kokkos::DualView<char*, buffer_device_type>::t_host::execution_space host_exec_space;
890 typedef Kokkos::Device<host_exec_space, Kokkos::HostSpace> host_dev_type;
891
892 typename local_matrix_device_type::device_type outputDevice;
893 typedef typename NT::execution_space execution_space;
894
895 const bool verbose = ::Tpetra::Details::Behavior::verbose();
896 std::unique_ptr<std::string> prefix;
897 if (verbose) {
898 const int myRank = [&]() {
899 auto map = sourceMatrix.getMap();
900 if (map.get() == nullptr) {
901 return -1;
902 }
903 auto comm = map->getComm();
904 if (comm.get() == nullptr) {
905 return -2;
906 }
907 return comm->getRank();
908 }();
909 std::ostringstream os;
910 os << "Proc " << myRank << ": packCrsMatrixWithOwningPIDs: ";
911 prefix = std::unique_ptr<std::string>(new std::string(os.str()));
912
913 std::ostringstream os2;
914 os2 << *prefix << "start" << std::endl;
915 std::cerr << os2.str();
916 }
917
918 // Convert all Teuchos::Array to Kokkos::View
919
920 // This is an output array, so we don't have to copy to device here.
921 // However, we'll have to remember to copy back to host when done.
923 create_mirror_view_from_raw_host_array(buffer_device_type(),
924 numPacketsPerLID.getRawPtr(),
925 numPacketsPerLID.size(), false,
926 "num_packets_per_lid");
927
928 // This is an input array, so we have to copy to device here.
929 // However, we never need to copy it back to host.
930 auto export_lids_d =
931 create_mirror_view_from_raw_host_array(buffer_device_type(),
932 exportLIDs.getRawPtr(),
933 exportLIDs.size(), true,
934 "export_lids");
935 // This is an input array, so we have to copy to device here.
936 // However, we never need to copy it back to host.
937 auto export_pids_d =
939 sourcePIDs.getRawPtr(),
940 sourcePIDs.size(), true,
941 "export_pids");
942 constexpr bool pack_pids = true;
943 try {
944 PackCrsMatrixImpl::packCrsMatrix(sourceMatrix, exports_dv, num_packets_per_lid_d, export_lids_d,
946 } catch (std::exception& e) {
947 if (verbose) {
948 std::ostringstream os;
949 os << *prefix << "PackCrsMatrixImpl::packCrsMatrix threw: "
950 << e.what() << std::endl;
951 std::cerr << os.str();
952 }
953 throw;
954 } catch (...) {
955 if (verbose) {
956 std::ostringstream os;
957 os << *prefix << "PackCrsMatrixImpl::packCrsMatrix threw an exception "
958 "not a subclass of std::exception"
959 << std::endl;
960 std::cerr << os.str();
961 }
962 throw;
963 }
964
965 if (numPacketsPerLID.size() != 0) {
966 try {
967 // The counts are an output of PackCrsMatrixImpl::packCrsMatrix,
968 // so we have to copy them back to host.
969 Kokkos::View<size_t*, host_dev_type> num_packets_per_lid_h(numPacketsPerLID.getRawPtr(), numPacketsPerLID.size());
970 // DEEP_COPY REVIEW - DEVICE-TO-HOST
971 Kokkos::deep_copy(execution_space(), num_packets_per_lid_h, num_packets_per_lid_d);
972 } catch (std::exception& e) {
973 if (verbose) {
974 std::ostringstream os;
975 os << *prefix << "Kokkos::deep_copy threw: " << e.what() << std::endl;
976 std::cerr << os.str();
977 }
978 throw;
979 } catch (...) {
980 if (verbose) {
981 std::ostringstream os;
982 os << *prefix << "Kokkos::deep_copy threw an exception not a subclass "
983 "of std::exception"
984 << std::endl;
985 std::cerr << os.str();
986 }
987 throw;
988 }
989 }
990
991 if (verbose) {
992 std::ostringstream os;
993 os << *prefix << "done" << std::endl;
994 std::cerr << os.str();
995 }
996}
997
998} // namespace Details
999} // namespace Tpetra
1000
1001#define TPETRA_DETAILS_PACKCRSMATRIX_INSTANT(ST, LO, GO, NT) \
1002 template void \
1003 Details::packCrsMatrix<ST, LO, GO, NT>(const CrsMatrix<ST, LO, GO, NT>&, \
1004 Teuchos::Array<char>&, \
1005 const Teuchos::ArrayView<size_t>&, \
1006 const Teuchos::ArrayView<const LO>&, \
1007 size_t&); \
1008 template void \
1009 Details::packCrsMatrixNew<ST, LO, GO, NT>(const CrsMatrix<ST, LO, GO, NT>&, \
1010 Kokkos::DualView<char*, DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1011 const Kokkos::DualView<size_t*, DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1012 const Kokkos::DualView<const LO*, DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1013 size_t&); \
1014 template void \
1015 Details::packCrsMatrixWithOwningPIDs<ST, LO, GO, NT>(const CrsMatrix<ST, LO, GO, NT>&, \
1016 Kokkos::DualView<char*, DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1017 const Teuchos::ArrayView<size_t>&, \
1018 const Teuchos::ArrayView<const LO>&, \
1019 const Teuchos::ArrayView<const int>&, \
1020 size_t&);
1021
1022#endif // TPETRA_DETAILS_PACKCRSMATRIX_DEF_HPP
Declaration of the Tpetra::CrsMatrix class.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Import KokkosSparse::OrdinalTraits, a traits class for "invalid" (flag) values of integer types,...
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary,...
Declaration and definition of Tpetra::Details::getEntryOnHost.
KOKKOS_FUNCTION Kokkos::pair< int, size_t > packCrsMatrixRow(const ColumnMap &col_map, const Kokkos::View< char *, BufferDeviceType > &exports, const typename PackTraits< typename ColumnMap::local_ordinal_type >::input_array_type &lids_in, const typename PackTraits< int >::input_array_type &pids_in, const typename PackTraits< ST >::input_array_type &vals_in, const size_t offset, const size_t num_ent, const size_t num_bytes_per_value, const bool pack_pids)
Packs a single row of the CrsMatrix.
Struct that holds views of the contents of a CrsMatrix.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries.
typename Node::device_type device_type
The Kokkos device type.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
static bool verbose()
Whether Tpetra is in verbose mode.
"Local" part of Map suitable for Kokkos kernels.
LocalOrdinal local_ordinal_type
The type of local indices.
GlobalOrdinal global_ordinal_type
The type of global indices.
Compute the number of packets and offsets for the pack procedure.
Kokkos::Device< typename device_type::execution_space, buffer_memory_space > buffer_device_type
Kokkos::Device specialization for communication buffers.
Implementation details of Tpetra.
void packCrsMatrix(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, Teuchos::Array< char > &exports, const Teuchos::ArrayView< size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &exportLIDs, size_t &constantNumPackets)
Pack specified entries of the given local sparse matrix for communication.
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
void packCrsMatrixWithOwningPIDs(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, Kokkos::DualView< char *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &exports_dv, const Teuchos::ArrayView< size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &exportLIDs, const Teuchos::ArrayView< const int > &sourcePIDs, size_t &constantNumPackets)
Pack specified entries of the given local sparse matrix for communication.
void packCrsMatrixNew(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, Kokkos::DualView< char *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &exports, const Kokkos::DualView< size_t *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &numPacketsPerLID, const Kokkos::DualView< const LO *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &exportLIDs, size_t &constantNumPackets)
Pack specified entries of the given local sparse matrix for communication, for "new" DistObject inter...
Namespace Tpetra contains the class and methods constituting the Tpetra library.
Traits class for packing / unpacking data of type T.
static KOKKOS_INLINE_FUNCTION Kokkos::pair< int, size_t > packArray(char outBuf[], const value_type inBuf[], const size_t numEnt)
Pack the first numEnt entries of the given input buffer of value_type, into the output buffer of byte...
static KOKKOS_INLINE_FUNCTION size_t packValue(char outBuf[], const T &inVal)
Pack the given value of type value_type into the given output buffer of bytes (char).
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const T &)
Number of bytes required to pack or unpack the given value of type value_type.
Kokkos::View< const value_type *, Kokkos::AnonymousSpace > input_array_type
The type of an input array of value_type.