Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
Tpetra_CrsMatrix_def.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef TPETRA_CRSMATRIX_DEF_HPP
11#define TPETRA_CRSMATRIX_DEF_HPP
12
20
23#include "Tpetra_RowMatrix.hpp"
24#include "Tpetra_LocalCrsMatrixOperator.hpp"
25#include "Tpetra_computeRowAndColumnOneNorms.hpp"
27
34#include "Tpetra_Details_getDiagCopyWithoutOffsets.hpp"
42#include "Tpetra_Details_packCrsMatrix.hpp"
43#include "Tpetra_Details_unpackCrsMatrixAndCombine.hpp"
45#include "Teuchos_FancyOStream.hpp"
46#include "Teuchos_RCP.hpp"
47#include "Teuchos_DataAccess.hpp"
48#include "Teuchos_SerialDenseMatrix.hpp" // unused here, could delete
49#include "KokkosBlas1_scal.hpp"
50#include "KokkosSparse_getDiagCopy.hpp"
51#include "KokkosSparse_spmv.hpp"
53
54#include <memory>
55#include <sstream>
56#include <typeinfo>
57#include <utility>
58#include <vector>
59
60namespace Tpetra {
61
62namespace { // (anonymous)
63
64template <class T, class BinaryFunction>
65T atomic_binary_function_update(T* const dest,
66 const T& inputVal,
67 BinaryFunction f) {
68 T oldVal = *dest;
69 T assume;
70
71 // NOTE (mfh 30 Nov 2015) I do NOT need a fence here for IBM
72 // POWER architectures, because 'newval' depends on 'assume',
73 // which depends on 'oldVal', which depends on '*dest'. This
74 // sets up a chain of read dependencies that should ensure
75 // correct behavior given a sane memory model.
76 do {
77 assume = oldVal;
78 T newVal = f(assume, inputVal);
79 oldVal = Kokkos::atomic_compare_exchange(dest, assume, newVal);
80 } while (assume != oldVal);
81
82 return oldVal;
83}
84} // namespace
85
86//
87// Users must never rely on anything in the Details namespace.
88//
89namespace Details {
90
100template <class Scalar>
101struct AbsMax {
103 Scalar operator()(const Scalar& x, const Scalar& y) {
104 typedef Teuchos::ScalarTraits<Scalar> STS;
105 return std::max(STS::magnitude(x), STS::magnitude(y));
106 }
107};
108
109} // namespace Details
110} // namespace Tpetra
111
112namespace Tpetra {
113
114template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
116 CrsMatrix(const Teuchos::RCP<const map_type>& rowMap,
117 size_t maxNumEntriesPerRow,
118 const Teuchos::RCP<Teuchos::ParameterList>& params)
119 : dist_object_type(rowMap) {
120 const char tfecfFuncName[] =
121 "CrsMatrix(RCP<const Map>, size_t "
122 "[, RCP<ParameterList>]): ";
123 Teuchos::RCP<crs_graph_type> graph;
124 try {
125 graph = Teuchos::rcp(new crs_graph_type(rowMap, maxNumEntriesPerRow,
126 params));
127 } catch (std::exception& e) {
128 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
129 "CrsGraph constructor (RCP<const Map>, "
130 "size_t [, RCP<ParameterList>]) threw an exception: "
131 << e.what());
132 }
133 // myGraph_ not null means that the matrix owns the graph. That's
134 // different than the const CrsGraph constructor, where the matrix
135 // does _not_ own the graph.
136 myGraph_ = graph;
137 staticGraph_ = myGraph_;
140}
141
142template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
144 CrsMatrix(const Teuchos::RCP<const map_type>& rowMap,
145 const Teuchos::ArrayView<const size_t>& numEntPerRowToAlloc,
146 const Teuchos::RCP<Teuchos::ParameterList>& params)
147 : dist_object_type(rowMap) {
148 const char tfecfFuncName[] =
149 "CrsMatrix(RCP<const Map>, "
150 "ArrayView<const size_t>[, RCP<ParameterList>]): ";
151 Teuchos::RCP<crs_graph_type> graph;
152 try {
153 using Teuchos::rcp;
155 params));
156 } catch (std::exception& e) {
157 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
158 "CrsGraph constructor "
159 "(RCP<const Map>, ArrayView<const size_t>"
160 "[, RCP<ParameterList>]) threw an exception: "
161 << e.what());
162 }
163 // myGraph_ not null means that the matrix owns the graph. That's
164 // different than the const CrsGraph constructor, where the matrix
165 // does _not_ own the graph.
166 myGraph_ = graph;
167 staticGraph_ = graph;
170}
171
172template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
174 CrsMatrix(const Teuchos::RCP<const map_type>& rowMap,
175 const Teuchos::RCP<const map_type>& colMap,
176 const size_t maxNumEntPerRow,
177 const Teuchos::RCP<Teuchos::ParameterList>& params)
178 : dist_object_type(rowMap) {
179 const char tfecfFuncName[] =
180 "CrsMatrix(RCP<const Map>, "
181 "RCP<const Map>, size_t[, RCP<ParameterList>]): ";
182 const char suffix[] =
183 " Please report this bug to the Tpetra developers.";
184
185 // An artifact of debugging something a while back.
186 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!staticGraph_.is_null(), std::logic_error,
187 "staticGraph_ is not null at the beginning of the constructor."
188 << suffix);
189 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!myGraph_.is_null(), std::logic_error,
190 "myGraph_ is not null at the beginning of the constructor."
191 << suffix);
192 Teuchos::RCP<crs_graph_type> graph;
193 try {
194 graph = Teuchos::rcp(new crs_graph_type(rowMap, colMap,
196 params));
197 } catch (std::exception& e) {
198 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
199 "CrsGraph constructor (RCP<const Map>, "
200 "RCP<const Map>, size_t[, RCP<ParameterList>]) threw an "
201 "exception: "
202 << e.what());
203 }
204 // myGraph_ not null means that the matrix owns the graph. That's
205 // different than the const CrsGraph constructor, where the matrix
206 // does _not_ own the graph.
207 myGraph_ = graph;
208 staticGraph_ = myGraph_;
211}
212
213template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
215 CrsMatrix(const Teuchos::RCP<const map_type>& rowMap,
216 const Teuchos::RCP<const map_type>& colMap,
217 const Teuchos::ArrayView<const size_t>& numEntPerRowToAlloc,
218 const Teuchos::RCP<Teuchos::ParameterList>& params)
219 : dist_object_type(rowMap) {
220 const char tfecfFuncName[] =
221 "CrsMatrix(RCP<const Map>, RCP<const Map>, "
222 "ArrayView<const size_t>[, RCP<ParameterList>]): ";
223 Teuchos::RCP<crs_graph_type> graph;
224 try {
225 graph = Teuchos::rcp(new crs_graph_type(rowMap, colMap,
227 params));
228 } catch (std::exception& e) {
229 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
230 "CrsGraph constructor (RCP<const Map>, "
231 "RCP<const Map>, ArrayView<const size_t>[, "
232 "RCP<ParameterList>]) threw an exception: "
233 << e.what());
234 }
235 // myGraph_ not null means that the matrix owns the graph. That's
236 // different than the const CrsGraph constructor, where the matrix
237 // does _not_ own the graph.
238 myGraph_ = graph;
239 staticGraph_ = graph;
242}
243
244template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
246 CrsMatrix(const Teuchos::RCP<const crs_graph_type>& graph,
247 const Teuchos::RCP<Teuchos::ParameterList>& /* params */)
248 : dist_object_type(graph->getRowMap())
249 , staticGraph_(graph)
250 , storageStatus_(Details::STORAGE_1D_PACKED) {
251 using std::endl;
252 typedef typename local_matrix_device_type::values_type values_type;
253 const char tfecfFuncName[] =
254 "CrsMatrix(RCP<const CrsGraph>[, "
255 "RCP<ParameterList>]): ";
256 const bool verbose = Details::Behavior::verbose("CrsMatrix");
257
258 std::unique_ptr<std::string> prefix;
259 if (verbose) {
260 prefix = this->createPrefix("CrsMatrix", "CrsMatrix(graph,params)");
261 std::ostringstream os;
262 os << *prefix << "Start" << endl;
263 std::cerr << os.str();
264 }
265
266 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(graph.is_null(), std::runtime_error, "Input graph is null.");
267 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!graph->isFillComplete(), std::runtime_error,
268 "Input graph "
269 "is not fill complete. You must call fillComplete on the "
270 "graph before using it to construct a CrsMatrix. Note that "
271 "calling resumeFill on the graph makes it not fill complete, "
272 "even if you had previously called fillComplete. In that "
273 "case, you must call fillComplete on the graph again.");
274
275 // The graph is fill complete, so it is locally indexed and has a
276 // fixed structure. This means we can allocate the (1-D) array of
277 // values and build the local matrix right now. Note that the
278 // local matrix's number of columns comes from the column Map, not
279 // the domain Map.
280
281 const size_t numEnt = graph->lclIndsPacked_wdv.extent(0);
282 if (verbose) {
283 std::ostringstream os;
284 os << *prefix << "Allocate values: " << numEnt << endl;
285 std::cerr << os.str();
286 }
287
288 values_type val("Tpetra::CrsMatrix::values", numEnt);
289 valuesPacked_wdv = values_wdv_type(val);
290 valuesUnpacked_wdv = valuesPacked_wdv;
291
293
294 if (verbose) {
295 std::ostringstream os;
296 os << *prefix << "Done" << endl;
297 std::cerr << os.str();
298 }
299}
300
301template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
304 const Teuchos::RCP<const crs_graph_type>& graph,
305 const Teuchos::RCP<Teuchos::ParameterList>& params)
306 : dist_object_type(graph->getRowMap())
307 , staticGraph_(graph)
308 , storageStatus_(matrix.storageStatus_) {
309 const char tfecfFuncName[] =
310 "CrsMatrix(RCP<const CrsGraph>, "
311 "local_matrix_device_type::values_type, "
312 "[,RCP<ParameterList>]): ";
313 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(graph.is_null(), std::runtime_error, "Input graph is null.");
314 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!graph->isFillComplete(), std::runtime_error,
315 "Input graph "
316 "is not fill complete. You must call fillComplete on the "
317 "graph before using it to construct a CrsMatrix. Note that "
318 "calling resumeFill on the graph makes it not fill complete, "
319 "even if you had previously called fillComplete. In that "
320 "case, you must call fillComplete on the graph again.");
321
322 size_t numValuesPacked = graph->lclIndsPacked_wdv.extent(0);
323 valuesPacked_wdv = values_wdv_type(matrix.valuesPacked_wdv, 0, numValuesPacked);
324
325 size_t numValuesUnpacked = graph->lclIndsUnpacked_wdv.extent(0);
326 valuesUnpacked_wdv = values_wdv_type(matrix.valuesUnpacked_wdv, 0, numValuesUnpacked);
327
329}
330
331template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
333 CrsMatrix(const Teuchos::RCP<const crs_graph_type>& graph,
334 const typename local_matrix_device_type::values_type& values,
335 const Teuchos::RCP<Teuchos::ParameterList>& /* params */)
336 : dist_object_type(graph->getRowMap())
337 , staticGraph_(graph)
338 , storageStatus_(Details::STORAGE_1D_PACKED) {
339 const char tfecfFuncName[] =
340 "CrsMatrix(RCP<const CrsGraph>, "
341 "local_matrix_device_type::values_type, "
342 "[,RCP<ParameterList>]): ";
343 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(graph.is_null(), std::runtime_error, "Input graph is null.");
344 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!graph->isFillComplete(), std::runtime_error,
345 "Input graph "
346 "is not fill complete. You must call fillComplete on the "
347 "graph before using it to construct a CrsMatrix. Note that "
348 "calling resumeFill on the graph makes it not fill complete, "
349 "even if you had previously called fillComplete. In that "
350 "case, you must call fillComplete on the graph again.");
351
352 // The graph is fill complete, so it is locally indexed and has a
353 // fixed structure. This means we can allocate the (1-D) array of
354 // values and build the local matrix right now. Note that the
355 // local matrix's number of columns comes from the column Map, not
356 // the domain Map.
357
358 valuesPacked_wdv = values_wdv_type(values);
359 valuesUnpacked_wdv = valuesPacked_wdv;
360
361 // FIXME (22 Jun 2016) I would very much like to get rid of
362 // k_values1D_ at some point. I find it confusing to have all
363 // these extra references lying around.
364 // KDDKDD ALMOST THERE, MARK!
365 // k_values1D_ = valuesUnpacked_wdv.getDeviceView(Access::ReadWrite);
366
368}
369
370template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
372 CrsMatrix(const Teuchos::RCP<const map_type>& rowMap,
373 const Teuchos::RCP<const map_type>& colMap,
374 const typename local_graph_device_type::row_map_type& rowPointers,
375 const typename local_graph_device_type::entries_type::non_const_type& columnIndices,
376 const typename local_matrix_device_type::values_type& values,
377 const Teuchos::RCP<Teuchos::ParameterList>& params)
378 : dist_object_type(rowMap)
379 , storageStatus_(Details::STORAGE_1D_PACKED) {
380 using Details::getEntryOnHost;
381 using std::endl;
382 using Teuchos::RCP;
383 const char tfecfFuncName[] =
384 "Tpetra::CrsMatrix(RCP<const Map>, "
385 "RCP<const Map>, ptr, ind, val[, params]): ";
386 const char suffix[] =
387 ". Please report this bug to the Tpetra developers.";
388 const bool debug = Details::Behavior::debug("CrsMatrix");
389 const bool verbose = Details::Behavior::verbose("CrsMatrix");
390
391 std::unique_ptr<std::string> prefix;
392 if (verbose) {
393 prefix = this->createPrefix(
394 "CrsMatrix", "CrsMatrix(rowMap,colMap,ptr,ind,val[,params])");
395 std::ostringstream os;
396 os << *prefix << "Start" << endl;
397 std::cerr << os.str();
398 }
399
400 // Check the user's input. Note that this might throw only on
401 // some processes but not others, causing deadlock. We prefer
402 // deadlock due to exceptions to segfaults, because users can
403 // catch exceptions.
405 std::invalid_argument, "values.extent(0)=" << values.extent(0) << " != columnIndices.extent(0) = " << columnIndices.extent(0) << ".");
406 if (debug && rowPointers.extent(0) != 0) {
407 const size_t numEnt =
408 getEntryOnHost(rowPointers, rowPointers.extent(0) - 1);
410 numEnt != size_t(values.extent(0)),
411 std::invalid_argument,
412 "Last entry of rowPointers says that "
413 "the matrix has "
414 << numEnt << " entr"
415 << (numEnt != 1 ? "ies" : "y") << ", but the dimensions of "
416 "columnIndices and values don't match this. "
417 "columnIndices.extent(0)="
418 << columnIndices.extent(0)
419 << " and values.extent(0)=" << values.extent(0) << ".");
420 }
421
423 try {
424 graph = Teuchos::rcp(new crs_graph_type(rowMap, colMap, rowPointers,
426 } catch (std::exception& e) {
427 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
428 "CrsGraph constructor (RCP<const Map>, "
429 "RCP<const Map>, ptr, ind[, params]) threw an exception: "
430 << e.what());
431 }
432 // The newly created CrsGraph _must_ have a local graph at this
433 // point. We don't really care whether CrsGraph's constructor
434 // deep-copies or shallow-copies the input, but the dimensions
435 // have to be right. That's how we tell whether the CrsGraph has
436 // a local graph.
437 auto lclGraph = graph->getLocalGraphDevice();
438 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(lclGraph.row_map.extent(0) != rowPointers.extent(0) ||
439 lclGraph.entries.extent(0) != columnIndices.extent(0),
440 std::logic_error,
441 "CrsGraph's constructor (rowMap, colMap, ptr, "
442 "ind[, params]) did not set the local graph correctly."
443 << suffix);
444 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(lclGraph.entries.extent(0) != values.extent(0),
445 std::logic_error,
446 "CrsGraph's constructor (rowMap, colMap, ptr, ind[, "
447 "params]) did not set the local graph correctly. "
448 "lclGraph.entries.extent(0) = "
449 << lclGraph.entries.extent(0)
450 << " != values.extent(0) = " << values.extent(0) << suffix);
451
452 // myGraph_ not null means that the matrix owns the graph. This
453 // is true because the column indices come in as nonconst,
454 // implying shared ownership.
455 myGraph_ = graph;
456 staticGraph_ = graph;
457
458 // The graph may not be fill complete yet. However, it is locally
459 // indexed (since we have a column Map) and has a fixed structure
460 // (due to the input arrays). This means we can allocate the
461 // (1-D) array of values and build the local matrix right now.
462 // Note that the local matrix's number of columns comes from the
463 // column Map, not the domain Map.
464
465 valuesPacked_wdv = values_wdv_type(values);
466 valuesUnpacked_wdv = valuesPacked_wdv;
467
468 // FIXME (22 Jun 2016) I would very much like to get rid of
469 // k_values1D_ at some point. I find it confusing to have all
470 // these extra references lying around.
471 // this->k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
472
474 if (verbose) {
475 std::ostringstream os;
476 os << *prefix << "Done" << endl;
477 std::cerr << os.str();
478 }
479}
480
481template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
483 CrsMatrix(const Teuchos::RCP<const map_type>& rowMap,
484 const Teuchos::RCP<const map_type>& colMap,
485 const Teuchos::ArrayRCP<size_t>& ptr,
486 const Teuchos::ArrayRCP<LocalOrdinal>& ind,
487 const Teuchos::ArrayRCP<Scalar>& val,
488 const Teuchos::RCP<Teuchos::ParameterList>& params)
489 : dist_object_type(rowMap)
490 , storageStatus_(Details::STORAGE_1D_PACKED) {
491 using Kokkos::Compat::getKokkosViewDeepCopy;
492 using Teuchos::av_reinterpret_cast;
493 using Teuchos::RCP;
494 using values_type = typename local_matrix_device_type::values_type;
495 using IST = impl_scalar_type;
496 const char tfecfFuncName[] =
497 "Tpetra::CrsMatrix(RCP<const Map>, "
498 "RCP<const Map>, ptr, ind, val[, params]): ";
499
501 try {
502 graph = Teuchos::rcp(new crs_graph_type(rowMap, colMap, ptr,
503 ind, params));
504 } catch (std::exception& e) {
505 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
506 "CrsGraph constructor (RCP<const Map>, "
507 "RCP<const Map>, ArrayRCP<size_t>, ArrayRCP<LocalOrdinal>[, "
508 "RCP<ParameterList>]) threw an exception: "
509 << e.what());
510 }
511 // myGraph_ not null means that the matrix owns the graph. This
512 // is true because the column indices come in as nonconst,
513 // implying shared ownership.
514 myGraph_ = graph;
515 staticGraph_ = graph;
516
517 // The graph may not be fill complete yet. However, it is locally
518 // indexed (since we have a column Map) and has a fixed structure
519 // (due to the input arrays). This means we can allocate the
520 // (1-D) array of values and build the local matrix right now.
521 // Note that the local matrix's number of columns comes from the
522 // column Map, not the domain Map.
523
524 // The graph _must_ have a local graph at this point. We don't
525 // really care whether CrsGraph's constructor deep-copies or
526 // shallow-copies the input, but the dimensions have to be right.
527 // That's how we tell whether the CrsGraph has a local graph.
528 auto lclGraph = staticGraph_->getLocalGraphDevice();
529 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(size_t(lclGraph.row_map.extent(0)) != size_t(ptr.size()) ||
530 size_t(lclGraph.entries.extent(0)) != size_t(ind.size()),
531 std::logic_error,
532 "CrsGraph's constructor (rowMap, colMap, "
533 "ptr, ind[, params]) did not set the local graph correctly. "
534 "Please report this bug to the Tpetra developers.");
535
536 values_type valIn =
538 valuesPacked_wdv = values_wdv_type(valIn);
539 valuesUnpacked_wdv = valuesPacked_wdv;
540
541 // FIXME (22 Jun 2016) I would very much like to get rid of
542 // k_values1D_ at some point. I find it confusing to have all
543 // these extra references lying around.
544 // this->k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
545
547}
548
549template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
551 CrsMatrix(const Teuchos::RCP<const map_type>& rowMap,
552 const Teuchos::RCP<const map_type>& colMap,
554 const Teuchos::RCP<Teuchos::ParameterList>& params)
555 : dist_object_type(rowMap)
556 , storageStatus_(Details::STORAGE_1D_PACKED)
557 , fillComplete_(true) {
558 const char tfecfFuncName[] =
559 "Tpetra::CrsMatrix(RCP<const Map>, "
560 "RCP<const Map>, local_matrix_device_type[, RCP<ParameterList>]): ";
561 const char suffix[] =
562 " Please report this bug to the Tpetra developers.";
563
564 Teuchos::RCP<crs_graph_type> graph;
565 try {
566 graph = Teuchos::rcp(new crs_graph_type(rowMap, colMap,
567 lclMatrix.graph, params));
568 } catch (std::exception& e) {
569 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
570 "CrsGraph constructor (RCP<const Map>, "
571 "RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) threw an "
572 "exception: "
573 << e.what());
574 }
575 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!graph->isFillComplete(), std::logic_error,
576 "CrsGraph constructor (RCP"
577 "<const Map>, RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) "
578 "did not produce a fill-complete graph. Please report this bug to the "
579 "Tpetra developers.");
580 // myGraph_ not null means that the matrix owns the graph. This
581 // is true because the column indices come in as nonconst through
582 // the matrix, implying shared ownership.
583 myGraph_ = graph;
584 staticGraph_ = graph;
585
586 valuesPacked_wdv = values_wdv_type(lclMatrix.values);
587 valuesUnpacked_wdv = valuesPacked_wdv;
588
590 "At the end of a CrsMatrix constructor that should produce "
591 "a fillComplete matrix, isFillActive() is true."
592 << suffix);
594 "At the end of a "
595 "CrsMatrix constructor that should produce a fillComplete "
596 "matrix, isFillComplete() is false."
597 << suffix);
599}
600
601template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
604 const Teuchos::RCP<const map_type>& rowMap,
605 const Teuchos::RCP<const map_type>& colMap,
606 const Teuchos::RCP<const map_type>& domainMap,
607 const Teuchos::RCP<const map_type>& rangeMap,
608 const Teuchos::RCP<Teuchos::ParameterList>& params)
609 : dist_object_type(rowMap)
610 , storageStatus_(Details::STORAGE_1D_PACKED)
611 , fillComplete_(true) {
612 const char tfecfFuncName[] =
613 "Tpetra::CrsMatrix(RCP<const Map>, "
614 "RCP<const Map>, RCP<const Map>, RCP<const Map>, "
615 "local_matrix_device_type[, RCP<ParameterList>]): ";
616 const char suffix[] =
617 " Please report this bug to the Tpetra developers.";
618
619 Teuchos::RCP<crs_graph_type> graph;
620 try {
621 graph = Teuchos::rcp(new crs_graph_type(lclMatrix.graph, rowMap, colMap,
622 domainMap, rangeMap, params));
623 } catch (std::exception& e) {
624 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
625 "CrsGraph constructor (RCP<const Map>, "
626 "RCP<const Map>, RCP<const Map>, RCP<const Map>, local_graph_device_type[, "
627 "RCP<ParameterList>]) threw an exception: "
628 << e.what());
629 }
630 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!graph->isFillComplete(), std::logic_error,
631 "CrsGraph "
632 "constructor (RCP<const Map>, RCP<const Map>, RCP<const Map>, "
633 "RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) did "
634 "not produce a fillComplete graph."
635 << suffix);
636 // myGraph_ not null means that the matrix owns the graph. This
637 // is true because the column indices come in as nonconst through
638 // the matrix, implying shared ownership.
639 myGraph_ = graph;
640 staticGraph_ = graph;
641
642 valuesPacked_wdv = values_wdv_type(lclMatrix.values);
643 valuesUnpacked_wdv = valuesPacked_wdv;
644
645 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(isFillActive(), std::logic_error,
646 "At the end of a CrsMatrix constructor that should produce "
647 "a fillComplete matrix, isFillActive() is true."
648 << suffix);
649 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!isFillComplete(), std::logic_error,
650 "At the end of a "
651 "CrsMatrix constructor that should produce a fillComplete "
652 "matrix, isFillComplete() is false."
653 << suffix);
654 checkInternalState();
655}
656
657template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
660 const Teuchos::RCP<const map_type>& rowMap,
661 const Teuchos::RCP<const map_type>& colMap,
662 const Teuchos::RCP<const map_type>& domainMap,
663 const Teuchos::RCP<const map_type>& rangeMap,
664 const Teuchos::RCP<const import_type>& importer,
665 const Teuchos::RCP<const export_type>& exporter,
666 const Teuchos::RCP<Teuchos::ParameterList>& params)
667 : dist_object_type(rowMap)
668 , storageStatus_(Details::STORAGE_1D_PACKED)
669 , fillComplete_(true) {
670 using Teuchos::rcp;
671 const char tfecfFuncName[] =
672 "Tpetra::CrsMatrix"
673 "(lclMat,Map,Map,Map,Map,Import,Export,params): ";
674 const char suffix[] =
675 " Please report this bug to the Tpetra developers.";
676
677 Teuchos::RCP<crs_graph_type> graph;
678 try {
679 graph = rcp(new crs_graph_type(lclMatrix.graph, rowMap, colMap,
680 domainMap, rangeMap, importer,
681 exporter, params));
682 } catch (std::exception& e) {
683 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
684 "CrsGraph constructor "
685 "(local_graph_device_type, Map, Map, Map, Map, Import, Export, "
686 "params) threw: "
687 << e.what());
688 }
689 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!graph->isFillComplete(), std::logic_error,
690 "CrsGraph "
691 "constructor (local_graph_device_type, Map, Map, Map, Map, Import, "
692 "Export, params) did not produce a fill-complete graph. "
693 "Please report this bug to the Tpetra developers.");
694 // myGraph_ not null means that the matrix owns the graph. This
695 // is true because the column indices come in as nonconst through
696 // the matrix, implying shared ownership.
697 myGraph_ = graph;
698 staticGraph_ = graph;
700 valuesPacked_wdv = values_wdv_type(lclMatrix.values);
701 valuesUnpacked_wdv = valuesPacked_wdv;
702
703 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(isFillActive(), std::logic_error,
704 "At the end of a CrsMatrix constructor that should produce "
705 "a fillComplete matrix, isFillActive() is true."
706 << suffix);
707 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!isFillComplete(), std::logic_error,
708 "At the end of a "
709 "CrsMatrix constructor that should produce a fillComplete "
710 "matrix, isFillComplete() is false."
711 << suffix);
712 checkInternalState();
713}
714
715template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
718 const Teuchos::DataAccess copyOrView)
719 : dist_object_type(source.getCrsGraph()->getRowMap())
720 , staticGraph_(source.getCrsGraph())
721 , storageStatus_(source.storageStatus_) {
722 const char tfecfFuncName[] =
723 "Tpetra::CrsMatrix("
724 "const CrsMatrix&, const Teuchos::DataAccess): ";
725 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!source.isFillComplete(), std::invalid_argument,
726 "Source graph must be fillComplete().");
727
728 if (copyOrView == Teuchos::Copy) {
729 using values_type = typename local_matrix_device_type::values_type;
730 auto vals = source.getLocalValuesDevice(Access::ReadOnly);
731 using Kokkos::view_alloc;
732 using Kokkos::WithoutInitializing;
733 values_type newvals(view_alloc("val", WithoutInitializing),
734 vals.extent(0));
735 // DEEP_COPY REVIEW - DEVICE-TO_DEVICE
736 Kokkos::deep_copy(newvals, vals);
737 valuesPacked_wdv = values_wdv_type(newvals);
738 valuesUnpacked_wdv = valuesPacked_wdv;
739 fillComplete(source.getDomainMap(), source.getRangeMap());
740 } else if (copyOrView == Teuchos::View) {
741 valuesPacked_wdv = values_wdv_type(source.valuesPacked_wdv);
742 valuesUnpacked_wdv = values_wdv_type(source.valuesUnpacked_wdv);
743 fillComplete(source.getDomainMap(), source.getRangeMap());
744 } else {
745 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::invalid_argument,
746 "Second argument 'copyOrView' "
747 "has an invalid value "
748 << copyOrView << ". Valid values "
749 "include Teuchos::Copy = "
750 << Teuchos::Copy << " and "
751 "Teuchos::View = "
752 << Teuchos::View << ".");
753 }
755}
756
757template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
760 std::swap(crs_matrix.importMV_, this->importMV_);
761 std::swap(crs_matrix.exportMV_, this->exportMV_);
762 std::swap(crs_matrix.staticGraph_, this->staticGraph_);
763 std::swap(crs_matrix.myGraph_, this->myGraph_);
764 std::swap(crs_matrix.valuesPacked_wdv, this->valuesPacked_wdv);
765 std::swap(crs_matrix.valuesUnpacked_wdv, this->valuesUnpacked_wdv);
766 std::swap(crs_matrix.storageStatus_, this->storageStatus_);
767 std::swap(crs_matrix.fillComplete_, this->fillComplete_);
768 std::swap(crs_matrix.nonlocals_, this->nonlocals_);
769}
770
771template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
772Teuchos::RCP<const Teuchos::Comm<int>>
774 getComm() const {
775 return getCrsGraphRef().getComm();
776}
777
778template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
780 isFillComplete() const {
781 return fillComplete_;
782}
783
784template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
786 isFillActive() const {
787 return !fillComplete_;
788}
789
790template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
792 isStorageOptimized() const {
793 return this->getCrsGraphRef().isStorageOptimized();
794}
795
796template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
798 isLocallyIndexed() const {
799 return getCrsGraphRef().isLocallyIndexed();
800}
801
802template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
804 isGloballyIndexed() const {
805 return getCrsGraphRef().isGloballyIndexed();
806}
807
808template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
810 hasColMap() const {
811 return getCrsGraphRef().hasColMap();
812}
813
814template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
817 getGlobalNumEntries() const {
818 return getCrsGraphRef().getGlobalNumEntries();
819}
820
821template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
822size_t
824 getLocalNumEntries() const {
825 return getCrsGraphRef().getLocalNumEntries();
826}
827
828template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
831 getGlobalNumRows() const {
832 return getCrsGraphRef().getGlobalNumRows();
833}
834
835template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
838 getGlobalNumCols() const {
839 return getCrsGraphRef().getGlobalNumCols();
840}
841
842template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
843size_t
845 getLocalNumRows() const {
846 return getCrsGraphRef().getLocalNumRows();
847}
848
849template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
850size_t
852 getLocalNumCols() const {
853 return getCrsGraphRef().getLocalNumCols();
854}
855
856template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
857size_t
860 return getCrsGraphRef().getNumEntriesInGlobalRow(globalRow);
861}
862
863template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
864size_t
867 return getCrsGraphRef().getNumEntriesInLocalRow(localRow);
868}
869
870template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
871size_t
874 return getCrsGraphRef().getGlobalMaxNumRowEntries();
875}
876
877template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
878size_t
881 return getCrsGraphRef().getLocalMaxNumRowEntries();
882}
883
884template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
887 getIndexBase() const {
888 return getRowMap()->getIndexBase();
889}
890
891template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
892Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node>>
894 getRowMap() const {
895 return getCrsGraphRef().getRowMap();
896}
897
898template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
899Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node>>
901 getColMap() const {
902 return getCrsGraphRef().getColMap();
903}
904
905template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
906Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node>>
908 getDomainMap() const {
909 return getCrsGraphRef().getDomainMap();
910}
911
912template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
913Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node>>
915 getRangeMap() const {
916 return getCrsGraphRef().getRangeMap();
917}
918
919template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
920Teuchos::RCP<const RowGraph<LocalOrdinal, GlobalOrdinal, Node>>
922 getGraph() const {
923 if (staticGraph_ != Teuchos::null) {
924 return staticGraph_;
925 }
926 return myGraph_;
927}
928
929template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
930Teuchos::RCP<const CrsGraph<LocalOrdinal, GlobalOrdinal, Node>>
932 getCrsGraph() const {
933 if (staticGraph_ != Teuchos::null) {
934 return staticGraph_;
935 }
936 return myGraph_;
937}
938
939template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
942 getCrsGraphRef() const {
943#ifdef HAVE_TPETRA_DEBUG
944 constexpr bool debug = true;
945#else
946 constexpr bool debug = false;
947#endif // HAVE_TPETRA_DEBUG
948
949 if (!this->staticGraph_.is_null()) {
950 return *(this->staticGraph_);
951 } else {
952 if (debug) {
953 const char tfecfFuncName[] = "getCrsGraphRef: ";
954 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(this->myGraph_.is_null(), std::logic_error,
955 "Both staticGraph_ and myGraph_ are null. "
956 "Please report this bug to the Tpetra developers.");
957 }
958 return *(this->myGraph_);
959 }
960}
961
962template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
965 getLocalMatrixDevice() const {
966 auto numCols = staticGraph_->getColMap()->getLocalNumElements();
967 return local_matrix_device_type("Tpetra::CrsMatrix::lclMatrixDevice",
968 numCols,
969 valuesPacked_wdv.getDeviceView(Access::ReadWrite),
970 staticGraph_->getLocalGraphDevice());
971}
972
973template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
974typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_host_type
976 getLocalMatrixHost() const {
977 auto numCols = staticGraph_->getColMap()->getLocalNumElements();
978 return local_matrix_host_type("Tpetra::CrsMatrix::lclMatrixHost", numCols,
979 valuesPacked_wdv.getHostView(Access::ReadWrite),
980 staticGraph_->getLocalGraphHost());
981}
982
983template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
985 isStaticGraph() const {
986 return myGraph_.is_null();
987}
988
989template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
994
995template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1000
1001template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1003 allocateValues(ELocalGlobal lg, GraphAllocationStatus gas,
1004 const bool verbose) {
1005 using Details::Behavior;
1007 using std::endl;
1008 const char tfecfFuncName[] = "allocateValues: ";
1009 const char suffix[] =
1010 " Please report this bug to the Tpetra developers.";
1011 ProfilingRegion region("Tpetra::CrsMatrix::allocateValues");
1012
1013 std::unique_ptr<std::string> prefix;
1014 if (verbose) {
1015 prefix = this->createPrefix("CrsMatrix", "allocateValues");
1016 std::ostringstream os;
1017 os << *prefix << "lg: "
1018 << (lg == LocalIndices ? "Local" : "Global") << "Indices"
1019 << ", gas: Graph"
1020 << (gas == GraphAlreadyAllocated ? "Already" : "NotYet")
1021 << "Allocated" << endl;
1022 std::cerr << os.str();
1023 }
1024
1025 const bool debug = Behavior::debug("CrsMatrix");
1026 if (debug) {
1027 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(this->staticGraph_.is_null(), std::logic_error,
1028 "staticGraph_ is null." << suffix);
1029
1030 // If the graph indices are already allocated, then gas should be
1031 // GraphAlreadyAllocated. Otherwise, gas should be
1032 // GraphNotYetAllocated.
1033 if ((gas == GraphAlreadyAllocated) !=
1034 staticGraph_->indicesAreAllocated()) {
1035 const char err1[] =
1036 "The caller has asserted that the graph "
1037 "is ";
1038 const char err2[] =
1039 "already allocated, but the static graph "
1040 "says that its indices are ";
1041 const char err3[] = "already allocated. ";
1042 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(gas == GraphAlreadyAllocated &&
1043 !staticGraph_->indicesAreAllocated(),
1044 std::logic_error,
1045 err1 << err2 << "not " << err3 << suffix);
1046 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(gas != GraphAlreadyAllocated &&
1047 staticGraph_->indicesAreAllocated(),
1048 std::logic_error,
1049 err1 << "not " << err2 << err3 << suffix);
1050 }
1051
1052 // If the graph is unallocated, then it had better be a
1053 // matrix-owned graph. ("Matrix-owned graph" means that the
1054 // matrix gets to define the graph structure. If the CrsMatrix
1055 // constructor that takes an RCP<const CrsGraph> was used, then
1056 // the matrix does _not_ own the graph.)
1057 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!this->staticGraph_->indicesAreAllocated() &&
1058 this->myGraph_.is_null(),
1059 std::logic_error,
1060 "The static graph says that its indices are not allocated, "
1061 "but the graph is not owned by the matrix."
1062 << suffix);
1063 }
1065 if (gas == GraphNotYetAllocated) {
1066 if (debug) {
1067 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(this->myGraph_.is_null(), std::logic_error,
1068 "gas = GraphNotYetAllocated, but myGraph_ is null." << suffix);
1069 }
1070 try {
1071 this->myGraph_->allocateIndices(lg, verbose);
1072 } catch (std::exception& e) {
1073 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
1074 "CrsGraph::allocateIndices "
1075 "threw an exception: "
1076 << e.what());
1077 } catch (...) {
1078 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
1079 "CrsGraph::allocateIndices "
1080 "threw an exception not a subclass of std::exception.");
1081 }
1082 }
1083
1084 // Allocate matrix values.
1085 const size_t lclTotalNumEntries = this->staticGraph_->getLocalAllocationSize();
1086 if (debug) {
1087 const size_t lclNumRows = this->staticGraph_->getLocalNumRows();
1088 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(this->staticGraph_->getRowPtrsUnpackedHost()(lclNumRows) != lclTotalNumEntries, std::logic_error,
1089 "length of staticGraph's lclIndsUnpacked does not match final entry of rowPtrsUnapcked_host." << suffix);
1090 }
1091
1092 // Allocate array of (packed???) matrix values.
1093 using values_type = typename local_matrix_device_type::values_type;
1094 if (verbose) {
1095 std::ostringstream os;
1096 os << *prefix << "Allocate values_wdv: Pre "
1097 << valuesUnpacked_wdv.extent(0) << ", post "
1098 << lclTotalNumEntries << endl;
1099 std::cerr << os.str();
1100 }
1101 // this->k_values1D_ =
1102 valuesUnpacked_wdv = values_wdv_type(
1103 values_type("Tpetra::CrsMatrix::values",
1104 lclTotalNumEntries));
1105}
1106
1107template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1109 fillLocalGraphAndMatrix(const Teuchos::RCP<Teuchos::ParameterList>& params) {
1110 using std::endl;
1111 using Teuchos::arcp_const_cast;
1112 using Teuchos::Array;
1113 using Teuchos::ArrayRCP;
1114 using Teuchos::null;
1115 using Teuchos::RCP;
1116 using Teuchos::rcp;
1117 using ::Tpetra::Details::computeOffsetsFromCounts;
1118 using ::Tpetra::Details::getEntryOnHost;
1119 using row_map_type = typename local_graph_device_type::row_map_type;
1120 using lclinds_1d_type = typename Graph::local_graph_device_type::entries_type::non_const_type;
1121 using values_type = typename local_matrix_device_type::values_type;
1122 Details::ProfilingRegion regionFLGAM("Tpetra::CrsMatrix::fillLocalGraphAndMatrix");
1123
1124 const char tfecfFuncName[] =
1125 "fillLocalGraphAndMatrix (called from "
1126 "fillComplete or expertStaticFillComplete): ";
1127 const char suffix[] =
1128 " Please report this bug to the Tpetra developers.";
1129 const bool debug = Details::Behavior::debug("CrsMatrix");
1130 const bool verbose = Details::Behavior::verbose("CrsMatrix");
1131
1132 std::unique_ptr<std::string> prefix;
1133 if (verbose) {
1134 prefix = this->createPrefix("CrsMatrix", "fillLocalGraphAndMatrix");
1135 std::ostringstream os;
1137 std::cerr << os.str();
1138 }
1139
1140 if (debug) {
1141 // fillComplete() only calls fillLocalGraphAndMatrix() if the
1142 // matrix owns the graph, which means myGraph_ is not null.
1143 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(myGraph_.is_null(), std::logic_error,
1144 "The nonconst graph "
1145 "(myGraph_) is null. This means that the matrix has a "
1146 "const (a.k.a. \"static\") graph. fillComplete or "
1147 "expertStaticFillComplete should never call "
1148 "fillLocalGraphAndMatrix in that case."
1149 << suffix);
1150 }
1151
1152 const size_t lclNumRows = this->getLocalNumRows();
1154 // This method's goal is to fill in the three arrays (compressed
1155 // sparse row format) that define the sparse graph's and matrix's
1156 // structure, and the sparse matrix's values.
1157 //
1158 // Get references to the data in myGraph_, so we can modify them
1159 // as well. Note that we only call fillLocalGraphAndMatrix() if
1160 // the matrix owns the graph, which means myGraph_ is not null.
1161
1162 // NOTE: This does not work correctly w/ GCC 12.3 + CUDA due to a compiler bug.
1163 // See: https://github.com/trilinos/Trilinos/issues/12237
1164 // using row_entries_type = decltype (myGraph_->k_numRowEntries_);
1165 using row_entries_type = typename crs_graph_type::num_row_entries_type;
1166
1167 typename Graph::local_graph_device_type::row_map_type curRowOffsets =
1168 myGraph_->rowPtrsUnpacked_dev_;
1169
1170 if (debug) {
1171 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(curRowOffsets.extent(0) == 0, std::logic_error,
1172 "curRowOffsets.extent(0) == 0.");
1173 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(curRowOffsets.extent(0) != lclNumRows + 1, std::logic_error,
1174 "curRowOffsets.extent(0) = "
1175 << curRowOffsets.extent(0) << " != lclNumRows + 1 = "
1176 << (lclNumRows + 1) << ".");
1177 const size_t numOffsets = curRowOffsets.extent(0);
1178 const auto valToCheck = myGraph_->getRowPtrsUnpackedHost()(numOffsets - 1);
1180 myGraph_->lclIndsUnpacked_wdv.extent(0) != valToCheck,
1181 std::logic_error, "numOffsets = " << numOffsets << " != 0 and myGraph_->lclIndsUnpacked_wdv.extent(0) = " << myGraph_->lclIndsUnpacked_wdv.extent(0) << " != curRowOffsets(" << numOffsets << ") = " << valToCheck << ".");
1182 }
1183
1184 if (myGraph_->getLocalNumEntries() !=
1185 myGraph_->getLocalAllocationSize()) {
1186 // Use the nonconst version of row_map_type for k_ptrs,
1187 // because row_map_type is const and we need to modify k_ptrs here.
1188 typename row_map_type::non_const_type k_ptrs;
1191 values_type k_vals;
1192
1193 if (verbose) {
1194 std::ostringstream os;
1195 const auto numEnt = myGraph_->getLocalNumEntries();
1196 const auto allocSize = myGraph_->getLocalAllocationSize();
1197 os << *prefix << "Unpacked 1-D storage: numEnt=" << numEnt
1198 << ", allocSize=" << allocSize << endl;
1199 std::cerr << os.str();
1200 }
1201 // The matrix's current 1-D storage is "unpacked." This means
1202 // the row offsets may differ from what the final row offsets
1203 // should be. This could happen, for example, if the user
1204 // set an upper
1205 // bound on the number of entries per row, but didn't fill all
1206 // those entries.
1207 if (debug && curRowOffsets.extent(0) != 0) {
1208 const size_t numOffsets =
1209 static_cast<size_t>(curRowOffsets.extent(0));
1210 const auto valToCheck = myGraph_->getRowPtrsUnpackedHost()(numOffsets - 1);
1212 static_cast<size_t>(valuesUnpacked_wdv.extent(0)),
1213 std::logic_error,
1214 "(unpacked branch) Before "
1215 "allocating or packing, curRowOffsets("
1216 << (numOffsets - 1)
1217 << ") = " << valToCheck << " != valuesUnpacked_wdv.extent(0)"
1218 " = "
1219 << valuesUnpacked_wdv.extent(0) << ".");
1221 static_cast<size_t>(myGraph_->lclIndsUnpacked_wdv.extent(0)),
1222 std::logic_error,
1223 "(unpacked branch) Before "
1224 "allocating or packing, curRowOffsets("
1225 << (numOffsets - 1)
1226 << ") = " << valToCheck
1227 << " != myGraph_->lclIndsUnpacked_wdv.extent(0) = "
1228 << myGraph_->lclIndsUnpacked_wdv.extent(0) << ".");
1229 }
1230 // Pack the row offsets into k_ptrs, by doing a sum-scan of
1231 // the array of valid entry counts per row.
1232
1233 // Total number of entries in the matrix on the calling
1234 // process. We will compute this in the loop below. It's
1235 // cheap to compute and useful as a sanity check.
1236 size_t lclTotalNumEntries = 0;
1237 {
1238 // Allocate the packed row offsets array. We use a nonconst
1239 // temporary (packedRowOffsets) here, because k_ptrs is
1240 // const. We will assign packedRowOffsets to k_ptrs below.
1241 if (verbose) {
1242 std::ostringstream os;
1243 os << *prefix << "Allocate packed row offsets: "
1244 << (lclNumRows + 1) << endl;
1245 std::cerr << os.str();
1246 }
1247 typename row_map_type::non_const_type
1248 packedRowOffsets("Tpetra::CrsGraph::ptr", lclNumRows + 1);
1249 typename row_entries_type::const_type numRowEnt_h =
1250 myGraph_->k_numRowEntries_;
1251 // We're computing offsets on device. This function can
1252 // handle numRowEnt_h being a host View.
1253 lclTotalNumEntries =
1254 computeOffsetsFromCounts(packedRowOffsets, numRowEnt_h);
1255 // packedRowOffsets is modifiable; k_ptrs isn't, so we have
1256 // to use packedRowOffsets in the loop above and assign here.
1257 k_ptrs = packedRowOffsets;
1258 k_ptrs_const = k_ptrs;
1259 }
1260
1261 if (debug) {
1262 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(k_ptrs.extent(0)) != lclNumRows + 1,
1263 std::logic_error,
1264 "(unpacked branch) After packing k_ptrs, "
1265 "k_ptrs.extent(0) = "
1266 << k_ptrs.extent(0) << " != "
1267 "lclNumRows+1 = "
1268 << (lclNumRows + 1) << ".");
1269 const auto valToCheck = getEntryOnHost(k_ptrs, lclNumRows);
1271 "(unpacked branch) After filling k_ptrs, "
1272 "k_ptrs(lclNumRows="
1273 << lclNumRows << ") = " << valToCheck
1274 << " != total number of entries on the calling process = "
1275 << lclTotalNumEntries << ".");
1276 }
1277
1278 // Allocate the arrays of packed column indices and values.
1279 if (verbose) {
1280 std::ostringstream os;
1281 os << *prefix << "Allocate packed local column indices: "
1283 std::cerr << os.str();
1284 }
1285 k_inds = lclinds_1d_type("Tpetra::CrsGraph::lclInds", lclTotalNumEntries);
1286 if (verbose) {
1287 std::ostringstream os;
1288 os << *prefix << "Allocate packed values: "
1289 << lclTotalNumEntries << endl;
1290 std::cerr << os.str();
1291 }
1292 k_vals = values_type("Tpetra::CrsMatrix::values", lclTotalNumEntries);
1293
1294 // curRowOffsets (myGraph_->rowPtrsUnpacked_) (???), lclIndsUnpacked_wdv,
1295 // and valuesUnpacked_wdv are currently unpacked. Pack them, using
1296 // the packed row offsets array k_ptrs that we created above.
1297 //
1298 // FIXME (mfh 06 Aug 2014) If "Optimize Storage" is false, we
1299 // need to keep around the unpacked row offsets, column
1300 // indices, and values arrays.
1301
1302 // Pack the column indices from unpacked lclIndsUnpacked_wdv into
1303 // packed k_inds. We will replace lclIndsUnpacked_wdv below.
1304 using inds_packer_type = pack_functor<
1305 typename Graph::local_graph_device_type::entries_type::non_const_type,
1306 typename Graph::local_inds_dualv_type::t_dev::const_type,
1307 typename Graph::local_graph_device_type::row_map_type::non_const_type,
1308 typename Graph::local_graph_device_type::row_map_type>;
1309 inds_packer_type indsPacker(
1310 k_inds,
1311 myGraph_->lclIndsUnpacked_wdv.getDeviceView(Access::ReadOnly),
1312 k_ptrs, curRowOffsets);
1313 using exec_space = typename decltype(k_inds)::execution_space;
1314 using range_type = Kokkos::RangePolicy<exec_space, LocalOrdinal>;
1315 Kokkos::parallel_for("Tpetra::CrsMatrix pack column indices",
1316 range_type(0, lclNumRows), indsPacker);
1317
1318 // Pack the values from unpacked valuesUnpacked_wdv into packed
1319 // k_vals. We will replace valuesPacked_wdv below.
1320 using vals_packer_type = pack_functor<
1321 typename values_type::non_const_type,
1322 typename values_type::const_type,
1323 typename row_map_type::non_const_type,
1324 typename row_map_type::const_type>;
1326 k_vals,
1327 this->valuesUnpacked_wdv.getDeviceView(Access::ReadOnly),
1329 Kokkos::parallel_for("Tpetra::CrsMatrix pack values",
1330 range_type(0, lclNumRows), valsPacker);
1331
1332 if (debug) {
1333 const char myPrefix[] =
1334 "(\"Optimize Storage\""
1335 "=true branch) After packing, ";
1336 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(k_ptrs.extent(0) == 0, std::logic_error, myPrefix << "k_ptrs.extent(0) = 0. This probably means that "
1337 "rowPtrsUnpacked_ was never allocated.");
1338 if (k_ptrs.extent(0) != 0) {
1339 const size_t numOffsets(k_ptrs.extent(0));
1340 const auto valToCheck =
1341 getEntryOnHost(k_ptrs, numOffsets - 1);
1343 std::logic_error, myPrefix << "k_ptrs(" << (numOffsets - 1) << ") = " << valToCheck << " != k_vals.extent(0) = " << k_vals.extent(0) << ".");
1345 std::logic_error, myPrefix << "k_ptrs(" << (numOffsets - 1) << ") = " << valToCheck << " != k_inds.extent(0) = " << k_inds.extent(0) << ".");
1346 }
1347 }
1348 // Build the local graph.
1349 myGraph_->setRowPtrsPacked(k_ptrs_const);
1350 myGraph_->lclIndsPacked_wdv =
1351 typename crs_graph_type::local_inds_wdv_type(k_inds);
1352 valuesPacked_wdv = values_wdv_type(k_vals);
1353 } else { // We don't have to pack, so just set the pointers.
1354 // FIXME KDDKDD https://github.com/trilinos/Trilinos/issues/9657
1355 // FIXME? This is already done in the graph fill call - need to avoid the memcpy to host
1356 myGraph_->rowPtrsPacked_dev_ = myGraph_->rowPtrsUnpacked_dev_;
1357 myGraph_->rowPtrsPacked_host_ = myGraph_->rowPtrsUnpacked_host_;
1358 myGraph_->packedUnpackedRowPtrsMatch_ = true;
1359 myGraph_->lclIndsPacked_wdv = myGraph_->lclIndsUnpacked_wdv;
1360 valuesPacked_wdv = valuesUnpacked_wdv;
1361
1362 if (verbose) {
1363 std::ostringstream os;
1364 os << *prefix << "Storage already packed: rowPtrsUnpacked_: "
1365 << myGraph_->getRowPtrsUnpackedHost().extent(0) << ", lclIndsUnpacked_wdv: "
1366 << myGraph_->lclIndsUnpacked_wdv.extent(0) << ", valuesUnpacked_wdv: "
1367 << valuesUnpacked_wdv.extent(0) << endl;
1368 std::cerr << os.str();
1369 }
1370
1371 if (debug) {
1372 const char myPrefix[] =
1373 "(\"Optimize Storage\"=false branch) ";
1374 auto rowPtrsUnpackedHost = myGraph_->getRowPtrsUnpackedHost();
1375 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(myGraph_->rowPtrsUnpacked_dev_.extent(0) == 0, std::logic_error, myPrefix << "myGraph->rowPtrsUnpacked_dev_.extent(0) = 0. This probably means "
1376 "that rowPtrsUnpacked_ was never allocated.");
1377 if (myGraph_->rowPtrsUnpacked_dev_.extent(0) != 0) {
1378 const size_t numOffsets = rowPtrsUnpackedHost.extent(0);
1379 const auto valToCheck = rowPtrsUnpackedHost(numOffsets - 1);
1380 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(size_t(valToCheck) != valuesPacked_wdv.extent(0),
1381 std::logic_error, myPrefix << "k_ptrs_const(" << (numOffsets - 1) << ") = " << valToCheck << " != valuesPacked_wdv.extent(0) = " << valuesPacked_wdv.extent(0) << ".");
1382 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(size_t(valToCheck) != myGraph_->lclIndsPacked_wdv.extent(0),
1383 std::logic_error, myPrefix << "k_ptrs_const(" << (numOffsets - 1) << ") = " << valToCheck << " != myGraph_->lclIndsPacked.extent(0) = " << myGraph_->lclIndsPacked_wdv.extent(0) << ".");
1384 }
1385 }
1386 }
1387
1388 if (debug) {
1389 const char myPrefix[] = "After packing, ";
1390 auto rowPtrsPackedHost = myGraph_->getRowPtrsPackedHost();
1391 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(size_t(rowPtrsPackedHost.extent(0)) != size_t(lclNumRows + 1),
1392 std::logic_error, myPrefix << "myGraph_->rowPtrsPacked_host_.extent(0) = " << rowPtrsPackedHost.extent(0) << " != lclNumRows+1 = " << (lclNumRows + 1) << ".");
1393 if (rowPtrsPackedHost.extent(0) != 0) {
1394 const size_t numOffsets(rowPtrsPackedHost.extent(0));
1395 const size_t valToCheck = rowPtrsPackedHost(numOffsets - 1);
1396 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(valToCheck != size_t(valuesPacked_wdv.extent(0)),
1397 std::logic_error, myPrefix << "k_ptrs_const(" << (numOffsets - 1) << ") = " << valToCheck << " != valuesPacked_wdv.extent(0) = " << valuesPacked_wdv.extent(0) << ".");
1398 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(valToCheck != size_t(myGraph_->lclIndsPacked_wdv.extent(0)),
1399 std::logic_error, myPrefix << "k_ptrs_const(" << (numOffsets - 1) << ") = " << valToCheck << " != myGraph_->lclIndsPacked_wdvk_inds.extent(0) = " << myGraph_->lclIndsPacked_wdv.extent(0) << ".");
1400 }
1401 }
1402
1403 // May we ditch the old allocations for the packed (and otherwise
1404 // "optimized") allocations, later in this routine? Optimize
1405 // storage if the graph is not static, or if the graph already has
1406 // optimized storage.
1407 const bool defaultOptStorage =
1408 !isStaticGraph() || staticGraph_->isStorageOptimized();
1409 const bool requestOptimizedStorage =
1410 (!params.is_null() &&
1411 params->get("Optimize Storage", defaultOptStorage)) ||
1412 (params.is_null() && defaultOptStorage);
1413
1414 // The graph has optimized storage when indices are allocated,
1415 // myGraph_->k_numRowEntries_ is empty, and there are more than
1416 // zero rows on this process.
1418 // Free the old, unpacked, unoptimized allocations.
1419 // Free graph data structures that are only needed for
1420 // unpacked 1-D storage.
1421 if (verbose) {
1422 std::ostringstream os;
1423 os << *prefix << "Optimizing storage: free k_numRowEntries_: "
1424 << myGraph_->k_numRowEntries_.extent(0) << endl;
1425 std::cerr << os.str();
1426 }
1427
1428 myGraph_->k_numRowEntries_ = row_entries_type();
1429
1430 // Keep the new 1-D packed allocations.
1431 // FIXME KDDKDD https://github.com/trilinos/Trilinos/issues/9657
1432 // We directly set the memory spaces to avoid a memcpy from device to host
1433 myGraph_->rowPtrsUnpacked_dev_ = myGraph_->rowPtrsPacked_dev_;
1434 myGraph_->rowPtrsUnpacked_host_ = myGraph_->rowPtrsPacked_host_;
1435 myGraph_->packedUnpackedRowPtrsMatch_ = true;
1436 myGraph_->lclIndsUnpacked_wdv = myGraph_->lclIndsPacked_wdv;
1437 valuesUnpacked_wdv = valuesPacked_wdv;
1438
1439 myGraph_->storageStatus_ = Details::STORAGE_1D_PACKED;
1440 this->storageStatus_ = Details::STORAGE_1D_PACKED;
1441 } else {
1442 if (verbose) {
1443 std::ostringstream os;
1444 os << *prefix << "User requested NOT to optimize storage"
1445 << endl;
1446 std::cerr << os.str();
1447 }
1448 }
1449}
1450
1451template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1453 fillLocalMatrix(const Teuchos::RCP<Teuchos::ParameterList>& params) {
1454 using std::endl;
1455 using Teuchos::Array;
1456 using Teuchos::ArrayRCP;
1457 using Teuchos::null;
1458 using Teuchos::RCP;
1459 using Teuchos::rcp;
1460 using ::Tpetra::Details::ProfilingRegion;
1461 using row_map_type = typename Graph::local_graph_device_type::row_map_type;
1462 using non_const_row_map_type = typename row_map_type::non_const_type;
1463 using values_type = typename local_matrix_device_type::values_type;
1464 ProfilingRegion regionFLM("Tpetra::CrsMatrix::fillLocalMatrix");
1465 const size_t lclNumRows = getLocalNumRows();
1466
1467 const bool verbose = Details::Behavior::verbose("CrsMatrix");
1468 std::unique_ptr<std::string> prefix;
1469 if (verbose) {
1470 prefix = this->createPrefix("CrsMatrix", "fillLocalMatrix");
1471 std::ostringstream os;
1472 os << *prefix << "lclNumRows: " << lclNumRows << endl;
1473 std::cerr << os.str();
1474 }
1475
1476 // The goals of this routine are first, to allocate and fill
1477 // packed 1-D storage (see below for an explanation) in the vals
1478 // array, and second, to give vals to the local matrix and
1479 // finalize the local matrix. We only need k_ptrs, the packed 1-D
1480 // row offsets, within the scope of this routine, since we're only
1481 // filling the local matrix here (use fillLocalGraphAndMatrix() to
1482 // fill both the graph and the matrix at the same time).
1483
1484 // get data from staticGraph_
1485 size_t nodeNumEntries = staticGraph_->getLocalNumEntries();
1486 size_t nodeNumAllocated = staticGraph_->getLocalAllocationSize();
1487 row_map_type k_rowPtrs = staticGraph_->rowPtrsPacked_dev_;
1488
1489 row_map_type k_ptrs; // "packed" row offsets array
1490 values_type k_vals; // "packed" values array
1491
1492 // May we ditch the old allocations for the packed (and otherwise
1493 // "optimized") allocations, later in this routine? Request
1494 // optimized storage by default.
1495 bool requestOptimizedStorage = true;
1496 const bool default_OptimizeStorage =
1497 !isStaticGraph() || staticGraph_->isStorageOptimized();
1498 if (!params.is_null() &&
1499 !params->get("Optimize Storage", default_OptimizeStorage)) {
1501 }
1502 // If we're not allowed to change a static graph, then we can't
1503 // change the storage of the matrix, either. This means that if
1504 // the graph's storage isn't already optimized, we can't optimize
1505 // the matrix's storage either. Check and give warning, as
1506 // appropriate.
1507 if (!staticGraph_->isStorageOptimized() &&
1508 requestOptimizedStorage) {
1509 TPETRA_ABUSE_WARNING(true, std::runtime_error,
1510 "You requested optimized storage "
1511 "by setting the \"Optimize Storage\" flag to \"true\" in "
1512 "the ParameterList, or by virtue of default behavior. "
1513 "However, the associated CrsGraph was filled separately and "
1514 "requested not to optimize storage. Therefore, the "
1515 "CrsMatrix cannot optimize storage.");
1516 requestOptimizedStorage = false;
1517 }
1518
1519 // NOTE: This does not work correctly w/ GCC 12.3 + CUDA due to a compiler bug.
1520 // See: https://github.com/trilinos/Trilinos/issues/12237
1521 // using row_entries_type = decltype (staticGraph_->k_numRowEntries_);
1522 using row_entries_type = typename crs_graph_type::num_row_entries_type;
1523
1524 // The matrix's values are currently
1525 // stored in a 1-D format. However, this format is "unpacked";
1526 // it doesn't necessarily have the same row offsets as indicated
1527 // by the ptrs array returned by allocRowPtrs. This could
1528 // happen, for example, if the user
1529 // fixed the number of matrix entries in
1530 // each row, but didn't fill all those entries.
1531 //
1532 // As above, we don't need to keep the "packed" row offsets
1533 // array ptrs here, but we do need it here temporarily, so we
1534 // have to allocate it. We'll free ptrs later in this method.
1535 //
1536 // Note that this routine checks whether storage has already
1537 // been packed. This is a common case for solution of nonlinear
1538 // PDEs using the finite element method, as long as the
1539 // structure of the sparse matrix does not change between linear
1540 // solves.
1541 if (nodeNumEntries != nodeNumAllocated) {
1542 if (verbose) {
1543 std::ostringstream os;
1544 os << *prefix << "Unpacked 1-D storage: numEnt="
1545 << nodeNumEntries << ", allocSize=" << nodeNumAllocated
1546 << endl;
1547 std::cerr << os.str();
1548 }
1549 // We have to pack the 1-D storage, since the user didn't fill
1550 // up all requested storage.
1551 if (verbose) {
1552 std::ostringstream os;
1553 os << *prefix << "Allocate packed row offsets: "
1554 << (lclNumRows + 1) << endl;
1555 std::cerr << os.str();
1556 }
1557 non_const_row_map_type tmpk_ptrs("Tpetra::CrsGraph::ptr",
1558 lclNumRows + 1);
1559 // Total number of entries in the matrix on the calling
1560 // process. We will compute this in the loop below. It's
1561 // cheap to compute and useful as a sanity check.
1562 size_t lclTotalNumEntries = 0;
1563 k_ptrs = tmpk_ptrs;
1564 {
1565 typename row_entries_type::const_type numRowEnt_h =
1566 staticGraph_->k_numRowEntries_;
1567 // This function can handle the counts being a host View.
1568 lclTotalNumEntries =
1569 Details::computeOffsetsFromCounts(tmpk_ptrs, numRowEnt_h);
1570 }
1571
1572 // Allocate the "packed" values array.
1573 // It has exactly the right number of entries.
1574 if (verbose) {
1575 std::ostringstream os;
1576 os << *prefix << "Allocate packed values: "
1577 << lclTotalNumEntries << endl;
1578 std::cerr << os.str();
1579 }
1580 k_vals = values_type("Tpetra::CrsMatrix::val", lclTotalNumEntries);
1581
1582 // Pack values_wdv into k_vals. We will replace values_wdv below.
1583 pack_functor<
1584 typename values_type::non_const_type,
1585 typename values_type::const_type,
1586 typename row_map_type::non_const_type,
1587 typename row_map_type::const_type>
1588 valsPacker(k_vals, valuesUnpacked_wdv.getDeviceView(Access::ReadOnly),
1589 tmpk_ptrs, k_rowPtrs);
1590
1591 using exec_space = typename decltype(k_vals)::execution_space;
1592 using range_type = Kokkos::RangePolicy<exec_space, LocalOrdinal>;
1593 Kokkos::parallel_for("Tpetra::CrsMatrix pack values",
1594 range_type(0, lclNumRows), valsPacker);
1595 valuesPacked_wdv = values_wdv_type(k_vals);
1596 } else { // We don't have to pack, so just set the pointer.
1597 valuesPacked_wdv = valuesUnpacked_wdv;
1598 if (verbose) {
1599 std::ostringstream os;
1600 os << *prefix << "Storage already packed: "
1601 << "valuesUnpacked_wdv: " << valuesUnpacked_wdv.extent(0) << endl;
1602 std::cerr << os.str();
1603 }
1604 }
1605
1606 // May we ditch the old allocations for the packed one?
1607 if (requestOptimizedStorage) {
1608 // The user requested optimized storage, so we can dump the
1609 // unpacked 1-D storage, and keep the packed storage.
1610 valuesUnpacked_wdv = valuesPacked_wdv;
1611 // k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
1612 this->storageStatus_ = Details::STORAGE_1D_PACKED;
1613 }
1614}
1615
1616template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1617void CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
1618 insertIndicesAndValues(crs_graph_type& graph,
1619 RowInfo& rowInfo,
1620 const typename crs_graph_type::SLocalGlobalViews& newInds,
1621 const Teuchos::ArrayView<impl_scalar_type>& oldRowVals,
1622 const Teuchos::ArrayView<const impl_scalar_type>& newRowVals,
1623 const ELocalGlobal lg,
1624 const ELocalGlobal I) {
1625 const size_t oldNumEnt = rowInfo.numEntries;
1626 const size_t numInserted = graph.insertIndices(rowInfo, newInds, lg, I);
1627
1628 // Use of memcpy here works around an issue with GCC >= 4.9.0,
1629 // that probably relates to scalar_type vs. impl_scalar_type
1630 // aliasing. See history of Tpetra_CrsGraph_def.hpp for
1631 // details; look for GCC_WORKAROUND macro definition.
1632 if (numInserted > 0) {
1633 const size_t startOffset = oldNumEnt;
1634 memcpy((void*)&oldRowVals[startOffset], &newRowVals[0],
1635 numInserted * sizeof(impl_scalar_type));
1636 }
1637}
1638
1639template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1642 const Teuchos::ArrayView<const LocalOrdinal>& indices,
1643 const Teuchos::ArrayView<const Scalar>& values,
1644 const CombineMode CM) {
1645 using std::endl;
1646 const char tfecfFuncName[] = "insertLocalValues: ";
1647
1648 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!this->isFillActive(), std::runtime_error,
1649 "Fill is not active. After calling fillComplete, you must call "
1650 "resumeFill before you may insert entries into the matrix again.");
1651 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(this->isStaticGraph(), std::runtime_error,
1652 "Cannot insert indices with static graph; use replaceLocalValues() "
1653 "instead.");
1654 // At this point, we know that myGraph_ is nonnull.
1655 crs_graph_type& graph = *(this->myGraph_);
1656 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(graph.colMap_.is_null(), std::runtime_error,
1657 "Cannot insert local indices without a column map.");
1658 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(graph.isGloballyIndexed(),
1659 std::runtime_error,
1660 "Graph indices are global; use "
1661 "insertGlobalValues().");
1662 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(values.size() != indices.size(), std::runtime_error,
1663 "values.size() = " << values.size()
1664 << " != indices.size() = " << indices.size() << ".");
1666 !graph.rowMap_->isNodeLocalElement(lclRow), std::runtime_error,
1667 "Local row index " << lclRow << " does not belong to this process.");
1668
1669 if (!graph.indicesAreAllocated()) {
1670 // We only allocate values at most once per process, so it's OK
1671 // to check TPETRA_VERBOSE here.
1672 const bool verbose = Details::Behavior::verbose("CrsMatrix");
1673 this->allocateValues(LocalIndices, GraphNotYetAllocated, verbose);
1674 }
1675
1676#ifdef HAVE_TPETRA_DEBUG
1677 const size_t numEntriesToAdd = static_cast<size_t>(indices.size());
1678 // In a debug build, test whether any of the given column indices
1679 // are not in the column Map. Keep track of the invalid column
1680 // indices so we can tell the user about them.
1681 {
1682 using Teuchos::toString;
1683
1684 const map_type& colMap = *(graph.colMap_);
1685 Teuchos::Array<LocalOrdinal> badColInds;
1686 bool allInColMap = true;
1687 for (size_t k = 0; k < numEntriesToAdd; ++k) {
1688 if (!colMap.isNodeLocalElement(indices[k])) {
1689 allInColMap = false;
1690 badColInds.push_back(indices[k]);
1691 }
1692 }
1693 if (!allInColMap) {
1694 std::ostringstream os;
1695 os << "You attempted to insert entries in owned row " << lclRow
1696 << ", at the following column indices: " << toString(indices)
1697 << "." << endl;
1698 os << "Of those, the following indices are not in the column Map on "
1699 "this process: "
1700 << toString(badColInds) << "." << endl
1701 << "Since "
1702 "the matrix has a column Map already, it is invalid to insert "
1703 "entries at those locations.";
1704 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::invalid_argument, os.str());
1705 }
1706 }
1707#endif // HAVE_TPETRA_DEBUG
1708
1709 RowInfo rowInfo = graph.getRowInfo(lclRow);
1710
1711 auto valsView = this->getValuesViewHostNonConst(rowInfo);
1712 if (CM == ADD) {
1713 auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset) { valsView[offset] += values[k]; };
1714 std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
1715 graph.insertLocalIndicesImpl(lclRow, indices, cb);
1716 } else if (CM == INSERT) {
1717 auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset) { valsView[offset] = values[k]; };
1718 std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
1719 graph.insertLocalIndicesImpl(lclRow, indices, cb);
1720 } else {
1721 std::ostringstream os;
1722 os << "You attempted to use insertLocalValues with CombineMode " << combineModeToString(CM)
1723 << "but this has not been implemented." << endl;
1724 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::invalid_argument, os.str());
1725 }
1726}
1727
1728template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1730 insertLocalValues(const LocalOrdinal localRow,
1731 const LocalOrdinal numEnt,
1732 const Scalar vals[],
1733 const LocalOrdinal cols[],
1734 const CombineMode CM) {
1735 Teuchos::ArrayView<const LocalOrdinal> colsT(cols, numEnt);
1736 Teuchos::ArrayView<const Scalar> valsT(vals, numEnt);
1737 this->insertLocalValues(localRow, colsT, valsT, CM);
1738}
1739
1740template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1744 const GlobalOrdinal gblColInds[],
1745 const impl_scalar_type vals[],
1746 const size_t numInputEnt) {
1747#ifdef HAVE_TPETRA_DEBUG
1748 const char tfecfFuncName[] = "insertGlobalValuesImpl: ";
1749 const size_t origNumEnt = graph.getNumEntriesInLocalRow(rowInfo.localRow);
1750 const size_t curNumEnt = rowInfo.numEntries;
1751#endif // HAVE_TPETRA_DEBUG
1752
1753 if (!graph.indicesAreAllocated()) {
1754 // We only allocate values at most once per process, so it's OK
1755 // to check TPETRA_VERBOSE here.
1756 using ::Tpetra::Details::Behavior;
1757 const bool verbose = Behavior::verbose("CrsMatrix");
1758 this->allocateValues(GlobalIndices, GraphNotYetAllocated, verbose);
1759 // mfh 23 Jul 2017: allocateValues invalidates existing
1760 // getRowInfo results. Once we get rid of lazy graph
1761 // allocation, we'll be able to move the getRowInfo call outside
1762 // of this method.
1763 rowInfo = graph.getRowInfo(rowInfo.localRow);
1764 }
1765
1766 auto valsView = this->getValuesViewHostNonConst(rowInfo);
1767 auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset) {
1768 valsView[offset] += vals[k];
1769 };
1770 std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
1771#ifdef HAVE_TPETRA_DEBUG
1772 // numInserted is only used inside the debug code below.
1773 auto numInserted =
1774#endif
1775 graph.insertGlobalIndicesImpl(rowInfo, gblColInds, numInputEnt, cb);
1776
1777#ifdef HAVE_TPETRA_DEBUG
1778 size_t newNumEnt = curNumEnt + numInserted;
1779 const size_t chkNewNumEnt =
1780 graph.getNumEntriesInLocalRow(rowInfo.localRow);
1781 if (chkNewNumEnt != newNumEnt) {
1782 std::ostringstream os;
1783 os << std::endl
1784 << "newNumEnt = " << newNumEnt
1785 << " != graph.getNumEntriesInLocalRow(" << rowInfo.localRow
1786 << ") = " << chkNewNumEnt << "." << std::endl
1787 << "\torigNumEnt: " << origNumEnt << std::endl
1788 << "\tnumInputEnt: " << numInputEnt << std::endl
1789 << "\tgblColInds: [";
1790 for (size_t k = 0; k < numInputEnt; ++k) {
1792 if (k + size_t(1) < numInputEnt) {
1793 os << ",";
1794 }
1795 }
1796 os << "]" << std::endl
1797 << "\tvals: [";
1798 for (size_t k = 0; k < numInputEnt; ++k) {
1799 os << vals[k];
1800 if (k + size_t(1) < numInputEnt) {
1801 os << ",";
1802 }
1803 }
1804 os << "]" << std::endl;
1805
1806 if (this->supportsRowViews()) {
1807 values_host_view_type vals2;
1808 if (this->isGloballyIndexed()) {
1809 global_inds_host_view_type gblColInds2;
1810 const GlobalOrdinal gblRow =
1811 graph.rowMap_->getGlobalElement(rowInfo.localRow);
1812 if (gblRow ==
1813 Tpetra::Details::OrdinalTraits<GlobalOrdinal>::invalid()) {
1814 os << "Local row index " << rowInfo.localRow << " is invalid!"
1815 << std::endl;
1816 } else {
1817 bool getViewThrew = false;
1818 try {
1819 this->getGlobalRowView(gblRow, gblColInds2, vals2);
1820 } catch (std::exception& e) {
1821 getViewThrew = true;
1822 os << "getGlobalRowView threw exception:" << std::endl
1823 << e.what() << std::endl;
1824 }
1825 if (!getViewThrew) {
1826 os << "\tNew global column indices: ";
1827 for (size_t jjj = 0; jjj < gblColInds2.extent(0); jjj++)
1828 os << gblColInds2[jjj] << " ";
1829 os << std::endl;
1830 os << "\tNew values: ";
1831 for (size_t jjj = 0; jjj < vals2.extent(0); jjj++)
1832 os << vals2[jjj] << " ";
1833 os << std::endl;
1834 }
1835 }
1836 } else if (this->isLocallyIndexed()) {
1837 local_inds_host_view_type lclColInds2;
1838 this->getLocalRowView(rowInfo.localRow, lclColInds2, vals2);
1839 os << "\tNew local column indices: ";
1840 for (size_t jjj = 0; jjj < lclColInds2.extent(0); jjj++)
1841 os << lclColInds2[jjj] << " ";
1842 os << std::endl;
1843 os << "\tNew values: ";
1844 for (size_t jjj = 0; jjj < vals2.extent(0); jjj++)
1845 os << vals2[jjj] << " ";
1846 os << std::endl;
1847 }
1848 }
1849
1850 os << "Please report this bug to the Tpetra developers.";
1851 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::logic_error, os.str());
1852 }
1853#endif // HAVE_TPETRA_DEBUG
1854}
1855
1856template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1859 const Teuchos::ArrayView<const GlobalOrdinal>& indices,
1860 const Teuchos::ArrayView<const Scalar>& values) {
1861 using std::endl;
1862 using Teuchos::toString;
1863 typedef impl_scalar_type IST;
1864 typedef LocalOrdinal LO;
1865 typedef GlobalOrdinal GO;
1866 typedef Tpetra::Details::OrdinalTraits<LO> OTLO;
1867 typedef typename Teuchos::ArrayView<const GO>::size_type size_type;
1868 const char tfecfFuncName[] = "insertGlobalValues: ";
1869
1870#ifdef HAVE_TPETRA_DEBUG
1871 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(values.size() != indices.size(), std::runtime_error,
1872 "values.size() = " << values.size() << " != indices.size() = "
1873 << indices.size() << ".");
1874#endif // HAVE_TPETRA_DEBUG
1875
1876 // getRowMap() is not thread safe, because it increments RCP's
1877 // reference count. getCrsGraphRef() is thread safe.
1878 const map_type& rowMap = *(this->getCrsGraphRef().rowMap_);
1879 const LO lclRow = rowMap.getLocalElement(gblRow);
1880
1881 if (lclRow == OTLO::invalid()) {
1882 // Input row is _not_ owned by the calling process.
1883 //
1884 // See a note (now deleted) from mfh 14 Dec 2012: If input row
1885 // is not in the row Map, it doesn't matter whether or not the
1886 // graph is static; the data just get stashed for later use by
1887 // globalAssemble().
1888 this->insertNonownedGlobalValues(gblRow, indices, values);
1889 } else { // Input row _is_ owned by the calling process
1890 if (this->isStaticGraph()) {
1891 // Uh oh! Not allowed to insert into owned rows in that case.
1892 const int myRank = rowMap.getComm()->getRank();
1893 const int numProcs = rowMap.getComm()->getSize();
1894 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
1895 "The matrix was constructed with a constant (\"static\") graph, "
1896 "yet the given global row index "
1897 << gblRow << " is in the row "
1898 "Map on the calling process (with rank "
1899 << myRank << ", of " << numProcs << " process(es)). In this case, you may not insert "
1900 "new entries into rows owned by the calling process.");
1901 }
1902
1903 crs_graph_type& graph = *(this->myGraph_);
1904 const IST* const inputVals =
1905 reinterpret_cast<const IST*>(values.getRawPtr());
1906 const GO* const inputGblColInds = indices.getRawPtr();
1907 const size_t numInputEnt = indices.size();
1908 RowInfo rowInfo = graph.getRowInfo(lclRow);
1909
1910 // If the matrix has a column Map, check at this point whether
1911 // the column indices belong to the column Map.
1912 //
1913 // FIXME (mfh 16 May 2013) We may want to consider deferring the
1914 // test to the CrsGraph method, since it may have to do this
1915 // anyway.
1916 if (!graph.colMap_.is_null()) {
1917 const map_type& colMap = *(graph.colMap_);
1918 // In a debug build, keep track of the nonowned ("bad") column
1919 // indices, so that we can display them in the exception
1920 // message. In a release build, just ditch the loop early if
1921 // we encounter a nonowned column index.
1922#ifdef HAVE_TPETRA_DEBUG
1923 Teuchos::Array<GO> badColInds;
1924#endif // HAVE_TPETRA_DEBUG
1925 const size_type numEntriesToInsert = indices.size();
1926 bool allInColMap = true;
1927 for (size_type k = 0; k < numEntriesToInsert; ++k) {
1928 if (!colMap.isNodeGlobalElement(indices[k])) {
1929 allInColMap = false;
1930#ifdef HAVE_TPETRA_DEBUG
1931 badColInds.push_back(indices[k]);
1932#else
1933 break;
1934#endif // HAVE_TPETRA_DEBUG
1935 }
1936 }
1937 if (!allInColMap) {
1938 std::ostringstream os;
1939 os << "You attempted to insert entries in owned row " << gblRow
1940 << ", at the following column indices: " << toString(indices)
1941 << "." << endl;
1942#ifdef HAVE_TPETRA_DEBUG
1943 os << "Of those, the following indices are not in the column Map "
1944 "on this process: "
1945 << toString(badColInds) << "." << endl
1946 << "Since the matrix has a column Map already, it is invalid "
1947 "to insert entries at those locations.";
1948#else
1949 os << "At least one of those indices is not in the column Map "
1950 "on this process."
1951 << endl
1952 << "It is invalid to insert into "
1953 "columns not in the column Map on the process that owns the "
1954 "row.";
1955#endif // HAVE_TPETRA_DEBUG
1956 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::invalid_argument, os.str());
1957 }
1958 }
1959
1960 this->insertGlobalValuesImpl(graph, rowInfo, inputGblColInds,
1961 inputVals, numInputEnt);
1962 }
1963}
1964
1965template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1968 const LocalOrdinal numEnt,
1969 const Scalar vals[],
1970 const GlobalOrdinal inds[]) {
1971 Teuchos::ArrayView<const GlobalOrdinal> indsT(inds, numEnt);
1972 Teuchos::ArrayView<const Scalar> valsT(vals, numEnt);
1973 this->insertGlobalValues(globalRow, indsT, valsT);
1974}
1975
1976template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1979 const GlobalOrdinal gblRow,
1980 const Teuchos::ArrayView<const GlobalOrdinal>& indices,
1981 const Teuchos::ArrayView<const Scalar>& values,
1982 const bool debug) {
1983 typedef impl_scalar_type IST;
1984 typedef LocalOrdinal LO;
1985 typedef GlobalOrdinal GO;
1986 typedef Tpetra::Details::OrdinalTraits<LO> OTLO;
1987 const char tfecfFuncName[] = "insertGlobalValuesFiltered: ";
1988
1989 if (debug) {
1990 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(values.size() != indices.size(), std::runtime_error,
1991 "values.size() = " << values.size() << " != indices.size() = "
1992 << indices.size() << ".");
1994
1995 // getRowMap() is not thread safe, because it increments RCP's
1996 // reference count. getCrsGraphRef() is thread safe.
1997 const map_type& rowMap = *(this->getCrsGraphRef().rowMap_);
1998 const LO lclRow = rowMap.getLocalElement(gblRow);
1999 if (lclRow == OTLO::invalid()) {
2000 // Input row is _not_ owned by the calling process.
2001 //
2002 // See a note (now deleted) from mfh 14 Dec 2012: If input row
2003 // is not in the row Map, it doesn't matter whether or not the
2004 // graph is static; the data just get stashed for later use by
2005 // globalAssemble().
2006 this->insertNonownedGlobalValues(gblRow, indices, values);
2007 } else { // Input row _is_ owned by the calling process
2008 if (this->isStaticGraph()) {
2009 // Uh oh! Not allowed to insert into owned rows in that case.
2010 const int myRank = rowMap.getComm()->getRank();
2011 const int numProcs = rowMap.getComm()->getSize();
2012 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
2013 "The matrix was constructed with a constant (\"static\") graph, "
2014 "yet the given global row index "
2015 << gblRow << " is in the row "
2016 "Map on the calling process (with rank "
2017 << myRank << ", of " << numProcs << " process(es)). In this case, you may not insert "
2018 "new entries into rows owned by the calling process.");
2019 }
2020
2021 crs_graph_type& graph = *(this->myGraph_);
2022 const IST* const inputVals =
2023 reinterpret_cast<const IST*>(values.getRawPtr());
2024 const GO* const inputGblColInds = indices.getRawPtr();
2025 const size_t numInputEnt = indices.size();
2026 RowInfo rowInfo = graph.getRowInfo(lclRow);
2027
2028 if (!graph.colMap_.is_null() && graph.isLocallyIndexed()) {
2029 // This branch is similar in function to the following branch, but for
2030 // the special case that the target graph is locally indexed.
2031 // In this case, we cannot simply filter
2032 // out global indices that don't exist on the receiving process and
2033 // insert the remaining (global) indices, but we must convert them (the
2034 // remaining global indices) to local and call `insertLocalValues`.
2035 const map_type& colMap = *(graph.colMap_);
2036 size_t curOffset = 0;
2037 while (curOffset < numInputEnt) {
2038 // Find a sequence of input indices that are in the column Map on the
2039 // calling process. Doing a sequence at a time, instead of one at a
2040 // time, amortizes some overhead.
2041 Teuchos::Array<LO> lclIndices;
2042 size_t endOffset = curOffset;
2043 for (; endOffset < numInputEnt; ++endOffset) {
2044 auto lclIndex = colMap.getLocalElement(inputGblColInds[endOffset]);
2045 if (lclIndex != OTLO::invalid())
2046 lclIndices.push_back(lclIndex);
2047 else
2048 break;
2049 }
2050 // curOffset, endOffset: half-exclusive range of indices in the column
2051 // Map on the calling process. If endOffset == curOffset, the range is
2052 // empty.
2054 if (numIndInSeq != 0) {
2055 this->insertLocalValues(lclRow, lclIndices(), values(curOffset, numIndInSeq));
2056 }
2057 // Invariant before the increment line: Either endOffset ==
2058 // numInputEnt, or inputGblColInds[endOffset] is not in the column Map
2059 // on the calling process.
2060 if (debug) {
2061 const bool invariant = endOffset == numInputEnt ||
2062 colMap.getLocalElement(inputGblColInds[endOffset]) == OTLO::invalid();
2063 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!invariant, std::logic_error, std::endl
2064 << "Invariant failed!");
2065 }
2066 curOffset = endOffset + 1;
2067 }
2068 } else if (!graph.colMap_.is_null()) { // We have a column Map.
2069 const map_type& colMap = *(graph.colMap_);
2070 size_t curOffset = 0;
2071 while (curOffset < numInputEnt) {
2072 // Find a sequence of input indices that are in the column
2073 // Map on the calling process. Doing a sequence at a time,
2074 // instead of one at a time, amortizes some overhead.
2075 size_t endOffset = curOffset;
2076 for (; endOffset < numInputEnt &&
2077 colMap.getLocalElement(inputGblColInds[endOffset]) != OTLO::invalid();
2078 ++endOffset) {
2079 }
2080 // curOffset, endOffset: half-exclusive range of indices in
2081 // the column Map on the calling process. If endOffset ==
2082 // curOffset, the range is empty.
2083 const LO numIndInSeq = (endOffset - curOffset);
2084 if (numIndInSeq != 0) {
2085 rowInfo = graph.getRowInfo(lclRow); // KDD 5/19 Need fresh RowInfo in each loop iteration
2086 this->insertGlobalValuesImpl(graph, rowInfo,
2089 numIndInSeq);
2090 }
2091 // Invariant before the increment line: Either endOffset ==
2092 // numInputEnt, or inputGblColInds[endOffset] is not in the
2093 // column Map on the calling process.
2094 if (debug) {
2095 const bool invariant = endOffset == numInputEnt ||
2096 colMap.getLocalElement(inputGblColInds[endOffset]) == OTLO::invalid();
2097 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!invariant, std::logic_error, std::endl
2098 << "Invariant failed!");
2099 }
2100 curOffset = endOffset + 1;
2101 }
2102 } else { // we don't have a column Map.
2103 this->insertGlobalValuesImpl(graph, rowInfo, inputGblColInds,
2104 inputVals, numInputEnt);
2105 }
2106 }
2107}
2108
2109template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2110void CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2111 insertGlobalValuesFilteredChecked(
2112 const GlobalOrdinal gblRow,
2113 const Teuchos::ArrayView<const GlobalOrdinal>& indices,
2114 const Teuchos::ArrayView<const Scalar>& values,
2115 const char* const prefix,
2116 const bool debug,
2117 const bool verbose) {
2119 using std::endl;
2120
2121 try {
2122 insertGlobalValuesFiltered(gblRow, indices, values, debug);
2123 } catch (std::exception& e) {
2124 std::ostringstream os;
2125 if (verbose) {
2126 const size_t maxNumToPrint =
2128 os << *prefix << ": insertGlobalValuesFiltered threw an "
2129 "exception: "
2130 << e.what() << endl
2131 << "Global row index: " << gblRow << endl;
2132 verbosePrintArray(os, indices, "Global column indices",
2133 maxNumToPrint);
2134 os << endl;
2135 verbosePrintArray(os, values, "Values", maxNumToPrint);
2136 os << endl;
2137 } else {
2138 os << ": insertGlobalValuesFiltered threw an exception: "
2139 << e.what();
2140 }
2141 TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, os.str());
2142 }
2143}
2144
2145template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2146LocalOrdinal
2149 const crs_graph_type& graph,
2150 const RowInfo& rowInfo,
2151 const LocalOrdinal inds[],
2152 const impl_scalar_type newVals[],
2153 const LocalOrdinal numElts) {
2154 typedef LocalOrdinal LO;
2155 typedef GlobalOrdinal GO;
2156 const bool sorted = graph.isSorted();
2157
2158 size_t hint = 0; // Guess for the current index k into rowVals
2159 LO numValid = 0; // number of valid local column indices
2160
2161 if (graph.isLocallyIndexed()) {
2162 // Get a view of the column indices in the row. This amortizes
2163 // the cost of getting the view over all the entries of inds.
2164 auto colInds = graph.getLocalIndsViewHost(rowInfo);
2165
2166 for (LO j = 0; j < numElts; ++j) {
2167 const LO lclColInd = inds[j];
2168 const size_t offset =
2169 KokkosSparse::findRelOffset(colInds, rowInfo.numEntries,
2171 if (offset != rowInfo.numEntries) {
2172 rowVals[offset] = newVals[j];
2173 hint = offset + 1;
2174 ++numValid;
2175 }
2177 } else if (graph.isGloballyIndexed()) {
2178 if (graph.colMap_.is_null()) {
2179 return Teuchos::OrdinalTraits<LO>::invalid();
2180 }
2181 const map_type colMap = *(graph.colMap_);
2182
2183 // Get a view of the column indices in the row. This amortizes
2184 // the cost of getting the view over all the entries of inds.
2185 auto colInds = graph.getGlobalIndsViewHost(rowInfo);
2186
2187 for (LO j = 0; j < numElts; ++j) {
2188 const GO gblColInd = colMap.getGlobalElement(inds[j]);
2189 if (gblColInd != Teuchos::OrdinalTraits<GO>::invalid()) {
2190 const size_t offset =
2191 KokkosSparse::findRelOffset(colInds, rowInfo.numEntries,
2193 if (offset != rowInfo.numEntries) {
2194 rowVals[offset] = newVals[j];
2195 hint = offset + 1;
2196 ++numValid;
2197 }
2198 }
2199 }
2200 }
2201 // NOTE (mfh 26 Jun 2014, 26 Nov 2015) In the current version of
2202 // CrsGraph and CrsMatrix, it's possible for a matrix (or graph)
2203 // to be neither locally nor globally indexed on a process.
2204 // This means that the graph or matrix has no entries on that
2205 // process. Epetra also works like this. It's related to lazy
2206 // allocation (on first insertion, not at graph / matrix
2207 // construction). Lazy allocation will go away because it is
2208 // not thread scalable.
2209
2210 return numValid;
2211}
2212
2213template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2216 replaceLocalValues(const LocalOrdinal localRow,
2217 const Teuchos::ArrayView<const LocalOrdinal>& lclCols,
2218 const Teuchos::ArrayView<const Scalar>& vals) {
2219 typedef LocalOrdinal LO;
2220
2221 const LO numInputEnt = static_cast<LO>(lclCols.size());
2222 if (static_cast<LO>(vals.size()) != numInputEnt) {
2223 return Teuchos::OrdinalTraits<LO>::invalid();
2224 }
2225 const LO* const inputInds = lclCols.getRawPtr();
2226 const Scalar* const inputVals = vals.getRawPtr();
2227 return this->replaceLocalValues(localRow, numInputEnt,
2229}
2230
2231template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2232typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2233 local_ordinal_type
2236 const local_ordinal_type localRow,
2237 const Kokkos::View<const local_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
2238 const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals) {
2239 using LO = local_ordinal_type;
2240 const LO numInputEnt = inputInds.extent(0);
2241 if (numInputEnt != static_cast<LO>(inputVals.extent(0))) {
2242 return Teuchos::OrdinalTraits<LO>::invalid();
2243 }
2244 const Scalar* const inVals =
2245 reinterpret_cast<const Scalar*>(inputVals.data());
2246 return this->replaceLocalValues(localRow, numInputEnt,
2247 inVals, inputInds.data());
2249
2250template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2253 replaceLocalValues(const LocalOrdinal localRow,
2254 const LocalOrdinal numEnt,
2256 const LocalOrdinal inputCols[]) {
2257 typedef impl_scalar_type IST;
2258 typedef LocalOrdinal LO;
2259
2260 if (!this->isFillActive() || this->staticGraph_.is_null()) {
2261 // Fill must be active and the "nonconst" graph must exist.
2262 return Teuchos::OrdinalTraits<LO>::invalid();
2263 }
2264 const crs_graph_type& graph = *(this->staticGraph_);
2265 const RowInfo rowInfo = graph.getRowInfo(localRow);
2266
2267 if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid()) {
2268 // The calling process does not own this row, so it is not
2269 // allowed to modify its values.
2270 return static_cast<LO>(0);
2271 }
2272 auto curRowVals = this->getValuesViewHostNonConst(rowInfo);
2273 const IST* const inVals = reinterpret_cast<const IST*>(inputVals);
2274 return this->replaceLocalValuesImpl(curRowVals.data(), graph, rowInfo,
2275 inputCols, inVals, numEnt);
2276}
2277
2278template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2279LocalOrdinal
2282 const crs_graph_type& graph,
2283 const RowInfo& rowInfo,
2284 const GlobalOrdinal inds[],
2285 const impl_scalar_type newVals[],
2286 const LocalOrdinal numElts) {
2287 Teuchos::ArrayView<const GlobalOrdinal> indsT(inds, numElts);
2288 auto fun =
2289 [&](size_t const k, size_t const /*start*/, size_t const offset) {
2290 rowVals[offset] = newVals[k];
2291 };
2292 std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
2293 return graph.findGlobalIndices(rowInfo, indsT, cb);
2294}
2295
2296template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2300 const Teuchos::ArrayView<const GlobalOrdinal>& inputGblColInds,
2301 const Teuchos::ArrayView<const Scalar>& inputVals) {
2302 typedef LocalOrdinal LO;
2303
2304 const LO numInputEnt = static_cast<LO>(inputGblColInds.size());
2305 if (static_cast<LO>(inputVals.size()) != numInputEnt) {
2306 return Teuchos::OrdinalTraits<LO>::invalid();
2307 }
2308 return this->replaceGlobalValues(globalRow, numInputEnt,
2309 inputVals.getRawPtr(),
2310 inputGblColInds.getRawPtr());
2311}
2312
2313template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2317 const LocalOrdinal numEnt,
2318 const Scalar inputVals[],
2320 typedef impl_scalar_type IST;
2321 typedef LocalOrdinal LO;
2322
2323 if (!this->isFillActive() || this->staticGraph_.is_null()) {
2324 // Fill must be active and the "nonconst" graph must exist.
2325 return Teuchos::OrdinalTraits<LO>::invalid();
2327 const crs_graph_type& graph = *(this->staticGraph_);
2328
2329 const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex(globalRow);
2330 if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid()) {
2331 // The input local row is invalid on the calling process,
2332 // which means that the calling process summed 0 entries.
2333 return static_cast<LO>(0);
2335
2336 auto curRowVals = this->getValuesViewHostNonConst(rowInfo);
2337 const IST* const inVals = reinterpret_cast<const IST*>(inputVals);
2338 return this->replaceGlobalValuesImpl(curRowVals.data(), graph, rowInfo,
2340}
2341
2342template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2344 local_ordinal_type
2348 const Kokkos::View<const global_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
2349 const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals) {
2350 // We use static_assert here to check the template parameters,
2351 // rather than std::enable_if (e.g., on the return value, to
2352 // enable compilation only if the template parameters match the
2353 // desired attributes). This turns obscure link errors into
2354 // clear compilation errors. It also makes the return value a
2355 // lot easier to see.
2357 const LO numInputEnt = static_cast<LO>(inputInds.extent(0));
2358 if (static_cast<LO>(inputVals.extent(0)) != numInputEnt) {
2359 return Teuchos::OrdinalTraits<LO>::invalid();
2360 }
2361 const Scalar* const inVals =
2362 reinterpret_cast<const Scalar*>(inputVals.data());
2363 return this->replaceGlobalValues(globalRow, numInputEnt, inVals,
2364 inputInds.data());
2365}
2366
2367template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2371 const crs_graph_type& graph,
2372 const RowInfo& rowInfo,
2374 const impl_scalar_type newVals[],
2375 const LocalOrdinal numElts,
2376 const bool atomic) {
2377 typedef LocalOrdinal LO;
2378 typedef GlobalOrdinal GO;
2379
2380 const bool sorted = graph.isSorted();
2381
2382 size_t hint = 0; // guess at the index's relative offset in the row
2383 LO numValid = 0; // number of valid input column indices
2384
2385 if (graph.isLocallyIndexed()) {
2386 // NOTE (mfh 04 Nov 2015) Dereferencing an RCP or reading its
2387 // pointer does NOT change its reference count. Thus, this
2388 // code is still thread safe.
2389 if (graph.colMap_.is_null()) {
2390 // NO input column indices are valid in this case, since if
2391 // the column Map is null on the calling process, then the
2392 // calling process owns no graph entries.
2393 return numValid;
2394 }
2395 const map_type& colMap = *(graph.colMap_);
2396
2397 // Get a view of the column indices in the row. This amortizes
2398 // the cost of getting the view over all the entries of inds.
2399 auto colInds = graph.getLocalIndsViewHost(rowInfo);
2400 const LO LINV = Teuchos::OrdinalTraits<LO>::invalid();
2401
2402 for (LO j = 0; j < numElts; ++j) {
2403 const LO lclColInd = colMap.getLocalElement(inds[j]);
2404 if (lclColInd != LINV) {
2405 const size_t offset =
2406 KokkosSparse::findRelOffset(colInds, rowInfo.numEntries,
2408 if (offset != rowInfo.numEntries) {
2409 if (atomic) {
2410 Kokkos::atomic_add(&rowVals[offset], newVals[j]);
2411 } else {
2412 rowVals[offset] += newVals[j];
2413 }
2414 hint = offset + 1;
2415 numValid++;
2416 }
2417 }
2418 }
2419 } else if (graph.isGloballyIndexed()) {
2420 // Get a view of the column indices in the row. This amortizes
2421 // the cost of getting the view over all the entries of inds.
2422 auto colInds = graph.getGlobalIndsViewHost(rowInfo);
2423
2424 for (LO j = 0; j < numElts; ++j) {
2425 const GO gblColInd = inds[j];
2426 const size_t offset =
2427 KokkosSparse::findRelOffset(colInds, rowInfo.numEntries,
2428 gblColInd, hint, sorted);
2429 if (offset != rowInfo.numEntries) {
2430 if (atomic) {
2431 Kokkos::atomic_add(&rowVals[offset], newVals[j]);
2432 } else {
2433 rowVals[offset] += newVals[j];
2434 }
2435 hint = offset + 1;
2436 numValid++;
2437 }
2438 }
2439 }
2440 // If the graph is neither locally nor globally indexed on the
2441 // calling process, that means the calling process has no graph
2442 // entries. Thus, none of the input column indices are valid.
2443
2444 return numValid;
2445}
2446
2447template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2448LocalOrdinal
2451 const Teuchos::ArrayView<const GlobalOrdinal>& inputGblColInds,
2452 const Teuchos::ArrayView<const Scalar>& inputVals,
2453 const bool atomic) {
2454 typedef LocalOrdinal LO;
2455
2456 const LO numInputEnt = static_cast<LO>(inputGblColInds.size());
2457 if (static_cast<LO>(inputVals.size()) != numInputEnt) {
2458 return Teuchos::OrdinalTraits<LO>::invalid();
2459 }
2460 return this->sumIntoGlobalValues(gblRow, numInputEnt,
2461 inputVals.getRawPtr(),
2462 inputGblColInds.getRawPtr(),
2463 atomic);
2464}
2465
2466template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2467LocalOrdinal
2471 const Scalar inputVals[],
2473 const bool atomic) {
2474 typedef impl_scalar_type IST;
2475 typedef LocalOrdinal LO;
2476 typedef GlobalOrdinal GO;
2477
2478 if (!this->isFillActive() || this->staticGraph_.is_null()) {
2479 // Fill must be active and the "nonconst" graph must exist.
2480 return Teuchos::OrdinalTraits<LO>::invalid();
2481 }
2482 const crs_graph_type& graph = *(this->staticGraph_);
2483
2484 const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex(gblRow);
2485 if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid()) {
2486 // mfh 23 Mar 2017, 26 Jul 2017: This branch may not be not
2487 // thread safe in a debug build, in part because it uses
2488 // Teuchos::ArrayView, and in part because of the data structure
2489 // used to stash outgoing entries.
2490 using Teuchos::ArrayView;
2493 numInputEnt);
2495 numInputEnt == 0 ? nullptr : inputVals, numInputEnt);
2496 // gblRow is not in the row Map on the calling process, so stash
2497 // the given entries away in a separate data structure.
2498 // globalAssemble() (called during fillComplete()) will exchange
2499 // that data and sum it in using sumIntoGlobalValues().
2500 this->insertNonownedGlobalValues(gblRow, inputGblColInds_av,
2502 // FIXME (mfh 08 Jul 2014) It's not clear what to return here,
2503 // since we won't know whether the given indices were valid
2504 // until globalAssemble (called in fillComplete) is called.
2505 // That's why insertNonownedGlobalValues doesn't return
2506 // anything. Just for consistency, I'll return the number of
2507 // entries that the user gave us.
2508 return numInputEnt;
2509 } else { // input row is in the row Map on the calling process
2510 auto curRowVals = this->getValuesViewHostNonConst(rowInfo);
2511 const IST* const inVals = reinterpret_cast<const IST*>(inputVals);
2512 return this->sumIntoGlobalValuesImpl(curRowVals.data(), graph, rowInfo,
2514 numInputEnt, atomic);
2515 }
2516}
2517
2518template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2519LocalOrdinal
2520CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2521 transformLocalValues(const LocalOrdinal lclRow,
2524 const LocalOrdinal inputCols[],
2525 std::function<impl_scalar_type(const impl_scalar_type&, const impl_scalar_type&)> f,
2526 const bool atomic) {
2527 using Tpetra::Details::OrdinalTraits;
2528 typedef LocalOrdinal LO;
2529
2530 if (!this->isFillActive() || this->staticGraph_.is_null()) {
2531 // Fill must be active and the "nonconst" graph must exist.
2532 return Teuchos::OrdinalTraits<LO>::invalid();
2533 }
2534 const crs_graph_type& graph = *(this->staticGraph_);
2535 const RowInfo rowInfo = graph.getRowInfo(lclRow);
2536
2537 if (rowInfo.localRow == OrdinalTraits<size_t>::invalid()) {
2538 // The calling process does not own this row, so it is not
2539 // allowed to modify its values.
2540 return static_cast<LO>(0);
2541 }
2542 auto curRowVals = this->getValuesViewHostNonConst(rowInfo);
2543 return this->transformLocalValues(curRowVals.data(), graph,
2545 numInputEnt, f, atomic);
2546}
2547
2548template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2553 const impl_scalar_type inputVals[],
2554 const GlobalOrdinal inputCols[],
2555 std::function<impl_scalar_type(const impl_scalar_type&, const impl_scalar_type&)> f,
2556 const bool atomic) {
2557 using Tpetra::Details::OrdinalTraits;
2558 typedef LocalOrdinal LO;
2559
2560 if (!this->isFillActive() || this->staticGraph_.is_null()) {
2561 // Fill must be active and the "nonconst" graph must exist.
2563 }
2564 const crs_graph_type& graph = *(this->staticGraph_);
2565 const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex(gblRow);
2566
2567 if (rowInfo.localRow == OrdinalTraits<size_t>::invalid()) {
2568 // The calling process does not own this row, so it is not
2569 // allowed to modify its values.
2570 return static_cast<LO>(0);
2571 }
2572 auto curRowVals = this->getValuesViewHostNonConst(rowInfo);
2573 return this->transformGlobalValues(curRowVals.data(), graph,
2574 rowInfo, inputCols, inputVals,
2575 numInputEnt, f, atomic);
2576}
2577
2578template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2579LocalOrdinal
2580CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2581 transformLocalValues(impl_scalar_type rowVals[],
2582 const crs_graph_type& graph,
2583 const RowInfo& rowInfo,
2584 const LocalOrdinal inds[],
2586 const LocalOrdinal numElts,
2587 std::function<impl_scalar_type(const impl_scalar_type&, const impl_scalar_type&)> f,
2588 const bool atomic) {
2589 typedef impl_scalar_type ST;
2590 typedef LocalOrdinal LO;
2591 typedef GlobalOrdinal GO;
2592
2593 // if (newVals.extent (0) != inds.extent (0)) {
2594 // The sizes of the input arrays must match.
2595 // return Tpetra::Details::OrdinalTraits<LO>::invalid ();
2596 // }
2597 // const LO numElts = static_cast<LO> (inds.extent (0));
2598 const bool sorted = graph.isSorted();
2599
2600 LO numValid = 0; // number of valid input column indices
2601 size_t hint = 0; // Guess for the current index k into rowVals
2602
2603 if (graph.isLocallyIndexed()) {
2604 // Get a view of the column indices in the row. This amortizes
2605 // the cost of getting the view over all the entries of inds.
2606 auto colInds = graph.getLocalIndsViewHost(rowInfo);
2607
2608 for (LO j = 0; j < numElts; ++j) {
2609 const LO lclColInd = inds[j];
2610 const size_t offset =
2611 KokkosSparse::findRelOffset(colInds, rowInfo.numEntries,
2613 if (offset != rowInfo.numEntries) {
2614 if (atomic) {
2615 // NOTE (mfh 30 Nov 2015) The commented-out code is
2616 // wrong because another thread may have changed
2617 // rowVals[offset] between those two lines of code.
2618 //
2619 // const ST newVal = f (rowVals[offset], newVals[j]);
2620 // Kokkos::atomic_assign (&rowVals[offset], newVal);
2621
2622 ST* const dest = &rowVals[offset];
2624 } else {
2625 // use binary function f
2627 }
2628 hint = offset + 1;
2629 ++numValid;
2630 }
2631 }
2632 } else if (graph.isGloballyIndexed()) {
2633 // NOTE (mfh 26 Nov 2015) Dereferencing an RCP or reading its
2634 // pointer does NOT change its reference count. Thus, this
2635 // code is still thread safe.
2636 if (graph.colMap_.is_null()) {
2637 // NO input column indices are valid in this case. Either
2638 // the column Map hasn't been set yet (so local indices
2639 // don't exist yet), or the calling process owns no graph
2640 // entries.
2641 return numValid;
2642 }
2643 const map_type& colMap = *(graph.colMap_);
2644 // Get a view of the column indices in the row. This amortizes
2645 // the cost of getting the view over all the entries of inds.
2646 auto colInds = graph.getGlobalIndsViewHost(rowInfo);
2647
2648 const GO GINV = Teuchos::OrdinalTraits<GO>::invalid();
2649 for (LO j = 0; j < numElts; ++j) {
2650 const GO gblColInd = colMap.getGlobalElement(inds[j]);
2651 if (gblColInd != GINV) {
2652 const size_t offset =
2653 KokkosSparse::findRelOffset(colInds, rowInfo.numEntries,
2655 if (offset != rowInfo.numEntries) {
2656 if (atomic) {
2657 // NOTE (mfh 30 Nov 2015) The commented-out code is
2658 // wrong because another thread may have changed
2659 // rowVals[offset] between those two lines of code.
2661 // const ST newVal = f (rowVals[offset], newVals[j]);
2662 // Kokkos::atomic_assign (&rowVals[offset], newVal);
2663
2664 ST* const dest = &rowVals[offset];
2666 } else {
2667 // use binary function f
2669 }
2670 hint = offset + 1;
2671 numValid++;
2672 }
2673 }
2674 }
2675 }
2676 // If the graph is neither locally nor globally indexed on the
2677 // calling process, that means the calling process has no graph
2678 // entries. Thus, none of the input column indices are valid.
2679
2680 return numValid;
2681}
2682
2683template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2684LocalOrdinal
2685CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2686 transformGlobalValues(impl_scalar_type rowVals[],
2687 const crs_graph_type& graph,
2688 const RowInfo& rowInfo,
2689 const GlobalOrdinal inds[],
2690 const impl_scalar_type newVals[],
2691 const LocalOrdinal numElts,
2692 std::function<impl_scalar_type(const impl_scalar_type&, const impl_scalar_type&)> f,
2693 const bool atomic) {
2694 typedef impl_scalar_type ST;
2695 typedef LocalOrdinal LO;
2696 typedef GlobalOrdinal GO;
2697
2698 // if (newVals.extent (0) != inds.extent (0)) {
2699 // The sizes of the input arrays must match.
2700 // return Tpetra::Details::OrdinalTraits<LO>::invalid ();
2701 // }
2702 // const LO numElts = static_cast<LO> (inds.extent (0));
2703 const bool sorted = graph.isSorted();
2704
2705 LO numValid = 0; // number of valid input column indices
2706 size_t hint = 0; // Guess for the current index k into rowVals
2707
2708 if (graph.isGloballyIndexed()) {
2709 // Get a view of the column indices in the row. This amortizes
2710 // the cost of getting the view over all the entries of inds.
2711 auto colInds = graph.getGlobalIndsViewHost(rowInfo);
2712
2713 for (LO j = 0; j < numElts; ++j) {
2714 const GO gblColInd = inds[j];
2715 const size_t offset =
2716 KokkosSparse::findRelOffset(colInds, rowInfo.numEntries,
2718 if (offset != rowInfo.numEntries) {
2719 if (atomic) {
2720 // NOTE (mfh 30 Nov 2015) The commented-out code is
2721 // wrong because another thread may have changed
2722 // rowVals[offset] between those two lines of code.
2723 //
2724 // const ST newVal = f (rowVals[offset], newVals[j]);
2725 // Kokkos::atomic_assign (&rowVals[offset], newVal);
2726
2727 ST* const dest = &rowVals[offset];
2729 } else {
2730 // use binary function f
2732 }
2733 hint = offset + 1;
2734 ++numValid;
2735 }
2736 }
2737 } else if (graph.isLocallyIndexed()) {
2738 // NOTE (mfh 26 Nov 2015) Dereferencing an RCP or reading its
2739 // pointer does NOT change its reference count. Thus, this
2740 // code is still thread safe.
2741 if (graph.colMap_.is_null()) {
2742 // NO input column indices are valid in this case. Either the
2743 // column Map hasn't been set yet (so local indices don't
2744 // exist yet), or the calling process owns no graph entries.
2745 return numValid;
2746 }
2747 const map_type& colMap = *(graph.colMap_);
2748 // Get a view of the column indices in the row. This amortizes
2749 // the cost of getting the view over all the entries of inds.
2750 auto colInds = graph.getLocalIndsViewHost(rowInfo);
2751
2752 const LO LINV = Teuchos::OrdinalTraits<LO>::invalid();
2753 for (LO j = 0; j < numElts; ++j) {
2754 const LO lclColInd = colMap.getLocalElement(inds[j]);
2755 if (lclColInd != LINV) {
2756 const size_t offset =
2757 KokkosSparse::findRelOffset(colInds, rowInfo.numEntries,
2759 if (offset != rowInfo.numEntries) {
2760 if (atomic) {
2761 // NOTE (mfh 30 Nov 2015) The commented-out code is
2762 // wrong because another thread may have changed
2763 // rowVals[offset] between those two lines of code.
2765 // const ST newVal = f (rowVals[offset], newVals[j]);
2766 // Kokkos::atomic_assign (&rowVals[offset], newVal);
2767
2768 ST* const dest = &rowVals[offset];
2770 } else {
2771 // use binary function f
2773 }
2774 hint = offset + 1;
2775 numValid++;
2776 }
2777 }
2778 }
2779 }
2780 // If the graph is neither locally nor globally indexed on the
2781 // calling process, that means the calling process has no graph
2782 // entries. Thus, none of the input column indices are valid.
2783
2784 return numValid;
2785}
2786
2787template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2788LocalOrdinal
2791 const crs_graph_type& graph,
2792 const RowInfo& rowInfo,
2793 const LocalOrdinal inds[],
2794 const impl_scalar_type newVals[],
2795 const LocalOrdinal numElts,
2796 const bool atomic) {
2797 typedef LocalOrdinal LO;
2798 typedef GlobalOrdinal GO;
2799
2800 const bool sorted = graph.isSorted();
2801
2802 size_t hint = 0; // Guess for the current index k into rowVals
2803 LO numValid = 0; // number of valid local column indices
2804
2805 if (graph.isLocallyIndexed()) {
2806 // Get a view of the column indices in the row. This amortizes
2807 // the cost of getting the view over all the entries of inds.
2808 auto colInds = graph.getLocalIndsViewHost(rowInfo);
2809
2810 for (LO j = 0; j < numElts; ++j) {
2811 const LO lclColInd = inds[j];
2812 const size_t offset =
2813 KokkosSparse::findRelOffset(colInds, rowInfo.numEntries,
2815 if (offset != rowInfo.numEntries) {
2816 if (atomic) {
2817 Kokkos::atomic_add(&rowVals[offset], newVals[j]);
2818 } else {
2819 rowVals[offset] += newVals[j];
2820 }
2821 hint = offset + 1;
2822 ++numValid;
2823 }
2824 }
2825 } else if (graph.isGloballyIndexed()) {
2826 if (graph.colMap_.is_null()) {
2827 return Teuchos::OrdinalTraits<LO>::invalid();
2828 }
2829 const map_type colMap = *(graph.colMap_);
2830
2831 // Get a view of the column indices in the row. This amortizes
2832 // the cost of getting the view over all the entries of inds.
2833 auto colInds = graph.getGlobalIndsViewHost(rowInfo);
2834
2835 for (LO j = 0; j < numElts; ++j) {
2836 const GO gblColInd = colMap.getGlobalElement(inds[j]);
2837 if (gblColInd != Teuchos::OrdinalTraits<GO>::invalid()) {
2838 const size_t offset =
2839 KokkosSparse::findRelOffset(colInds, rowInfo.numEntries,
2841 if (offset != rowInfo.numEntries) {
2842 if (atomic) {
2843 Kokkos::atomic_add(&rowVals[offset], newVals[j]);
2844 } else {
2845 rowVals[offset] += newVals[j];
2846 }
2847 hint = offset + 1;
2848 ++numValid;
2850 }
2851 }
2852 }
2853 // NOTE (mfh 26 Jun 2014, 26 Nov 2015) In the current version of
2854 // CrsGraph and CrsMatrix, it's possible for a matrix (or graph)
2855 // to be neither locally nor globally indexed on a process.
2856 // This means that the graph or matrix has no entries on that
2857 // process. Epetra also works like this. It's related to lazy
2858 // allocation (on first insertion, not at graph / matrix
2859 // construction). Lazy allocation will go away because it is
2860 // not thread scalable.
2861
2862 return numValid;
2863}
2864
2865template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2868 sumIntoLocalValues(const LocalOrdinal localRow,
2869 const Teuchos::ArrayView<const LocalOrdinal>& indices,
2870 const Teuchos::ArrayView<const Scalar>& values,
2871 const bool atomic) {
2872 using LO = local_ordinal_type;
2873 const LO numInputEnt = static_cast<LO>(indices.size());
2874 if (static_cast<LO>(values.size()) != numInputEnt) {
2875 return Teuchos::OrdinalTraits<LO>::invalid();
2876 }
2877 const LO* const inputInds = indices.getRawPtr();
2878 const scalar_type* const inputVals = values.getRawPtr();
2879 return this->sumIntoLocalValues(localRow, numInputEnt,
2880 inputVals, inputInds, atomic);
2881}
2883template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2885 local_ordinal_type
2888 const local_ordinal_type localRow,
2889 const Kokkos::View<const local_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
2890 const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals,
2891 const bool atomic) {
2892 using LO = local_ordinal_type;
2893 const LO numInputEnt = static_cast<LO>(inputInds.extent(0));
2894 if (static_cast<LO>(inputVals.extent(0)) != numInputEnt) {
2895 return Teuchos::OrdinalTraits<LO>::invalid();
2896 }
2897 const scalar_type* inVals =
2898 reinterpret_cast<const scalar_type*>(inputVals.data());
2899 return this->sumIntoLocalValues(localRow, numInputEnt, inVals,
2900 inputInds.data(), atomic);
2901}
2902
2903template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2906 sumIntoLocalValues(const LocalOrdinal localRow,
2908 const Scalar vals[],
2909 const LocalOrdinal cols[],
2910 const bool atomic) {
2911 typedef impl_scalar_type IST;
2912 typedef LocalOrdinal LO;
2913
2914 if (!this->isFillActive() || this->staticGraph_.is_null()) {
2915 // Fill must be active and the "nonconst" graph must exist.
2916 return Teuchos::OrdinalTraits<LO>::invalid();
2917 }
2918 const crs_graph_type& graph = *(this->staticGraph_);
2919 const RowInfo rowInfo = graph.getRowInfo(localRow);
2920
2921 if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid()) {
2922 // The calling process does not own this row, so it is not
2923 // allowed to modify its values.
2924 return static_cast<LO>(0);
2925 }
2926 auto curRowVals = this->getValuesViewHostNonConst(rowInfo);
2927 const IST* const inputVals = reinterpret_cast<const IST*>(vals);
2928 return this->sumIntoLocalValuesImpl(curRowVals.data(), graph, rowInfo,
2929 cols, inputVals, numEnt, atomic);
2930}
2931
2932template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2933typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2934 values_dualv_type::t_host::const_type
2936 getValuesViewHost(const RowInfo& rowinfo) const {
2937 if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
2938 return typename values_dualv_type::t_host::const_type();
2939 else
2940 return valuesUnpacked_wdv.getHostSubview(rowinfo.offset1D,
2941 rowinfo.allocSize,
2942 Access::ReadOnly);
2943}
2944
2945template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2947 values_dualv_type::t_host
2950 if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
2951 return typename values_dualv_type::t_host();
2952 else
2953 return valuesUnpacked_wdv.getHostSubview(rowinfo.offset1D,
2954 rowinfo.allocSize,
2955 Access::ReadWrite);
2956}
2957
2958template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2960 values_dualv_type::t_dev::const_type
2962 getValuesViewDevice(const RowInfo& rowinfo) const {
2963 if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
2964 return typename values_dualv_type::t_dev::const_type();
2965 else
2966 return valuesUnpacked_wdv.getDeviceSubview(rowinfo.offset1D,
2967 rowinfo.allocSize,
2968 Access::ReadOnly);
2969}
2971template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2973 values_dualv_type::t_dev
2976 if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
2977 return typename values_dualv_type::t_dev();
2978 else
2979 return valuesUnpacked_wdv.getDeviceSubview(rowinfo.offset1D,
2980 rowinfo.allocSize,
2981 Access::ReadWrite);
2982}
2983
2984template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2987 nonconst_local_inds_host_view_type& indices,
2988 nonconst_values_host_view_type& values,
2989 size_t& numEntries) const {
2990 using Teuchos::ArrayView;
2991 using Teuchos::av_reinterpret_cast;
2992 const char tfecfFuncName[] = "getLocalRowCopy: ";
2993
2994 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!this->hasColMap(), std::runtime_error,
2995 "The matrix does not have a column Map yet. This means we don't have "
2996 "local indices for columns yet, so it doesn't make sense to call this "
2997 "method. If the matrix doesn't have a column Map yet, you should call "
2998 "fillComplete on it first.");
2999
3000 const RowInfo rowinfo = staticGraph_->getRowInfo(localRow);
3001 const size_t theNumEntries = rowinfo.numEntries;
3002 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(indices.size()) < theNumEntries ||
3003 static_cast<size_t>(values.size()) < theNumEntries,
3004 std::runtime_error, "Row with local index " << localRow << " has " << theNumEntries << " entry/ies, but indices.size() = " << indices.size() << " and values.size() = " << values.size() << ".");
3005 numEntries = theNumEntries; // first side effect
3006
3007 if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid()) {
3008 if (staticGraph_->isLocallyIndexed()) {
3009 auto curLclInds = staticGraph_->getLocalIndsViewHost(rowinfo);
3010 auto curVals = getValuesViewHost(rowinfo);
3011
3012 for (size_t j = 0; j < theNumEntries; ++j) {
3013 values[j] = curVals[j];
3014 indices[j] = curLclInds(j);
3015 }
3016 } else if (staticGraph_->isGloballyIndexed()) {
3017 // Don't call getColMap(), because it touches RCP's reference count.
3018 const map_type& colMap = *(staticGraph_->colMap_);
3019 auto curGblInds = staticGraph_->getGlobalIndsViewHost(rowinfo);
3020 auto curVals = getValuesViewHost(rowinfo);
3021
3022 for (size_t j = 0; j < theNumEntries; ++j) {
3023 values[j] = curVals[j];
3024 indices[j] = colMap.getLocalElement(curGblInds(j));
3025 }
3026 }
3027 }
3028}
3030template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3033 nonconst_global_inds_host_view_type& indices,
3034 nonconst_values_host_view_type& values,
3035 size_t& numEntries) const {
3036 using Teuchos::ArrayView;
3037 using Teuchos::av_reinterpret_cast;
3038 const char tfecfFuncName[] = "getGlobalRowCopy: ";
3039
3040 const RowInfo rowinfo =
3041 staticGraph_->getRowInfoFromGlobalRowIndex(globalRow);
3042 const size_t theNumEntries = rowinfo.numEntries;
3044 static_cast<size_t>(indices.size()) < theNumEntries ||
3045 static_cast<size_t>(values.size()) < theNumEntries,
3046 std::runtime_error, "Row with global index " << globalRow << " has " << theNumEntries << " entry/ies, but indices.size() = " << indices.size() << " and values.size() = " << values.size() << ".");
3047 numEntries = theNumEntries; // first side effect
3048
3049 if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid()) {
3050 if (staticGraph_->isLocallyIndexed()) {
3051 const map_type& colMap = *(staticGraph_->colMap_);
3052 auto curLclInds = staticGraph_->getLocalIndsViewHost(rowinfo);
3053 auto curVals = getValuesViewHost(rowinfo);
3054
3055 for (size_t j = 0; j < theNumEntries; ++j) {
3056 values[j] = curVals[j];
3057 indices[j] = colMap.getGlobalElement(curLclInds(j));
3058 }
3059 } else if (staticGraph_->isGloballyIndexed()) {
3060 auto curGblInds = staticGraph_->getGlobalIndsViewHost(rowinfo);
3061 auto curVals = getValuesViewHost(rowinfo);
3062
3063 for (size_t j = 0; j < theNumEntries; ++j) {
3064 values[j] = curVals[j];
3065 indices[j] = curGblInds(j);
3066 }
3067 }
3068 }
3069}
3070
3071template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3074 local_inds_host_view_type& indices,
3075 values_host_view_type& values) const {
3076 const char tfecfFuncName[] = "getLocalRowView: ";
3077
3079 isGloballyIndexed(), std::runtime_error,
3080 "The matrix currently stores "
3081 "its indices as global indices, so you cannot get a view with local "
3082 "column indices. If the matrix has a column Map, you may call "
3083 "getLocalRowCopy() to get local column indices; otherwise, you may get "
3084 "a view with global column indices by calling getGlobalRowCopy().");
3085
3086 const RowInfo rowInfo = staticGraph_->getRowInfo(localRow);
3087 if (rowInfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid() &&
3088 rowInfo.numEntries > 0) {
3089 indices = staticGraph_->lclIndsUnpacked_wdv.getHostSubview(
3090 rowInfo.offset1D,
3091 rowInfo.numEntries,
3092 Access::ReadOnly);
3093 values = valuesUnpacked_wdv.getHostSubview(rowInfo.offset1D,
3094 rowInfo.numEntries,
3095 Access::ReadOnly);
3096 } else {
3097 // This does the right thing (reports an empty row) if the input
3098 // row is invalid.
3099 indices = local_inds_host_view_type();
3100 values = values_host_view_type();
3101 }
3102
3103#ifdef HAVE_TPETRA_DEBUG
3104 const char suffix[] =
3105 ". This should never happen. Please report this "
3106 "bug to the Tpetra developers.";
3107 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(indices.size()) !=
3108 static_cast<size_t>(values.size()),
3109 std::logic_error,
3110 "At the end of this method, for local row " << localRow << ", "
3111 "indices.size() = "
3112 << indices.size() << " != values.size () = "
3113 << values.size() << suffix);
3114 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(indices.size()) !=
3115 static_cast<size_t>(rowInfo.numEntries),
3116 std::logic_error,
3117 "At the end of this method, for local row " << localRow << ", "
3118 "indices.size() = "
3119 << indices.size() << " != rowInfo.numEntries = "
3120 << rowInfo.numEntries << suffix);
3121 const size_t expectedNumEntries = getNumEntriesInLocalRow(localRow);
3123 "At the end "
3124 "of this method, for local row "
3125 << localRow << ", rowInfo.numEntries = "
3126 << rowInfo.numEntries << " != getNumEntriesInLocalRow(localRow) = " << expectedNumEntries << suffix);
3127#endif // HAVE_TPETRA_DEBUG
3128}
3129
3130template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3133 global_inds_host_view_type& indices,
3134 values_host_view_type& values) const {
3135 const char tfecfFuncName[] = "getGlobalRowView: ";
3136
3138 isLocallyIndexed(), std::runtime_error,
3139 "The matrix is locally indexed, so we cannot return a view of the row "
3140 "with global column indices. Use getGlobalRowCopy() instead.");
3141
3142 // This does the right thing (reports an empty row) if the input
3143 // row is invalid.
3144 const RowInfo rowInfo =
3145 staticGraph_->getRowInfoFromGlobalRowIndex(globalRow);
3146 if (rowInfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid() &&
3147 rowInfo.numEntries > 0) {
3148 indices = staticGraph_->gblInds_wdv.getHostSubview(rowInfo.offset1D,
3149 rowInfo.numEntries,
3150 Access::ReadOnly);
3151 values = valuesUnpacked_wdv.getHostSubview(rowInfo.offset1D,
3152 rowInfo.numEntries,
3153 Access::ReadOnly);
3154 } else {
3155 indices = global_inds_host_view_type();
3156 values = values_host_view_type();
3157 }
3158
3159#ifdef HAVE_TPETRA_DEBUG
3160 const char suffix[] =
3161 ". This should never happen. Please report this "
3162 "bug to the Tpetra developers.";
3163 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(indices.size()) !=
3164 static_cast<size_t>(values.size()),
3165 std::logic_error,
3166 "At the end of this method, for global row " << globalRow << ", "
3167 "indices.size() = "
3168 << indices.size() << " != values.size () = "
3169 << values.size() << suffix);
3170 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(static_cast<size_t>(indices.size()) !=
3171 static_cast<size_t>(rowInfo.numEntries),
3172 std::logic_error,
3173 "At the end of this method, for global row " << globalRow << ", "
3174 "indices.size() = "
3175 << indices.size() << " != rowInfo.numEntries = "
3176 << rowInfo.numEntries << suffix);
3177 const size_t expectedNumEntries = getNumEntriesInGlobalRow(globalRow);
3178 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(rowInfo.numEntries != expectedNumEntries, std::logic_error,
3179 "At the end "
3180 "of this method, for global row "
3181 << globalRow << ", rowInfo.numEntries "
3182 "= "
3183 << rowInfo.numEntries << " != getNumEntriesInGlobalRow(globalRow) ="
3184 " "
3185 << expectedNumEntries << suffix);
3186#endif // HAVE_TPETRA_DEBUG
3187}
3188
3189template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3191 scale(const Scalar& alpha) {
3192 const impl_scalar_type theAlpha = static_cast<impl_scalar_type>(alpha);
3193
3194 const size_t nlrs = staticGraph_->getLocalNumRows();
3195 const size_t numEntries = staticGraph_->getLocalNumEntries();
3196 if (!staticGraph_->indicesAreAllocated() ||
3197 nlrs == 0 || numEntries == 0) {
3198 // do nothing
3199 } else {
3200 auto vals = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
3201 KokkosBlas::scal(vals, theAlpha, vals);
3202 }
3203}
3204
3205template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3208 const impl_scalar_type theAlpha = static_cast<impl_scalar_type>(alpha);
3209
3210 // replace all values in the matrix
3211 // it is easiest to replace all allocated values, instead of replacing only the ones with valid entries
3212 // however, if there are no valid entries, we can short-circuit
3213 // furthermore, if the values aren't allocated, we can short-circuit (no entry have been inserted so far)
3214 const size_t numEntries = staticGraph_->getLocalNumEntries();
3215 if (!staticGraph_->indicesAreAllocated() || numEntries == 0) {
3216 // do nothing
3217 } else {
3218 // DEEP_COPY REVIEW - VALUE-TO-DEVICE
3219 Kokkos::deep_copy(execution_space(), valuesUnpacked_wdv.getDeviceView(Access::OverwriteAll),
3220 theAlpha);
3221 // CAG: This fence was found to be required on Cuda with UVM=on.
3222 Kokkos::fence("CrsMatrix::setAllToScalar");
3223 }
3224}
3225
3226template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3228 setAllValues(const typename local_graph_device_type::row_map_type& rowPointers,
3229 const typename local_graph_device_type::entries_type::non_const_type& columnIndices,
3230 const typename local_matrix_device_type::values_type& values) {
3231 using ProfilingRegion = Details::ProfilingRegion;
3232 ProfilingRegion region("Tpetra::CrsMatrix::setAllValues");
3233 const char tfecfFuncName[] = "setAllValues: ";
3234 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(columnIndices.size() != values.size(), std::invalid_argument,
3235 "columnIndices.size() = " << columnIndices.size() << " != values.size()"
3236 " = "
3237 << values.size() << ".");
3238 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(myGraph_.is_null(), std::runtime_error, "myGraph_ must not be null.");
3239
3240 try {
3241 myGraph_->setAllIndices(rowPointers, columnIndices);
3242 } catch (std::exception& e) {
3243 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
3244 "myGraph_->setAllIndices() threw an "
3245 "exception: "
3246 << e.what());
3247 }
3248
3249 // Make sure that myGraph_ now has a local graph. It may not be
3250 // fillComplete yet, so it's important to check. We don't care
3251 // whether setAllIndices() did a shallow copy or a deep copy, so a
3252 // good way to check is to compare dimensions.
3253 auto lclGraph = myGraph_->getLocalGraphDevice();
3254 const size_t numEnt = lclGraph.entries.extent(0);
3255 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(lclGraph.row_map.extent(0) != rowPointers.extent(0) ||
3256 numEnt != static_cast<size_t>(columnIndices.extent(0)),
3257 std::logic_error,
3258 "myGraph_->setAllIndices() did not correctly create "
3259 "local graph. Please report this bug to the Tpetra developers.");
3260
3261 valuesPacked_wdv = values_wdv_type(values);
3262 valuesUnpacked_wdv = valuesPacked_wdv;
3263
3264 // Storage MUST be packed, since the interface doesn't give any
3265 // way to indicate any extra space at the end of each row.
3266 this->storageStatus_ = Details::STORAGE_1D_PACKED;
3267
3268 checkInternalState();
3269}
3270
3271template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3274 using ProfilingRegion = Details::ProfilingRegion;
3275 ProfilingRegion region("Tpetra::CrsMatrix::setAllValues from KokkosSparse::CrsMatrix");
3276
3277 auto graph = localDeviceMatrix.graph;
3278 // FIXME how to check whether graph is allocated
3279
3280 auto rows = graph.row_map;
3281 auto columns = graph.entries;
3282 auto values = localDeviceMatrix.values;
3283
3284 setAllValues(rows, columns, values);
3285}
3286
3287template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3289 setAllValues(const Teuchos::ArrayRCP<size_t>& ptr,
3290 const Teuchos::ArrayRCP<LocalOrdinal>& ind,
3291 const Teuchos::ArrayRCP<Scalar>& val) {
3292 using Kokkos::Compat::getKokkosViewDeepCopy;
3293 using Teuchos::ArrayRCP;
3294 using Teuchos::av_reinterpret_cast;
3295 typedef device_type DT;
3296 typedef impl_scalar_type IST;
3297 typedef typename local_graph_device_type::row_map_type row_map_type;
3298 // typedef typename row_map_type::non_const_value_type row_offset_type;
3299 const char tfecfFuncName[] = "setAllValues(ArrayRCP<size_t>, ArrayRCP<LO>, ArrayRCP<Scalar>): ";
3300
3301 // The row offset type may depend on the execution space. It may
3302 // not necessarily be size_t. If it's not, we need to make a deep
3303 // copy. We need to make a deep copy anyway so that Kokkos can
3304 // own the memory. Regardless, ptrIn gets the copy.
3305 typename row_map_type::non_const_type ptrNative("ptr", ptr.size());
3306 Kokkos::View<const size_t*,
3307 typename row_map_type::array_layout,
3308 Kokkos::HostSpace,
3309 Kokkos::MemoryUnmanaged>
3310 ptrSizeT(ptr.getRawPtr(), ptr.size());
3312
3314 std::logic_error, "ptrNative.extent(0) = " << ptrNative.extent(0) << " != ptrSizeT.extent(0) = " << ptrSizeT.extent(0) << ". Please report this bug to the "
3315 "Tpetra developers.");
3316
3319 this->setAllValues(ptrNative, indIn, valIn);
3320}
3321
3322template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3324 getLocalDiagOffsets(Teuchos::ArrayRCP<size_t>& offsets) const {
3325 const char tfecfFuncName[] = "getLocalDiagOffsets: ";
3326 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(staticGraph_.is_null(), std::runtime_error, "The matrix has no graph.");
3327
3328 // mfh 11 May 2016: We plan to deprecate the ArrayRCP version of
3329 // this method in CrsGraph too, so don't call it (otherwise build
3330 // warnings will show up and annoy users). Instead, copy results
3331 // in and out, if the memory space requires it.
3332
3333 const size_t lclNumRows = staticGraph_->getLocalNumRows();
3334 if (static_cast<size_t>(offsets.size()) < lclNumRows) {
3335 offsets.resize(lclNumRows);
3336 }
3337
3338 // The input ArrayRCP must always be a host pointer. Thus, if
3339 // device_type::memory_space is Kokkos::HostSpace, it's OK for us
3340 // to write to that allocation directly as a Kokkos::View.
3341 if (std::is_same<memory_space, Kokkos::HostSpace>::value) {
3342 // It is always syntactically correct to assign a raw host
3343 // pointer to a device View, so this code will compile correctly
3344 // even if this branch never runs.
3345 typedef Kokkos::View<size_t*, device_type,
3346 Kokkos::MemoryUnmanaged>
3348 output_type offsetsOut(offsets.getRawPtr(), lclNumRows);
3349 staticGraph_->getLocalDiagOffsets(offsetsOut);
3350 } else {
3351 Kokkos::View<size_t*, device_type> offsetsTmp("diagOffsets", lclNumRows);
3352 staticGraph_->getLocalDiagOffsets(offsetsTmp);
3353 typedef Kokkos::View<size_t*, Kokkos::HostSpace,
3354 Kokkos::MemoryUnmanaged>
3356 output_type offsetsOut(offsets.getRawPtr(), lclNumRows);
3357 // DEEP_COPY REVIEW - DEVICE-TO-HOST
3358 Kokkos::deep_copy(execution_space(), offsetsOut, offsetsTmp);
3359 }
3360}
3361
3362template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3365 using Teuchos::ArrayRCP;
3366 using Teuchos::ArrayView;
3367 using Teuchos::av_reinterpret_cast;
3368 const char tfecfFuncName[] = "getLocalDiagCopy (1-arg): ";
3369 typedef local_ordinal_type LO;
3370
3372 staticGraph_.is_null(), std::runtime_error,
3373 "This method requires that the matrix have a graph.");
3374 auto rowMapPtr = this->getRowMap();
3375 if (rowMapPtr.is_null() || rowMapPtr->getComm().is_null()) {
3376 // Processes on which the row Map or its communicator is null
3377 // don't participate. Users shouldn't even call this method on
3378 // those processes.
3379 return;
3380 }
3381 auto colMapPtr = this->getColMap();
3382 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!this->hasColMap() || colMapPtr.is_null(), std::runtime_error,
3383 "This method requires that the matrix have a column Map.");
3384 const map_type& rowMap = *rowMapPtr;
3385 const map_type& colMap = *colMapPtr;
3386 const LO myNumRows = static_cast<LO>(this->getLocalNumRows());
3387
3388#ifdef HAVE_TPETRA_DEBUG
3389 // isCompatible() requires an all-reduce, and thus this check
3390 // should only be done in debug mode.
3392 !diag.getMap()->isCompatible(rowMap), std::runtime_error,
3393 "The input Vector's Map must be compatible with the CrsMatrix's row "
3394 "Map. You may check this by using Map's isCompatible method: "
3395 "diag.getMap ()->isCompatible (A.getRowMap ());");
3396#endif // HAVE_TPETRA_DEBUG
3397
3398 const auto D_lcl = diag.getLocalViewDevice(Access::OverwriteAll);
3399 // 1-D subview of the first (and only) column of D_lcl.
3400 const auto D_lcl_1d =
3401 Kokkos::subview(D_lcl, Kokkos::make_pair(LO(0), myNumRows), 0);
3402
3403 const auto lclRowMap = rowMap.getLocalMap();
3404 const auto lclColMap = colMap.getLocalMap();
3405 using ::Tpetra::Details::getDiagCopyWithoutOffsets;
3406 (void)getDiagCopyWithoutOffsets(D_lcl_1d, lclRowMap,
3407 lclColMap,
3408 getLocalMatrixDevice());
3409}
3410
3411template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3414 const Kokkos::View<const size_t*, device_type,
3415 Kokkos::MemoryUnmanaged>& offsets) const {
3416 typedef LocalOrdinal LO;
3417
3418#ifdef HAVE_TPETRA_DEBUG
3419 const char tfecfFuncName[] = "getLocalDiagCopy: ";
3420 const map_type& rowMap = *(this->getRowMap());
3421 // isCompatible() requires an all-reduce, and thus this check
3422 // should only be done in debug mode.
3424 !diag.getMap()->isCompatible(rowMap), std::runtime_error,
3425 "The input Vector's Map must be compatible with (in the sense of Map::"
3426 "isCompatible) the CrsMatrix's row Map.");
3427#endif // HAVE_TPETRA_DEBUG
3428
3429 // For now, we fill the Vector on the host and sync to device.
3430 // Later, we may write a parallel kernel that works entirely on
3431 // device.
3432 //
3433 // NOTE (mfh 21 Jan 2016): The host kernel here assumes UVM. Once
3434 // we write a device kernel, it will not need to assume UVM.
3435
3436 auto D_lcl = diag.getLocalViewDevice(Access::OverwriteAll);
3437 const LO myNumRows = static_cast<LO>(this->getLocalNumRows());
3438 // Get 1-D subview of the first (and only) column of D_lcl.
3439 auto D_lcl_1d =
3440 Kokkos::subview(D_lcl, Kokkos::make_pair(LO(0), myNumRows), 0);
3441
3442 KokkosSparse::getDiagCopy(D_lcl_1d, offsets,
3443 getLocalMatrixDevice());
3444}
3445
3446template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3449 const Teuchos::ArrayView<const size_t>& offsets) const {
3450 using LO = LocalOrdinal;
3451 using host_execution_space = Kokkos::DefaultHostExecutionSpace;
3452 using IST = impl_scalar_type;
3453
3454#ifdef HAVE_TPETRA_DEBUG
3455 const char tfecfFuncName[] = "getLocalDiagCopy: ";
3456 const map_type& rowMap = *(this->getRowMap());
3457 // isCompatible() requires an all-reduce, and thus this check
3458 // should only be done in debug mode.
3460 !diag.getMap()->isCompatible(rowMap), std::runtime_error,
3461 "The input Vector's Map must be compatible with (in the sense of Map::"
3462 "isCompatible) the CrsMatrix's row Map.");
3463#endif // HAVE_TPETRA_DEBUG
3464
3465 // See #1510. In case diag has already been marked modified on
3466 // device, we need to clear that flag, since the code below works
3467 // on host.
3468 // diag.clear_sync_state ();
3469
3470 // For now, we fill the Vector on the host and sync to device.
3471 // Later, we may write a parallel kernel that works entirely on
3472 // device.
3473 auto lclVecHost = diag.getLocalViewHost(Access::OverwriteAll);
3474 // 1-D subview of the first (and only) column of lclVecHost.
3475 auto lclVecHost1d = Kokkos::subview(lclVecHost, Kokkos::ALL(), 0);
3476
3478 Kokkos::View<const size_t*, Kokkos::HostSpace,
3479 Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
3480 host_offsets_view_type h_offsets(offsets.getRawPtr(), offsets.size());
3481 // Find the diagonal entries and put them in lclVecHost1d.
3482 using range_type = Kokkos::RangePolicy<host_execution_space, LO>;
3483 const LO myNumRows = static_cast<LO>(this->getLocalNumRows());
3484 const size_t INV = Tpetra::Details::OrdinalTraits<size_t>::invalid();
3485
3486 auto rowPtrsPackedHost = staticGraph_->getRowPtrsPackedHost();
3487 auto valuesPackedHost = valuesPacked_wdv.getHostView(Access::ReadOnly);
3488 Kokkos::parallel_for("Tpetra::CrsMatrix::getLocalDiagCopy",
3489 range_type(0, myNumRows),
3490 [&, INV, h_offsets](const LO lclRow) { // Value capture is a workaround for cuda + gcc-7.2 compiler bug w/c++14
3491 lclVecHost1d(lclRow) = STS::zero(); // default value if no diag entry
3492 if (h_offsets[lclRow] != INV) {
3495 static_cast<IST>(valuesPackedHost(curRowOffset + h_offsets[lclRow]));
3496 }
3497 });
3498 // diag.sync_device ();
3499}
3500
3501template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3504 using Teuchos::ArrayRCP;
3505 using Teuchos::ArrayView;
3506 using Teuchos::null;
3507 using Teuchos::RCP;
3508 using Teuchos::rcp;
3509 using Teuchos::rcpFromRef;
3510 using ::Tpetra::Details::ProfilingRegion;
3512 const char tfecfFuncName[] = "leftScale: ";
3513
3514 ProfilingRegion region("Tpetra::CrsMatrix::leftScale");
3515
3517 if (this->getRangeMap()->isSameAs(*(x.getMap()))) {
3518 // Take from Epetra: If we have a non-trivial exporter, we must
3519 // import elements that are permuted or are on other processors.
3520 auto exporter = this->getCrsGraphRef().getExporter();
3521 if (exporter.get() != nullptr) {
3522 RCP<vec_type> tempVec(new vec_type(this->getRowMap()));
3523 tempVec->doImport(x, *exporter, REPLACE); // reverse mode
3524 xp = tempVec;
3525 } else {
3526 xp = rcpFromRef(x);
3527 }
3528 } else if (this->getRowMap()->isSameAs(*(x.getMap()))) {
3529 xp = rcpFromRef(x);
3530 } else {
3531 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::invalid_argument,
3532 "x's Map must be the same as "
3533 "either the row Map or the range Map of the CrsMatrix.");
3534 }
3535
3536 if (this->isFillComplete()) {
3537 auto x_lcl = xp->getLocalViewDevice(Access::ReadOnly);
3538 auto x_lcl_1d = Kokkos::subview(x_lcl, Kokkos::ALL(), 0);
3539 using ::Tpetra::Details::leftScaleLocalCrsMatrix;
3540 leftScaleLocalCrsMatrix(getLocalMatrixDevice(),
3541 x_lcl_1d, false, false);
3542 } else {
3543 // 6/2020 Disallow leftScale of non-fillComplete matrices #7446
3544 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
3545 "CrsMatrix::leftScale requires matrix to be"
3546 " fillComplete");
3547 }
3548}
3549
3550template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3553 using Teuchos::ArrayRCP;
3554 using Teuchos::ArrayView;
3555 using Teuchos::null;
3556 using Teuchos::RCP;
3557 using Teuchos::rcp;
3558 using Teuchos::rcpFromRef;
3559 using ::Tpetra::Details::ProfilingRegion;
3561 const char tfecfFuncName[] = "rightScale: ";
3562
3563 ProfilingRegion region("Tpetra::CrsMatrix::rightScale");
3564
3566 if (this->getDomainMap()->isSameAs(*(x.getMap()))) {
3567 // Take from Epetra: If we have a non-trivial exporter, we must
3568 // import elements that are permuted or are on other processors.
3569 auto importer = this->getCrsGraphRef().getImporter();
3570 if (importer.get() != nullptr) {
3571 RCP<vec_type> tempVec(new vec_type(this->getColMap()));
3572 tempVec->doImport(x, *importer, REPLACE);
3573 xp = tempVec;
3574 } else {
3575 xp = rcpFromRef(x);
3576 }
3577 } else if (this->getColMap()->isSameAs(*(x.getMap()))) {
3578 xp = rcpFromRef(x);
3579 } else {
3580 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
3581 "x's Map must be the same as "
3582 "either the domain Map or the column Map of the CrsMatrix.");
3583 }
3584
3585 if (this->isFillComplete()) {
3586 auto x_lcl = xp->getLocalViewDevice(Access::ReadOnly);
3587 auto x_lcl_1d = Kokkos::subview(x_lcl, Kokkos::ALL(), 0);
3588 using ::Tpetra::Details::rightScaleLocalCrsMatrix;
3589 rightScaleLocalCrsMatrix(getLocalMatrixDevice(),
3590 x_lcl_1d, false, false);
3591 } else {
3592 // 6/2020 Disallow rightScale of non-fillComplete matrices #7446
3593 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
3594 "CrsMatrix::rightScale requires matrix to be"
3595 " fillComplete");
3596 }
3597}
3598
3599template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3602 auto equilInfo = computeRowOneNorms(*this);
3604 using range_type = Kokkos::RangePolicy<execution_space, local_ordinal_type>;
3605 Kokkos::parallel_reduce(
3606 "getNormInf", range_type(0, equilInfo.rowNorms.extent(0)),
3608 max = equilInfo.rowNorms(i);
3609 },
3610 Kokkos::Max<mag_type>(myMax));
3611 mag_type totalMax = STM::zero();
3612 Teuchos::reduceAll<int, mag_type>(*(getComm()), Teuchos::REDUCE_MAX, myMax,
3613 Teuchos::outArg(totalMax));
3614 return totalMax;
3615}
3616
3617template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3620 getNorm1(const bool assumeSymmetric) const {
3621 if (assumeSymmetric)
3622 return getNormInf();
3623 auto equilInfo = computeRowAndColumnOneNorms(*this, false);
3625 using range_type = Kokkos::RangePolicy<execution_space, local_ordinal_type>;
3626 Kokkos::parallel_reduce(
3627 "getNorm1", range_type(0, equilInfo.colNorms.extent(0)),
3629 max = equilInfo.colNorms(i);
3630 },
3631 Kokkos::Max<mag_type>(myMax));
3632 mag_type totalMax = STM::zero();
3633 Teuchos::reduceAll<int, mag_type>(*(getComm()), Teuchos::REDUCE_MAX, myMax,
3634 Teuchos::outArg(totalMax));
3635 return totalMax;
3636}
3637
3638template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3641 getFrobeniusNorm() const {
3642 using Teuchos::ArrayView;
3643 using Teuchos::outArg;
3644 using Teuchos::REDUCE_SUM;
3645 using Teuchos::reduceAll;
3646
3647 // FIXME (mfh 05 Aug 2014) Write a thread-parallel kernel for the
3648 // local part of this computation. It could make sense to put
3649 // this operation in the Kokkos::CrsMatrix.
3650
3651 // check the cache first
3652 mag_type mySum = STM::zero();
3653 if (getLocalNumEntries() > 0) {
3654 if (isStorageOptimized()) {
3655 // "Optimized" storage is packed storage. That means we can
3656 // iterate in one pass through the 1-D values array.
3657 const size_t numEntries = getLocalNumEntries();
3658 auto values = valuesPacked_wdv.getHostView(Access::ReadOnly);
3659 for (size_t k = 0; k < numEntries; ++k) {
3660 auto val = values[k];
3661 // Note (etp 06 Jan 2015) We need abs() here for composite types
3662 // (in general, if mag_type is on the left-hand-side, we need
3663 // abs() on the right-hand-side)
3664 const mag_type val_abs = STS::abs(val);
3665 mySum += val_abs * val_abs;
3666 }
3667 } else {
3668 const LocalOrdinal numRows =
3669 static_cast<LocalOrdinal>(this->getLocalNumRows());
3670 for (LocalOrdinal r = 0; r < numRows; ++r) {
3671 const RowInfo rowInfo = myGraph_->getRowInfo(r);
3672 const size_t numEntries = rowInfo.numEntries;
3673 auto A_r = this->getValuesViewHost(rowInfo);
3674 for (size_t k = 0; k < numEntries; ++k) {
3675 const impl_scalar_type val = A_r[k];
3676 const mag_type val_abs = STS::abs(val);
3677 mySum += val_abs * val_abs;
3678 }
3679 }
3680 }
3681 }
3682 mag_type totalSum = STM::zero();
3685 return STM::sqrt(totalSum);
3686}
3687
3688template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3690 replaceColMap(const Teuchos::RCP<const map_type>& newColMap) {
3691 const char tfecfFuncName[] = "replaceColMap: ";
3692 // FIXME (mfh 06 Aug 2014) What if the graph is locally indexed?
3693 // Then replacing the column Map might mean that we need to
3694 // reindex the column indices.
3696 myGraph_.is_null(), std::runtime_error,
3697 "This method does not work if the matrix has a const graph. The whole "
3698 "idea of a const graph is that you are not allowed to change it, but "
3699 "this method necessarily must modify the graph, since the graph owns "
3700 "the matrix's column Map.");
3701 myGraph_->replaceColMap(newColMap);
3702}
3703
3704template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3707 const Teuchos::RCP<const map_type>& newColMap,
3708 const Teuchos::RCP<const import_type>& newImport,
3709 const bool sortEachRow) {
3710 const char tfecfFuncName[] = "reindexColumns: ";
3712 graph == nullptr && myGraph_.is_null(), std::invalid_argument,
3713 "The input graph is null, but the matrix does not own its graph.");
3714
3715 crs_graph_type& theGraph = (graph == nullptr) ? *myGraph_ : *graph;
3716 const bool sortGraph = false; // we'll sort graph & matrix together below
3717
3718 theGraph.reindexColumns(newColMap, newImport, sortGraph);
3719
3720 if (sortEachRow && theGraph.isLocallyIndexed() && !theGraph.isSorted()) {
3721 const LocalOrdinal lclNumRows =
3722 static_cast<LocalOrdinal>(theGraph.getLocalNumRows());
3723
3724 for (LocalOrdinal row = 0; row < lclNumRows; ++row) {
3725 const RowInfo rowInfo = theGraph.getRowInfo(row);
3726 auto lclColInds = theGraph.getLocalIndsViewHostNonConst(rowInfo);
3727 auto vals = this->getValuesViewHostNonConst(rowInfo);
3728
3729 sort2(lclColInds.data(),
3730 lclColInds.data() + rowInfo.numEntries,
3731 vals.data());
3732 }
3733 theGraph.indicesAreSorted_ = true;
3734 }
3735}
3736
3737template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3739 replaceDomainMap(const Teuchos::RCP<const map_type>& newDomainMap) {
3740 const char tfecfFuncName[] = "replaceDomainMap: ";
3742 myGraph_.is_null(), std::runtime_error,
3743 "This method does not work if the matrix has a const graph. The whole "
3744 "idea of a const graph is that you are not allowed to change it, but this"
3745 " method necessarily must modify the graph, since the graph owns the "
3746 "matrix's domain Map and Import objects.");
3747 myGraph_->replaceDomainMap(newDomainMap);
3748}
3749
3750template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3752 replaceDomainMapAndImporter(const Teuchos::RCP<const map_type>& newDomainMap,
3753 Teuchos::RCP<const import_type>& newImporter) {
3754 const char tfecfFuncName[] = "replaceDomainMapAndImporter: ";
3756 myGraph_.is_null(), std::runtime_error,
3757 "This method does not work if the matrix has a const graph. The whole "
3758 "idea of a const graph is that you are not allowed to change it, but this"
3759 " method necessarily must modify the graph, since the graph owns the "
3760 "matrix's domain Map and Import objects.");
3761 myGraph_->replaceDomainMapAndImporter(newDomainMap, newImporter);
3762}
3763
3764template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3766 replaceRangeMap(const Teuchos::RCP<const map_type>& newRangeMap) {
3767 const char tfecfFuncName[] = "replaceRangeMap: ";
3769 myGraph_.is_null(), std::runtime_error,
3770 "This method does not work if the matrix has a const graph. The whole "
3771 "idea of a const graph is that you are not allowed to change it, but this"
3772 " method necessarily must modify the graph, since the graph owns the "
3773 "matrix's domain Map and Import objects.");
3774 myGraph_->replaceRangeMap(newRangeMap);
3775}
3776
3777template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3779 replaceRangeMapAndExporter(const Teuchos::RCP<const map_type>& newRangeMap,
3780 Teuchos::RCP<const export_type>& newExporter) {
3781 const char tfecfFuncName[] = "replaceRangeMapAndExporter: ";
3783 myGraph_.is_null(), std::runtime_error,
3784 "This method does not work if the matrix has a const graph. The whole "
3785 "idea of a const graph is that you are not allowed to change it, but this"
3786 " method necessarily must modify the graph, since the graph owns the "
3787 "matrix's domain Map and Import objects.");
3788 myGraph_->replaceRangeMapAndExporter(newRangeMap, newExporter);
3789}
3790
3791template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3794 const Teuchos::ArrayView<const GlobalOrdinal>& indices,
3795 const Teuchos::ArrayView<const Scalar>& values) {
3796 using Teuchos::Array;
3797 typedef GlobalOrdinal GO;
3798 typedef typename Array<GO>::size_type size_type;
3799
3800 const size_type numToInsert = indices.size();
3801 // Add the new data to the list of nonlocals.
3802 // This creates the arrays if they don't exist yet.
3803 std::pair<Array<GO>, Array<Scalar>>& curRow = nonlocals_[globalRow];
3804 Array<GO>& curRowInds = curRow.first;
3806 const size_type newCapacity = curRowInds.size() + numToInsert;
3807 curRowInds.reserve(newCapacity);
3808 curRowVals.reserve(newCapacity);
3809 for (size_type k = 0; k < numToInsert; ++k) {
3810 curRowInds.push_back(indices[k]);
3811 curRowVals.push_back(values[k]);
3812 }
3813}
3814
3815template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3818 using Details::Behavior;
3820 using std::endl;
3821 using Teuchos::Comm;
3822 using Teuchos::outArg;
3823 using Teuchos::RCP;
3824 using Teuchos::rcp;
3825 using Teuchos::REDUCE_MAX;
3826 using Teuchos::REDUCE_MIN;
3827 using Teuchos::reduceAll;
3829 // typedef LocalOrdinal LO;
3830 typedef GlobalOrdinal GO;
3831 typedef typename Teuchos::Array<GO>::size_type size_type;
3832 const char tfecfFuncName[] = "globalAssemble: "; // for exception macro
3833 ProfilingRegion regionGlobalAssemble("Tpetra::CrsMatrix::globalAssemble");
3834
3835 const bool verbose = Behavior::verbose("CrsMatrix");
3836 std::unique_ptr<std::string> prefix;
3837 if (verbose) {
3838 prefix = this->createPrefix("CrsMatrix", "globalAssemble");
3839 std::ostringstream os;
3840 os << *prefix << "nonlocals_.size()=" << nonlocals_.size()
3841 << endl;
3842 std::cerr << os.str();
3843 }
3844 RCP<const Comm<int>> comm = getComm();
3845
3846 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!isFillActive(), std::runtime_error,
3847 "Fill must be active before "
3848 "you may call this method.");
3849
3850 const size_t myNumNonlocalRows = nonlocals_.size();
3851
3852 // If no processes have nonlocal rows, then we don't have to do
3853 // anything. Checking this is probably cheaper than constructing
3854 // the Map of nonlocal rows (see below) and noticing that it has
3855 // zero global entries.
3856 {
3857 const int iHaveNonlocalRows = (myNumNonlocalRows == 0) ? 0 : 1;
3861 if (someoneHasNonlocalRows == 0) {
3862 return; // no process has nonlocal rows, so nothing to do
3863 }
3864 }
3866 // 1. Create a list of the "nonlocal" rows on each process. this
3867 // requires iterating over nonlocals_, so while we do this,
3868 // deduplicate the entries and get a count for each nonlocal
3869 // row on this process.
3870 // 2. Construct a new row Map corresponding to those rows. This
3871 // Map is likely overlapping. We know that the Map is not
3872 // empty on all processes, because the above all-reduce and
3873 // return exclude that case.
3874
3877 {
3878 Teuchos::Array<GO> myNonlocalGblRows(myNumNonlocalRows);
3879 size_type curPos = 0;
3880 for (auto mapIter = nonlocals_.begin(); mapIter != nonlocals_.end();
3883 // Get the values and column indices by reference, since we
3884 // intend to change them in place (that's what "erase" does).
3885 Teuchos::Array<GO>& gblCols = (mapIter->second).first;
3886 Teuchos::Array<Scalar>& vals = (mapIter->second).second;
3887
3888 // Sort both arrays jointly, using the column indices as keys,
3889 // then merge them jointly. "Merge" here adds values
3890 // corresponding to the same column indices. The first 2 args
3891 // of merge2 are output arguments that work just like the
3892 // return value of std::unique.
3893 sort2(gblCols.begin(), gblCols.end(), vals.begin());
3894 typename Teuchos::Array<GO>::iterator gblCols_newEnd;
3895 typename Teuchos::Array<Scalar>::iterator vals_newEnd;
3897 gblCols.begin(), gblCols.end(),
3898 vals.begin(), vals.end());
3899 gblCols.erase(gblCols_newEnd, gblCols.end());
3900 vals.erase(vals_newEnd, vals.end());
3902 }
3903
3904 // Currently, Map requires that its indexBase be the global min
3905 // of all its global indices. Map won't compute this for us, so
3906 // we must do it. If our process has no nonlocal rows, set the
3907 // "min" to the max possible GO value. This ensures that if
3908 // some process has at least one nonlocal row, then it will pick
3909 // that up as the min. We know that at least one process has a
3910 // nonlocal row, since the all-reduce and return at the top of
3911 // this method excluded that case.
3912 GO myMinNonlocalGblRow = std::numeric_limits<GO>::max();
3913 {
3914 auto iter = std::min_element(myNonlocalGblRows.begin(),
3915 myNonlocalGblRows.end());
3916 if (iter != myNonlocalGblRows.end()) {
3918 }
3919 }
3920 GO gblMinNonlocalGblRow = 0;
3921 reduceAll<int, GO>(*comm, REDUCE_MIN, myMinNonlocalGblRow,
3922 outArg(gblMinNonlocalGblRow));
3923 const GO indexBase = gblMinNonlocalGblRow;
3924 const global_size_t INV = Teuchos::OrdinalTraits<global_size_t>::invalid();
3925 nonlocalRowMap = rcp(new map_type(INV, myNonlocalGblRows(), indexBase, comm));
3926 }
3927
3928 // 3. Use the values and column indices for each nonlocal row, as
3929 // stored in nonlocals_, to construct a CrsMatrix corresponding
3930 // to nonlocal rows. We have
3931 // exact counts of the number of entries in each nonlocal row.
3932
3933 if (verbose) {
3934 std::ostringstream os;
3935 os << *prefix << "Create nonlocal matrix" << endl;
3936 std::cerr << os.str();
3937 }
3938 RCP<crs_matrix_type> nonlocalMatrix =
3939 rcp(new crs_matrix_type(nonlocalRowMap, numEntPerNonlocalRow()));
3940 {
3941 size_type curPos = 0;
3942 for (auto mapIter = nonlocals_.begin(); mapIter != nonlocals_.end();
3944 const GO gblRow = mapIter->first;
3945 // Get values & column indices by ref, just to avoid copy.
3946 Teuchos::Array<GO>& gblCols = (mapIter->second).first;
3947 Teuchos::Array<Scalar>& vals = (mapIter->second).second;
3948 // const LO numEnt = static_cast<LO> (numEntPerNonlocalRow[curPos]);
3949 nonlocalMatrix->insertGlobalValues(gblRow, gblCols(), vals());
3950 }
3952 // There's no need to fill-complete the nonlocals matrix.
3953 // We just use it as a temporary container for the Export.
3954
3955 // 4. If the original row Map is one to one, then we can Export
3956 // directly from nonlocalMatrix into this. Otherwise, we have
3957 // to create a temporary matrix with a one-to-one row Map,
3958 // Export into that, then Import from the temporary matrix into
3959 // *this.
3960
3961 auto origRowMap = this->getRowMap();
3962 const bool origRowMapIsOneToOne = origRowMap->isOneToOne();
3963
3964 int isLocallyComplete = 1; // true by default
3965
3967 if (verbose) {
3968 std::ostringstream os;
3969 os << *prefix << "Original row Map is 1-to-1" << endl;
3970 std::cerr << os.str();
3971 }
3973 if (!exportToOrig.isLocallyComplete()) {
3974 isLocallyComplete = 0;
3975 }
3976 if (verbose) {
3977 std::ostringstream os;
3978 os << *prefix << "doExport from nonlocalMatrix" << endl;
3979 std::cerr << os.str();
3980 }
3981 this->doExport(*nonlocalMatrix, exportToOrig, Tpetra::ADD);
3982 // We're done at this point!
3983 } else {
3984 if (verbose) {
3985 std::ostringstream os;
3986 os << *prefix << "Original row Map is NOT 1-to-1" << endl;
3987 std::cerr << os.str();
3988 }
3989 // If you ask a Map whether it is one to one, it does some
3990 // communication and stashes intermediate results for later use
3991 // by createOneToOne. Thus, calling createOneToOne doesn't cost
3992 // much more then the original cost of calling isOneToOne.
3993 auto oneToOneRowMap = Tpetra::createOneToOne(origRowMap);
3994 export_type exportToOneToOne(nonlocalRowMap, oneToOneRowMap);
3995 if (!exportToOneToOne.isLocallyComplete()) {
3996 isLocallyComplete = 0;
3997 }
3998
3999 // Create a temporary matrix with the one-to-one row Map.
4000 //
4001 // TODO (mfh 09 Sep 2016, 12 Sep 2016) Estimate # entries in
4002 // each row, to avoid reallocation during the Export operation.
4003 if (verbose) {
4004 std::ostringstream os;
4005 os << *prefix << "Create & doExport into 1-to-1 matrix"
4006 << endl;
4007 std::cerr << os.str();
4008 }
4009 crs_matrix_type oneToOneMatrix(oneToOneRowMap, 0);
4010 // Export from matrix of nonlocals into the temp one-to-one matrix.
4011 oneToOneMatrix.doExport(*nonlocalMatrix, exportToOneToOne,
4012 Tpetra::ADD);
4013
4014 // We don't need the matrix of nonlocals anymore, so get rid of
4015 // it, to keep the memory high-water mark down.
4016 if (verbose) {
4017 std::ostringstream os;
4018 os << *prefix << "Free nonlocalMatrix" << endl;
4019 std::cerr << os.str();
4020 }
4021 nonlocalMatrix = Teuchos::null;
4022
4023 // Import from the one-to-one matrix to the original matrix.
4024 if (verbose) {
4025 std::ostringstream os;
4026 os << *prefix << "doImport from 1-to-1 matrix" << endl;
4027 std::cerr << os.str();
4028 }
4029 import_type importToOrig(oneToOneRowMap, origRowMap);
4030 this->doImport(oneToOneMatrix, importToOrig, Tpetra::ADD);
4031 }
4032
4033 // It's safe now to clear out nonlocals_, since we've already
4034 // committed side effects to *this. The standard idiom for
4035 // clearing a Container like std::map, is to swap it with an empty
4036 // Container and let the swapped Container fall out of scope.
4037 if (verbose) {
4038 std::ostringstream os;
4039 os << *prefix << "Free nonlocals_ (std::map)" << endl;
4040 std::cerr << os.str();
4041 }
4042 decltype(nonlocals_) newNonlocals;
4043 std::swap(nonlocals_, newNonlocals);
4044
4045 // FIXME (mfh 12 Sep 2016) I don't like this all-reduce, and I
4046 // don't like throwing an exception here. A local return value
4047 // would likely be more useful to users. However, if users find
4048 // themselves exercising nonlocal inserts often, then they are
4049 // probably novice users who need the help. See Gibhub Issues
4050 // #603 and #601 (esp. the latter) for discussion.
4051
4052 int isGloballyComplete = 0; // output argument of reduceAll
4053 reduceAll<int, int>(*comm, REDUCE_MIN, isLocallyComplete,
4054 outArg(isGloballyComplete));
4055 TEUCHOS_TEST_FOR_EXCEPTION(isGloballyComplete != 1, std::runtime_error,
4056 "On at least one process, "
4057 "you called insertGlobalValues with a global row index which is not in "
4058 "the matrix's row Map on any process in its communicator.");
4059}
4060
4061template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4063 resumeFill(const Teuchos::RCP<Teuchos::ParameterList>& params) {
4064 if (!isStaticGraph()) { // Don't resume fill of a nonowned graph.
4065 myGraph_->resumeFill(params);
4066 }
4067 // Delete the apply helper (if it exists)
4068 applyHelper.reset();
4069 fillComplete_ = false;
4070}
4071
4072template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4074 haveGlobalConstants() const {
4075 return getCrsGraphRef().haveGlobalConstants();
4076}
4077
4078template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4080 fillComplete(const Teuchos::RCP<Teuchos::ParameterList>& params) {
4081 const char tfecfFuncName[] = "fillComplete(params): ";
4082
4083 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(this->getCrsGraph().is_null(), std::logic_error,
4084 "getCrsGraph() returns null. This should not happen at this point. "
4085 "Please report this bug to the Tpetra developers.");
4086
4087 const crs_graph_type& graph = this->getCrsGraphRef();
4088 if (this->isStaticGraph() && graph.isFillComplete()) {
4089 // If this matrix's graph is fill complete and the user did not
4090 // supply a domain or range Map, use the graph's domain and
4091 // range Maps.
4092 this->fillComplete(graph.getDomainMap(), graph.getRangeMap(), params);
4093 } else { // assume that user's row Map is the domain and range Map
4094 Teuchos::RCP<const map_type> rangeMap = graph.getRowMap();
4095 Teuchos::RCP<const map_type> domainMap = rangeMap;
4096 this->fillComplete(domainMap, rangeMap, params);
4097 }
4098}
4099
4100template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4102 fillComplete(const Teuchos::RCP<const map_type>& domainMap,
4103 const Teuchos::RCP<const map_type>& rangeMap,
4104 const Teuchos::RCP<Teuchos::ParameterList>& params) {
4105 using Details::Behavior;
4107 using std::endl;
4108 using Teuchos::ArrayRCP;
4109 using Teuchos::RCP;
4110 using Teuchos::rcp;
4111 const char tfecfFuncName[] = "fillComplete: ";
4112 ProfilingRegion regionFillComplete("Tpetra::CrsMatrix::fillComplete");
4113 const bool verbose = Behavior::verbose("CrsMatrix");
4114 std::unique_ptr<std::string> prefix;
4115 if (verbose) {
4116 prefix = this->createPrefix("CrsMatrix", "fillComplete(dom,ran,p)");
4117 std::ostringstream os;
4118 os << *prefix << endl;
4119 std::cerr << os.str();
4120 }
4122 "Tpetra::CrsMatrix::fillCompete",
4123 "fillCompete");
4124
4125 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!this->isFillActive() || this->isFillComplete(), std::runtime_error,
4126 "Matrix fill state must be active (isFillActive() "
4127 "must be true) before you may call fillComplete().");
4128 const int numProcs = this->getComm()->getSize();
4129
4130 //
4131 // Read parameters from the input ParameterList.
4132 //
4133 {
4134 Details::ProfilingRegion region_fc("Tpetra::CrsMatrix::fillCompete", "ParameterList");
4135
4136 // If true, the caller promises that no process did nonlocal
4137 // changes since the last call to fillComplete.
4138 bool assertNoNonlocalInserts = false;
4139 // If true, makeColMap sorts remote GIDs (within each remote
4140 // process' group).
4141 bool sortGhosts = true;
4142
4143 if (!params.is_null()) {
4144 assertNoNonlocalInserts = params->get("No Nonlocal Changes",
4146 if (params->isParameter("sort column map ghost gids")) {
4147 sortGhosts = params->get("sort column map ghost gids", sortGhosts);
4148 } else if (params->isParameter("Sort column Map ghost GIDs")) {
4149 sortGhosts = params->get("Sort column Map ghost GIDs", sortGhosts);
4150 }
4151 }
4152 // We also don't need to do global assembly if there is only one
4153 // process in the communicator.
4155 // This parameter only matters if this matrix owns its graph.
4156 if (!this->myGraph_.is_null()) {
4157 this->myGraph_->sortGhostsAssociatedWithEachProcessor_ = sortGhosts;
4158 }
4159
4160 if (!this->getCrsGraphRef().indicesAreAllocated()) {
4161 if (this->hasColMap()) { // use local indices
4162 allocateValues(LocalIndices, GraphNotYetAllocated, verbose);
4163 } else { // no column Map, so use global indices
4164 allocateValues(GlobalIndices, GraphNotYetAllocated, verbose);
4165 }
4166 }
4167 // Global assemble, if we need to. This call only costs a single
4168 // all-reduce if we didn't need global assembly after all.
4169 if (needGlobalAssemble) {
4170 this->globalAssemble();
4171 } else {
4172 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(numProcs == 1 && nonlocals_.size() > 0,
4173 std::runtime_error,
4174 "Cannot have nonlocal entries on a serial run. "
4175 "An invalid entry (i.e., with row index not in the row Map) must have "
4176 "been submitted to the CrsMatrix.");
4177 }
4178 }
4179 if (this->isStaticGraph()) {
4180 Details::ProfilingRegion region_isg("Tpetra::CrsMatrix::fillCompete", "isStaticGraph");
4181 // FIXME (mfh 14 Nov 2016) In order to fix #843, I enable the
4182 // checks below only in debug mode. It would be nicer to do a
4183 // local check, then propagate the error state in a deferred
4184 // way, whenever communication happens. That would reduce the
4185 // cost of checking, to the point where it may make sense to
4186 // enable it even in release mode.
4187#ifdef HAVE_TPETRA_DEBUG
4188 // FIXME (mfh 18 Jun 2014) This check for correctness of the
4189 // input Maps incurs a penalty of two all-reduces for the
4190 // otherwise optimal const graph case.
4191 //
4192 // We could turn these (max) 2 all-reduces into (max) 1, by
4193 // fusing them. We could do this by adding a "locallySameAs"
4194 // method to Map, which would return one of four states:
4195 //
4196 // a. Certainly globally the same
4197 // b. Certainly globally not the same
4198 // c. Locally the same
4199 // d. Locally not the same
4200 //
4201 // The first two states don't require further communication.
4202 // The latter two states require an all-reduce to communicate
4203 // globally, but we only need one all-reduce, since we only need
4204 // to check whether at least one of the Maps is wrong.
4205 const bool domainMapsMatch =
4206 this->staticGraph_->getDomainMap()->isSameAs(*domainMap);
4207 const bool rangeMapsMatch =
4208 this->staticGraph_->getRangeMap()->isSameAs(*rangeMap);
4209
4211 "The CrsMatrix's domain Map does not match the graph's domain Map. "
4212 "The graph cannot be changed because it was given to the CrsMatrix "
4213 "constructor as const. You can fix this by passing in the graph's "
4214 "domain Map and range Map to the matrix's fillComplete call.");
4215
4217 "The CrsMatrix's range Map does not match the graph's range Map. "
4218 "The graph cannot be changed because it was given to the CrsMatrix "
4219 "constructor as const. You can fix this by passing in the graph's "
4220 "domain Map and range Map to the matrix's fillComplete call.");
4221#endif // HAVE_TPETRA_DEBUG
4222
4223 // The matrix does _not_ own the graph, and the graph's
4224 // structure is already fixed, so just fill the local matrix.
4225 this->fillLocalMatrix(params);
4226 } else {
4227 Details::ProfilingRegion region_insg("Tpetra::CrsMatrix::fillCompete", "isNotStaticGraph");
4228 // Set the graph's domain and range Maps. This will clear the
4229 // Import if the domain Map has changed (is a different
4230 // pointer), and the Export if the range Map has changed (is a
4231 // different pointer).
4232 this->myGraph_->setDomainRangeMaps(domainMap, rangeMap);
4233
4234 // Make the graph's column Map, if necessary.
4235 Teuchos::Array<int> remotePIDs(0);
4236 const bool mustBuildColMap = !this->hasColMap();
4237 if (mustBuildColMap) {
4238 this->myGraph_->makeColMap(remotePIDs);
4239 }
4240
4241 // Make indices local, if necessary. The method won't do
4242 // anything if the graph is already locally indexed.
4243 const std::pair<size_t, std::string> makeIndicesLocalResult =
4244 this->myGraph_->makeIndicesLocal(verbose);
4245 // TODO (mfh 20 Jul 2017) Instead of throwing here, pass along
4246 // the error state to makeImportExport
4247 // which may do all-reduces and thus may
4248 // have the opportunity to communicate that error state.
4250 makeIndicesLocalResult.second);
4251
4252 const bool sorted = this->myGraph_->isSorted();
4253 const bool merged = this->myGraph_->isMerged();
4254 this->sortAndMergeIndicesAndValues(sorted, merged);
4255
4256 // Make Import and Export objects, if they haven't been made
4257 // already. If we made a column Map above, reuse information
4258 // from that process to avoid communiation in the Import setup.
4259 this->myGraph_->makeImportExport(remotePIDs, mustBuildColMap);
4260
4261 // The matrix _does_ own the graph, so fill the local graph at
4262 // the same time as the local matrix.
4263 this->fillLocalGraphAndMatrix(params);
4264
4265 const bool callGraphComputeGlobalConstants = params.get() == nullptr ||
4266 params->get("compute global constants", true);
4268 this->myGraph_->computeGlobalConstants();
4269 } else {
4270 this->myGraph_->computeLocalConstants();
4271 }
4272 this->myGraph_->fillComplete_ = true;
4273 this->myGraph_->checkInternalState();
4274 }
4275
4276 // FIXME (mfh 28 Aug 2014) "Preserve Local Graph" bool parameter no longer used.
4277
4278 this->fillComplete_ = true; // Now we're fill complete!
4279 {
4281 "Tpetra::CrsMatrix::fillCompete", "checkInternalState");
4282 this->checkInternalState();
4283 }
4284} // fillComplete(domainMap, rangeMap, params)
4285
4286template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4288 expertStaticFillComplete(const Teuchos::RCP<const map_type>& domainMap,
4289 const Teuchos::RCP<const map_type>& rangeMap,
4290 const Teuchos::RCP<const import_type>& importer,
4291 const Teuchos::RCP<const export_type>& exporter,
4292 const Teuchos::RCP<Teuchos::ParameterList>& params) {
4293#ifdef HAVE_TPETRA_MMM_TIMINGS
4294 std::string label;
4295 if (!params.is_null())
4296 label = params->get("Timer Label", label);
4297 std::string prefix = std::string("Tpetra ") + label + std::string(": ");
4298 using Teuchos::TimeMonitor;
4299
4300 Teuchos::TimeMonitor all(*TimeMonitor::getNewTimer(prefix + std::string("ESFC-all")));
4301#endif
4302
4303 const char tfecfFuncName[] = "expertStaticFillComplete: ";
4304 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!isFillActive() || isFillComplete(),
4305 std::runtime_error,
4306 "Matrix fill state must be active (isFillActive() "
4307 "must be true) before calling fillComplete().");
4309 myGraph_.is_null(), std::logic_error, "myGraph_ is null. This is not allowed.");
4310
4311 {
4312#ifdef HAVE_TPETRA_MMM_TIMINGS
4313 Teuchos::TimeMonitor graph(*TimeMonitor::getNewTimer(prefix + std::string("eSFC-M-Graph")));
4314#endif
4315 // We will presume globalAssemble is not needed, so we do the ESFC on the graph
4316 myGraph_->expertStaticFillComplete(domainMap, rangeMap, importer, exporter, params);
4317 }
4318
4319 {
4320#ifdef HAVE_TPETRA_MMM_TIMINGS
4321 TimeMonitor fLGAM(*TimeMonitor::getNewTimer(prefix + std::string("eSFC-M-fLGAM")));
4322#endif
4323 // Fill the local graph and matrix
4324 fillLocalGraphAndMatrix(params);
4325 }
4326 // FIXME (mfh 28 Aug 2014) "Preserve Local Graph" bool parameter no longer used.
4327
4328 // Now we're fill complete!
4329 fillComplete_ = true;
4330
4331 // Sanity checks at the end.
4332#ifdef HAVE_TPETRA_DEBUG
4333 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(isFillActive(), std::logic_error,
4334 ": We're at the end of fillComplete(), but isFillActive() is true. "
4335 "Please report this bug to the Tpetra developers.");
4336 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!isFillComplete(), std::logic_error,
4337 ": We're at the end of fillComplete(), but isFillActive() is true. "
4338 "Please report this bug to the Tpetra developers.");
4339#endif // HAVE_TPETRA_DEBUG
4340 {
4341#ifdef HAVE_TPETRA_MMM_TIMINGS
4342 Teuchos::TimeMonitor cIS(*TimeMonitor::getNewTimer(prefix + std::string("ESFC-M-cIS")));
4343#endif
4344
4345 checkInternalState();
4346 }
4347}
4348
4349template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4353 // beg,end define a half-exclusive interval over which to iterate.
4357 if (beg != end) {
4358 LocalOrdinal* cur = beg + 1;
4361 cur = beg + 1;
4362 while (cur != end) {
4363 if (*cur != *newend) {
4364 // new entry; save it
4365 ++newend;
4366 ++vend;
4367 (*newend) = (*cur);
4368 (*vend) = (*vcur);
4369 } else {
4370 // old entry; merge it
4371 //(*vend) = f (*vend, *vcur);
4372 (*vend) += *vcur;
4373 }
4374 ++cur;
4375 ++vcur;
4376 }
4377 ++newend; // one past the last entry, per typical [beg,end) semantics
4378 }
4379 return newend - beg;
4380}
4381
4382template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4384 sortAndMergeIndicesAndValues(const bool sorted, const bool merged) {
4385 using ::Tpetra::Details::ProfilingRegion;
4386 typedef LocalOrdinal LO;
4387 typedef typename Kokkos::View<LO*, device_type>::host_mirror_type::execution_space
4388 host_execution_space;
4389 typedef Kokkos::RangePolicy<host_execution_space, LO> range_type;
4390 const char tfecfFuncName[] = "sortAndMergeIndicesAndValues: ";
4391 ProfilingRegion regionSAM("Tpetra::CrsMatrix::sortAndMergeIndicesAndValues");
4392
4393 if (!sorted || !merged) {
4394 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(this->isStaticGraph(), std::runtime_error,
4395 "Cannot sort or merge with "
4396 "\"static\" (const) graph, since the matrix does not own the graph.");
4397 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(this->myGraph_.is_null(), std::logic_error,
4398 "myGraph_ is null, but "
4399 "this matrix claims ! isStaticGraph(). "
4400 "Please report this bug to the Tpetra developers.");
4401 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(this->isStorageOptimized(), std::logic_error,
4402 "It is invalid to call "
4403 "this method if the graph's storage has already been optimized. "
4404 "Please report this bug to the Tpetra developers.");
4405
4406 crs_graph_type& graph = *(this->myGraph_);
4407 const LO lclNumRows = static_cast<LO>(this->getLocalNumRows());
4408 size_t totalNumDups = 0;
4409 {
4410 // Accessing host unpacked (4-array CRS) local matrix.
4411 auto rowBegins_ = graph.getRowPtrsUnpackedHost();
4412 auto rowLengths_ = graph.k_numRowEntries_;
4413 auto vals_ = this->valuesUnpacked_wdv.getHostView(Access::ReadWrite);
4414 auto cols_ = graph.lclIndsUnpacked_wdv.getHostView(Access::ReadWrite);
4415 Kokkos::parallel_reduce(
4416 "sortAndMergeIndicesAndValues", range_type(0, lclNumRows),
4417 [=](const LO lclRow, size_t& numDups) {
4418 size_t rowBegin = rowBegins_(lclRow);
4419 size_t rowLen = rowLengths_(lclRow);
4420 LO* cols = cols_.data() + rowBegin;
4421 impl_scalar_type* vals = vals_.data() + rowBegin;
4422 if (!sorted) {
4423 sort2(cols, cols + rowLen, vals);
4424 }
4425 if (!merged) {
4426 size_t newRowLength = mergeRowIndicesAndValues(rowLen, cols, vals);
4429 }
4430 },
4431 totalNumDups);
4432 }
4433 if (!sorted) {
4434 graph.indicesAreSorted_ = true; // we just sorted every row
4435 }
4436 if (!merged) {
4437 graph.noRedundancies_ = true; // we just merged every row
4438 }
4439 }
4440}
4441
4442template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4446 Scalar alpha,
4447 Scalar beta) const {
4448 using Teuchos::RCP;
4449 using Teuchos::rcp;
4450 using Teuchos::rcp_const_cast;
4451 using Teuchos::rcpFromRef;
4453 const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero();
4454 const Scalar ONE = Teuchos::ScalarTraits<Scalar>::one();
4455
4456 // mfh 05 Jun 2014: Special case for alpha == 0. I added this to
4457 // fix an Ifpack2 test (RILUKSingleProcessUnitTests), which was
4458 // failing only for the Kokkos refactor version of Tpetra. It's a
4459 // good idea regardless to have the bypass.
4460 if (alpha == ZERO) {
4461 if (beta == ZERO) {
4462 Y_in.putScalar(ZERO);
4463 } else if (beta != ONE) {
4464 Y_in.scale(beta);
4465 }
4466 return;
4467 }
4468
4469 // It's possible that X is a view of Y or vice versa. We don't
4470 // allow this (apply() requires that X and Y not alias one
4471 // another), but it's helpful to detect and work around this case.
4472 // We don't try to to detect the more subtle cases (e.g., one is a
4473 // subview of the other, but their initial pointers differ). We
4474 // only need to do this if this matrix's Import is trivial;
4475 // otherwise, we don't actually apply the operator from X into Y.
4476
4477 RCP<const import_type> importer = this->getGraph()->getImporter();
4478 RCP<const export_type> exporter = this->getGraph()->getExporter();
4479
4480 // If beta == 0, then the output MV will be overwritten; none of
4481 // its entries should be read. (Sparse BLAS semantics say that we
4482 // must ignore any Inf or NaN entries in Y_in, if beta is zero.)
4483 // This matters if we need to do an Export operation; see below.
4484 const bool Y_is_overwritten = (beta == ZERO);
4485
4486 // We treat the case of a replicated MV output specially.
4487 const bool Y_is_replicated =
4488 (!Y_in.isDistributed() && this->getComm()->getSize() != 1);
4489
4490 // This is part of the special case for replicated MV output.
4491 // We'll let each process do its thing, but do an all-reduce at
4492 // the end to sum up the results. Setting beta=0 on all processes
4493 // but Proc 0 makes the math work out for the all-reduce. (This
4494 // assumes that the replicated data is correctly replicated, so
4495 // that the data are the same on all processes.)
4496 if (Y_is_replicated && this->getComm()->getRank() > 0) {
4497 beta = ZERO;
4498 }
4499
4500 // Temporary MV for Import operation. After the block of code
4501 // below, this will be an (Imported if necessary) column Map MV
4502 // ready to give to localApply(...).
4504 if (importer.is_null()) {
4505 if (!X_in.isConstantStride()) {
4506 // Not all sparse mat-vec kernels can handle an input MV with
4507 // nonconstant stride correctly, so we have to copy it in that
4508 // case into a constant stride MV. To make a constant stride
4509 // copy of X_in, we force creation of the column (== domain)
4510 // Map MV (if it hasn't already been created, else fetch the
4511 // cached copy). This avoids creating a new MV each time.
4512 RCP<MV> X_colMapNonConst = getColumnMapMultiVector(X_in, true);
4515 } else {
4516 // The domain and column Maps are the same, so do the local
4517 // multiply using the domain Map input MV X_in.
4519 }
4520 } else { // need to Import source (multi)vector
4521 ProfilingRegion regionImport("Tpetra::CrsMatrix::apply: Import");
4522
4523 // We're doing an Import anyway, which will copy the relevant
4524 // elements of the domain Map MV X_in into a separate column Map
4525 // MV. Thus, we don't have to worry whether X_in is constant
4526 // stride.
4527 RCP<MV> X_colMapNonConst = getColumnMapMultiVector(X_in);
4528
4529 // Import from the domain Map MV to the column Map MV.
4530 X_colMapNonConst->doImport(X_in, *importer, INSERT);
4532 }
4533
4534 // Temporary MV for doExport (if needed), or for copying a
4535 // nonconstant stride output MV into a constant stride MV. This
4536 // is null if we don't need the temporary MV, that is, if the
4537 // Export is trivial (null).
4538 RCP<MV> Y_rowMap = getRowMapMultiVector(Y_in);
4539
4540 // If we have a nontrivial Export object, we must perform an
4541 // Export. In that case, the local multiply result will go into
4542 // the row Map multivector. We don't have to make a
4543 // constant-stride version of Y_in in this case, because we had to
4544 // make a constant stride Y_rowMap MV and do an Export anyway.
4545 if (!exporter.is_null()) {
4546 this->localApply(*X_colMap, *Y_rowMap, Teuchos::NO_TRANS, alpha, ZERO);
4547 {
4548 ProfilingRegion regionExport("Tpetra::CrsMatrix::apply: Export");
4549
4550 // If we're overwriting the output MV Y_in completely (beta ==
4551 // 0), then make sure that it is filled with zeros before we
4552 // do the Export. Otherwise, the ADD combine mode will use
4553 // data in Y_in, which is supposed to be zero.
4554 if (Y_is_overwritten) {
4555 Y_in.putScalar(ZERO);
4556 } else {
4557 // Scale output MV by beta, so that doExport sums in the
4558 // mat-vec contribution: Y_in = beta*Y_in + alpha*A*X_in.
4559 Y_in.scale(beta);
4560 }
4561 // Do the Export operation.
4562 Y_in.doExport(*Y_rowMap, *exporter, ADD_ASSIGN);
4563 }
4564 } else { // Don't do an Export: row Map and range Map are the same.
4565 //
4566 // If Y_in does not have constant stride, or if the column Map
4567 // MV aliases Y_in, then we can't let the kernel write directly
4568 // to Y_in. Instead, we have to use the cached row (== range)
4569 // Map MV as temporary storage.
4570 //
4571 // FIXME (mfh 05 Jun 2014) This test for aliasing only tests if
4572 // the user passed in the same MultiVector for both X and Y. It
4573 // won't detect whether one MultiVector views the other. We
4574 // should also check the MultiVectors' raw data pointers.
4575 if (!Y_in.isConstantStride() || X_colMap.getRawPtr() == &Y_in) {
4576 // Force creating the MV if it hasn't been created already.
4577 // This will reuse a previously created cached MV.
4578 Y_rowMap = getRowMapMultiVector(Y_in, true);
4579
4580 // If beta == 0, we don't need to copy Y_in into Y_rowMap,
4581 // since we're overwriting it anyway.
4582 if (beta != ZERO) {
4584 }
4585 this->localApply(*X_colMap, *Y_rowMap, Teuchos::NO_TRANS, alpha, beta);
4587 } else {
4588 this->localApply(*X_colMap, Y_in, Teuchos::NO_TRANS, alpha, beta);
4589 }
4590 }
4591
4592 // If the range Map is a locally replicated Map, sum up
4593 // contributions from each process. We set beta = 0 on all
4594 // processes but Proc 0 initially, so this will handle the scaling
4595 // factor beta correctly.
4596 if (Y_is_replicated) {
4597 ProfilingRegion regionReduce("Tpetra::CrsMatrix::apply: Reduce Y");
4598 Y_in.reduce();
4599 }
4600}
4601
4602template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4606 const Teuchos::ETransp mode,
4607 Scalar alpha,
4608 Scalar beta) const {
4609 using Teuchos::null;
4610 using Teuchos::RCP;
4611 using Teuchos::rcp;
4612 using Teuchos::rcp_const_cast;
4613 using Teuchos::rcpFromRef;
4615 const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero();
4616
4617 // Take shortcuts for alpha == 0.
4618 if (alpha == ZERO) {
4619 // Follow the Sparse BLAS convention by ignoring both the matrix
4620 // and X_in, in this case.
4621 if (beta == ZERO) {
4622 // Follow the Sparse BLAS convention by overwriting any Inf or
4623 // NaN values in Y_in, in this case.
4624 Y_in.putScalar(ZERO);
4625 } else {
4626 Y_in.scale(beta);
4627 }
4628 return;
4629 }
4630
4631 const size_t numVectors = X_in.getNumVectors();
4632
4633 // We don't allow X_in and Y_in to alias one another. It's hard
4634 // to check this, because advanced users could create views from
4635 // raw pointers. However, if X_in and Y_in reference the same
4636 // object, we will do the user a favor by copying X into new
4637 // storage (with a warning). We only need to do this if we have
4638 // trivial importers; otherwise, we don't actually apply the
4639 // operator from X into Y.
4640 RCP<const import_type> importer = this->getGraph()->getImporter();
4641 RCP<const export_type> exporter = this->getGraph()->getExporter();
4642 // access X indirectly, in case we need to create temporary storage
4644
4645 // some parameters for below
4646 const bool Y_is_replicated = (!Y_in.isDistributed() && this->getComm()->getSize() != 1);
4647 const bool Y_is_overwritten = (beta == ZERO);
4648 if (Y_is_replicated && this->getComm()->getRank() > 0) {
4649 beta = ZERO;
4650 }
4651
4652 // The kernels do not allow input or output with nonconstant stride.
4653 if (!X_in.isConstantStride() && importer.is_null()) {
4654 X = rcp(new MV(X_in, Teuchos::Copy)); // Constant-stride copy of X_in
4655 } else {
4656 X = rcpFromRef(X_in); // Reference to X_in
4657 }
4658
4659 // Set up temporary multivectors for Import and/or Export.
4660 if (importer != Teuchos::null) {
4661 if (importMV_ != Teuchos::null && importMV_->getNumVectors() != numVectors) {
4662 importMV_ = null;
4663 }
4664 if (importMV_ == null) {
4665 importMV_ = rcp(new MV(this->getColMap(), numVectors));
4666 }
4667 }
4668 if (exporter != Teuchos::null) {
4669 if (exportMV_ != Teuchos::null && exportMV_->getNumVectors() != numVectors) {
4670 exportMV_ = null;
4671 }
4672 if (exportMV_ == null) {
4673 exportMV_ = rcp(new MV(this->getRowMap(), numVectors));
4674 }
4675 }
4676
4677 // If we have a non-trivial exporter, we must import elements that
4678 // are permuted or are on other processors.
4679 if (!exporter.is_null()) {
4680 ProfilingRegion regionImport("Tpetra::CrsMatrix::apply (transpose): Import");
4681 exportMV_->doImport(X_in, *exporter, INSERT);
4682 X = exportMV_; // multiply out of exportMV_
4683 }
4684
4685 // If we have a non-trivial importer, we must export elements that
4686 // are permuted or belong to other processors. We will compute
4687 // solution into the to-be-exported MV; get a view.
4688 if (importer != Teuchos::null) {
4689 ProfilingRegion regionExport("Tpetra::CrsMatrix::apply (transpose): Export");
4690
4691 // FIXME (mfh 18 Apr 2015) Temporary fix suggested by Clark
4692 // Dohrmann on Fri 17 Apr 2015. At some point, we need to go
4693 // back and figure out why this helps. importMV_ SHOULD be
4694 // completely overwritten in the localApply(...) call
4695 // below, because beta == ZERO there.
4696 importMV_->putScalar(ZERO);
4697 // Do the local computation.
4698 this->localApply(*X, *importMV_, mode, alpha, ZERO);
4699
4700 if (Y_is_overwritten) {
4701 Y_in.putScalar(ZERO);
4702 } else {
4703 Y_in.scale(beta);
4704 }
4705 Y_in.doExport(*importMV_, *importer, ADD_ASSIGN);
4706 }
4707 // otherwise, multiply into Y
4708 else {
4709 // can't multiply in-situ; can't multiply into non-strided multivector
4710 //
4711 // FIXME (mfh 05 Jun 2014) This test for aliasing only tests if
4712 // the user passed in the same MultiVector for both X and Y. It
4713 // won't detect whether one MultiVector views the other. We
4714 // should also check the MultiVectors' raw data pointers.
4715 if (!Y_in.isConstantStride() || X.getRawPtr() == &Y_in) {
4716 // Make a deep copy of Y_in, into which to write the multiply result.
4717 MV Y(Y_in, Teuchos::Copy);
4718 this->localApply(*X, Y, mode, alpha, beta);
4720 } else {
4721 this->localApply(*X, Y_in, mode, alpha, beta);
4722 }
4723 }
4724
4725 // If the range Map is a locally replicated map, sum the
4726 // contributions from each process. (That's why we set beta=0
4727 // above for all processes but Proc 0.)
4728 if (Y_is_replicated) {
4729 ProfilingRegion regionReduce("Tpetra::CrsMatrix::apply (transpose): Reduce Y");
4730 Y_in.reduce();
4731 }
4732}
4733
4734template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4738 const Teuchos::ETransp mode,
4739 const Scalar& alpha,
4740 const Scalar& beta) const {
4741 using Teuchos::NO_TRANS;
4743 ProfilingRegion regionLocalApply("Tpetra::CrsMatrix::localApply");
4744
4745 auto X_lcl = X.getLocalViewDevice(Access::ReadOnly);
4746 auto Y_lcl = Y.getLocalViewDevice(Access::ReadWrite);
4747
4748 const bool debug = ::Tpetra::Details::Behavior::debug();
4749 if (debug) {
4750 const char tfecfFuncName[] = "localApply: ";
4751 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(X.getNumVectors() != Y.getNumVectors(), std::runtime_error,
4752 "X.getNumVectors() = " << X.getNumVectors() << " != "
4753 "Y.getNumVectors() = "
4754 << Y.getNumVectors() << ".");
4755 const bool transpose = (mode != Teuchos::NO_TRANS);
4756 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!transpose && X.getLocalLength() !=
4757 getColMap()->getLocalNumElements(),
4758 std::runtime_error,
4759 "NO_TRANS case: X has the wrong number of local rows. "
4760 "X.getLocalLength() = "
4761 << X.getLocalLength() << " != "
4762 "getColMap()->getLocalNumElements() = "
4763 << getColMap()->getLocalNumElements() << ".");
4764 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!transpose && Y.getLocalLength() !=
4765 getRowMap()->getLocalNumElements(),
4766 std::runtime_error,
4767 "NO_TRANS case: Y has the wrong number of local rows. "
4768 "Y.getLocalLength() = "
4769 << Y.getLocalLength() << " != "
4770 "getRowMap()->getLocalNumElements() = "
4771 << getRowMap()->getLocalNumElements() << ".");
4773 getRowMap()->getLocalNumElements(),
4774 std::runtime_error,
4775 "TRANS or CONJ_TRANS case: X has the wrong number of local "
4776 "rows. X.getLocalLength() = "
4777 << X.getLocalLength()
4778 << " != getRowMap()->getLocalNumElements() = "
4779 << getRowMap()->getLocalNumElements() << ".");
4781 getColMap()->getLocalNumElements(),
4782 std::runtime_error,
4783 "TRANS or CONJ_TRANS case: X has the wrong number of local "
4784 "rows. Y.getLocalLength() = "
4785 << Y.getLocalLength()
4786 << " != getColMap()->getLocalNumElements() = "
4787 << getColMap()->getLocalNumElements() << ".");
4788 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!isFillComplete(), std::runtime_error,
4789 "The matrix is not "
4790 "fill complete. You must call fillComplete() (possibly with "
4791 "domain and range Map arguments) without an intervening "
4792 "resumeFill() call before you may call this method.");
4793 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!X.isConstantStride() || !Y.isConstantStride(),
4794 std::runtime_error, "X and Y must be constant stride.");
4795 // If the two pointers are null, then they don't alias one
4796 // another, even though they are equal.
4797 // Kokkos does not guarantee that zero row-extent vectors
4798 // point to different places, so we have to check that too.
4799 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(X_lcl.data() == Y_lcl.data() && X_lcl.data() != nullptr && X_lcl.extent(0) != 0,
4800 std::runtime_error, "X and Y may not alias one another.");
4801 }
4802
4803 auto A_lcl = getLocalMatrixDevice();
4804
4805 if (!applyHelper.get()) {
4806 // The apply helper does not exist, so create it.
4807 // Decide now whether to use the imbalanced row path, or the default.
4808 bool useMergePath = false;
4809#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
4810 // TODO: when https://github.com/kokkos/kokkos-kernels/issues/2166 is fixed and,
4811 // we can use SPMV_MERGE_PATH for the native spmv as well.
4812 // Take out this ifdef to enable that.
4813 //
4814 // Until then, only use SPMV_MERGE_PATH when calling cuSPARSE.
4815 if constexpr (std::is_same_v<execution_space, Kokkos::Cuda>) {
4816 LocalOrdinal nrows = getLocalNumRows();
4818 if (nrows != 0)
4819 maxRowImbalance = getLocalMaxNumRowEntries() - (getLocalNumEntries() / nrows);
4820
4822 useMergePath = true;
4823 }
4824#endif
4825 applyHelper = std::make_shared<ApplyHelper>(A_lcl.nnz(), A_lcl.graph.row_map,
4826 useMergePath ? KokkosSparse::SPMV_MERGE_PATH : KokkosSparse::SPMV_DEFAULT);
4827 }
4828
4829 // Translate mode (Teuchos enum) to KokkosKernels (1-character string)
4830 const char* modeKK = nullptr;
4831 switch (mode) {
4832 case Teuchos::NO_TRANS:
4833 modeKK = KokkosSparse::NoTranspose;
4834 break;
4835 case Teuchos::TRANS:
4836 modeKK = KokkosSparse::Transpose;
4837 break;
4838 case Teuchos::CONJ_TRANS:
4839 modeKK = KokkosSparse::ConjugateTranspose;
4840 break;
4841 default:
4842 throw std::invalid_argument("Tpetra::CrsMatrix::localApply: invalid mode");
4843 }
4844
4845 if (applyHelper->shouldUseIntRowptrs()) {
4846 auto A_lcl_int_rowptrs = applyHelper->getIntRowptrMatrix(A_lcl);
4847 KokkosSparse::spmv(
4848 &applyHelper->handle_int, modeKK,
4850 } else {
4851 KokkosSparse::spmv(
4852 &applyHelper->handle, modeKK,
4854 }
4855}
4856
4857template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4861 Teuchos::ETransp mode,
4862 Scalar alpha,
4863 Scalar beta) const {
4865 const char fnName[] = "Tpetra::CrsMatrix::apply";
4866
4867 TEUCHOS_TEST_FOR_EXCEPTION(!isFillComplete(), std::runtime_error,
4868 fnName << ": Cannot call apply() until fillComplete() "
4869 "has been called.");
4870
4871 if (mode == Teuchos::NO_TRANS) {
4872 ProfilingRegion regionNonTranspose(fnName);
4873 this->applyNonTranspose(X, Y, alpha, beta);
4874 } else {
4875 ProfilingRegion regionTranspose("Tpetra::CrsMatrix::apply (transpose)");
4876 this->applyTranspose(X, Y, mode, alpha, beta);
4877 }
4878}
4879
4880template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4881template <class T>
4882Teuchos::RCP<CrsMatrix<T, LocalOrdinal, GlobalOrdinal, Node>>
4884 convert() const {
4885 using Teuchos::RCP;
4887 const char tfecfFuncName[] = "convert: ";
4888
4889 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!this->isFillComplete(), std::runtime_error,
4890 "This matrix (the source "
4891 "of the conversion) is not fill complete. You must first call "
4892 "fillComplete() (possibly with the domain and range Map) without an "
4893 "intervening call to resumeFill(), before you may call this method.");
4894
4895 RCP<output_matrix_type> newMatrix(new output_matrix_type(this->getCrsGraph()));
4896 // Copy old values into new values. impl_scalar_type and T may
4897 // differ, so we can't use Kokkos::deep_copy.
4898 using ::Tpetra::Details::copyConvert;
4899 copyConvert(newMatrix->getLocalMatrixDevice().values,
4900 this->getLocalMatrixDevice().values);
4901 // Since newmat has a static (const) graph, the graph already has
4902 // a column Map, and Import and Export objects already exist (if
4903 // applicable). Thus, calling fillComplete is cheap.
4904 newMatrix->fillComplete(this->getDomainMap(), this->getRangeMap());
4905
4906 return newMatrix;
4907}
4908
4909template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4911 checkInternalState() const {
4912 const bool debug = ::Tpetra::Details::Behavior::debug("CrsGraph");
4913 if (debug) {
4914 const char tfecfFuncName[] = "checkInternalState: ";
4915 const char err[] =
4916 "Internal state is not consistent. "
4917 "Please report this bug to the Tpetra developers.";
4918
4919 // This version of the graph (RCP<const crs_graph_type>) must
4920 // always be nonnull.
4921 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(staticGraph_.is_null(), std::logic_error, err);
4922 // myGraph == null means that the matrix has a const ("static")
4923 // graph. Otherwise, the matrix has a dynamic graph (it owns its
4924 // graph).
4925 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!myGraph_.is_null() && myGraph_ != staticGraph_,
4926 std::logic_error, err);
4927 // if matrix is fill complete, then graph must be fill complete
4928 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(isFillComplete() && !staticGraph_->isFillComplete(),
4929 std::logic_error, err << " Specifically, the matrix is fill complete, "
4930 "but its graph is NOT fill complete.");
4931 // if values are allocated and they are non-zero in number, then
4932 // one of the allocations should be present
4933 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(staticGraph_->indicesAreAllocated() &&
4934 staticGraph_->getLocalAllocationSize() > 0 &&
4935 staticGraph_->getLocalNumRows() > 0 &&
4936 valuesUnpacked_wdv.extent(0) == 0,
4937 std::logic_error, err);
4938 }
4939}
4940
4941template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4942std::string
4944 description() const {
4945 std::ostringstream os;
4946
4947 os << "Tpetra::CrsMatrix (Kokkos refactor): {";
4948 if (this->getObjectLabel() != "") {
4949 os << "Label: \"" << this->getObjectLabel() << "\", ";
4950 }
4951 if (isFillComplete()) {
4952 os << "isFillComplete: true"
4953 << ", global dimensions: [" << getGlobalNumRows() << ", "
4954 << getGlobalNumCols() << "]"
4955 << ", global number of entries: " << getGlobalNumEntries()
4956 << "}";
4957 } else {
4958 os << "isFillComplete: false"
4959 << ", global dimensions: [" << getGlobalNumRows() << ", "
4960 << getGlobalNumCols() << "]}";
4961 }
4962 return os.str();
4963}
4964
4965template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4967 describe(Teuchos::FancyOStream& out,
4968 const Teuchos::EVerbosityLevel verbLevel) const {
4969 using std::endl;
4970 using std::setw;
4971 using Teuchos::ArrayView;
4972 using Teuchos::Comm;
4973 using Teuchos::RCP;
4974 using Teuchos::TypeNameTraits;
4975 using Teuchos::VERB_DEFAULT;
4976 using Teuchos::VERB_EXTREME;
4977 using Teuchos::VERB_HIGH;
4978 using Teuchos::VERB_LOW;
4979 using Teuchos::VERB_MEDIUM;
4980 using Teuchos::VERB_NONE;
4981
4982 const Teuchos::EVerbosityLevel vl = (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
4983
4984 if (vl == VERB_NONE) {
4985 return; // Don't print anything at all
4986 }
4987
4988 // By convention, describe() always begins with a tab.
4989 Teuchos::OSTab tab0(out);
4990
4991 RCP<const Comm<int>> comm = this->getComm();
4992 const int myRank = comm->getRank();
4993 const int numProcs = comm->getSize();
4994 size_t width = 1;
4995 for (size_t dec = 10; dec < getGlobalNumRows(); dec *= 10) {
4996 ++width;
4997 }
4998 width = std::max<size_t>(width, static_cast<size_t>(11)) + 2;
4999
5000 // none: print nothing
5001 // low: print O(1) info from node 0
5002 // medium: print O(P) info, num entries per process
5003 // high: print O(N) info, num entries per row
5004 // extreme: print O(NNZ) info: print indices and values
5005 //
5006 // for medium and higher, print constituent objects at specified verbLevel
5007 if (myRank == 0) {
5008 out << "Tpetra::CrsMatrix (Kokkos refactor):" << endl;
5009 }
5010 Teuchos::OSTab tab1(out);
5011
5012 if (myRank == 0) {
5013 if (this->getObjectLabel() != "") {
5014 out << "Label: \"" << this->getObjectLabel() << "\", ";
5015 }
5016 {
5017 out << "Template parameters:" << endl;
5018 Teuchos::OSTab tab2(out);
5019 out << "Scalar: " << TypeNameTraits<Scalar>::name() << endl
5020 << "LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name() << endl
5021 << "GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name() << endl
5022 << "Node: " << TypeNameTraits<Node>::name() << endl;
5023 }
5024 if (isFillComplete()) {
5025 out << "isFillComplete: true" << endl
5026 << "Global dimensions: [" << getGlobalNumRows() << ", "
5027 << getGlobalNumCols() << "]" << endl
5028 << "Global number of entries: " << getGlobalNumEntries() << endl
5029 << endl
5030 << "Global max number of entries in a row: "
5031 << getGlobalMaxNumRowEntries() << endl;
5032 } else {
5033 out << "isFillComplete: false" << endl
5034 << "Global dimensions: [" << getGlobalNumRows() << ", "
5035 << getGlobalNumCols() << "]" << endl;
5036 }
5037 }
5038
5039 if (vl < VERB_MEDIUM) {
5040 return; // all done!
5041 }
5042
5043 // Describe the row Map.
5044 if (myRank == 0) {
5045 out << endl
5046 << "Row Map:" << endl;
5047 }
5048 if (getRowMap().is_null()) {
5049 if (myRank == 0) {
5050 out << "null" << endl;
5051 }
5052 } else {
5053 if (myRank == 0) {
5054 out << endl;
5055 }
5056 getRowMap()->describe(out, vl);
5057 }
5058
5059 // Describe the column Map.
5060 if (myRank == 0) {
5061 out << "Column Map: ";
5062 }
5063 if (getColMap().is_null()) {
5064 if (myRank == 0) {
5065 out << "null" << endl;
5066 }
5067 } else if (getColMap() == getRowMap()) {
5068 if (myRank == 0) {
5069 out << "same as row Map" << endl;
5070 }
5071 } else {
5072 if (myRank == 0) {
5073 out << endl;
5074 }
5075 getColMap()->describe(out, vl);
5076 }
5077
5078 // Describe the domain Map.
5079 if (myRank == 0) {
5080 out << "Domain Map: ";
5081 }
5082 if (getDomainMap().is_null()) {
5083 if (myRank == 0) {
5084 out << "null" << endl;
5085 }
5086 } else if (getDomainMap() == getRowMap()) {
5087 if (myRank == 0) {
5088 out << "same as row Map" << endl;
5089 }
5090 } else if (getDomainMap() == getColMap()) {
5091 if (myRank == 0) {
5092 out << "same as column Map" << endl;
5093 }
5094 } else {
5095 if (myRank == 0) {
5096 out << endl;
5097 }
5098 getDomainMap()->describe(out, vl);
5099 }
5100
5101 // Describe the range Map.
5102 if (myRank == 0) {
5103 out << "Range Map: ";
5104 }
5105 if (getRangeMap().is_null()) {
5106 if (myRank == 0) {
5107 out << "null" << endl;
5108 }
5109 } else if (getRangeMap() == getDomainMap()) {
5110 if (myRank == 0) {
5111 out << "same as domain Map" << endl;
5112 }
5113 } else if (getRangeMap() == getRowMap()) {
5114 if (myRank == 0) {
5115 out << "same as row Map" << endl;
5116 }
5117 } else {
5118 if (myRank == 0) {
5119 out << endl;
5120 }
5121 getRangeMap()->describe(out, vl);
5122 }
5123
5124 // O(P) data
5125 for (int curRank = 0; curRank < numProcs; ++curRank) {
5126 if (myRank == curRank) {
5127 out << "Process rank: " << curRank << endl;
5128 Teuchos::OSTab tab2(out);
5129 if (!staticGraph_->indicesAreAllocated()) {
5130 out << "Graph indices not allocated" << endl;
5131 } else {
5132 out << "Number of allocated entries: "
5133 << staticGraph_->getLocalAllocationSize() << endl;
5134 }
5135 out << "Number of entries: " << getLocalNumEntries() << endl
5136 << "Max number of entries per row: " << getLocalMaxNumRowEntries()
5137 << endl;
5138 }
5139 // Give output time to complete by executing some barriers.
5140 comm->barrier();
5141 comm->barrier();
5142 comm->barrier();
5143 }
5144
5145 if (vl < VERB_HIGH) {
5146 return; // all done!
5147 }
5148
5149 // O(N) and O(NNZ) data
5150 for (int curRank = 0; curRank < numProcs; ++curRank) {
5151 if (myRank == curRank) {
5152 out << std::setw(width) << "Proc Rank"
5153 << std::setw(width) << "Global Row"
5154 << std::setw(width) << "Num Entries";
5155 if (vl == VERB_EXTREME) {
5156 out << std::setw(width) << "(Index,Value)";
5157 }
5158 out << endl;
5159 for (size_t r = 0; r < getLocalNumRows(); ++r) {
5160 const size_t nE = getNumEntriesInLocalRow(r);
5161 GlobalOrdinal gid = getRowMap()->getGlobalElement(r);
5162 out << std::setw(width) << myRank
5163 << std::setw(width) << gid
5164 << std::setw(width) << nE;
5165 if (vl == VERB_EXTREME) {
5166 if (isGloballyIndexed()) {
5167 global_inds_host_view_type rowinds;
5168 values_host_view_type rowvals;
5169 getGlobalRowView(gid, rowinds, rowvals);
5170 for (size_t j = 0; j < nE; ++j) {
5171 out << " (" << rowinds[j]
5172 << ", " << rowvals[j]
5173 << ") ";
5174 }
5175 } else if (isLocallyIndexed()) {
5176 local_inds_host_view_type rowinds;
5177 values_host_view_type rowvals;
5178 getLocalRowView(r, rowinds, rowvals);
5179 for (size_t j = 0; j < nE; ++j) {
5180 out << " (" << getColMap()->getGlobalElement(rowinds[j])
5181 << ", " << rowvals[j]
5182 << ") ";
5183 }
5184 } // globally or locally indexed
5185 } // vl == VERB_EXTREME
5186 out << endl;
5187 } // for each row r on this process
5188 } // if (myRank == curRank)
5189
5190 // Give output time to complete
5191 comm->barrier();
5192 comm->barrier();
5193 comm->barrier();
5194 } // for each process p
5195}
5196
5197template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5200 // It's not clear what kind of compatibility checks on sizes can
5201 // be performed here. Epetra_CrsGraph doesn't check any sizes for
5202 // compatibility.
5203
5204 // Currently, the source object must be a RowMatrix with the same
5205 // four template parameters as the target CrsMatrix. We might
5206 // relax this requirement later.
5207 const row_matrix_type* srcRowMat =
5208 dynamic_cast<const row_matrix_type*>(&source);
5209 return (srcRowMat != nullptr);
5210}
5211
5212template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5215 const typename crs_graph_type::padding_type& padding,
5216 const bool verbose) {
5219 using std::endl;
5220 using LO = local_ordinal_type;
5221 using row_ptrs_type =
5222 typename local_graph_device_type::row_map_type::non_const_type;
5223 using range_policy =
5224 Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
5225 const char tfecfFuncName[] = "applyCrsPadding";
5226 const char suffix[] =
5227 ". Please report this bug to the Tpetra developers.";
5228 ProfilingRegion regionCAP("Tpetra::CrsMatrix::applyCrsPadding");
5229
5230 std::unique_ptr<std::string> prefix;
5231 if (verbose) {
5232 prefix = this->createPrefix("CrsMatrix", tfecfFuncName);
5233 std::ostringstream os;
5234 os << *prefix << "padding: ";
5235 padding.print(os);
5236 os << endl;
5237 std::cerr << os.str();
5238 }
5239 const int myRank = !verbose ? -1 : [&]() {
5240 auto map = this->getMap();
5241 if (map.is_null()) {
5242 return -1;
5243 }
5244 auto comm = map->getComm();
5245 if (comm.is_null()) {
5246 return -1;
5247 }
5248 return comm->getRank();
5249 }();
5250
5251 // NOTE (mfh 29 Jan 2020) This allocates the values array.
5252 if (!myGraph_->indicesAreAllocated()) {
5253 if (verbose) {
5254 std::ostringstream os;
5255 os << *prefix << "Call allocateIndices" << endl;
5256 std::cerr << os.str();
5257 }
5258 allocateValues(GlobalIndices, GraphNotYetAllocated, verbose);
5259 }
5260
5261 // FIXME (mfh 10 Feb 2020) We shouldn't actually reallocate
5262 // row_ptrs_beg or allocate row_ptrs_end unless the allocation
5263 // size needs to increase. That should be the job of
5264 // padCrsArrays.
5265
5266 // Making copies here because rowPtrsUnpacked_ has a const type. Otherwise, we
5267 // would use it directly.
5268
5269 if (verbose) {
5270 std::ostringstream os;
5271 os << *prefix << "Allocate row_ptrs_beg: "
5272 << myGraph_->getRowPtrsUnpackedHost().extent(0) << endl;
5273 std::cerr << os.str();
5274 }
5275 using Kokkos::view_alloc;
5276 using Kokkos::WithoutInitializing;
5277 row_ptrs_type row_ptr_beg(view_alloc("row_ptr_beg", WithoutInitializing),
5278 myGraph_->rowPtrsUnpacked_dev_.extent(0));
5279 // DEEP_COPY REVIEW - DEVICE-TO-DEVICE
5280 Kokkos::deep_copy(execution_space(), row_ptr_beg, myGraph_->rowPtrsUnpacked_dev_);
5281
5282 const size_t N = row_ptr_beg.extent(0) == 0 ? size_t(0) : size_t(row_ptr_beg.extent(0) - 1);
5283 if (verbose) {
5284 std::ostringstream os;
5285 os << *prefix << "Allocate row_ptrs_end: " << N << endl;
5286 std::cerr << os.str();
5287 }
5288 row_ptrs_type row_ptr_end(
5289 view_alloc("row_ptr_end", WithoutInitializing), N);
5290
5291 row_ptrs_type num_row_entries_d;
5292
5293 const bool refill_num_row_entries =
5294 myGraph_->k_numRowEntries_.extent(0) != 0;
5295
5296 if (refill_num_row_entries) { // unpacked storage
5297 // We can't assume correct *this capture until C++17, and it's
5298 // likely more efficient just to capture what we need anyway.
5299 num_row_entries_d = create_mirror_view_and_copy(memory_space(),
5300 myGraph_->k_numRowEntries_);
5301 Kokkos::parallel_for(
5302 "Fill end row pointers", range_policy(0, N),
5303 KOKKOS_LAMBDA(const size_t i) {
5304 row_ptr_end(i) = row_ptr_beg(i) + num_row_entries_d(i);
5305 });
5306 } else {
5307 // FIXME (mfh 04 Feb 2020) Fix padCrsArrays so that if packed
5308 // storage, we don't need row_ptr_end to be separate allocation;
5309 // could just have it alias row_ptr_beg+1.
5310 Kokkos::parallel_for(
5311 "Fill end row pointers", range_policy(0, N),
5312 KOKKOS_LAMBDA(const size_t i) {
5313 row_ptr_end(i) = row_ptr_beg(i + 1);
5314 });
5315 }
5316
5317 if (myGraph_->isGloballyIndexed()) {
5318 padCrsArrays(row_ptr_beg, row_ptr_end,
5319 myGraph_->gblInds_wdv,
5320 valuesUnpacked_wdv, padding, myRank, verbose);
5321 const auto newValuesLen = valuesUnpacked_wdv.extent(0);
5322 const auto newColIndsLen = myGraph_->gblInds_wdv.extent(0);
5323 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(newValuesLen != newColIndsLen, std::logic_error,
5324 ": After padding, valuesUnpacked_wdv.extent(0)=" << newValuesLen
5325 << " != myGraph_->gblInds_wdv.extent(0)=" << newColIndsLen
5326 << suffix);
5327 } else {
5328 padCrsArrays(row_ptr_beg, row_ptr_end,
5329 myGraph_->lclIndsUnpacked_wdv,
5330 valuesUnpacked_wdv, padding, myRank, verbose);
5331 const auto newValuesLen = valuesUnpacked_wdv.extent(0);
5332 const auto newColIndsLen = myGraph_->lclIndsUnpacked_wdv.extent(0);
5333 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(newValuesLen != newColIndsLen, std::logic_error,
5334 ": After padding, valuesUnpacked_wdv.extent(0)=" << newValuesLen
5335 << " != myGraph_->lclIndsUnpacked_wdv.extent(0)=" << newColIndsLen
5336 << suffix);
5337 }
5338
5339 if (refill_num_row_entries) {
5340 Kokkos::parallel_for(
5341 "Fill num entries", range_policy(0, N),
5342 KOKKOS_LAMBDA(const size_t i) {
5343 num_row_entries_d(i) = row_ptr_end(i) - row_ptr_beg(i);
5344 });
5345 Kokkos::deep_copy(myGraph_->k_numRowEntries_, num_row_entries_d);
5346 }
5347
5348 if (verbose) {
5349 std::ostringstream os;
5350 os << *prefix << "Assign myGraph_->rowPtrsUnpacked_; "
5351 << "old size: " << myGraph_->rowPtrsUnpacked_host_.extent(0)
5352 << ", new size: " << row_ptr_beg.extent(0) << endl;
5353 std::cerr << os.str();
5354 TEUCHOS_ASSERT(myGraph_->getRowPtrsUnpackedHost().extent(0) ==
5355 row_ptr_beg.extent(0));
5356 }
5357 myGraph_->setRowPtrsUnpacked(row_ptr_beg);
5358}
5359
5360template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5361void CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
5362 copyAndPermuteStaticGraph(
5363 const RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& srcMat,
5364 const size_t numSameIDs,
5365 const LocalOrdinal permuteToLIDs[],
5366 const LocalOrdinal permuteFromLIDs[],
5367 const size_t numPermutes) {
5368 using Details::ProfilingRegion;
5369 using std::endl;
5370 using Teuchos::Array;
5371 using Teuchos::ArrayView;
5372 using LO = LocalOrdinal;
5373 using GO = GlobalOrdinal;
5374 const char tfecfFuncName[] = "copyAndPermuteStaticGraph";
5375 const char suffix[] =
5376 " Please report this bug to the Tpetra developers.";
5377 ProfilingRegion regionCAP("Tpetra::CrsMatrix::copyAndPermuteStaticGraph");
5378
5379 const bool debug = Details::Behavior::debug("CrsGraph");
5380 const bool verbose = Details::Behavior::verbose("CrsGraph");
5381 std::unique_ptr<std::string> prefix;
5382 if (verbose) {
5383 prefix = this->createPrefix("CrsGraph", tfecfFuncName);
5384 std::ostringstream os;
5385 os << *prefix << "Start" << endl;
5386 }
5387 const char* const prefix_raw =
5388 verbose ? prefix.get()->c_str() : nullptr;
5389
5390 const bool sourceIsLocallyIndexed = srcMat.isLocallyIndexed();
5391 //
5392 // Copy the first numSame row from source to target (this matrix).
5393 // This involves copying rows corresponding to LIDs [0, numSame-1].
5394 //
5395 const map_type& srcRowMap = *(srcMat.getRowMap());
5396 nonconst_global_inds_host_view_type rowInds;
5397 nonconst_values_host_view_type rowVals;
5398 const LO numSameIDs_as_LID = static_cast<LO>(numSameIDs);
5399 for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) {
5400 // Global ID for the current row index in the source matrix.
5401 // The first numSameIDs GIDs in the two input lists are the
5402 // same, so sourceGID == targetGID in this case.
5403 const GO sourceGID = srcRowMap.getGlobalElement(sourceLID);
5404 const GO targetGID = sourceGID;
5405
5406 ArrayView<const GO> rowIndsConstView;
5407 ArrayView<const Scalar> rowValsConstView;
5408
5409 if (sourceIsLocallyIndexed) {
5410 const size_t rowLength = srcMat.getNumEntriesInGlobalRow(sourceGID);
5411 if (rowLength > static_cast<size_t>(rowInds.size())) {
5412 Kokkos::resize(rowInds, rowLength);
5413 Kokkos::resize(rowVals, rowLength);
5414 }
5415 // Resizing invalidates an Array's views, so we must make new
5416 // ones, even if rowLength hasn't changed.
5417 nonconst_global_inds_host_view_type rowIndsView = Kokkos::subview(rowInds, std::make_pair((size_t)0, rowLength));
5418 nonconst_values_host_view_type rowValsView = Kokkos::subview(rowVals, std::make_pair((size_t)0, rowLength));
5419
5420 // The source matrix is locally indexed, so we have to get a
5421 // copy. Really it's the GIDs that have to be copied (because
5422 // they have to be converted from LIDs).
5423 size_t checkRowLength = 0;
5424 srcMat.getGlobalRowCopy(sourceGID, rowIndsView,
5425 rowValsView, checkRowLength);
5426 if (debug) {
5427 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(rowLength != checkRowLength, std::logic_error,
5428 "For "
5429 "global row index "
5430 << sourceGID << ", the source "
5431 "matrix's getNumEntriesInGlobalRow returns a row length "
5432 "of "
5433 << rowLength << ", but getGlobalRowCopy reports "
5434 "a row length of "
5435 << checkRowLength << "." << suffix);
5436 }
5437
5438 // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5439 // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5440 // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5441 // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5442 rowIndsConstView = Teuchos::ArrayView<const GO>( // BAD BAD BAD
5443 rowIndsView.data(), rowIndsView.extent(0),
5444 Teuchos::RCP_DISABLE_NODE_LOOKUP);
5445 rowValsConstView = Teuchos::ArrayView<const Scalar>( // BAD BAD BAD
5446 reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5447 Teuchos::RCP_DISABLE_NODE_LOOKUP);
5448 // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5449 // KDDKDD UVM TEMPORARY: KokkosView interface
5450 } else { // source matrix is globally indexed.
5451 global_inds_host_view_type rowIndsView;
5452 values_host_view_type rowValsView;
5453 srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
5454 // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5455 // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5456 // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5457 // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5458 rowIndsConstView = Teuchos::ArrayView<const GO>( // BAD BAD BAD
5459 rowIndsView.data(), rowIndsView.extent(0),
5460 Teuchos::RCP_DISABLE_NODE_LOOKUP);
5461 rowValsConstView = Teuchos::ArrayView<const Scalar>( // BAD BAD BAD
5462 reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5463 Teuchos::RCP_DISABLE_NODE_LOOKUP);
5464 // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5465 // KDDKDD UVM TEMPORARY: KokkosView interface
5466 }
5467
5468 // Applying a permutation to a matrix with a static graph
5469 // means REPLACE-ing entries.
5470 combineGlobalValues(targetGID, rowIndsConstView,
5471 rowValsConstView, REPLACE,
5472 prefix_raw, debug, verbose);
5473 }
5474
5475 if (verbose) {
5476 std::ostringstream os;
5477 os << *prefix << "Do permutes" << endl;
5478 }
5479
5480 const map_type& tgtRowMap = *(this->getRowMap());
5481 for (size_t p = 0; p < numPermutes; ++p) {
5482 const GO sourceGID = srcRowMap.getGlobalElement(permuteFromLIDs[p]);
5483 const GO targetGID = tgtRowMap.getGlobalElement(permuteToLIDs[p]);
5484
5485 ArrayView<const GO> rowIndsConstView;
5486 ArrayView<const Scalar> rowValsConstView;
5487
5488 if (sourceIsLocallyIndexed) {
5489 const size_t rowLength = srcMat.getNumEntriesInGlobalRow(sourceGID);
5490 if (rowLength > static_cast<size_t>(rowInds.size())) {
5491 Kokkos::resize(rowInds, rowLength);
5492 Kokkos::resize(rowVals, rowLength);
5493 }
5494 // Resizing invalidates an Array's views, so we must make new
5495 // ones, even if rowLength hasn't changed.
5496 nonconst_global_inds_host_view_type rowIndsView = Kokkos::subview(rowInds, std::make_pair((size_t)0, rowLength));
5497 nonconst_values_host_view_type rowValsView = Kokkos::subview(rowVals, std::make_pair((size_t)0, rowLength));
5498
5499 // The source matrix is locally indexed, so we have to get a
5500 // copy. Really it's the GIDs that have to be copied (because
5501 // they have to be converted from LIDs).
5502 size_t checkRowLength = 0;
5503 srcMat.getGlobalRowCopy(sourceGID, rowIndsView,
5504 rowValsView, checkRowLength);
5505 if (debug) {
5506 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(rowLength != checkRowLength, std::logic_error,
5507 "For "
5508 "source matrix global row index "
5509 << sourceGID << ", "
5510 "getNumEntriesInGlobalRow returns a row length of "
5511 << rowLength << ", but getGlobalRowCopy a row length of "
5512 << checkRowLength << "." << suffix);
5513 }
5514
5515 // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5516 // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5517 // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5518 // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5519 rowIndsConstView = Teuchos::ArrayView<const GO>( // BAD BAD BAD
5520 rowIndsView.data(), rowIndsView.extent(0),
5521 Teuchos::RCP_DISABLE_NODE_LOOKUP);
5522 rowValsConstView = Teuchos::ArrayView<const Scalar>( // BAD BAD BAD
5523 reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5524 Teuchos::RCP_DISABLE_NODE_LOOKUP);
5525 // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5526 // KDDKDD UVM TEMPORARY: KokkosView interface
5527 } else {
5528 global_inds_host_view_type rowIndsView;
5529 values_host_view_type rowValsView;
5530 srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
5531 // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5532 // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5533 // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5534 // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5535 rowIndsConstView = Teuchos::ArrayView<const GO>( // BAD BAD BAD
5536 rowIndsView.data(), rowIndsView.extent(0),
5537 Teuchos::RCP_DISABLE_NODE_LOOKUP);
5538 rowValsConstView = Teuchos::ArrayView<const Scalar>( // BAD BAD BAD
5539 reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5540 Teuchos::RCP_DISABLE_NODE_LOOKUP);
5541 // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5542 // KDDKDD UVM TEMPORARY: KokkosView interface
5543 }
5544
5545 combineGlobalValues(targetGID, rowIndsConstView,
5546 rowValsConstView, REPLACE,
5547 prefix_raw, debug, verbose);
5548 }
5549
5550 if (verbose) {
5551 std::ostringstream os;
5552 os << *prefix << "Done" << endl;
5553 }
5554}
5555
5556template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5557void CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
5558 copyAndPermuteNonStaticGraph(
5559 const RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& srcMat,
5560 const size_t numSameIDs,
5561 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs_dv,
5562 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs_dv,
5563 const size_t numPermutes) {
5564 using Details::ProfilingRegion;
5565 using std::endl;
5566 using Teuchos::Array;
5567 using Teuchos::ArrayView;
5568 using LO = LocalOrdinal;
5569 using GO = GlobalOrdinal;
5570 const char tfecfFuncName[] = "copyAndPermuteNonStaticGraph";
5571 const char suffix[] =
5572 " Please report this bug to the Tpetra developers.";
5573 ProfilingRegion regionCAP("Tpetra::CrsMatrix::copyAndPermuteNonStaticGraph");
5574
5575 const bool debug = Details::Behavior::debug("CrsGraph");
5576 const bool verbose = Details::Behavior::verbose("CrsGraph");
5577 std::unique_ptr<std::string> prefix;
5578 if (verbose) {
5579 prefix = this->createPrefix("CrsGraph", tfecfFuncName);
5580 std::ostringstream os;
5581 os << *prefix << "Start" << endl;
5582 }
5583 const char* const prefix_raw =
5584 verbose ? prefix.get()->c_str() : nullptr;
5585
5586 {
5587 using row_graph_type = RowGraph<LO, GO, Node>;
5588 const row_graph_type& srcGraph = *(srcMat.getGraph());
5589 auto padding =
5590 myGraph_->computeCrsPadding(srcGraph, numSameIDs,
5591 permuteToLIDs_dv, permuteFromLIDs_dv, verbose);
5592 applyCrsPadding(*padding, verbose);
5593 }
5594 const bool sourceIsLocallyIndexed = srcMat.isLocallyIndexed();
5595 //
5596 // Copy the first numSame row from source to target (this matrix).
5597 // This involves copying rows corresponding to LIDs [0, numSame-1].
5598 //
5599 const map_type& srcRowMap = *(srcMat.getRowMap());
5600 const LO numSameIDs_as_LID = static_cast<LO>(numSameIDs);
5601 using gids_type = nonconst_global_inds_host_view_type;
5602 using vals_type = nonconst_values_host_view_type;
5603 gids_type rowInds;
5604 vals_type rowVals;
5605 for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) {
5606 // Global ID for the current row index in the source matrix.
5607 // The first numSameIDs GIDs in the two input lists are the
5608 // same, so sourceGID == targetGID in this case.
5609 const GO sourceGID = srcRowMap.getGlobalElement(sourceLID);
5610 const GO targetGID = sourceGID;
5611
5612 ArrayView<const GO> rowIndsConstView;
5613 ArrayView<const Scalar> rowValsConstView;
5614
5615 if (sourceIsLocallyIndexed) {
5616 const size_t rowLength = srcMat.getNumEntriesInGlobalRow(sourceGID);
5617 if (rowLength > static_cast<size_t>(rowInds.extent(0))) {
5618 Kokkos::resize(rowInds, rowLength);
5619 Kokkos::resize(rowVals, rowLength);
5620 }
5621 // Resizing invalidates an Array's views, so we must make new
5622 // ones, even if rowLength hasn't changed.
5623 gids_type rowIndsView = Kokkos::subview(rowInds, std::make_pair((size_t)0, rowLength));
5624 vals_type rowValsView = Kokkos::subview(rowVals, std::make_pair((size_t)0, rowLength));
5625
5626 // The source matrix is locally indexed, so we have to get a
5627 // copy. Really it's the GIDs that have to be copied (because
5628 // they have to be converted from LIDs).
5629 size_t checkRowLength = 0;
5630 srcMat.getGlobalRowCopy(sourceGID, rowIndsView, rowValsView,
5631 checkRowLength);
5632 if (debug) {
5633 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(rowLength != checkRowLength, std::logic_error,
5634 ": For "
5635 "global row index "
5636 << sourceGID << ", the source "
5637 "matrix's getNumEntriesInGlobalRow returns a row length "
5638 "of "
5639 << rowLength << ", but getGlobalRowCopy reports "
5640 "a row length of "
5641 << checkRowLength << "." << suffix);
5642 }
5643 rowIndsConstView = Teuchos::ArrayView<const GO>(rowIndsView.data(), rowLength);
5644 rowValsConstView = Teuchos::ArrayView<const Scalar>(reinterpret_cast<Scalar*>(rowValsView.data()), rowLength);
5645 } else { // source matrix is globally indexed.
5646 global_inds_host_view_type rowIndsView;
5647 values_host_view_type rowValsView;
5648 srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
5649
5650 // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5651 // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5652 // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5653 // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5654 rowIndsConstView = Teuchos::ArrayView<const GO>( // BAD BAD BAD
5655 rowIndsView.data(), rowIndsView.extent(0),
5656 Teuchos::RCP_DISABLE_NODE_LOOKUP);
5657 rowValsConstView = Teuchos::ArrayView<const Scalar>( // BAD BAD BAD
5658 reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5659 Teuchos::RCP_DISABLE_NODE_LOOKUP);
5660 // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5661 // KDDKDD UVM TEMPORARY: KokkosView interface
5662 }
5663
5664 // Combine the data into the target matrix.
5665 insertGlobalValuesFilteredChecked(targetGID, rowIndsConstView,
5666 rowValsConstView, prefix_raw, debug, verbose);
5667 }
5668
5669 if (verbose) {
5670 std::ostringstream os;
5671 os << *prefix << "Do permutes" << endl;
5672 }
5673 const LO* const permuteFromLIDs = permuteFromLIDs_dv.view_host().data();
5674 const LO* const permuteToLIDs = permuteToLIDs_dv.view_host().data();
5675
5676 const map_type& tgtRowMap = *(this->getRowMap());
5677 for (size_t p = 0; p < numPermutes; ++p) {
5678 const GO sourceGID = srcRowMap.getGlobalElement(permuteFromLIDs[p]);
5679 const GO targetGID = tgtRowMap.getGlobalElement(permuteToLIDs[p]);
5680
5681 ArrayView<const GO> rowIndsConstView;
5682 ArrayView<const Scalar> rowValsConstView;
5683
5684 if (sourceIsLocallyIndexed) {
5685 const size_t rowLength = srcMat.getNumEntriesInGlobalRow(sourceGID);
5686 if (rowLength > static_cast<size_t>(rowInds.extent(0))) {
5687 Kokkos::resize(rowInds, rowLength);
5688 Kokkos::resize(rowVals, rowLength);
5689 }
5690 // Resizing invalidates an Array's views, so we must make new
5691 // ones, even if rowLength hasn't changed.
5692 gids_type rowIndsView = Kokkos::subview(rowInds, std::make_pair((size_t)0, rowLength));
5693 vals_type rowValsView = Kokkos::subview(rowVals, std::make_pair((size_t)0, rowLength));
5694
5695 // The source matrix is locally indexed, so we have to get a
5696 // copy. Really it's the GIDs that have to be copied (because
5697 // they have to be converted from LIDs).
5698 size_t checkRowLength = 0;
5699 srcMat.getGlobalRowCopy(sourceGID, rowIndsView,
5700 rowValsView, checkRowLength);
5701 if (debug) {
5702 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(rowLength != checkRowLength, std::logic_error,
5703 "For "
5704 "source matrix global row index "
5705 << sourceGID << ", "
5706 "getNumEntriesInGlobalRow returns a row length of "
5707 << rowLength << ", but getGlobalRowCopy a row length of "
5708 << checkRowLength << "." << suffix);
5709 }
5710 rowIndsConstView = Teuchos::ArrayView<const GO>(rowIndsView.data(), rowLength);
5711 rowValsConstView = Teuchos::ArrayView<const Scalar>(reinterpret_cast<Scalar*>(rowValsView.data()), rowLength);
5712 } else {
5713 global_inds_host_view_type rowIndsView;
5714 values_host_view_type rowValsView;
5715 srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
5716
5717 // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5718 // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5719 // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5720 // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5721 rowIndsConstView = Teuchos::ArrayView<const GO>( // BAD BAD BAD
5722 rowIndsView.data(), rowIndsView.extent(0),
5723 Teuchos::RCP_DISABLE_NODE_LOOKUP);
5724 rowValsConstView = Teuchos::ArrayView<const Scalar>( // BAD BAD BAD
5725 reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5726 Teuchos::RCP_DISABLE_NODE_LOOKUP);
5727 // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5728 // KDDKDD UVM TEMPORARY: KokkosView interface
5729 }
5730
5731 // Combine the data into the target matrix.
5732 insertGlobalValuesFilteredChecked(targetGID, rowIndsConstView,
5733 rowValsConstView, prefix_raw, debug, verbose);
5734 }
5735
5736 if (verbose) {
5737 std::ostringstream os;
5738 os << *prefix << "Done" << endl;
5739 }
5740}
5741
5742template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5745 const SrcDistObject& srcObj,
5746 const size_t numSameIDs,
5747 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs,
5748 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs,
5749 const CombineMode /*CM*/) {
5750 using Details::Behavior;
5753 using std::endl;
5754
5755 // Method name string for TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC.
5756 const char tfecfFuncName[] = "copyAndPermute: ";
5757 ProfilingRegion regionCAP("Tpetra::CrsMatrix::copyAndPermute");
5758
5759 const bool verbose = Behavior::verbose("CrsMatrix");
5760 std::unique_ptr<std::string> prefix;
5761 if (verbose) {
5762 prefix = this->createPrefix("CrsMatrix", "copyAndPermute");
5763 std::ostringstream os;
5764 os << *prefix << endl
5765 << *prefix << " numSameIDs: " << numSameIDs << endl
5766 << *prefix << " numPermute: " << permuteToLIDs.extent(0)
5767 << endl
5768 << *prefix << " "
5769 << dualViewStatusToString(permuteToLIDs, "permuteToLIDs")
5770 << endl
5771 << *prefix << " "
5772 << dualViewStatusToString(permuteFromLIDs, "permuteFromLIDs")
5773 << endl
5774 << *prefix << " "
5775 << "isStaticGraph: " << (isStaticGraph() ? "true" : "false")
5776 << endl;
5777 std::cerr << os.str();
5778 }
5779
5780 const auto numPermute = permuteToLIDs.extent(0);
5782 std::invalid_argument, "permuteToLIDs.extent(0) = " << numPermute << "!= permuteFromLIDs.extent(0) = " << permuteFromLIDs.extent(0) << ".");
5783
5784 // This dynamic cast should succeed, because we've already tested
5785 // it in checkSizes().
5787 const RMT& srcMat = dynamic_cast<const RMT&>(srcObj);
5788 if (isStaticGraph()) {
5789 TEUCHOS_ASSERT(!permuteToLIDs.need_sync_host());
5790 auto permuteToLIDs_h = permuteToLIDs.view_host();
5791 TEUCHOS_ASSERT(!permuteFromLIDs.need_sync_host());
5792 auto permuteFromLIDs_h = permuteFromLIDs.view_host();
5793
5794 copyAndPermuteStaticGraph(srcMat, numSameIDs,
5795 permuteToLIDs_h.data(),
5796 permuteFromLIDs_h.data(),
5797 numPermute);
5798 } else {
5799 copyAndPermuteNonStaticGraph(srcMat, numSameIDs, permuteToLIDs,
5801 }
5802
5803 if (verbose) {
5804 std::ostringstream os;
5805 os << *prefix << "Done" << endl;
5806 std::cerr << os.str();
5807 }
5808}
5809
5810template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5813 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
5814 Kokkos::DualView<char*, buffer_device_type>& exports,
5815 Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
5816 size_t& constantNumPackets) {
5817 using Details::Behavior;
5820 using std::endl;
5821 using Teuchos::outArg;
5822 using Teuchos::REDUCE_MAX;
5823 using Teuchos::reduceAll;
5824 typedef LocalOrdinal LO;
5825 typedef GlobalOrdinal GO;
5826 const char tfecfFuncName[] = "packAndPrepare: ";
5827 ProfilingRegion regionPAP("Tpetra::CrsMatrix::packAndPrepare");
5828
5829 const bool debug = Behavior::debug("CrsMatrix");
5830 const bool verbose = Behavior::verbose("CrsMatrix");
5831
5832 // Processes on which the communicator is null should not participate.
5833 Teuchos::RCP<const Teuchos::Comm<int>> pComm = this->getComm();
5834 if (pComm.is_null()) {
5835 return;
5836 }
5837 const Teuchos::Comm<int>& comm = *pComm;
5838 const int myRank = comm.getSize();
5839
5840 std::unique_ptr<std::string> prefix;
5841 if (verbose) {
5842 prefix = this->createPrefix("CrsMatrix", "packAndPrepare");
5843 std::ostringstream os;
5844 os << *prefix << "Start" << endl
5845 << *prefix << " "
5846 << dualViewStatusToString(exportLIDs, "exportLIDs")
5847 << endl
5848 << *prefix << " "
5849 << dualViewStatusToString(exports, "exports")
5850 << endl
5851 << *prefix << " "
5852 << dualViewStatusToString(numPacketsPerLID, "numPacketsPerLID")
5853 << endl;
5854 std::cerr << os.str();
5855 }
5856
5857 // Attempt to cast the source object to CrsMatrix. If successful,
5858 // use the source object's packNew() method to pack its data for
5859 // communication. Otherwise, attempt to cast to RowMatrix; if
5860 // successful, use the source object's pack() method. Otherwise,
5861 // the source object doesn't have the right type.
5862 //
5863 // FIXME (mfh 30 Jun 2013, 11 Sep 2017) We don't even need the
5864 // RowMatrix to have the same Node type. Unfortunately, we don't
5865 // have a way to ask if the RowMatrix is "a RowMatrix with any
5866 // Node type," since RowMatrix doesn't have a base class. A
5867 // hypothetical RowMatrixBase<Scalar, LO, GO> class, which does
5868 // not currently exist, would satisfy this requirement.
5869 //
5870 // Why RowMatrixBase<Scalar, LO, GO>? The source object's Scalar
5871 // type doesn't technically need to match the target object's
5872 // Scalar type, so we could just have RowMatrixBase<LO, GO>. LO
5873 // and GO need not be the same, as long as there is no overflow of
5874 // the indices. However, checking for index overflow is global
5875 // and therefore undesirable.
5876
5877 std::ostringstream msg; // for collecting error messages
5878 int lclBad = 0; // to be set below
5879
5880 using crs_matrix_type = CrsMatrix<Scalar, LO, GO, Node>;
5881 const crs_matrix_type* srcCrsMat =
5882 dynamic_cast<const crs_matrix_type*>(&source);
5883 if (srcCrsMat != nullptr) {
5884 if (verbose) {
5885 std::ostringstream os;
5886 os << *prefix << "Source matrix same (CrsMatrix) type as target; "
5887 "calling packNew"
5888 << endl;
5889 std::cerr << os.str();
5890 }
5891 try {
5892 srcCrsMat->packNew(exportLIDs, exports, numPacketsPerLID,
5893 constantNumPackets);
5894 } catch (std::exception& e) {
5895 lclBad = 1;
5896 msg << "Proc " << myRank << ": " << e.what() << std::endl;
5897 }
5898 } else {
5899 using Kokkos::HostSpace;
5900 using Kokkos::subview;
5901 using exports_type = Kokkos::DualView<char*, buffer_device_type>;
5902 using range_type = Kokkos::pair<size_t, size_t>;
5903
5904 if (verbose) {
5905 std::ostringstream os;
5906 os << *prefix << "Source matrix NOT same (CrsMatrix) type as target"
5907 << endl;
5908 std::cerr << os.str();
5909 }
5910
5911 const row_matrix_type* srcRowMat =
5912 dynamic_cast<const row_matrix_type*>(&source);
5913 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(srcRowMat == nullptr, std::invalid_argument,
5914 "The source object of the Import or Export operation is neither a "
5915 "CrsMatrix (with the same template parameters as the target object), "
5916 "nor a RowMatrix (with the same first four template parameters as the "
5917 "target object).");
5918
5919 // For the RowMatrix case, we need to convert from
5920 // Kokkos::DualView to Teuchos::Array*. This doesn't need to be
5921 // so terribly efficient, since packing a non-CrsMatrix
5922 // RowMatrix for Import/Export into a CrsMatrix is not a
5923 // critical case. Thus, we may allocate Teuchos::Array objects
5924 // here and copy to and from Kokkos::*View.
5925
5926 // View exportLIDs's host data as a Teuchos::ArrayView.
5927 TEUCHOS_ASSERT(!exportLIDs.need_sync_host());
5928 auto exportLIDs_h = exportLIDs.view_host();
5929 Teuchos::ArrayView<const LO> exportLIDs_av(exportLIDs_h.data(),
5930 exportLIDs_h.size());
5931
5932 // pack() will allocate exports_a as needed. We'll copy back
5933 // into exports (after (re)allocating exports if needed) below.
5934 Teuchos::Array<char> exports_a;
5935
5936 // View exportLIDs' host data as a Teuchos::ArrayView. We don't
5937 // need to sync, since we're doing write-only access, but we do
5938 // need to mark the DualView as modified on host.
5939
5940 numPacketsPerLID.clear_sync_state(); // write-only access
5941 numPacketsPerLID.modify_host();
5942 auto numPacketsPerLID_h = numPacketsPerLID.view_host();
5943 Teuchos::ArrayView<size_t> numPacketsPerLID_av(numPacketsPerLID_h.data(),
5944 numPacketsPerLID_h.size());
5945
5946 // Invoke RowMatrix's legacy pack() interface, using above
5947 // Teuchos::Array* objects.
5948 try {
5949 srcRowMat->pack(exportLIDs_av, exports_a, numPacketsPerLID_av,
5950 constantNumPackets);
5951 } catch (std::exception& e) {
5952 lclBad = 1;
5953 msg << "Proc " << myRank << ": " << e.what() << std::endl;
5954 }
5955
5956 // Allocate 'exports', and copy exports_a back into it.
5957 const size_t newAllocSize = static_cast<size_t>(exports_a.size());
5958 if (static_cast<size_t>(exports.extent(0)) < newAllocSize) {
5959 const std::string oldLabel = exports.view_device().label();
5960 const std::string newLabel = (oldLabel == "") ? "exports" : oldLabel;
5961 exports = exports_type(newLabel, newAllocSize);
5962 }
5963 // It's safe to assume that we're working on host anyway, so
5964 // just keep exports sync'd to host.
5965 // ignore current device contents
5966 exports.modify_host();
5967
5968 auto exports_h = exports.view_host();
5969 auto exports_h_sub = subview(exports_h, range_type(0, newAllocSize));
5970
5971 // Kokkos::deep_copy needs a Kokkos::View input, so turn
5972 // exports_a into a nonowning Kokkos::View first before copying.
5973 typedef typename exports_type::t_host::execution_space HES;
5974 typedef Kokkos::Device<HES, HostSpace> host_device_type;
5975 Kokkos::View<const char*, host_device_type>
5976 exports_a_kv(exports_a.getRawPtr(), newAllocSize);
5977 // DEEP_COPY REVIEW - NOT TESTED
5978 Kokkos::deep_copy(exports_h_sub, exports_a_kv);
5979 }
5980
5981 if (debug) {
5982 int gblBad = 0; // output argument; to be set below
5983 reduceAll<int, int>(comm, REDUCE_MAX, lclBad, outArg(gblBad));
5984 if (gblBad != 0) {
5985 Tpetra::Details::gathervPrint(std::cerr, msg.str(), comm);
5986 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::logic_error,
5987 "packNew() or pack() threw an exception on "
5988 "one or more participating processes.");
5989 }
5990 } else {
5991 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(lclBad != 0, std::logic_error,
5992 "packNew threw an exception on one "
5993 "or more participating processes. Here is this process' error "
5994 "message: "
5995 << msg.str());
5996 }
5997
5998 if (verbose) {
5999 std::ostringstream os;
6000 os << *prefix << "packAndPrepare: Done!" << endl
6001 << *prefix << " "
6002 << dualViewStatusToString(exportLIDs, "exportLIDs")
6003 << endl
6004 << *prefix << " "
6005 << dualViewStatusToString(exports, "exports")
6006 << endl
6007 << *prefix << " "
6008 << dualViewStatusToString(numPacketsPerLID, "numPacketsPerLID")
6009 << endl;
6010 std::cerr << os.str();
6011 }
6012}
6013
6014template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6015size_t
6016CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6017 packRow(char exports[],
6018 const size_t offset,
6019 const size_t numEnt,
6020 const GlobalOrdinal gidsIn[],
6021 const impl_scalar_type valsIn[],
6022 const size_t numBytesPerValue) const {
6023 using Kokkos::subview;
6024 using Kokkos::View;
6026 typedef LocalOrdinal LO;
6027 typedef GlobalOrdinal GO;
6028 typedef impl_scalar_type ST;
6029
6030 if (numEnt == 0) {
6031 // Empty rows always take zero bytes, to ensure sparsity.
6032 return 0;
6033 }
6034
6035 const GO gid = 0; // packValueCount wants this
6036 const LO numEntLO = static_cast<size_t>(numEnt);
6037
6038 const size_t numEntBeg = offset;
6039 const size_t numEntLen = PackTraits<LO>::packValueCount(numEntLO);
6040 const size_t gidsBeg = numEntBeg + numEntLen;
6041 const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount(gid);
6042 const size_t valsBeg = gidsBeg + gidsLen;
6043 const size_t valsLen = numEnt * numBytesPerValue;
6044
6045 char* const numEntOut = exports + numEntBeg;
6046 char* const gidsOut = exports + gidsBeg;
6047 char* const valsOut = exports + valsBeg;
6048
6049 size_t numBytesOut = 0;
6050 int errorCode = 0;
6051 numBytesOut += PackTraits<LO>::packValue(numEntOut, numEntLO);
6052
6053 {
6054 Kokkos::pair<int, size_t> p;
6055 p = PackTraits<GO>::packArray(gidsOut, gidsIn, numEnt);
6056 errorCode += p.first;
6057 numBytesOut += p.second;
6058
6059 p = PackTraits<ST>::packArray(valsOut, valsIn, numEnt);
6060 errorCode += p.first;
6061 numBytesOut += p.second;
6062 }
6063
6064 const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
6065 TEUCHOS_TEST_FOR_EXCEPTION(numBytesOut != expectedNumBytes, std::logic_error,
6066 "packRow: "
6067 "numBytesOut = "
6068 << numBytesOut << " != expectedNumBytes = "
6069 << expectedNumBytes << ".");
6070 TEUCHOS_TEST_FOR_EXCEPTION(errorCode != 0, std::runtime_error,
6071 "packRow: "
6072 "PackTraits::packArray returned a nonzero error code");
6073
6074 return numBytesOut;
6075}
6076
6077template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6078size_t
6079CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6080 unpackRow(GlobalOrdinal gidsOut[],
6081 impl_scalar_type valsOut[],
6082 const char imports[],
6083 const size_t offset,
6084 const size_t numBytes,
6085 const size_t numEnt,
6086 const size_t numBytesPerValue) {
6087 using Kokkos::subview;
6088 using Kokkos::View;
6090 typedef LocalOrdinal LO;
6091 typedef GlobalOrdinal GO;
6092 typedef impl_scalar_type ST;
6093
6094 Details::ProfilingRegion region_upack_row(
6095 "Tpetra::CrsMatrix::unpackRow",
6096 "Import/Export");
6097
6098 if (numBytes == 0) {
6099 // Rows with zero bytes should always have zero entries.
6100 if (numEnt != 0) {
6101 const int myRank = this->getMap()->getComm()->getRank();
6102 TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "(Proc " << myRank << ") CrsMatrix::"
6103 "unpackRow: The number of bytes to unpack numBytes=0, but the "
6104 "number of entries to unpack (as reported by numPacketsPerLID) "
6105 "for this row numEnt="
6106 << numEnt << " != 0.");
6107 }
6108 return 0;
6109 }
6110
6111 if (numEnt == 0 && numBytes != 0) {
6112 const int myRank = this->getMap()->getComm()->getRank();
6113 TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "(Proc " << myRank << ") CrsMatrix::"
6114 "unpackRow: The number of entries to unpack (as reported by "
6115 "numPacketsPerLID) numEnt=0, but the number of bytes to unpack "
6116 "numBytes="
6117 << numBytes << " != 0.");
6118 }
6119
6120 const GO gid = 0; // packValueCount wants this
6121 const LO lid = 0; // packValueCount wants this
6122
6123 const size_t numEntBeg = offset;
6124 const size_t numEntLen = PackTraits<LO>::packValueCount(lid);
6125 const size_t gidsBeg = numEntBeg + numEntLen;
6126 const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount(gid);
6127 const size_t valsBeg = gidsBeg + gidsLen;
6128 const size_t valsLen = numEnt * numBytesPerValue;
6129
6130 const char* const numEntIn = imports + numEntBeg;
6131 const char* const gidsIn = imports + gidsBeg;
6132 const char* const valsIn = imports + valsBeg;
6133
6134 size_t numBytesOut = 0;
6135 int errorCode = 0;
6136 LO numEntOut;
6137 numBytesOut += PackTraits<LO>::unpackValue(numEntOut, numEntIn);
6138 if (static_cast<size_t>(numEntOut) != numEnt ||
6139 numEntOut == static_cast<LO>(0)) {
6140 const int myRank = this->getMap()->getComm()->getRank();
6141 std::ostringstream os;
6142 os << "(Proc " << myRank << ") CrsMatrix::unpackRow: ";
6143 bool firstErrorCondition = false;
6144 if (static_cast<size_t>(numEntOut) != numEnt) {
6145 os << "Number of entries from numPacketsPerLID numEnt=" << numEnt
6146 << " does not equal number of entries unpacked from imports "
6147 "buffer numEntOut="
6148 << numEntOut << ".";
6149 firstErrorCondition = true;
6150 }
6151 if (numEntOut == static_cast<LO>(0)) {
6152 if (firstErrorCondition) {
6153 os << " Also, ";
6154 }
6155 os << "Number of entries unpacked from imports buffer numEntOut=0, "
6156 "but number of bytes to unpack for this row numBytes="
6157 << numBytes
6158 << " != 0. This should never happen, since packRow should only "
6159 "ever pack rows with a nonzero number of entries. In this case, "
6160 "the number of entries from numPacketsPerLID is numEnt="
6161 << numEnt
6162 << ".";
6163 }
6164 TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, os.str());
6165 }
6166
6167 {
6168 Kokkos::pair<int, size_t> p;
6169 p = PackTraits<GO>::unpackArray(gidsOut, gidsIn, numEnt);
6170 errorCode += p.first;
6171 numBytesOut += p.second;
6172
6173 p = PackTraits<ST>::unpackArray(valsOut, valsIn, numEnt);
6174 errorCode += p.first;
6175 numBytesOut += p.second;
6176 }
6177
6178 TEUCHOS_TEST_FOR_EXCEPTION(numBytesOut != numBytes, std::logic_error, "unpackRow: numBytesOut = " << numBytesOut << " != numBytes = " << numBytes << ".");
6179
6180 const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
6181 TEUCHOS_TEST_FOR_EXCEPTION(numBytesOut != expectedNumBytes, std::logic_error,
6182 "unpackRow: "
6183 "numBytesOut = "
6184 << numBytesOut << " != expectedNumBytes = "
6185 << expectedNumBytes << ".");
6186
6187 TEUCHOS_TEST_FOR_EXCEPTION(errorCode != 0, std::runtime_error,
6188 "unpackRow: "
6189 "PackTraits::unpackArray returned a nonzero error code");
6190
6191 return numBytesOut;
6192}
6193
6194template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6195void CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6196 allocatePackSpaceNew(Kokkos::DualView<char*, buffer_device_type>& exports,
6197 size_t& totalNumEntries,
6198 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs) const {
6199 using Details::Behavior;
6201 using std::endl;
6202 typedef impl_scalar_type IST;
6203 typedef LocalOrdinal LO;
6204 typedef GlobalOrdinal GO;
6205 // const char tfecfFuncName[] = "allocatePackSpaceNew: ";
6206
6207 // mfh 18 Oct 2017: Set TPETRA_VERBOSE to true for copious debug
6208 // output to std::cerr on every MPI process. This is unwise for
6209 // runs with large numbers of MPI processes.
6210 const bool verbose = Behavior::verbose("CrsMatrix");
6211 std::unique_ptr<std::string> prefix;
6212 if (verbose) {
6213 prefix = this->createPrefix("CrsMatrix", "allocatePackSpaceNew");
6214 std::ostringstream os;
6215 os << *prefix << "Before:"
6216 << endl
6217 << *prefix << " "
6218 << dualViewStatusToString(exports, "exports")
6219 << endl
6220 << *prefix << " "
6221 << dualViewStatusToString(exportLIDs, "exportLIDs")
6222 << endl;
6223 std::cerr << os.str();
6224 }
6225
6226 // The number of export LIDs must fit in LocalOrdinal, assuming
6227 // that the LIDs are distinct and valid on the calling process.
6228 const LO numExportLIDs = static_cast<LO>(exportLIDs.extent(0));
6229
6230 TEUCHOS_ASSERT(!exportLIDs.need_sync_host());
6231 auto exportLIDs_h = exportLIDs.view_host();
6232
6233 // Count the total number of matrix entries to send.
6234 totalNumEntries = 0;
6235 for (LO i = 0; i < numExportLIDs; ++i) {
6236 const LO lclRow = exportLIDs_h[i];
6237 size_t curNumEntries = this->getNumEntriesInLocalRow(lclRow);
6238 // FIXME (mfh 25 Jan 2015) We should actually report invalid row
6239 // indices as an error. Just consider them nonowned for now.
6240 if (curNumEntries == Teuchos::OrdinalTraits<size_t>::invalid()) {
6241 curNumEntries = 0;
6242 }
6243 totalNumEntries += curNumEntries;
6244 }
6245
6246 // FIXME (mfh 24 Feb 2013, 24 Mar 2017) This code is only correct
6247 // if sizeof(IST) is a meaningful representation of the amount of
6248 // data in a Scalar instance. (LO and GO are always built-in
6249 // integer types.)
6250 //
6251 // Allocate the exports array. It does NOT need padding for
6252 // alignment, since we use memcpy to write to / read from send /
6253 // receive buffers.
6254 const size_t allocSize =
6255 static_cast<size_t>(numExportLIDs) * sizeof(LO) +
6256 totalNumEntries * (sizeof(IST) + sizeof(GO));
6257 if (static_cast<size_t>(exports.extent(0)) < allocSize) {
6258 using exports_type = Kokkos::DualView<char*, buffer_device_type>;
6259
6260 const std::string oldLabel = exports.view_device().label();
6261 const std::string newLabel = (oldLabel == "") ? "exports" : oldLabel;
6262 exports = exports_type(newLabel, allocSize);
6263 }
6264
6265 if (verbose) {
6266 std::ostringstream os;
6267 os << *prefix << "After:"
6268 << endl
6269 << *prefix << " "
6270 << dualViewStatusToString(exports, "exports")
6271 << endl
6272 << *prefix << " "
6273 << dualViewStatusToString(exportLIDs, "exportLIDs")
6274 << endl;
6275 std::cerr << os.str();
6276 }
6277}
6278
6279template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6281 packNew(const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
6282 Kokkos::DualView<char*, buffer_device_type>& exports,
6283 const Kokkos::DualView<size_t*, buffer_device_type>& numPacketsPerLID,
6284 size_t& constantNumPackets) const {
6285 // The call to packNew in packAndPrepare catches and handles any exceptions.
6286 Details::ProfilingRegion region_pack_new("Tpetra::CrsMatrix::packNew", "Import/Export");
6287 if (this->isStaticGraph()) {
6288 using ::Tpetra::Details::packCrsMatrixNew;
6289 packCrsMatrixNew(*this, exports, numPacketsPerLID, exportLIDs,
6291 } else {
6292 this->packNonStaticNew(exportLIDs, exports, numPacketsPerLID,
6294 }
6295}
6296
6297template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6299 packNonStaticNew(const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
6300 Kokkos::DualView<char*, buffer_device_type>& exports,
6301 const Kokkos::DualView<size_t*, buffer_device_type>& numPacketsPerLID,
6302 size_t& constantNumPackets) const {
6303 using Details::Behavior;
6306 using Details::PackTraits;
6307 using Kokkos::View;
6308 using std::endl;
6309 using LO = LocalOrdinal;
6310 using GO = GlobalOrdinal;
6311 using ST = impl_scalar_type;
6312 const char tfecfFuncName[] = "packNonStaticNew: ";
6313
6314 const bool verbose = Behavior::verbose("CrsMatrix");
6315 std::unique_ptr<std::string> prefix;
6316 if (verbose) {
6317 prefix = this->createPrefix("CrsMatrix", "packNonStaticNew");
6318 std::ostringstream os;
6319 os << *prefix << "Start" << endl;
6320 std::cerr << os.str();
6321 }
6322
6323 const size_t numExportLIDs = static_cast<size_t>(exportLIDs.extent(0));
6324 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(numExportLIDs != static_cast<size_t>(numPacketsPerLID.extent(0)),
6325 std::invalid_argument, "exportLIDs.size() = " << numExportLIDs << " != numPacketsPerLID.size() = " << numPacketsPerLID.extent(0) << ".");
6326
6327 // Setting this to zero tells the caller to expect a possibly
6328 // different ("nonconstant") number of packets per local index
6329 // (i.e., a possibly different number of entries per row).
6330 constantNumPackets = 0;
6331
6332 // The pack buffer 'exports' enters this method possibly
6333 // unallocated. Do the first two parts of "Count, allocate, fill,
6334 // compute."
6335 size_t totalNumEntries = 0;
6336 this->allocatePackSpaceNew(exports, totalNumEntries, exportLIDs);
6337 const size_t bufSize = static_cast<size_t>(exports.extent(0));
6338
6339 // Write-only host access
6340 exports.clear_sync_state();
6341 exports.modify_host();
6342 auto exports_h = exports.view_host();
6343 if (verbose) {
6344 std::ostringstream os;
6345 os << *prefix << "After marking exports as modified on host, "
6346 << dualViewStatusToString(exports, "exports") << endl;
6347 std::cerr << os.str();
6348 }
6349
6350 // Read-only host access
6351 auto exportLIDs_h = exportLIDs.view_host();
6352
6353 // Write-only host access
6354 const_cast<Kokkos::DualView<size_t*, buffer_device_type>*>(&numPacketsPerLID)->clear_sync_state();
6355 const_cast<Kokkos::DualView<size_t*, buffer_device_type>*>(&numPacketsPerLID)->modify_host();
6356 auto numPacketsPerLID_h = numPacketsPerLID.view_host();
6357
6358 // Compute the number of "packets" (in this case, bytes) per
6359 // export LID (in this case, local index of the row to send), and
6360 // actually pack the data.
6361 auto maxRowNumEnt = this->getLocalMaxNumRowEntries();
6362
6363 // Temporary buffer for global column indices.
6364 typename global_inds_host_view_type::non_const_type gidsIn_k;
6365 if (this->isLocallyIndexed()) { // Need storage for Global IDs
6366 gidsIn_k =
6367 typename global_inds_host_view_type::non_const_type("packGids",
6368 maxRowNumEnt);
6369 }
6370
6371 size_t offset = 0; // current index into 'exports' array.
6372 for (size_t i = 0; i < numExportLIDs; ++i) {
6373 const LO lclRow = exportLIDs_h[i];
6374
6375 size_t numBytes = 0;
6376 size_t numEnt = this->getNumEntriesInLocalRow(lclRow);
6377
6378 // Only pack this row's data if it has a nonzero number of
6379 // entries. We can do this because receiving processes get the
6380 // number of packets, and will know that zero packets means zero
6381 // entries.
6382 if (numEnt == 0) {
6383 numPacketsPerLID_h[i] = 0;
6384 continue;
6385 }
6386
6387 if (this->isLocallyIndexed()) {
6388 typename global_inds_host_view_type::non_const_type gidsIn;
6389 values_host_view_type valsIn;
6390 // If the matrix is locally indexed on the calling process, we
6391 // have to use its column Map (which it _must_ have in this
6392 // case) to convert to global indices.
6393 local_inds_host_view_type lidsIn;
6394 this->getLocalRowView(lclRow, lidsIn, valsIn);
6395 const map_type& colMap = *(this->getColMap());
6396 for (size_t k = 0; k < numEnt; ++k) {
6397 gidsIn_k[k] = colMap.getGlobalElement(lidsIn[k]);
6398 }
6399 gidsIn = Kokkos::subview(gidsIn_k, Kokkos::make_pair(GO(0), GO(numEnt)));
6400
6401 const size_t numBytesPerValue =
6402 PackTraits<ST>::packValueCount(valsIn[0]);
6403 numBytes = this->packRow(exports_h.data(), offset, numEnt,
6404 gidsIn.data(), valsIn.data(),
6405 numBytesPerValue);
6406 } else if (this->isGloballyIndexed()) {
6407 global_inds_host_view_type gidsIn;
6408 values_host_view_type valsIn;
6409 // If the matrix is globally indexed on the calling process,
6410 // then we can use the column indices directly. However, we
6411 // have to get the global row index. The calling process must
6412 // have a row Map, since otherwise it shouldn't be participating
6413 // in packing operations.
6414 const map_type& rowMap = *(this->getRowMap());
6415 const GO gblRow = rowMap.getGlobalElement(lclRow);
6416 this->getGlobalRowView(gblRow, gidsIn, valsIn);
6417
6418 const size_t numBytesPerValue =
6419 PackTraits<ST>::packValueCount(valsIn[0]);
6420 numBytes = this->packRow(exports_h.data(), offset, numEnt,
6421 gidsIn.data(), valsIn.data(),
6422 numBytesPerValue);
6423 }
6424 // mfh 11 Sep 2017: Currently, if the matrix is neither globally
6425 // nor locally indexed, then it has no entries. Therefore,
6426 // there is nothing to pack. No worries!
6427
6428 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(offset > bufSize || offset + numBytes > bufSize, std::logic_error,
6429 "First invalid offset into 'exports' pack buffer at index i = " << i
6430 << ". exportLIDs_h[i]: " << exportLIDs_h[i] << ", bufSize: " << bufSize << ", offset: " << offset << ", numBytes: " << numBytes << ".");
6431 // numPacketsPerLID_h[i] is the number of "packets" in the
6432 // current local row i. Packet=char (really "byte") so use the
6433 // number of bytes of the packed data for that row.
6434 numPacketsPerLID_h[i] = numBytes;
6435 offset += numBytes;
6436 }
6437
6438 if (verbose) {
6439 std::ostringstream os;
6440 os << *prefix << "Tpetra::CrsMatrix::packNonStaticNew: After:" << endl
6441 << *prefix << " "
6442 << dualViewStatusToString(exports, "exports")
6443 << endl
6444 << *prefix << " "
6445 << dualViewStatusToString(exportLIDs, "exportLIDs")
6446 << endl;
6447 std::cerr << os.str();
6448 }
6449}
6450
6451template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6452LocalOrdinal
6453CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6454 combineGlobalValuesRaw(const LocalOrdinal lclRow,
6455 const LocalOrdinal numEnt,
6456 const impl_scalar_type vals[],
6457 const GlobalOrdinal cols[],
6458 const Tpetra::CombineMode combMode,
6459 const char* const prefix,
6460 const bool debug,
6461 const bool verbose) {
6462 using GO = GlobalOrdinal;
6463
6464 // mfh 23 Mar 2017: This branch is not thread safe in a debug
6465 // build, due to use of Teuchos::ArrayView; see #229.
6466 const GO gblRow = myGraph_->rowMap_->getGlobalElement(lclRow);
6467 Teuchos::ArrayView<const GO> cols_av(numEnt == 0 ? nullptr : cols, numEnt);
6468 Teuchos::ArrayView<const Scalar> vals_av(numEnt == 0 ? nullptr : reinterpret_cast<const Scalar*>(vals), numEnt);
6469
6470 // FIXME (mfh 23 Mar 2017) This is a work-around for less common
6471 // combine modes. combineGlobalValues throws on error; it does
6472 // not return an error code. Thus, if it returns, it succeeded.
6473 combineGlobalValues(gblRow, cols_av, vals_av, combMode,
6474 prefix, debug, verbose);
6475 return numEnt;
6476}
6477
6478template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6479void CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6480 combineGlobalValues(
6481 const GlobalOrdinal globalRowIndex,
6482 const Teuchos::ArrayView<const GlobalOrdinal>& columnIndices,
6483 const Teuchos::ArrayView<const Scalar>& values,
6484 const Tpetra::CombineMode combineMode,
6485 const char* const prefix,
6486 const bool debug,
6487 const bool verbose) {
6488 const char tfecfFuncName[] = "combineGlobalValues: ";
6489
6490 if (isStaticGraph()) {
6491 // INSERT doesn't make sense for a static graph, since you
6492 // aren't allowed to change the structure of the graph.
6493 // However, all the other combine modes work.
6494 if (combineMode == ADD) {
6495 sumIntoGlobalValues(globalRowIndex, columnIndices, values);
6496 } else if (combineMode == REPLACE) {
6497 replaceGlobalValues(globalRowIndex, columnIndices, values);
6498 } else if (combineMode == ABSMAX) {
6499 using ::Tpetra::Details::AbsMax;
6500 AbsMax<Scalar> f;
6501 this->template transformGlobalValues<AbsMax<Scalar>>(globalRowIndex,
6502 columnIndices,
6503 values, f);
6504 } else if (combineMode == INSERT) {
6505 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(isStaticGraph() && combineMode == INSERT,
6506 std::invalid_argument,
6507 "INSERT combine mode is forbidden "
6508 "if the matrix has a static (const) graph (i.e., was "
6509 "constructed with the CrsMatrix constructor that takes a "
6510 "const CrsGraph pointer).");
6511 } else {
6512 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::logic_error,
6513 "Invalid combine mode; should "
6514 "never get here! "
6515 "Please report this bug to the Tpetra developers.");
6516 }
6517 } else { // The matrix has a dynamic graph.
6518 if (combineMode == ADD || combineMode == INSERT) {
6519 // For a dynamic graph, all incoming column indices are
6520 // inserted into the target graph. Duplicate indices will
6521 // have their values summed. In this context, ADD and INSERT
6522 // are equivalent. We need to call insertGlobalValues()
6523 // anyway if the column indices don't yet exist in this row,
6524 // so we just call insertGlobalValues() for both cases.
6525 insertGlobalValuesFilteredChecked(globalRowIndex,
6526 columnIndices, values, prefix, debug, verbose);
6527 }
6528 // FIXME (mfh 14 Mar 2012):
6529 //
6530 // Implementing ABSMAX or REPLACE for a dynamic graph would
6531 // require modifying assembly to attach a possibly different
6532 // combine mode to each inserted (i, j, A_ij) entry. For
6533 // example, consider two different Export operations to the same
6534 // target CrsMatrix, the first with ABSMAX combine mode and the
6535 // second with REPLACE. This isn't a common use case, so we
6536 // won't mess with it for now.
6537 else if (combineMode == ABSMAX) {
6538 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
6539 !isStaticGraph() && combineMode == ABSMAX, std::logic_error,
6540 "ABSMAX combine mode when the matrix has a dynamic graph is not yet "
6541 "implemented.");
6542 } else if (combineMode == REPLACE) {
6543 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
6544 !isStaticGraph() && combineMode == REPLACE, std::logic_error,
6545 "REPLACE combine mode when the matrix has a dynamic graph is not yet "
6546 "implemented.");
6547 } else {
6548 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
6549 true, std::logic_error,
6550 "Should never get here! Please report this "
6551 "bug to the Tpetra developers.");
6552 }
6553 }
6554}
6555
6556template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6558 unpackAndCombine(const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& importLIDs,
6559 Kokkos::DualView<char*, buffer_device_type> imports,
6560 Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
6561 const size_t constantNumPackets,
6562 const CombineMode combineMode) {
6563 using Details::Behavior;
6566 using std::endl;
6567 const char tfecfFuncName[] = "unpackAndCombine: ";
6568 ProfilingRegion regionUAC("Tpetra::CrsMatrix::unpackAndCombine");
6569
6570 const bool debug = Behavior::debug("CrsMatrix");
6571 const bool verbose = Behavior::verbose("CrsMatrix");
6572 constexpr int numValidModes = 5;
6575 const char* validModeNames[numValidModes] =
6576 {"ADD", "REPLACE", "ABSMAX", "INSERT", "ZERO"};
6577
6578 std::unique_ptr<std::string> prefix;
6579 if (verbose) {
6580 prefix = this->createPrefix("CrsMatrix", "unpackAndCombine");
6581 std::ostringstream os;
6582 os << *prefix << "Start:" << endl
6583 << *prefix << " "
6584 << dualViewStatusToString(importLIDs, "importLIDs")
6585 << endl
6586 << *prefix << " "
6587 << dualViewStatusToString(imports, "imports")
6588 << endl
6589 << *prefix << " "
6590 << dualViewStatusToString(numPacketsPerLID, "numPacketsPerLID")
6591 << endl
6592 << *prefix << " constantNumPackets: " << constantNumPackets
6593 << endl
6594 << *prefix << " combineMode: " << combineModeToString(combineMode)
6595 << endl;
6596 std::cerr << os.str();
6597 }
6598
6599 if (debug) {
6600 if (std::find(validModes, validModes + numValidModes, combineMode) ==
6602 std::ostringstream os;
6603 os << "Invalid combine mode. Valid modes are {";
6604 for (int k = 0; k < numValidModes; ++k) {
6605 os << validModeNames[k];
6606 if (k < numValidModes - 1) {
6607 os << ", ";
6608 }
6609 }
6610 os << "}.";
6611 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::invalid_argument, os.str());
6612 }
6614 std::invalid_argument, "importLIDs.extent(0)=" << importLIDs.extent(0) << " != numPacketsPerLID.extent(0)=" << numPacketsPerLID.extent(0) << ".");
6615 }
6616
6617 if (combineMode == ZERO) {
6618 return; // nothing to do
6619 }
6620
6621 if (debug) {
6622 using Teuchos::reduceAll;
6623 std::unique_ptr<std::ostringstream> msg(new std::ostringstream());
6624 int lclBad = 0;
6625 try {
6626 unpackAndCombineImpl(importLIDs, imports, numPacketsPerLID,
6628 verbose);
6629 } catch (std::exception& e) {
6630 lclBad = 1;
6631 *msg << e.what();
6632 }
6633 int gblBad = 0;
6634 const Teuchos::Comm<int>& comm = *(this->getComm());
6635 reduceAll<int, int>(comm, Teuchos::REDUCE_MAX,
6636 lclBad, Teuchos::outArg(gblBad));
6637 if (gblBad != 0) {
6638 // mfh 22 Oct 2017: 'prefix' might be null, since it is only
6639 // initialized in a debug build. Thus, we get the process
6640 // rank again here. This is an error message, so the small
6641 // run-time cost doesn't matter. See #1887.
6642 std::ostringstream os;
6643 os << "Proc " << comm.getRank() << ": " << msg->str() << endl;
6644 msg = std::unique_ptr<std::ostringstream>(new std::ostringstream());
6645 ::Tpetra::Details::gathervPrint(*msg, os.str(), comm);
6646 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::logic_error, std::endl
6647 << "unpackAndCombineImpl "
6648 "threw an exception on one or more participating processes: "
6649 << endl
6650 << msg->str());
6651 }
6652 } else {
6653 unpackAndCombineImpl(importLIDs, imports, numPacketsPerLID,
6655 verbose);
6656 }
6657
6658 if (verbose) {
6659 std::ostringstream os;
6660 os << *prefix << "Done!" << endl
6661 << *prefix << " "
6662 << dualViewStatusToString(importLIDs, "importLIDs")
6663 << endl
6664 << *prefix << " "
6665 << dualViewStatusToString(imports, "imports")
6666 << endl
6667 << *prefix << " "
6668 << dualViewStatusToString(numPacketsPerLID, "numPacketsPerLID")
6669 << endl;
6670 std::cerr << os.str();
6671 }
6672}
6673
6674template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6677 const Kokkos::DualView<const local_ordinal_type*,
6678 buffer_device_type>& importLIDs,
6679 Kokkos::DualView<char*, buffer_device_type> imports,
6680 Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
6681 const size_t constantNumPackets,
6683 const bool verbose) {
6685 "Tpetra::CrsMatrix::unpackAndCombineImpl",
6686 "Import/Export");
6687 using std::endl;
6688 const char tfecfFuncName[] = "unpackAndCombineImpl";
6689 std::unique_ptr<std::string> prefix;
6690 if (verbose) {
6691 prefix = this->createPrefix("CrsMatrix", tfecfFuncName);
6692 std::ostringstream os;
6693 os << *prefix << "isStaticGraph(): "
6694 << (isStaticGraph() ? "true" : "false")
6695 << ", importLIDs.extent(0): "
6696 << importLIDs.extent(0)
6697 << ", imports.extent(0): "
6698 << imports.extent(0)
6699 << ", numPacketsPerLID.extent(0): "
6700 << numPacketsPerLID.extent(0)
6701 << endl;
6702 std::cerr << os.str();
6703 }
6704
6705 if (isStaticGraph()) {
6706 using Details::unpackCrsMatrixAndCombineNew;
6707 unpackCrsMatrixAndCombineNew(*this, imports, numPacketsPerLID,
6708 importLIDs, constantNumPackets,
6709 combineMode);
6710 } else {
6711 {
6712 using padding_type = typename crs_graph_type::padding_type;
6713 std::unique_ptr<padding_type> padding;
6714 try {
6715 padding = myGraph_->computePaddingForCrsMatrixUnpack(
6716 importLIDs, imports, numPacketsPerLID, verbose);
6717 } catch (std::exception& e) {
6718 const auto rowMap = getRowMap();
6719 const auto comm = rowMap.is_null() ? Teuchos::null : rowMap->getComm();
6720 const int myRank = comm.is_null() ? -1 : comm->getRank();
6721 TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, "Proc " << myRank << ": "
6722 "Tpetra::CrsGraph::computePaddingForCrsMatrixUnpack "
6723 "threw an exception: "
6724 << e.what());
6725 }
6726 if (verbose) {
6727 std::ostringstream os;
6728 os << *prefix << "Call applyCrsPadding" << endl;
6729 std::cerr << os.str();
6730 }
6731 applyCrsPadding(*padding, verbose);
6732 }
6733 if (verbose) {
6734 std::ostringstream os;
6735 os << *prefix << "Call unpackAndCombineImplNonStatic" << endl;
6736 std::cerr << os.str();
6737 }
6738 unpackAndCombineImplNonStatic(importLIDs, imports,
6739 numPacketsPerLID,
6740 constantNumPackets,
6741 combineMode);
6742 }
6743
6744 if (verbose) {
6745 std::ostringstream os;
6746 os << *prefix << "Done" << endl;
6747 std::cerr << os.str();
6748 }
6749}
6750
6751template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6752void CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6753 unpackAndCombineImplNonStatic(
6754 const Kokkos::DualView<const local_ordinal_type*,
6755 buffer_device_type>& importLIDs,
6756 Kokkos::DualView<char*, buffer_device_type> imports,
6757 Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
6758 const size_t constantNumPackets,
6759 const CombineMode combineMode) {
6760 using Details::Behavior;
6763 using Details::PackTraits;
6764 using Details::ScalarViewTraits;
6765 using Kokkos::MemoryUnmanaged;
6766 using Kokkos::subview;
6767 using Kokkos::View;
6768 using std::endl;
6769 using LO = LocalOrdinal;
6770 using GO = GlobalOrdinal;
6771 using ST = impl_scalar_type;
6772 using size_type = typename Teuchos::ArrayView<LO>::size_type;
6773 using HES =
6774 typename View<int*, device_type>::host_mirror_type::execution_space;
6775 using pair_type = std::pair<typename View<int*, HES>::size_type,
6776 typename View<int*, HES>::size_type>;
6777 using gids_out_type = View<GO*, HES, MemoryUnmanaged>;
6778 using vals_out_type = View<ST*, HES, MemoryUnmanaged>;
6779 const char tfecfFuncName[] = "unpackAndCombineImplNonStatic";
6780
6781 const bool debug = Behavior::debug("CrsMatrix");
6782 const bool verbose = Behavior::verbose("CrsMatrix");
6783 std::unique_ptr<std::string> prefix;
6784 if (verbose) {
6785 prefix = this->createPrefix("CrsMatrix", tfecfFuncName);
6786 std::ostringstream os;
6787 os << *prefix << endl; // we've already printed DualViews' statuses
6788 std::cerr << os.str();
6789 }
6790 const char* const prefix_raw =
6791 verbose ? prefix.get()->c_str() : nullptr;
6792
6793 const size_type numImportLIDs = importLIDs.extent(0);
6794 if (combineMode == ZERO || numImportLIDs == 0) {
6795 return; // nothing to do; no need to combine entries
6796 }
6797
6798 Details::ProfilingRegion region_unpack_and_combine_impl_non_static(
6799 "Tpetra::CrsMatrix::unpackAndCombineImplNonStatic",
6800 "Import/Export");
6801
6802 // We're unpacking on host. This is read-only host access.
6803 if (imports.need_sync_host()) {
6804 imports.sync_host();
6805 }
6806 auto imports_h = imports.view_host();
6807
6808 // Read-only host access.
6809 if (numPacketsPerLID.need_sync_host()) {
6810 numPacketsPerLID.sync_host();
6811 }
6812 auto numPacketsPerLID_h = numPacketsPerLID.view_host();
6813
6814 TEUCHOS_ASSERT(!importLIDs.need_sync_host());
6815 auto importLIDs_h = importLIDs.view_host();
6816
6817 size_t numBytesPerValue;
6818 {
6819 // FIXME (mfh 17 Feb 2015, tjf 2 Aug 2017) What do I do about Scalar types
6820 // with run-time size? We already assume that all entries in both the
6821 // source and target matrices have the same size. If the calling process
6822 // owns at least one entry in either matrix, we can use that entry to set
6823 // the size. However, it is possible that the calling process owns no
6824 // entries. In that case, we're in trouble. One way to fix this would be
6825 // for each row's data to contain the run-time size. This is only
6826 // necessary if the size is not a compile-time constant.
6827 Scalar val;
6828 numBytesPerValue = PackTraits<ST>::packValueCount(val);
6829 }
6830
6831 // Determine the maximum number of entries in any one row
6832 size_t offset = 0;
6833 size_t maxRowNumEnt = 0;
6834 for (size_type i = 0; i < numImportLIDs; ++i) {
6835 const size_t numBytes = numPacketsPerLID_h[i];
6836 if (numBytes == 0) {
6837 continue; // empty buffer for that row means that the row is empty
6838 }
6839 // We need to unpack a nonzero number of entries for this row.
6840 if (debug) {
6841 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(offset + numBytes > size_t(imports_h.extent(0)),
6842 std::logic_error, ": At local row index importLIDs_h[i=" << i << "]=" << importLIDs_h[i] << ", offset (=" << offset << ") + numBytes (=" << numBytes << ") > "
6843 "imports_h.extent(0)="
6844 << imports_h.extent(0) << ".");
6845 }
6846 LO numEntLO = 0;
6847
6848 if (debug) {
6849 const size_t theNumBytes =
6850 PackTraits<LO>::packValueCount(numEntLO);
6851 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(theNumBytes > numBytes, std::logic_error, ": theNumBytes=" << theNumBytes << " > numBytes = " << numBytes << ".");
6852 }
6853 const char* const inBuf = imports_h.data() + offset;
6854 const size_t actualNumBytes =
6855 PackTraits<LO>::unpackValue(numEntLO, inBuf);
6856
6857 if (debug) {
6858 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(actualNumBytes > numBytes, std::logic_error, ": At i=" << i << ", actualNumBytes=" << actualNumBytes << " > numBytes=" << numBytes << ".");
6859 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(numEntLO == 0, std::logic_error,
6860 ": At local row index "
6861 "importLIDs_h[i="
6862 << i << "]=" << importLIDs_h[i] << ", "
6863 "the number of entries read from the packed data is "
6864 "numEntLO="
6865 << numEntLO << ", but numBytes=" << numBytes
6866 << " != 0.");
6867 }
6868
6869 maxRowNumEnt = std::max(size_t(numEntLO), maxRowNumEnt);
6870 offset += numBytes;
6871 }
6872
6873 // Temporary space to cache incoming global column indices and
6874 // values. Column indices come in as global indices, in case the
6875 // source object's column Map differs from the target object's
6876 // (this's) column Map.
6877 View<GO*, HES> gblColInds;
6878 View<LO*, HES> lclColInds;
6879 View<ST*, HES> vals;
6880 {
6881 GO gid = 0;
6882 LO lid = 0;
6883 // FIXME (mfh 17 Feb 2015, tjf 2 Aug 2017) What do I do about Scalar types
6884 // with run-time size? We already assume that all entries in both the
6885 // source and target matrices have the same size. If the calling process
6886 // owns at least one entry in either matrix, we can use that entry to set
6887 // the size. However, it is possible that the calling process owns no
6888 // entries. In that case, we're in trouble. One way to fix this would be
6889 // for each row's data to contain the run-time size. This is only
6890 // necessary if the size is not a compile-time constant.
6891 Scalar val;
6892 gblColInds = ScalarViewTraits<GO, HES>::allocateArray(
6893 gid, maxRowNumEnt, "gids");
6894 lclColInds = ScalarViewTraits<LO, HES>::allocateArray(
6895 lid, maxRowNumEnt, "lids");
6896 vals = ScalarViewTraits<ST, HES>::allocateArray(
6897 val, maxRowNumEnt, "vals");
6898 }
6899
6900 offset = 0;
6901 for (size_type i = 0; i < numImportLIDs; ++i) {
6902 const size_t numBytes = numPacketsPerLID_h[i];
6903 if (numBytes == 0) {
6904 continue; // empty buffer for that row means that the row is empty
6905 }
6906 LO numEntLO = 0;
6907 const char* const inBuf = imports_h.data() + offset;
6908 (void)PackTraits<LO>::unpackValue(numEntLO, inBuf);
6909
6910 const size_t numEnt = static_cast<size_t>(numEntLO);
6911 ;
6912 const LO lclRow = importLIDs_h[i];
6913
6914 gids_out_type gidsOut = subview(gblColInds, pair_type(0, numEnt));
6915 vals_out_type valsOut = subview(vals, pair_type(0, numEnt));
6916
6917 const size_t numBytesOut =
6918 unpackRow(gidsOut.data(), valsOut.data(), imports_h.data(),
6919 offset, numBytes, numEnt, numBytesPerValue);
6920 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(numBytes != numBytesOut, std::logic_error, ": At i=" << i << ", numBytes=" << numBytes << " != numBytesOut=" << numBytesOut << ".");
6921
6922 const ST* const valsRaw = const_cast<const ST*>(valsOut.data());
6923 const GO* const gidsRaw = const_cast<const GO*>(gidsOut.data());
6924 combineGlobalValuesRaw(lclRow, numEnt, valsRaw, gidsRaw,
6925 combineMode, prefix_raw, debug, verbose);
6926 // Don't update offset until current LID has succeeded.
6927 offset += numBytes;
6928 } // for each import LID i
6929
6930 if (verbose) {
6931 std::ostringstream os;
6932 os << *prefix << "Done" << endl;
6933 std::cerr << os.str();
6934 }
6935}
6936
6937template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6938Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>>
6941 const bool force) const {
6942 using Teuchos::null;
6943 using Teuchos::RCP;
6944 using Teuchos::rcp;
6945
6947 !this->hasColMap(), std::runtime_error,
6948 "Tpetra::CrsMatrix::getColumn"
6949 "MapMultiVector: You may only call this method if the matrix has a "
6950 "column Map. If the matrix does not yet have a column Map, you should "
6951 "first call fillComplete (with domain and range Map if necessary).");
6952
6953 // If the graph is not fill complete, then the Import object (if
6954 // one should exist) hasn't been constructed yet.
6956 !this->getGraph()->isFillComplete(), std::runtime_error,
6957 "Tpetra::"
6958 "CrsMatrix::getColumnMapMultiVector: You may only call this method if "
6959 "this matrix's graph is fill complete.");
6960
6961 const size_t numVecs = X_domainMap.getNumVectors();
6962 RCP<const import_type> importer = this->getGraph()->getImporter();
6963 RCP<const map_type> colMap = this->getColMap();
6964
6965 RCP<MV> X_colMap; // null by default
6966
6967 // If the Import object is trivial (null), then we don't need a
6968 // separate column Map multivector. Just return null in that
6969 // case. The caller is responsible for knowing not to use the
6970 // returned null pointer.
6971 //
6972 // If the Import is nontrivial, then we do need a separate
6973 // column Map multivector for the Import operation. Check in
6974 // that case if we have to (re)create the column Map
6975 // multivector.
6976 if (!importer.is_null() || force) {
6977 if (importMV_.is_null() || importMV_->getNumVectors() != numVecs) {
6978 X_colMap = rcp(new MV(colMap, numVecs));
6979
6980 // Cache the newly created multivector for later reuse.
6981 importMV_ = X_colMap;
6982 } else { // Yay, we can reuse the cached multivector!
6983 X_colMap = importMV_;
6984 // mfh 09 Jan 2013: We don't have to fill with zeros first,
6985 // because the Import uses INSERT combine mode, which overwrites
6986 // existing entries.
6987 //
6988 // X_colMap->putScalar (ZERO);
6989 }
6990 }
6991 return X_colMap;
6992}
6993
6994template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6995Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>>
6998 const bool force) const {
6999 using Teuchos::null;
7000 using Teuchos::RCP;
7001 using Teuchos::rcp;
7002
7003 // If the graph is not fill complete, then the Export object (if
7004 // one should exist) hasn't been constructed yet.
7006 !this->getGraph()->isFillComplete(), std::runtime_error,
7007 "Tpetra::"
7008 "CrsMatrix::getRowMapMultiVector: You may only call this method if this "
7009 "matrix's graph is fill complete.");
7010
7011 const size_t numVecs = Y_rangeMap.getNumVectors();
7012 RCP<const export_type> exporter = this->getGraph()->getExporter();
7013 // Every version of the constructor takes either a row Map, or a
7014 // graph (all of whose constructors take a row Map). Thus, the
7015 // matrix always has a row Map.
7016 RCP<const map_type> rowMap = this->getRowMap();
7017
7018 RCP<MV> Y_rowMap; // null by default
7019
7020 // If the Export object is trivial (null), then we don't need a
7021 // separate row Map multivector. Just return null in that case.
7022 // The caller is responsible for knowing not to use the returned
7023 // null pointer.
7024 //
7025 // If the Export is nontrivial, then we do need a separate row
7026 // Map multivector for the Export operation. Check in that case
7027 // if we have to (re)create the row Map multivector.
7028 if (!exporter.is_null() || force) {
7029 if (exportMV_.is_null() || exportMV_->getNumVectors() != numVecs) {
7030 Y_rowMap = rcp(new MV(rowMap, numVecs));
7031 exportMV_ = Y_rowMap; // Cache the newly created MV for later reuse.
7032 } else { // Yay, we can reuse the cached multivector!
7033 Y_rowMap = exportMV_;
7034 }
7035 }
7036 return Y_rowMap;
7037}
7038
7039template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7041 removeEmptyProcessesInPlace(const Teuchos::RCP<const map_type>& newMap) {
7043 myGraph_.is_null(), std::logic_error,
7044 "Tpetra::CrsMatrix::"
7045 "removeEmptyProcessesInPlace: This method does not work when the matrix "
7046 "was created with a constant graph (that is, when it was created using "
7047 "the version of its constructor that takes an RCP<const CrsGraph>). "
7048 "This is because the matrix is not allowed to modify the graph in that "
7049 "case, but removing empty processes requires modifying the graph.");
7050 myGraph_->removeEmptyProcessesInPlace(newMap);
7051 // Even though CrsMatrix's row Map (as returned by getRowMap())
7052 // comes from its CrsGraph, CrsMatrix still implements DistObject,
7053 // so we also have to change the DistObject's Map.
7054 this->map_ = this->getRowMap();
7055 // In the nonconst graph case, staticGraph_ is just a const
7056 // pointer to myGraph_. This assignment is probably redundant,
7057 // but it doesn't hurt.
7058 staticGraph_ = Teuchos::rcp_const_cast<const Graph>(myGraph_);
7059}
7060
7061template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7062Teuchos::RCP<RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>>
7064 add(const Scalar& alpha,
7066 const Scalar& beta,
7067 const Teuchos::RCP<const map_type>& domainMap,
7068 const Teuchos::RCP<const map_type>& rangeMap,
7069 const Teuchos::RCP<Teuchos::ParameterList>& params) const {
7070 using std::endl;
7071 using Teuchos::Array;
7072 using Teuchos::ArrayView;
7073 using Teuchos::ParameterList;
7074 using Teuchos::RCP;
7075 using Teuchos::rcp;
7076 using Teuchos::rcp_implicit_cast;
7077 using Teuchos::sublist;
7078 using LO = local_ordinal_type;
7079 using GO = global_ordinal_type;
7080 using crs_matrix_type =
7082 const char errPfx[] = "Tpetra::CrsMatrix::add: ";
7083
7084 const bool debug = Details::Behavior::debug("CrsMatrix");
7085 const bool verbose = Details::Behavior::verbose("CrsMatrix");
7086 std::unique_ptr<std::string> prefix;
7087 if (verbose) {
7088 prefix = this->createPrefix("CrsMatrix", "add");
7089 std::ostringstream os;
7090 os << *prefix << "Start" << endl;
7091 std::cerr << os.str();
7092 }
7093
7094 const crs_matrix_type& B = *this; // a convenient abbreviation
7095 const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero();
7096 const Scalar ONE = Teuchos::ScalarTraits<Scalar>::one();
7097
7098 // If the user didn't supply a domain or range Map, then try to
7099 // get one from B first (if it has them), then from A (if it has
7100 // them). If we don't have any domain or range Maps, scold the
7101 // user.
7102 RCP<const map_type> A_domainMap = A.getDomainMap();
7103 RCP<const map_type> A_rangeMap = A.getRangeMap();
7104 RCP<const map_type> B_domainMap = B.getDomainMap();
7105 RCP<const map_type> B_rangeMap = B.getRangeMap();
7106
7109
7110 if (domainMap.is_null()) {
7111 if (B_domainMap.is_null()) {
7113 A_domainMap.is_null(), std::invalid_argument,
7114 "Tpetra::CrsMatrix::add: If neither A nor B have a domain Map, "
7115 "then you must supply a nonnull domain Map to this method.");
7117 } else {
7119 }
7120 }
7121 if (rangeMap.is_null()) {
7122 if (B_rangeMap.is_null()) {
7124 A_rangeMap.is_null(), std::invalid_argument,
7125 "Tpetra::CrsMatrix::add: If neither A nor B have a range Map, "
7126 "then you must supply a nonnull range Map to this method.");
7128 } else {
7130 }
7131 }
7132
7133 if (debug) {
7134 // In debug mode, check that A and B have matching domain and
7135 // range Maps, if they have domain and range Maps at all. (If
7136 // they aren't fill complete, then they may not yet have them.)
7137 if (!A_domainMap.is_null() && !A_rangeMap.is_null()) {
7138 if (!B_domainMap.is_null() && !B_rangeMap.is_null()) {
7140 std::invalid_argument,
7141 errPfx << "The input RowMatrix A must have a domain Map "
7142 "which is the same as (isSameAs) this RowMatrix's "
7143 "domain Map.");
7144 TEUCHOS_TEST_FOR_EXCEPTION(!B_rangeMap->isSameAs(*A_rangeMap), std::invalid_argument,
7145 errPfx << "The input RowMatrix A must have a range Map "
7146 "which is the same as (isSameAs) this RowMatrix's range "
7147 "Map.");
7149 !domainMap->isSameAs(*B_domainMap),
7150 std::invalid_argument,
7151 errPfx << "The input domain Map must be the same as "
7152 "(isSameAs) this RowMatrix's domain Map.");
7154 !rangeMap->isSameAs(*B_rangeMap),
7155 std::invalid_argument,
7156 errPfx << "The input range Map must be the same as "
7157 "(isSameAs) this RowMatrix's range Map.");
7158 }
7159 } else if (!B_domainMap.is_null() && !B_rangeMap.is_null()) {
7161 !domainMap->isSameAs(*B_domainMap),
7162 std::invalid_argument,
7163 errPfx << "The input domain Map must be the same as "
7164 "(isSameAs) this RowMatrix's domain Map.");
7165 TEUCHOS_TEST_FOR_EXCEPTION(!rangeMap.is_null() && !rangeMap->isSameAs(*B_rangeMap),
7166 std::invalid_argument,
7167 errPfx << "The input range Map must be the same as "
7168 "(isSameAs) this RowMatrix's range Map.");
7169 } else {
7170 TEUCHOS_TEST_FOR_EXCEPTION(domainMap.is_null() || rangeMap.is_null(),
7171 std::invalid_argument, errPfx << "If neither A nor B "
7172 "have a domain and range Map, then you must supply a "
7173 "nonnull domain and range Map to this method.");
7174 }
7175 }
7176
7177 // What parameters do we pass to C's constructor? Do we call
7178 // fillComplete on C after filling it? And if so, what parameters
7179 // do we pass to C's fillComplete call?
7180 bool callFillComplete = true;
7183 if (!params.is_null()) {
7185 params->get("Call fillComplete", callFillComplete);
7186 constructorSublist = sublist(params, "Constructor parameters");
7187 fillCompleteSublist = sublist(params, "fillComplete parameters");
7188 }
7189
7190 RCP<const map_type> A_rowMap = A.getRowMap();
7191 RCP<const map_type> B_rowMap = B.getRowMap();
7192 RCP<const map_type> C_rowMap = B_rowMap; // see discussion in documentation
7193 RCP<crs_matrix_type> C; // The result matrix.
7194
7195 // If A and B's row Maps are the same, we can compute an upper
7196 // bound on the number of entries in each row of C, before
7197 // actually computing the sum. A reasonable upper bound is the
7198 // sum of the two entry counts in each row.
7199 if (A_rowMap->isSameAs(*B_rowMap)) {
7200 const LO localNumRows = static_cast<LO>(A_rowMap->getLocalNumElements());
7202
7203 // Get the number of entries in each row of A.
7204 if (alpha != ZERO) {
7205 for (LO localRow = 0; localRow < localNumRows; ++localRow) {
7206 const size_t A_numEntries = A.getNumEntriesInLocalRow(localRow);
7208 }
7209 }
7210 // Get the number of entries in each row of B.
7211 if (beta != ZERO) {
7212 for (LO localRow = 0; localRow < localNumRows; ++localRow) {
7213 const size_t B_numEntries = B.getNumEntriesInLocalRow(localRow);
7215 }
7216 }
7217 // Construct the result matrix C.
7218 if (constructorSublist.is_null()) {
7219 C = rcp(new crs_matrix_type(C_rowMap, C_maxNumEntriesPerRow()));
7220 } else {
7221 C = rcp(new crs_matrix_type(C_rowMap, C_maxNumEntriesPerRow(),
7223 }
7224 // Since A and B have the same row Maps, we could add them
7225 // together all at once and merge values before we call
7226 // insertGlobalValues. However, we don't really need to, since
7227 // we've already allocated enough space in each row of C for C
7228 // to do the merge itself.
7229 } else { // the row Maps of A and B are not the same
7230 // Construct the result matrix C.
7231 // true: !A_rowMap->isSameAs (*B_rowMap)
7232 TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, errPfx << "The row maps must "
7233 "be the same for statically allocated matrices, to ensure "
7234 "that there is sufficient space to do the addition.");
7235 }
7236
7237 TEUCHOS_TEST_FOR_EXCEPTION(C.is_null(), std::logic_error,
7238 errPfx << "C should not be null at this point. "
7239 "Please report this bug to the Tpetra developers.");
7240
7241 if (verbose) {
7242 std::ostringstream os;
7243 os << *prefix << "Compute C = alpha*A + beta*B" << endl;
7244 std::cerr << os.str();
7245 }
7246 using gids_type = nonconst_global_inds_host_view_type;
7247 using vals_type = nonconst_values_host_view_type;
7248 gids_type ind;
7249 vals_type val;
7250
7251 if (alpha != ZERO) {
7252 const LO A_localNumRows = static_cast<LO>(A_rowMap->getLocalNumElements());
7253 for (LO localRow = 0; localRow < A_localNumRows; ++localRow) {
7254 size_t A_numEntries = A.getNumEntriesInLocalRow(localRow);
7255 const GO globalRow = A_rowMap->getGlobalElement(localRow);
7256 if (A_numEntries > static_cast<size_t>(ind.size())) {
7257 Kokkos::resize(ind, A_numEntries);
7258 Kokkos::resize(val, A_numEntries);
7259 }
7260 gids_type indView = Kokkos::subview(ind, std::make_pair((size_t)0, A_numEntries));
7261 vals_type valView = Kokkos::subview(val, std::make_pair((size_t)0, A_numEntries));
7262 A.getGlobalRowCopy(globalRow, indView, valView, A_numEntries);
7263
7264 if (alpha != ONE) {
7265 for (size_t k = 0; k < A_numEntries; ++k) {
7266 valView[k] *= alpha;
7267 }
7268 }
7269 C->insertGlobalValues(globalRow, A_numEntries,
7270 reinterpret_cast<Scalar*>(valView.data()),
7271 indView.data());
7272 }
7273 }
7274
7275 if (beta != ZERO) {
7276 const LO B_localNumRows = static_cast<LO>(B_rowMap->getLocalNumElements());
7277 for (LO localRow = 0; localRow < B_localNumRows; ++localRow) {
7278 size_t B_numEntries = B.getNumEntriesInLocalRow(localRow);
7279 const GO globalRow = B_rowMap->getGlobalElement(localRow);
7280 if (B_numEntries > static_cast<size_t>(ind.size())) {
7281 Kokkos::resize(ind, B_numEntries);
7282 Kokkos::resize(val, B_numEntries);
7283 }
7284 gids_type indView = Kokkos::subview(ind, std::make_pair((size_t)0, B_numEntries));
7285 vals_type valView = Kokkos::subview(val, std::make_pair((size_t)0, B_numEntries));
7286 B.getGlobalRowCopy(globalRow, indView, valView, B_numEntries);
7287
7288 if (beta != ONE) {
7289 for (size_t k = 0; k < B_numEntries; ++k) {
7290 valView[k] *= beta;
7291 }
7292 }
7293 C->insertGlobalValues(globalRow, B_numEntries,
7294 reinterpret_cast<Scalar*>(valView.data()),
7295 indView.data());
7296 }
7297 }
7298
7299 if (callFillComplete) {
7300 if (verbose) {
7301 std::ostringstream os;
7302 os << *prefix << "Call fillComplete on C" << endl;
7303 std::cerr << os.str();
7304 }
7305 if (fillCompleteSublist.is_null()) {
7306 C->fillComplete(theDomainMap, theRangeMap);
7307 } else {
7309 }
7310 } else if (verbose) {
7311 std::ostringstream os;
7312 os << *prefix << "Do NOT call fillComplete on C" << endl;
7313 std::cerr << os.str();
7314 }
7315
7316 if (verbose) {
7317 std::ostringstream os;
7318 os << *prefix << "Done" << endl;
7319 std::cerr << os.str();
7320 }
7322}
7323
7324template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7327 const ::Tpetra::Details::Transfer<LocalOrdinal, GlobalOrdinal, Node>& rowTransfer,
7328 const Teuchos::RCP<const ::Tpetra::Details::Transfer<LocalOrdinal, GlobalOrdinal, Node>>& domainTransfer,
7329 const Teuchos::RCP<const map_type>& domainMap,
7330 const Teuchos::RCP<const map_type>& rangeMap,
7331 const Teuchos::RCP<Teuchos::ParameterList>& params) const {
7332 using Details::Behavior;
7337 using std::endl;
7338 using Teuchos::ArrayRCP;
7339 using Teuchos::ArrayView;
7340 using Teuchos::Comm;
7341 using Teuchos::ParameterList;
7342 using Teuchos::RCP;
7343 typedef LocalOrdinal LO;
7344 typedef GlobalOrdinal GO;
7345 typedef node_type NT;
7348 using Teuchos::as;
7349
7350 const bool debug = Behavior::debug("CrsMatrix");
7351 const bool verbose = Behavior::verbose("CrsMatrix");
7352 int MyPID = getComm()->getRank();
7353
7354 std::unique_ptr<std::string> verbosePrefix;
7355 if (verbose) {
7357 this->createPrefix("CrsMatrix", "transferAndFillComplete");
7358 std::ostringstream os;
7359 os << "Start" << endl;
7360 std::cerr << os.str();
7361 }
7362
7363 //
7364 // Get the caller's parameters
7365 //
7366 bool isMM = false; // optimize for matrix-matrix ops.
7367 bool reverseMode = false; // Are we in reverse mode?
7368 bool restrictComm = false; // Do we need to restrict the communicator?
7369
7370 int mm_optimization_core_count =
7371 Behavior::TAFC_OptimizationCoreCount();
7372 RCP<ParameterList> matrixparams; // parameters for the destination matrix
7373 bool overrideAllreduce = false;
7374 bool useKokkosPath = false;
7375 if (!params.is_null()) {
7376 matrixparams = sublist(params, "CrsMatrix");
7377 reverseMode = params->get("Reverse Mode", reverseMode);
7378 useKokkosPath = params->get("TAFC: use kokkos path", useKokkosPath);
7379 restrictComm = params->get("Restrict Communicator", restrictComm);
7380 auto& slist = params->sublist("matrixmatrix: kernel params", false);
7381 isMM = slist.get("isMatrixMatrix_TransferAndFillComplete", false);
7382 mm_optimization_core_count = slist.get("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
7383
7384 overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck", false);
7385 if (getComm()->getSize() < mm_optimization_core_count && isMM) isMM = false;
7386 if (reverseMode) isMM = false;
7387 }
7388
7389 // Only used in the sparse matrix-matrix multiply (isMM) case.
7390 std::shared_ptr<::Tpetra::Details::CommRequest> iallreduceRequest;
7391 int mismatch = 0;
7392 int reduced_mismatch = 0;
7393 if (isMM && !overrideAllreduce) {
7394 // Test for pathological matrix transfer
7395 const bool source_vals = !getGraph()->getImporter().is_null();
7396 const bool target_vals = !(rowTransfer.getExportLIDs().size() == 0 ||
7397 rowTransfer.getRemoteLIDs().size() == 0);
7398 mismatch = (source_vals != target_vals) ? 1 : 0;
7399 iallreduceRequest =
7400 ::Tpetra::Details::iallreduce(mismatch, reduced_mismatch,
7401 Teuchos::REDUCE_MAX, *(getComm()));
7402 }
7403
7404#ifdef HAVE_TPETRA_MMM_TIMINGS
7405 using Teuchos::TimeMonitor;
7406 std::string label;
7407 if (!params.is_null())
7408 label = params->get("Timer Label", label);
7409 std::string prefix = std::string("Tpetra ") + label + std::string(": ");
7410 std::string tlstr;
7411 {
7412 std::ostringstream os;
7413 if (isMM)
7414 os << ":MMOpt";
7415 else
7416 os << ":MMLegacy";
7417 tlstr = os.str();
7418 }
7419
7420 Teuchos::TimeMonitor MMall(*TimeMonitor::getNewTimer(prefix + std::string("TAFC All") + tlstr));
7421#endif
7422
7423 // Make sure that the input argument rowTransfer is either an
7424 // Import or an Export. Import and Export are the only two
7425 // subclasses of Transfer that we defined, but users might
7426 // (unwisely, for now at least) decide to implement their own
7427 // subclasses. Exclude this possibility.
7428 const import_type* xferAsImport = dynamic_cast<const import_type*>(&rowTransfer);
7429 const export_type* xferAsExport = dynamic_cast<const export_type*>(&rowTransfer);
7430 TEUCHOS_TEST_FOR_EXCEPTION(
7431 xferAsImport == nullptr && xferAsExport == nullptr, std::invalid_argument,
7432 "Tpetra::CrsMatrix::transferAndFillComplete: The 'rowTransfer' input "
7433 "argument must be either an Import or an Export, and its template "
7434 "parameters must match the corresponding template parameters of the "
7435 "CrsMatrix.");
7436
7437 // Make sure that the input argument domainTransfer is either an
7438 // Import or an Export. Import and Export are the only two
7439 // subclasses of Transfer that we defined, but users might
7440 // (unwisely, for now at least) decide to implement their own
7441 // subclasses. Exclude this possibility.
7442 Teuchos::RCP<const import_type> xferDomainAsImport = Teuchos::rcp_dynamic_cast<const import_type>(domainTransfer);
7443 Teuchos::RCP<const export_type> xferDomainAsExport = Teuchos::rcp_dynamic_cast<const export_type>(domainTransfer);
7444
7445 if (!domainTransfer.is_null()) {
7446 TEUCHOS_TEST_FOR_EXCEPTION(
7447 (xferDomainAsImport.is_null() && xferDomainAsExport.is_null()), std::invalid_argument,
7448 "Tpetra::CrsMatrix::transferAndFillComplete: The 'domainTransfer' input "
7449 "argument must be either an Import or an Export, and its template "
7450 "parameters must match the corresponding template parameters of the "
7451 "CrsMatrix.");
7452
7453 TEUCHOS_TEST_FOR_EXCEPTION(
7454 (xferAsImport != nullptr || !xferDomainAsImport.is_null()) &&
7455 ((xferAsImport != nullptr && xferDomainAsImport.is_null()) ||
7456 (xferAsImport == nullptr && !xferDomainAsImport.is_null())),
7457 std::invalid_argument,
7458 "Tpetra::CrsMatrix::transferAndFillComplete: The 'rowTransfer' and 'domainTransfer' input "
7459 "arguments must be of the same type (either Import or Export).");
7460
7461 TEUCHOS_TEST_FOR_EXCEPTION(
7462 (xferAsExport != nullptr || !xferDomainAsExport.is_null()) &&
7463 ((xferAsExport != nullptr && xferDomainAsExport.is_null()) ||
7464 (xferAsExport == nullptr && !xferDomainAsExport.is_null())),
7465 std::invalid_argument,
7466 "Tpetra::CrsMatrix::transferAndFillComplete: The 'rowTransfer' and 'domainTransfer' input "
7467 "arguments must be of the same type (either Import or Export).");
7468 } // domainTransfer != null
7469
7470 // FIXME (mfh 15 May 2014) Wouldn't communication still be needed,
7471 // if the source Map is not distributed but the target Map is?
7472 const bool communication_needed = rowTransfer.getSourceMap()->isDistributed();
7473
7474 // Get the new domain and range Maps. We need some of them for
7475 // error checking, now that we have the reverseMode parameter.
7476 RCP<const map_type> MyRowMap = reverseMode ? rowTransfer.getSourceMap() : rowTransfer.getTargetMap();
7477 RCP<const map_type> MyColMap; // create this below
7478 RCP<const map_type> MyDomainMap = !domainMap.is_null() ? domainMap : getDomainMap();
7479 RCP<const map_type> MyRangeMap = !rangeMap.is_null() ? rangeMap : getRangeMap();
7480 RCP<const map_type> BaseRowMap = MyRowMap;
7481 RCP<const map_type> BaseDomainMap = MyDomainMap;
7482
7483 // If the user gave us a nonnull destMat, then check whether it's
7484 // "pristine." That means that it has no entries.
7485 //
7486 // FIXME (mfh 15 May 2014) If this is not true on all processes,
7487 // then this exception test may hang. It would be better to
7488 // forward an error flag to the next communication phase.
7489 if (!destMat.is_null()) {
7490 // FIXME (mfh 15 May 2014): The Epetra idiom for checking
7491 // whether a graph or matrix has no entries on the calling
7492 // process, is that it is neither locally nor globally indexed.
7493 // This may change eventually with the Kokkos refactor version
7494 // of Tpetra, so it would be better just to check the quantity
7495 // of interest directly. Note that with the Kokkos refactor
7496 // version of Tpetra, asking for the total number of entries in
7497 // a graph or matrix that is not fill complete might require
7498 // computation (kernel launch), since it is not thread scalable
7499 // to update a count every time an entry is inserted.
7500 const bool NewFlag = !destMat->getGraph()->isLocallyIndexed() &&
7501 !destMat->getGraph()->isGloballyIndexed();
7502 TEUCHOS_TEST_FOR_EXCEPTION(
7503 !NewFlag, std::invalid_argument,
7504 "Tpetra::CrsMatrix::"
7505 "transferAndFillComplete: The input argument 'destMat' is only allowed "
7506 "to be nonnull, if its graph is empty (neither locally nor globally "
7507 "indexed).");
7508 // FIXME (mfh 15 May 2014) At some point, we want to change
7509 // graphs and matrices so that their DistObject Map
7510 // (this->getMap()) may differ from their row Map. This will
7511 // make redistribution for 2-D distributions more efficient. I
7512 // hesitate to change this check, because I'm not sure how much
7513 // the code here depends on getMap() and getRowMap() being the
7514 // same.
7515 TEUCHOS_TEST_FOR_EXCEPTION(
7516 !destMat->getRowMap()->isSameAs(*MyRowMap), std::invalid_argument,
7517 "Tpetra::CrsMatrix::transferAndFillComplete: The (row) Map of the "
7518 "input argument 'destMat' is not the same as the (row) Map specified "
7519 "by the input argument 'rowTransfer'.");
7520 TEUCHOS_TEST_FOR_EXCEPTION(
7521 !destMat->checkSizes(*this), std::invalid_argument,
7522 "Tpetra::CrsMatrix::transferAndFillComplete: You provided a nonnull "
7523 "destination matrix, but checkSizes() indicates that it is not a legal "
7524 "legal target for redistribution from the source matrix (*this). This "
7525 "may mean that they do not have the same dimensions.");
7526 }
7527
7528 // If forward mode (the default), then *this's (row) Map must be
7529 // the same as the source Map of the Transfer. If reverse mode,
7530 // then *this's (row) Map must be the same as the target Map of
7531 // the Transfer.
7532 //
7533 // FIXME (mfh 15 May 2014) At some point, we want to change graphs
7534 // and matrices so that their DistObject Map (this->getMap()) may
7535 // differ from their row Map. This will make redistribution for
7536 // 2-D distributions more efficient. I hesitate to change this
7537 // check, because I'm not sure how much the code here depends on
7538 // getMap() and getRowMap() being the same.
7539 TEUCHOS_TEST_FOR_EXCEPTION(
7540 !(reverseMode || getRowMap()->isSameAs(*rowTransfer.getSourceMap())),
7541 std::invalid_argument,
7542 "Tpetra::CrsMatrix::transferAndFillComplete: "
7543 "rowTransfer->getSourceMap() must match this->getRowMap() in forward mode.");
7544 TEUCHOS_TEST_FOR_EXCEPTION(
7545 !(!reverseMode || getRowMap()->isSameAs(*rowTransfer.getTargetMap())),
7546 std::invalid_argument,
7547 "Tpetra::CrsMatrix::transferAndFillComplete: "
7548 "rowTransfer->getTargetMap() must match this->getRowMap() in reverse mode.");
7549
7550 // checks for domainTransfer
7551 TEUCHOS_TEST_FOR_EXCEPTION(
7552 !xferDomainAsImport.is_null() && !xferDomainAsImport->getTargetMap()->isSameAs(*domainMap),
7553 std::invalid_argument,
7554 "Tpetra::CrsMatrix::transferAndFillComplete: The target map of the 'domainTransfer' input "
7555 "argument must be the same as the rebalanced domain map 'domainMap'");
7556
7557 TEUCHOS_TEST_FOR_EXCEPTION(
7558 !xferDomainAsExport.is_null() && !xferDomainAsExport->getSourceMap()->isSameAs(*domainMap),
7559 std::invalid_argument,
7560 "Tpetra::CrsMatrix::transferAndFillComplete: The source map of the 'domainTransfer' input "
7561 "argument must be the same as the rebalanced domain map 'domainMap'");
7562
7563 // The basic algorithm here is:
7564 //
7565 // 1. Call the moral equivalent of "Distor.do" to handle the import.
7566 // 2. Copy all the Imported and Copy/Permuted data into the raw
7567 // CrsMatrix / CrsGraphData pointers, still using GIDs.
7568 // 3. Call an optimized version of MakeColMap that avoids the
7569 // Directory lookups (since the importer knows who owns all the
7570 // GIDs) AND reindexes to LIDs.
7571 // 4. Call expertStaticFillComplete()
7572
7573 // Get information from the Importer
7574 const size_t NumSameIDs = rowTransfer.getNumSameIDs();
7575 ArrayView<const LO> ExportLIDs = reverseMode ? rowTransfer.getRemoteLIDs() : rowTransfer.getExportLIDs();
7576 auto RemoteLIDs = reverseMode ? rowTransfer.getExportLIDs_dv() : rowTransfer.getRemoteLIDs_dv();
7577 auto PermuteToLIDs = reverseMode ? rowTransfer.getPermuteFromLIDs_dv() : rowTransfer.getPermuteToLIDs_dv();
7578 auto PermuteFromLIDs = reverseMode ? rowTransfer.getPermuteToLIDs_dv() : rowTransfer.getPermuteFromLIDs_dv();
7579 Distributor& Distor = rowTransfer.getDistributor();
7580
7581 // Owning PIDs
7582 Teuchos::Array<int> SourcePids;
7583
7584 // Temp variables for sub-communicators
7585 RCP<const map_type> ReducedRowMap, ReducedColMap,
7586 ReducedDomainMap, ReducedRangeMap;
7587 RCP<const Comm<int>> ReducedComm;
7588
7589 // If the user gave us a null destMat, then construct the new
7590 // destination matrix. We will replace its column Map later.
7591 if (destMat.is_null()) {
7592 destMat = rcp(new this_CRS_type(MyRowMap, 0, matrixparams));
7593 }
7594
7595 /***************************************************/
7596 /***** 1) First communicator restriction phase ****/
7597 /***************************************************/
7598 if (restrictComm) {
7599#ifdef HAVE_TPETRA_MMM_TIMINGS
7600 Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC restrictComm")));
7601#endif
7602 ReducedRowMap = MyRowMap->removeEmptyProcesses();
7603 ReducedComm = ReducedRowMap.is_null() ? Teuchos::null : ReducedRowMap->getComm();
7604 destMat->removeEmptyProcessesInPlace(ReducedRowMap);
7605
7606 ReducedDomainMap = MyRowMap.getRawPtr() == MyDomainMap.getRawPtr() ? ReducedRowMap : MyDomainMap->replaceCommWithSubset(ReducedComm);
7607 ReducedRangeMap = MyRowMap.getRawPtr() == MyRangeMap.getRawPtr() ? ReducedRowMap : MyRangeMap->replaceCommWithSubset(ReducedComm);
7608
7609 // Reset the "my" maps
7610 MyRowMap = ReducedRowMap;
7611 MyDomainMap = ReducedDomainMap;
7612 MyRangeMap = ReducedRangeMap;
7613
7614 // Update my PID, if we've restricted the communicator
7615 if (!ReducedComm.is_null()) {
7616 MyPID = ReducedComm->getRank();
7617 } else {
7618 MyPID = -2; // For debugging
7619 }
7620 } else {
7621 ReducedComm = MyRowMap->getComm();
7622 }
7623
7624 /***************************************************/
7625 /***** 2) From Tpetra::DistObject::doTransfer() ****/
7626 /***************************************************/
7627 // Get the owning PIDs
7628 RCP<const import_type> MyImporter = getGraph()->getImporter();
7629
7630 // check whether domain maps of source matrix and base domain map is the same
7631 bool bSameDomainMap = BaseDomainMap->isSameAs(*getDomainMap());
7632
7633 if (!restrictComm && !MyImporter.is_null() && bSameDomainMap) {
7634#ifdef HAVE_TPETRA_MMM_TIMINGS
7635 Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs same map")));
7636#endif
7637 // Same domain map as source matrix
7638 //
7639 // NOTE: This won't work for restrictComm (because the Import
7640 // doesn't know the restricted PIDs), though writing an
7641 // optimized version for that case would be easy (Import an
7642 // IntVector of the new PIDs). Might want to add this later.
7643 Import_Util::getPids(*MyImporter, SourcePids, false);
7644 } else if (restrictComm && !MyImporter.is_null() && bSameDomainMap) {
7645 // Same domain map as source matrix (restricted communicator)
7646 // We need one import from the domain to the column map
7647#ifdef HAVE_TPETRA_MMM_TIMINGS
7648 Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs restricted comm")));
7649#endif
7650 IntVectorType SourceDomain_pids(getDomainMap(), true);
7651 IntVectorType SourceCol_pids(getColMap());
7652 // SourceDomain_pids contains the restricted pids
7653 SourceDomain_pids.putScalar(MyPID);
7654
7655 SourceCol_pids.doImport(SourceDomain_pids, *MyImporter, INSERT);
7656 SourcePids.resize(getColMap()->getLocalNumElements());
7657 SourceCol_pids.get1dCopy(SourcePids());
7658 } else if (MyImporter.is_null()) {
7659 // Matrix has no off-process entries
7660#ifdef HAVE_TPETRA_MMM_TIMINGS
7661 Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs all local entries")));
7662#endif
7663 SourcePids.resize(getColMap()->getLocalNumElements());
7664 SourcePids.assign(getColMap()->getLocalNumElements(), MyPID);
7665 } else if (!MyImporter.is_null() &&
7666 !domainTransfer.is_null()) {
7667 // general implementation for rectangular matrices with
7668 // domain map different than SourceMatrix domain map.
7669 // User has to provide a DomainTransfer object. We need
7670 // to communications (import/export)
7671#ifdef HAVE_TPETRA_MMM_TIMINGS
7672 Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs rectangular case")));
7673#endif
7674
7675 // TargetDomain_pids lives on the rebalanced new domain map
7676 IntVectorType TargetDomain_pids(domainMap);
7677 TargetDomain_pids.putScalar(MyPID);
7678
7679 // SourceDomain_pids lives on the non-rebalanced old domain map
7680 IntVectorType SourceDomain_pids(getDomainMap());
7681
7682 // SourceCol_pids lives on the non-rebalanced old column map
7683 IntVectorType SourceCol_pids(getColMap());
7684
7685 if (!reverseMode && !xferDomainAsImport.is_null()) {
7686 SourceDomain_pids.doExport(TargetDomain_pids, *xferDomainAsImport, INSERT);
7687 } else if (reverseMode && !xferDomainAsExport.is_null()) {
7688 SourceDomain_pids.doExport(TargetDomain_pids, *xferDomainAsExport, INSERT);
7689 } else if (!reverseMode && !xferDomainAsExport.is_null()) {
7690 SourceDomain_pids.doImport(TargetDomain_pids, *xferDomainAsExport, INSERT);
7691 } else if (reverseMode && !xferDomainAsImport.is_null()) {
7692 SourceDomain_pids.doImport(TargetDomain_pids, *xferDomainAsImport, INSERT);
7693 } else {
7694 TEUCHOS_TEST_FOR_EXCEPTION(
7695 true, std::logic_error,
7696 "Tpetra::CrsMatrix::"
7697 "transferAndFillComplete: Should never get here! "
7698 "Please report this bug to a Tpetra developer.");
7699 }
7700 SourceCol_pids.doImport(SourceDomain_pids, *MyImporter, INSERT);
7701 SourcePids.resize(getColMap()->getLocalNumElements());
7702 SourceCol_pids.get1dCopy(SourcePids());
7703 } else if (!MyImporter.is_null() &&
7704 BaseDomainMap->isSameAs(*BaseRowMap) &&
7705 getDomainMap()->isSameAs(*getRowMap())) {
7706 // We can use the rowTransfer + SourceMatrix's Import to find out who owns what.
7707#ifdef HAVE_TPETRA_MMM_TIMINGS
7708 Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs query import")));
7709#endif
7710
7711 IntVectorType TargetRow_pids(domainMap);
7712 IntVectorType SourceRow_pids(getRowMap());
7713 IntVectorType SourceCol_pids(getColMap());
7714
7715 TargetRow_pids.putScalar(MyPID);
7716 if (!reverseMode && xferAsImport != nullptr) {
7717 SourceRow_pids.doExport(TargetRow_pids, *xferAsImport, INSERT);
7718 } else if (reverseMode && xferAsExport != nullptr) {
7719 SourceRow_pids.doExport(TargetRow_pids, *xferAsExport, INSERT);
7720 } else if (!reverseMode && xferAsExport != nullptr) {
7721 SourceRow_pids.doImport(TargetRow_pids, *xferAsExport, INSERT);
7722 } else if (reverseMode && xferAsImport != nullptr) {
7723 SourceRow_pids.doImport(TargetRow_pids, *xferAsImport, INSERT);
7724 } else {
7725 TEUCHOS_TEST_FOR_EXCEPTION(
7726 true, std::logic_error,
7727 "Tpetra::CrsMatrix::"
7728 "transferAndFillComplete: Should never get here! "
7729 "Please report this bug to a Tpetra developer.");
7730 }
7731
7732 SourceCol_pids.doImport(SourceRow_pids, *MyImporter, INSERT);
7733 SourcePids.resize(getColMap()->getLocalNumElements());
7734 SourceCol_pids.get1dCopy(SourcePids());
7735 } else {
7736 TEUCHOS_TEST_FOR_EXCEPTION(
7737 true, std::invalid_argument,
7738 "Tpetra::CrsMatrix::"
7739 "transferAndFillComplete: This method only allows either domainMap == "
7740 "getDomainMap (), or (domainMap == rowTransfer.getTargetMap () and "
7741 "getDomainMap () == getRowMap ()).");
7742 }
7743
7744 // Tpetra-specific stuff
7745 size_t constantNumPackets = destMat->constantNumberOfPackets();
7746 {
7747#ifdef HAVE_TPETRA_MMM_TIMINGS
7748 Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC reallocate buffers")));
7749#endif
7750 if (constantNumPackets == 0) {
7751 destMat->reallocArraysForNumPacketsPerLid(ExportLIDs.size(),
7752 RemoteLIDs.view_host().size());
7753 } else {
7754 // There are a constant number of packets per element. We
7755 // already know (from the number of "remote" (incoming)
7756 // elements) how many incoming elements we expect, so we can
7757 // resize the buffer accordingly.
7758 const size_t rbufLen = RemoteLIDs.view_host().size() * constantNumPackets;
7759 destMat->reallocImportsIfNeeded(rbufLen, false, nullptr);
7760 }
7761 }
7762
7763 // Pack & Prepare w/ owning PIDs
7764 {
7765#ifdef HAVE_TPETRA_MMM_TIMINGS
7766 Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC pack and prepare")));
7767#endif
7768 if (debug) {
7769 using std::cerr;
7770 using std::endl;
7771 using Teuchos::outArg;
7772 using Teuchos::REDUCE_MAX;
7773 using Teuchos::reduceAll;
7774 RCP<const Teuchos::Comm<int>> comm = this->getComm();
7775 const int myRank = comm->getRank();
7776
7777 std::ostringstream errStrm;
7778 int lclErr = 0;
7779 int gblErr = 0;
7780
7781 Teuchos::ArrayView<size_t> numExportPacketsPerLID;
7782 try {
7783 // packAndPrepare* methods modify numExportPacketsPerLID_.
7784 destMat->numExportPacketsPerLID_.modify_host();
7785 numExportPacketsPerLID =
7786 getArrayViewFromDualView(destMat->numExportPacketsPerLID_);
7787 } catch (std::exception& e) {
7788 errStrm << "Proc " << myRank << ": getArrayViewFromDualView threw: "
7789 << e.what() << std::endl;
7790 lclErr = 1;
7791 } catch (...) {
7792 errStrm << "Proc " << myRank << ": getArrayViewFromDualView threw "
7793 "an exception not a subclass of std::exception"
7794 << std::endl;
7795 lclErr = 1;
7796 }
7797
7798 if (!comm.is_null()) {
7799 reduceAll<int, int>(*comm, REDUCE_MAX, lclErr, outArg(gblErr));
7800 }
7801 if (gblErr != 0) {
7802 ::Tpetra::Details::gathervPrint(cerr, errStrm.str(), *comm);
7803 TEUCHOS_TEST_FOR_EXCEPTION(
7804 true, std::runtime_error,
7805 "getArrayViewFromDualView threw an "
7806 "exception on at least one process.");
7807 }
7808
7809 if (verbose) {
7810 std::ostringstream os;
7811 os << *verbosePrefix << "Calling packCrsMatrixWithOwningPIDs"
7812 << std::endl;
7813 std::cerr << os.str();
7814 }
7815 try {
7817 destMat->exports_,
7818 numExportPacketsPerLID,
7819 ExportLIDs,
7820 SourcePids,
7821 constantNumPackets);
7822 } catch (std::exception& e) {
7823 errStrm << "Proc " << myRank << ": packCrsMatrixWithOwningPIDs threw: "
7824 << e.what() << std::endl;
7825 lclErr = 1;
7826 } catch (...) {
7827 errStrm << "Proc " << myRank << ": packCrsMatrixWithOwningPIDs threw "
7828 "an exception not a subclass of std::exception"
7829 << std::endl;
7830 lclErr = 1;
7831 }
7832
7833 if (verbose) {
7834 std::ostringstream os;
7835 os << *verbosePrefix << "Done with packCrsMatrixWithOwningPIDs"
7836 << std::endl;
7837 std::cerr << os.str();
7838 }
7839
7840 if (!comm.is_null()) {
7841 reduceAll<int, int>(*comm, REDUCE_MAX, lclErr, outArg(gblErr));
7842 }
7843 if (gblErr != 0) {
7844 ::Tpetra::Details::gathervPrint(cerr, errStrm.str(), *comm);
7845 TEUCHOS_TEST_FOR_EXCEPTION(
7846 true, std::runtime_error,
7847 "packCrsMatrixWithOwningPIDs threw an "
7848 "exception on at least one process.");
7849 }
7850 } else {
7851 // packAndPrepare* methods modify numExportPacketsPerLID_.
7852 destMat->numExportPacketsPerLID_.modify_host();
7853 Teuchos::ArrayView<size_t> numExportPacketsPerLID =
7854 getArrayViewFromDualView(destMat->numExportPacketsPerLID_);
7855 if (verbose) {
7856 std::ostringstream os;
7857 os << *verbosePrefix << "Calling packCrsMatrixWithOwningPIDs"
7858 << std::endl;
7859 std::cerr << os.str();
7860 }
7862 destMat->exports_,
7863 numExportPacketsPerLID,
7864 ExportLIDs,
7865 SourcePids,
7866 constantNumPackets);
7867 if (verbose) {
7868 std::ostringstream os;
7869 os << *verbosePrefix << "Done with packCrsMatrixWithOwningPIDs"
7870 << std::endl;
7871 std::cerr << os.str();
7872 }
7873 }
7874 }
7875
7876 // Do the exchange of remote data.
7877 {
7878#ifdef HAVE_TPETRA_MMM_TIMINGS
7879 Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs exchange remote data")));
7880#endif
7881 if (!communication_needed) {
7882 if (verbose) {
7883 std::ostringstream os;
7884 os << *verbosePrefix << "Communication not needed" << std::endl;
7885 std::cerr << os.str();
7886 }
7887 } else {
7888 if (reverseMode) {
7889 if (constantNumPackets == 0) { // variable number of packets per LID
7890 if (verbose) {
7891 std::ostringstream os;
7892 os << *verbosePrefix << "Reverse mode, variable # packets / LID"
7893 << std::endl;
7894 std::cerr << os.str();
7895 }
7896 // Make sure that host has the latest version, since we're
7897 // using the version on host. If host has the latest
7898 // version, syncing to host does nothing.
7899 destMat->numExportPacketsPerLID_.sync_host();
7900 Teuchos::ArrayView<const size_t> numExportPacketsPerLID =
7901 getArrayViewFromDualView(destMat->numExportPacketsPerLID_);
7902 destMat->numImportPacketsPerLID_.sync_host();
7903 Teuchos::ArrayView<size_t> numImportPacketsPerLID =
7904 getArrayViewFromDualView(destMat->numImportPacketsPerLID_);
7905
7906 if (verbose) {
7907 std::ostringstream os;
7908 os << *verbosePrefix << "Calling 3-arg doReversePostsAndWaits"
7909 << std::endl;
7910 std::cerr << os.str();
7911 }
7912 Distor.doReversePostsAndWaits(destMat->numExportPacketsPerLID_.view_host(), 1,
7913 destMat->numImportPacketsPerLID_.view_host());
7914 if (verbose) {
7915 std::ostringstream os;
7916 os << *verbosePrefix << "Finished 3-arg doReversePostsAndWaits"
7917 << std::endl;
7918 std::cerr << os.str();
7919 }
7920
7921 size_t totalImportPackets = 0;
7922 for (Array_size_type i = 0; i < numImportPacketsPerLID.size(); ++i) {
7923 totalImportPackets += numImportPacketsPerLID[i];
7924 }
7925
7926 // Reallocation MUST go before setting the modified flag,
7927 // because it may clear out the flags.
7928 destMat->reallocImportsIfNeeded(totalImportPackets, verbose,
7929 verbosePrefix.get());
7930 destMat->imports_.modify_host();
7931 auto hostImports = destMat->imports_.view_host();
7932 // This is a legacy host pack/unpack path, so use the host
7933 // version of exports_.
7934 destMat->exports_.sync_host();
7935 auto hostExports = destMat->exports_.view_host();
7936 if (verbose) {
7937 std::ostringstream os;
7938 os << *verbosePrefix << "Calling 4-arg doReversePostsAndWaits"
7939 << std::endl;
7940 std::cerr << os.str();
7941 }
7942 Distor.doReversePostsAndWaits(hostExports,
7943 numExportPacketsPerLID,
7944 hostImports,
7945 numImportPacketsPerLID);
7946 if (verbose) {
7947 std::ostringstream os;
7948 os << *verbosePrefix << "Finished 4-arg doReversePostsAndWaits"
7949 << std::endl;
7950 std::cerr << os.str();
7951 }
7952 } else { // constant number of packets per LID
7953 if (verbose) {
7954 std::ostringstream os;
7955 os << *verbosePrefix << "Reverse mode, constant # packets / LID"
7956 << std::endl;
7957 std::cerr << os.str();
7958 }
7959 destMat->imports_.modify_host();
7960 auto hostImports = destMat->imports_.view_host();
7961 // This is a legacy host pack/unpack path, so use the host
7962 // version of exports_.
7963 destMat->exports_.sync_host();
7964 auto hostExports = destMat->exports_.view_host();
7965 if (verbose) {
7966 std::ostringstream os;
7967 os << *verbosePrefix << "Calling 3-arg doReversePostsAndWaits"
7968 << std::endl;
7969 std::cerr << os.str();
7970 }
7971 Distor.doReversePostsAndWaits(hostExports,
7972 constantNumPackets,
7973 hostImports);
7974 if (verbose) {
7975 std::ostringstream os;
7976 os << *verbosePrefix << "Finished 3-arg doReversePostsAndWaits"
7977 << std::endl;
7978 std::cerr << os.str();
7979 }
7980 }
7981 } else { // forward mode (the default)
7982 if (constantNumPackets == 0) { // variable number of packets per LID
7983 if (verbose) {
7984 std::ostringstream os;
7985 os << *verbosePrefix << "Forward mode, variable # packets / LID"
7986 << std::endl;
7987 std::cerr << os.str();
7988 }
7989 // Make sure that host has the latest version, since we're
7990 // using the version on host. If host has the latest
7991 // version, syncing to host does nothing.
7992 destMat->numExportPacketsPerLID_.sync_host();
7993 Teuchos::ArrayView<const size_t> numExportPacketsPerLID =
7994 getArrayViewFromDualView(destMat->numExportPacketsPerLID_);
7995 destMat->numImportPacketsPerLID_.sync_host();
7996 Teuchos::ArrayView<size_t> numImportPacketsPerLID =
7997 getArrayViewFromDualView(destMat->numImportPacketsPerLID_);
7998 if (verbose) {
7999 std::ostringstream os;
8000 os << *verbosePrefix << "Calling 3-arg doPostsAndWaits"
8001 << std::endl;
8002 std::cerr << os.str();
8003 }
8004 Distor.doPostsAndWaits(destMat->numExportPacketsPerLID_.view_host(), 1,
8005 destMat->numImportPacketsPerLID_.view_host());
8006 if (verbose) {
8007 std::ostringstream os;
8008 os << *verbosePrefix << "Finished 3-arg doPostsAndWaits"
8009 << std::endl;
8010 std::cerr << os.str();
8011 }
8012
8013 size_t totalImportPackets = 0;
8014 for (Array_size_type i = 0; i < numImportPacketsPerLID.size(); ++i) {
8015 totalImportPackets += numImportPacketsPerLID[i];
8016 }
8017
8018 // Reallocation MUST go before setting the modified flag,
8019 // because it may clear out the flags.
8020 destMat->reallocImportsIfNeeded(totalImportPackets, verbose,
8021 verbosePrefix.get());
8022 destMat->imports_.modify_host();
8023 auto hostImports = destMat->imports_.view_host();
8024 // This is a legacy host pack/unpack path, so use the host
8025 // version of exports_.
8026 destMat->exports_.sync_host();
8027 auto hostExports = destMat->exports_.view_host();
8028 if (verbose) {
8029 std::ostringstream os;
8030 os << *verbosePrefix << "Calling 4-arg doPostsAndWaits"
8031 << std::endl;
8032 std::cerr << os.str();
8033 }
8034 Distor.doPostsAndWaits(hostExports,
8035 numExportPacketsPerLID,
8036 hostImports,
8037 numImportPacketsPerLID);
8038 if (verbose) {
8039 std::ostringstream os;
8040 os << *verbosePrefix << "Finished 4-arg doPostsAndWaits"
8041 << std::endl;
8042 std::cerr << os.str();
8043 }
8044 } else { // constant number of packets per LID
8045 if (verbose) {
8046 std::ostringstream os;
8047 os << *verbosePrefix << "Forward mode, constant # packets / LID"
8048 << std::endl;
8049 std::cerr << os.str();
8050 }
8051 destMat->imports_.modify_host();
8052 auto hostImports = destMat->imports_.view_host();
8053 // This is a legacy host pack/unpack path, so use the host
8054 // version of exports_.
8055 destMat->exports_.sync_host();
8056 auto hostExports = destMat->exports_.view_host();
8057 if (verbose) {
8058 std::ostringstream os;
8059 os << *verbosePrefix << "Calling 3-arg doPostsAndWaits"
8060 << std::endl;
8061 std::cerr << os.str();
8062 }
8063 Distor.doPostsAndWaits(hostExports,
8064 constantNumPackets,
8065 hostImports);
8066 if (verbose) {
8067 std::ostringstream os;
8068 os << *verbosePrefix << "Finished 3-arg doPostsAndWaits"
8069 << std::endl;
8070 std::cerr << os.str();
8071 }
8072 }
8073 }
8074 }
8075 }
8076
8077 /*********************************************************************/
8078 /**** 3) Copy all of the Same/Permute/Remote data into CSR_arrays ****/
8079 /*********************************************************************/
8080
8081 bool runOnHost = std::is_same_v<typename device_type::memory_space, Kokkos::HostSpace> && !useKokkosPath;
8082
8083 Teuchos::Array<int> RemotePids;
8084 if (runOnHost) {
8085 Teuchos::Array<int> TargetPids;
8086 // Backwards compatibility measure. We'll use this again below.
8087
8088 // TODO JHU Need to track down why numImportPacketsPerLID_ has not been corrently marked as modified on host (which it has been)
8089 // TODO JHU somewhere above, e.g., call to Distor.doPostsAndWaits().
8090 // TODO JHU This only becomes apparent as we begin to convert TAFC to run on device.
8091 destMat->numImportPacketsPerLID_.modify_host(); // FIXME
8092
8093#ifdef HAVE_TPETRA_MMM_TIMINGS
8094 RCP<TimeMonitor> tmCopySPRdata = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC unpack-count-resize + copy same-perm-remote data"))));
8095#endif
8096 ArrayRCP<size_t> CSR_rowptr;
8097 ArrayRCP<GO> CSR_colind_GID;
8098 ArrayRCP<LO> CSR_colind_LID;
8099 ArrayRCP<Scalar> CSR_vals;
8100
8101 destMat->imports_.sync_device();
8102 destMat->numImportPacketsPerLID_.sync_device();
8103
8104 size_t N = BaseRowMap->getLocalNumElements();
8105
8106 auto RemoteLIDs_d = RemoteLIDs.view_device();
8107 auto PermuteToLIDs_d = PermuteToLIDs.view_device();
8108 auto PermuteFromLIDs_d = PermuteFromLIDs.view_device();
8109
8111 *this,
8112 RemoteLIDs_d,
8113 destMat->imports_.view_device(), // hostImports
8114 destMat->numImportPacketsPerLID_.view_device(), // numImportPacketsPerLID
8115 NumSameIDs,
8116 PermuteToLIDs_d,
8117 PermuteFromLIDs_d,
8118 N,
8119 MyPID,
8120 CSR_rowptr,
8121 CSR_colind_GID,
8122 CSR_vals,
8123 SourcePids(),
8124 TargetPids);
8125
8126 // If LO and GO are the same, we can reuse memory when
8127 // converting the column indices from global to local indices.
8128 if (typeid(LO) == typeid(GO)) {
8129 CSR_colind_LID = Teuchos::arcp_reinterpret_cast<LO>(CSR_colind_GID);
8130 } else {
8131 CSR_colind_LID.resize(CSR_colind_GID.size());
8132 }
8133 CSR_colind_LID.resize(CSR_colind_GID.size());
8134
8135 // On return from unpackAndCombineIntoCrsArrays TargetPids[i] == -1 for locally
8136 // owned entries. Convert them to the actual PID.
8137 // JHU FIXME This can be done within unpackAndCombineIntoCrsArrays with a parallel_for.
8138 for (size_t i = 0; i < static_cast<size_t>(TargetPids.size()); i++) {
8139 if (TargetPids[i] == -1) TargetPids[i] = MyPID;
8140 }
8141#ifdef HAVE_TPETRA_MMM_TIMINGS
8142 tmCopySPRdata = Teuchos::null;
8143#endif
8144 /**************************************************************/
8145 /**** 4) Call Optimized MakeColMap w/ no Directory Lookups ****/
8146 /**************************************************************/
8147 // Call an optimized version of makeColMap that avoids the
8148 // Directory lookups (since the Import object knows who owns all
8149 // the GIDs).
8150 if (verbose) {
8151 std::ostringstream os;
8152 os << *verbosePrefix << "Calling lowCommunicationMakeColMapAndReindex"
8153 << std::endl;
8154 std::cerr << os.str();
8155 }
8156 {
8157#ifdef HAVE_TPETRA_MMM_TIMINGS
8158 Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC makeColMap")));
8159#endif
8160 Import_Util::lowCommunicationMakeColMapAndReindexSerial(CSR_rowptr(),
8161 CSR_colind_LID(),
8162 CSR_colind_GID(),
8163 BaseDomainMap,
8164 TargetPids,
8165 RemotePids,
8166 MyColMap);
8167 }
8168
8169 if (verbose) {
8170 std::ostringstream os;
8171 os << *verbosePrefix << "restrictComm="
8172 << (restrictComm ? "true" : "false") << std::endl;
8173 std::cerr << os.str();
8174 }
8175
8176 /*******************************************************/
8177 /**** 4) Second communicator restriction phase ****/
8178 /*******************************************************/
8179 {
8180#ifdef HAVE_TPETRA_MMM_TIMINGS
8181 Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC restrict colmap")));
8182#endif
8183 if (restrictComm) {
8184 ReducedColMap = (MyRowMap.getRawPtr() == MyColMap.getRawPtr()) ? ReducedRowMap : MyColMap->replaceCommWithSubset(ReducedComm);
8185 MyColMap = ReducedColMap; // Reset the "my" maps
8186 }
8187
8188 // Replace the col map
8189 if (verbose) {
8190 std::ostringstream os;
8191 os << *verbosePrefix << "Calling replaceColMap" << std::endl;
8192 std::cerr << os.str();
8193 }
8194 destMat->replaceColMap(MyColMap);
8195
8196 // Short circuit if the processor is no longer in the communicator
8197 //
8198 // NOTE: Epetra replaces modifies all "removed" processes so they
8199 // have a dummy (serial) Map that doesn't touch the original
8200 // communicator. Duplicating that here might be a good idea.
8201 if (ReducedComm.is_null()) {
8202 if (verbose) {
8203 std::ostringstream os;
8204 os << *verbosePrefix << "I am no longer in the communicator; "
8205 "returning"
8206 << std::endl;
8207 std::cerr << os.str();
8208 }
8209 return;
8210 }
8211 }
8212
8213 /***************************************************/
8214 /**** 5) Sort ****/
8215 /***************************************************/
8216 if ((!reverseMode && xferAsImport != nullptr) ||
8217 (reverseMode && xferAsExport != nullptr)) {
8218 if (verbose) {
8219 std::ostringstream os;
8220 os << *verbosePrefix << "Calling sortCrsEntries" << endl;
8221 std::cerr << os.str();
8222 }
8223#ifdef HAVE_TPETRA_MMM_TIMINGS
8224 Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC sortCrsEntries")));
8225#endif
8226 Import_Util::sortCrsEntries(CSR_rowptr(),
8227 CSR_colind_LID(),
8228 CSR_vals());
8229 } else if ((!reverseMode && xferAsExport != nullptr) ||
8230 (reverseMode && xferAsImport != nullptr)) {
8231 if (verbose) {
8232 std::ostringstream os;
8233 os << *verbosePrefix << "Calling sortAndMergeCrsEntries"
8234 << endl;
8235 std::cerr << os.str();
8236 }
8237#ifdef HAVE_TPETRA_MMM_TIMINGS
8238 Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC sortAndMergeCrsEntries")));
8239#endif
8240 Import_Util::sortAndMergeCrsEntries(CSR_rowptr(),
8241 CSR_colind_LID(),
8242 CSR_vals());
8243 if (CSR_rowptr[N] != static_cast<size_t>(CSR_vals.size())) {
8244 CSR_colind_LID.resize(CSR_rowptr[N]);
8245 CSR_vals.resize(CSR_rowptr[N]);
8246 }
8247 } else {
8248 TEUCHOS_TEST_FOR_EXCEPTION(
8249 true, std::logic_error,
8250 "Tpetra::CrsMatrix::"
8251 "transferAndFillComplete: Should never get here! "
8252 "Please report this bug to a Tpetra developer.");
8253 }
8254 /***************************************************/
8255 /**** 6) Reset the colmap and the arrays ****/
8256 /***************************************************/
8257
8258 if (verbose) {
8259 std::ostringstream os;
8260 os << *verbosePrefix << "Calling destMat->setAllValues" << endl;
8261 std::cerr << os.str();
8262 }
8263
8264 // Call constructor for the new matrix (restricted as needed)
8265 //
8266 // NOTE (mfh 15 May 2014) This should work fine for the Kokkos
8267 // refactor version of CrsMatrix, though it reserves the right to
8268 // make a deep copy of the arrays.
8269 {
8270#ifdef HAVE_TPETRA_MMM_TIMINGS
8271 Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC setAllValues")));
8272#endif
8273 destMat->setAllValues(CSR_rowptr, CSR_colind_LID, CSR_vals);
8274 }
8275
8276 } else {
8277 // run on device
8278
8279 // Backwards compatibility measure. We'll use this again below.
8280
8281 // TODO JHU Need to track down why numImportPacketsPerLID_ has not been corrently marked as modified on host (which it has been)
8282 // TODO JHU somewhere above, e.g., call to Distor.doPostsAndWaits().
8283 // TODO JHU This only becomes apparent as we begin to convert TAFC to run on device.
8284 destMat->numImportPacketsPerLID_.modify_host(); // FIXME
8285
8286#ifdef HAVE_TPETRA_MMM_TIMINGS
8287 RCP<TimeMonitor> tmCopySPRdata = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC unpack-count-resize + copy same-perm-remote data"))));
8288#endif
8289 ArrayRCP<size_t> CSR_rowptr;
8290 ArrayRCP<GO> CSR_colind_GID;
8291 ArrayRCP<LO> CSR_colind_LID;
8292 ArrayRCP<Scalar> CSR_vals;
8293
8294 destMat->imports_.sync_device();
8295 destMat->numImportPacketsPerLID_.sync_device();
8296
8297 size_t N = BaseRowMap->getLocalNumElements();
8298
8299 auto RemoteLIDs_d = RemoteLIDs.view_device();
8300 auto PermuteToLIDs_d = PermuteToLIDs.view_device();
8301 auto PermuteFromLIDs_d = PermuteFromLIDs.view_device();
8302
8303 Kokkos::View<size_t*, device_type> CSR_rowptr_d;
8304 Kokkos::View<GO*, device_type> CSR_colind_GID_d;
8305 Kokkos::View<LO*, device_type> CSR_colind_LID_d;
8306 Kokkos::View<impl_scalar_type*, device_type> CSR_vals_d;
8307 Kokkos::View<int*, device_type> TargetPids_d;
8308
8310 *this,
8311 RemoteLIDs_d,
8312 destMat->imports_.view_device(), // hostImports
8313 destMat->numImportPacketsPerLID_.view_device(), // numImportPacketsPerLID
8314 NumSameIDs,
8315 PermuteToLIDs_d,
8316 PermuteFromLIDs_d,
8317 N,
8318 MyPID,
8319 CSR_rowptr_d,
8320 CSR_colind_GID_d,
8321 CSR_vals_d,
8322 SourcePids(),
8323 TargetPids_d);
8324
8325 Kokkos::resize(CSR_colind_LID_d, CSR_colind_GID_d.size());
8326
8327#ifdef HAVE_TPETRA_MMM_TIMINGS
8328 tmCopySPRdata = Teuchos::null;
8329#endif
8330 /**************************************************************/
8331 /**** 4) Call Optimized MakeColMap w/ no Directory Lookups ****/
8332 /**************************************************************/
8333 // Call an optimized version of makeColMap that avoids the
8334 // Directory lookups (since the Import object knows who owns all
8335 // the GIDs).
8336 if (verbose) {
8337 std::ostringstream os;
8338 os << *verbosePrefix << "Calling lowCommunicationMakeColMapAndReindex"
8339 << std::endl;
8340 std::cerr << os.str();
8341 }
8342 {
8343#ifdef HAVE_TPETRA_MMM_TIMINGS
8344 Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC makeColMap")));
8345#endif
8346 Import_Util::lowCommunicationMakeColMapAndReindex(CSR_rowptr_d,
8347 CSR_colind_LID_d,
8348 CSR_colind_GID_d,
8349 BaseDomainMap,
8350 TargetPids_d,
8351 RemotePids,
8352 MyColMap);
8353 }
8354
8355 if (verbose) {
8356 std::ostringstream os;
8357 os << *verbosePrefix << "restrictComm="
8358 << (restrictComm ? "true" : "false") << std::endl;
8359 std::cerr << os.str();
8360 }
8361
8362 /*******************************************************/
8363 /**** 4) Second communicator restriction phase ****/
8364 /*******************************************************/
8365 {
8366#ifdef HAVE_TPETRA_MMM_TIMINGS
8367 Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC restrict colmap")));
8368#endif
8369 if (restrictComm) {
8370 ReducedColMap = (MyRowMap.getRawPtr() == MyColMap.getRawPtr()) ? ReducedRowMap : MyColMap->replaceCommWithSubset(ReducedComm);
8371 MyColMap = ReducedColMap; // Reset the "my" maps
8372 }
8373
8374 // Replace the col map
8375 if (verbose) {
8376 std::ostringstream os;
8377 os << *verbosePrefix << "Calling replaceColMap" << std::endl;
8378 std::cerr << os.str();
8379 }
8380 destMat->replaceColMap(MyColMap);
8381
8382 // Short circuit if the processor is no longer in the communicator
8383 //
8384 // NOTE: Epetra replaces modifies all "removed" processes so they
8385 // have a dummy (serial) Map that doesn't touch the original
8386 // communicator. Duplicating that here might be a good idea.
8387 if (ReducedComm.is_null()) {
8388 if (verbose) {
8389 std::ostringstream os;
8390 os << *verbosePrefix << "I am no longer in the communicator; "
8391 "returning"
8392 << std::endl;
8393 std::cerr << os.str();
8394 }
8395 return;
8396 }
8397 }
8398
8399 /***************************************************/
8400 /**** 5) Sort ****/
8401 /***************************************************/
8402
8403 if ((!reverseMode && xferAsImport != nullptr) ||
8404 (reverseMode && xferAsExport != nullptr)) {
8405 if (verbose) {
8406 std::ostringstream os;
8407 os << *verbosePrefix << "Calling sortCrsEntries" << endl;
8408 std::cerr << os.str();
8409 }
8410#ifdef HAVE_TPETRA_MMM_TIMINGS
8411 Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC sortCrsEntries")));
8412#endif
8413 Import_Util::sortCrsEntries(CSR_rowptr_d,
8414 CSR_colind_LID_d,
8415 CSR_vals_d);
8416 } else if ((!reverseMode && xferAsExport != nullptr) ||
8417 (reverseMode && xferAsImport != nullptr)) {
8418 if (verbose) {
8419 std::ostringstream os;
8420 os << *verbosePrefix << "Calling sortAndMergeCrsEntries"
8421 << endl;
8422 std::cerr << os.str();
8423 }
8424#ifdef HAVE_TPETRA_MMM_TIMINGS
8425 Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC sortAndMergeCrsEntries")));
8426#endif
8427 Import_Util::sortAndMergeCrsEntries(CSR_rowptr_d,
8428 CSR_colind_LID_d,
8429 CSR_vals_d);
8430 } else {
8431 TEUCHOS_TEST_FOR_EXCEPTION(
8432 true, std::logic_error,
8433 "Tpetra::CrsMatrix::"
8434 "transferAndFillComplete: Should never get here! "
8435 "Please report this bug to a Tpetra developer.");
8436 }
8437
8438 /***************************************************/
8439 /**** 6) Reset the colmap and the arrays ****/
8440 /***************************************************/
8441
8442 if (verbose) {
8443 std::ostringstream os;
8444 os << *verbosePrefix << "Calling destMat->setAllValues" << endl;
8445 std::cerr << os.str();
8446 }
8447
8448 {
8449#ifdef HAVE_TPETRA_MMM_TIMINGS
8450 Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC setAllValues")));
8451#endif
8452 destMat->setAllValues(CSR_rowptr_d, CSR_colind_LID_d, CSR_vals_d);
8453 }
8454
8455 } // if (runOnHost) .. else ..
8456
8457 /***************************************************/
8458 /**** 7) Build Importer & Call ESFC ****/
8459 /***************************************************/
8460#ifdef HAVE_TPETRA_MMM_TIMINGS
8461 RCP<TimeMonitor> tmIESFC = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC build importer and esfc"))));
8462#endif
8463 // Pre-build the importer using the existing PIDs
8464 Teuchos::ParameterList esfc_params;
8465
8466 RCP<import_type> MyImport;
8467
8468 // Fulfull the non-blocking allreduce on reduced_mismatch.
8469 if (iallreduceRequest.get() != nullptr) {
8470 if (verbose) {
8471 std::ostringstream os;
8472 os << *verbosePrefix << "Calling iallreduceRequest->wait()"
8473 << endl;
8474 std::cerr << os.str();
8475 }
8476 iallreduceRequest->wait();
8477 if (reduced_mismatch != 0) {
8478 isMM = false;
8479 }
8480 }
8481
8482 if (isMM) {
8483#ifdef HAVE_TPETRA_MMM_TIMINGS
8484 Teuchos::TimeMonitor MMisMM(*TimeMonitor::getNewTimer(prefix + std::string("isMM Block")));
8485#endif
8486 // Combine all type1/2/3 lists, [filter them], then call the expert import constructor.
8487
8488 if (verbose) {
8489 std::ostringstream os;
8490 os << *verbosePrefix << "Getting CRS pointers" << endl;
8491 std::cerr << os.str();
8492 }
8493
8494 Teuchos::ArrayRCP<LocalOrdinal> type3LIDs;
8495 Teuchos::ArrayRCP<int> type3PIDs;
8496 auto rowptr = getCrsGraph()->getLocalRowPtrsHost();
8497 auto colind = getCrsGraph()->getLocalIndicesHost();
8498
8499 if (verbose) {
8500 std::ostringstream os;
8501 os << *verbosePrefix << "Calling reverseNeighborDiscovery" << std::endl;
8502 std::cerr << os.str();
8503 }
8504
8505 {
8506#ifdef HAVE_TPETRA_MMM_TIMINGS
8507 TimeMonitor tm_rnd(*TimeMonitor::getNewTimer(prefix + std::string("isMMrevNeighDis")));
8508#endif
8509 Import_Util::reverseNeighborDiscovery(*this,
8510 rowptr,
8511 colind,
8512 rowTransfer,
8513 MyImporter,
8514 MyDomainMap,
8515 type3PIDs,
8516 type3LIDs,
8517 ReducedComm);
8518 }
8519
8520 if (verbose) {
8521 std::ostringstream os;
8522 os << *verbosePrefix << "Done with reverseNeighborDiscovery" << std::endl;
8523 std::cerr << os.str();
8524 }
8525
8526 Teuchos::ArrayView<const int> EPID1 = MyImporter.is_null() ? Teuchos::ArrayView<const int>() : MyImporter->getExportPIDs();
8527 Teuchos::ArrayView<const LO> ELID1 = MyImporter.is_null() ? Teuchos::ArrayView<const LO>() : MyImporter->getExportLIDs();
8528
8529 Teuchos::ArrayView<const int> TEPID2 = rowTransfer.getExportPIDs(); // row matrix
8530 Teuchos::ArrayView<const LO> TELID2 = rowTransfer.getExportLIDs();
8531
8532 const int numCols = getGraph()->getColMap()->getLocalNumElements(); // may be dup
8533 // from EpetraExt_MMHelpers.cpp: build_type2_exports
8534 std::vector<bool> IsOwned(numCols, true);
8535 std::vector<int> SentTo(numCols, -1);
8536 if (!MyImporter.is_null()) {
8537 for (auto&& rlid : MyImporter->getRemoteLIDs()) { // the remoteLIDs must be from sourcematrix
8538 IsOwned[rlid] = false;
8539 }
8540 }
8541
8542 std::vector<std::pair<int, GO>> usrtg;
8543 usrtg.reserve(TEPID2.size());
8544
8545 {
8546 const auto& colMap = *(this->getColMap()); // *this is sourcematrix
8547 for (Array_size_type i = 0; i < TEPID2.size(); ++i) {
8548 const LO row = TELID2[i];
8549 const int pid = TEPID2[i];
8550 for (auto j = rowptr[row]; j < rowptr[row + 1]; ++j) {
8551 const int col = colind[j];
8552 if (IsOwned[col] && SentTo[col] != pid) {
8553 SentTo[col] = pid;
8554 GO gid = colMap.getGlobalElement(col);
8555 usrtg.push_back(std::pair<int, GO>(pid, gid));
8556 }
8557 }
8558 }
8559 }
8560
8561 // This sort can _not_ be omitted.[
8562 std::sort(usrtg.begin(), usrtg.end()); // default comparator does the right thing, now sorted in gid order
8563 auto eopg = std ::unique(usrtg.begin(), usrtg.end());
8564 // 25 Jul 2018: Could just ignore the entries at and after eopg.
8565 usrtg.erase(eopg, usrtg.end());
8566
8567 const Array_size_type type2_us_size = usrtg.size();
8568 Teuchos::ArrayRCP<int> EPID2 = Teuchos::arcp(new int[type2_us_size], 0, type2_us_size, true);
8569 Teuchos::ArrayRCP<LO> ELID2 = Teuchos::arcp(new LO[type2_us_size], 0, type2_us_size, true);
8570
8571 int pos = 0;
8572 for (auto&& p : usrtg) {
8573 EPID2[pos] = p.first;
8574 ELID2[pos] = this->getDomainMap()->getLocalElement(p.second);
8575 pos++;
8576 }
8577
8578 Teuchos::ArrayView<int> EPID3 = type3PIDs();
8579 Teuchos::ArrayView<LO> ELID3 = type3LIDs();
8580 GO InfGID = std::numeric_limits<GO>::max();
8581 int InfPID = INT_MAX;
8582#ifdef TPETRA_MIN3
8583#undef TPETRA_MIN3
8584#endif // TPETRA_MIN3
8585#define TPETRA_MIN3(x, y, z) ((x) < (y) ? (std::min(x, z)) : (std::min(y, z)))
8586 int i1 = 0, i2 = 0, i3 = 0;
8587 int Len1 = EPID1.size();
8588 int Len2 = EPID2.size();
8589 int Len3 = EPID3.size();
8590
8591 int MyLen = Len1 + Len2 + Len3;
8592 Teuchos::ArrayRCP<LO> userExportLIDs = Teuchos::arcp(new LO[MyLen], 0, MyLen, true);
8593 Teuchos::ArrayRCP<int> userExportPIDs = Teuchos::arcp(new int[MyLen], 0, MyLen, true);
8594 int iloc = 0; // will be the size of the userExportLID/PIDs
8595
8596 while (i1 < Len1 || i2 < Len2 || i3 < Len3) {
8597 int PID1 = (i1 < Len1) ? (EPID1[i1]) : InfPID;
8598 int PID2 = (i2 < Len2) ? (EPID2[i2]) : InfPID;
8599 int PID3 = (i3 < Len3) ? (EPID3[i3]) : InfPID;
8600
8601 GO GID1 = (i1 < Len1) ? getDomainMap()->getGlobalElement(ELID1[i1]) : InfGID;
8602 GO GID2 = (i2 < Len2) ? getDomainMap()->getGlobalElement(ELID2[i2]) : InfGID;
8603 GO GID3 = (i3 < Len3) ? getDomainMap()->getGlobalElement(ELID3[i3]) : InfGID;
8604
8605 int MIN_PID = TPETRA_MIN3(PID1, PID2, PID3);
8606 GO MIN_GID = TPETRA_MIN3(((PID1 == MIN_PID) ? GID1 : InfGID), ((PID2 == MIN_PID) ? GID2 : InfGID), ((PID3 == MIN_PID) ? GID3 : InfGID));
8607#ifdef TPETRA_MIN3
8608#undef TPETRA_MIN3
8609#endif // TPETRA_MIN3
8610 bool added_entry = false;
8611
8612 if (PID1 == MIN_PID && GID1 == MIN_GID) {
8613 userExportLIDs[iloc] = ELID1[i1];
8614 userExportPIDs[iloc] = EPID1[i1];
8615 i1++;
8616 added_entry = true;
8617 iloc++;
8618 }
8619 if (PID2 == MIN_PID && GID2 == MIN_GID) {
8620 if (!added_entry) {
8621 userExportLIDs[iloc] = ELID2[i2];
8622 userExportPIDs[iloc] = EPID2[i2];
8623 added_entry = true;
8624 iloc++;
8625 }
8626 i2++;
8627 }
8628 if (PID3 == MIN_PID && GID3 == MIN_GID) {
8629 if (!added_entry) {
8630 userExportLIDs[iloc] = ELID3[i3];
8631 userExportPIDs[iloc] = EPID3[i3];
8632 iloc++;
8633 }
8634 i3++;
8635 }
8636 }
8637
8638 if (verbose) {
8639 std::ostringstream os;
8640 os << *verbosePrefix << "Create Import" << std::endl;
8641 std::cerr << os.str();
8642 }
8643
8644#ifdef HAVE_TPETRA_MMM_TIMINGS
8645 auto ismmIctor(*TimeMonitor::getNewTimer(prefix + std::string("isMMIportCtor")));
8646#endif
8647 Teuchos::RCP<Teuchos::ParameterList> plist = rcp(new Teuchos::ParameterList());
8648 // 25 Jul 2018: Test for equality with the non-isMM path's Import object.
8649 if ((MyDomainMap != MyColMap) && (!MyDomainMap->isSameAs(*MyColMap)))
8650 MyImport = rcp(new import_type(MyDomainMap,
8651 MyColMap,
8652 RemotePids,
8653 userExportLIDs.view(0, iloc).getConst(),
8654 userExportPIDs.view(0, iloc).getConst(),
8655 plist));
8656
8657 if (verbose) {
8658 std::ostringstream os;
8659 os << *verbosePrefix << "Call expertStaticFillComplete" << std::endl;
8660 std::cerr << os.str();
8661 }
8662
8663 {
8664#ifdef HAVE_TPETRA_MMM_TIMINGS
8665 TimeMonitor esfc(*TimeMonitor::getNewTimer(prefix + std::string("isMM::destMat->eSFC")));
8666 esfc_params.set("Timer Label", label + std::string("isMM eSFC"));
8667#endif
8668 if (!params.is_null())
8669 esfc_params.set("compute global constants", params->get("compute global constants", true));
8670 destMat->expertStaticFillComplete(MyDomainMap, MyRangeMap, MyImport, Teuchos::null, rcp(new Teuchos::ParameterList(esfc_params)));
8671 }
8672
8673 } // if(isMM)
8674 else {
8675#ifdef HAVE_TPETRA_MMM_TIMINGS
8676 TimeMonitor MMnotMMblock(*TimeMonitor::getNewTimer(prefix + std::string("TAFC notMMblock")));
8677#endif
8678 if (verbose) {
8679 std::ostringstream os;
8680 os << *verbosePrefix << "Create Import" << std::endl;
8681 std::cerr << os.str();
8682 }
8683
8684#ifdef HAVE_TPETRA_MMM_TIMINGS
8685 TimeMonitor notMMIcTor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC notMMCreateImporter")));
8686#endif
8687 Teuchos::RCP<Teuchos::ParameterList> mypars = rcp(new Teuchos::ParameterList);
8688 mypars->set("Timer Label", "notMMFrom_tAFC");
8689 if ((MyDomainMap != MyColMap) && (!MyDomainMap->isSameAs(*MyColMap)))
8690 MyImport = rcp(new import_type(MyDomainMap, MyColMap, RemotePids, mypars));
8691
8692 if (verbose) {
8693 std::ostringstream os;
8694 os << *verbosePrefix << "Call expertStaticFillComplete" << endl;
8695 std::cerr << os.str();
8696 }
8697
8698#ifdef HAVE_TPETRA_MMM_TIMINGS
8699 TimeMonitor esfcnotmm(*TimeMonitor::getNewTimer(prefix + std::string("notMMdestMat->expertStaticFillComplete")));
8700 esfc_params.set("Timer Label", prefix + std::string("notMM eSFC"));
8701#else
8702 esfc_params.set("Timer Label", std::string("notMM eSFC"));
8703#endif
8704
8705 if (!params.is_null()) {
8706 esfc_params.set("compute global constants",
8707 params->get("compute global constants", true));
8708 }
8709 destMat->expertStaticFillComplete(MyDomainMap, MyRangeMap,
8710 MyImport, Teuchos::null,
8711 rcp(new Teuchos::ParameterList(esfc_params)));
8712 }
8713
8714#ifdef HAVE_TPETRA_MMM_TIMINGS
8715 tmIESFC = Teuchos::null;
8716#endif
8717
8718 if (verbose) {
8719 std::ostringstream os;
8720 os << *verbosePrefix << "Done" << endl;
8721 std::cerr << os.str();
8722 }
8723} // transferAndFillComplete
8724
8725template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
8728 const import_type& importer,
8729 const Teuchos::RCP<const map_type>& domainMap,
8730 const Teuchos::RCP<const map_type>& rangeMap,
8731 const Teuchos::RCP<Teuchos::ParameterList>& params) const {
8732 transferAndFillComplete(destMatrix, importer, Teuchos::null, domainMap, rangeMap, params);
8733}
8734
8735template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
8738 const import_type& rowImporter,
8740 const Teuchos::RCP<const map_type>& domainMap,
8741 const Teuchos::RCP<const map_type>& rangeMap,
8742 const Teuchos::RCP<Teuchos::ParameterList>& params) const {
8743 transferAndFillComplete(destMatrix, rowImporter, Teuchos::rcpFromRef(domainImporter), domainMap, rangeMap, params);
8744}
8745
8746template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
8749 const export_type& exporter,
8750 const Teuchos::RCP<const map_type>& domainMap,
8751 const Teuchos::RCP<const map_type>& rangeMap,
8752 const Teuchos::RCP<Teuchos::ParameterList>& params) const {
8753 transferAndFillComplete(destMatrix, exporter, Teuchos::null, domainMap, rangeMap, params);
8754}
8755
8756template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
8759 const export_type& rowExporter,
8761 const Teuchos::RCP<const map_type>& domainMap,
8762 const Teuchos::RCP<const map_type>& rangeMap,
8763 const Teuchos::RCP<Teuchos::ParameterList>& params) const {
8764 transferAndFillComplete(destMatrix, rowExporter, Teuchos::rcpFromRef(domainExporter), domainMap, rangeMap, params);
8765}
8766
8767} // namespace Tpetra
8768
8769//
8770// Explicit instantiation macro
8771//
8772// Must be expanded from within the Tpetra namespace!
8773//
8774
8775#define TPETRA_CRSMATRIX_MATRIX_INSTANT(SCALAR, LO, GO, NODE) \
8776 \
8777 template class CrsMatrix<SCALAR, LO, GO, NODE>;
8778
8779#define TPETRA_CRSMATRIX_CONVERT_INSTANT(SO, SI, LO, GO, NODE) \
8780 \
8781 template Teuchos::RCP<CrsMatrix<SO, LO, GO, NODE>> \
8782 CrsMatrix<SI, LO, GO, NODE>::convert<SO>() const;
8783
8784#define TPETRA_CRSMATRIX_IMPORT_AND_FILL_COMPLETE_INSTANT(SCALAR, LO, GO, NODE) \
8785 template <> \
8786 Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE>> \
8787 importAndFillCompleteCrsMatrix(const Teuchos::RCP<const CrsMatrix<SCALAR, LO, GO, NODE>>& sourceMatrix, \
8788 const Import<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
8789 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
8790 CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& importer, \
8791 const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
8792 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
8793 CrsMatrix<SCALAR, LO, GO, NODE>::node_type>>& domainMap, \
8794 const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
8795 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
8796 CrsMatrix<SCALAR, LO, GO, NODE>::node_type>>& rangeMap, \
8797 const Teuchos::RCP<Teuchos::ParameterList>& params);
8798
8799#define TPETRA_CRSMATRIX_IMPORT_AND_FILL_COMPLETE_INSTANT_TWO(SCALAR, LO, GO, NODE) \
8800 template <> \
8801 Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE>> \
8802 importAndFillCompleteCrsMatrix(const Teuchos::RCP<const CrsMatrix<SCALAR, LO, GO, NODE>>& sourceMatrix, \
8803 const Import<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
8804 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
8805 CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& rowImporter, \
8806 const Import<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
8807 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
8808 CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& domainImporter, \
8809 const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
8810 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
8811 CrsMatrix<SCALAR, LO, GO, NODE>::node_type>>& domainMap, \
8812 const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
8813 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
8814 CrsMatrix<SCALAR, LO, GO, NODE>::node_type>>& rangeMap, \
8815 const Teuchos::RCP<Teuchos::ParameterList>& params);
8816
8817#define TPETRA_CRSMATRIX_EXPORT_AND_FILL_COMPLETE_INSTANT(SCALAR, LO, GO, NODE) \
8818 template <> \
8819 Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE>> \
8820 exportAndFillCompleteCrsMatrix(const Teuchos::RCP<const CrsMatrix<SCALAR, LO, GO, NODE>>& sourceMatrix, \
8821 const Export<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
8822 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
8823 CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& exporter, \
8824 const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
8825 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
8826 CrsMatrix<SCALAR, LO, GO, NODE>::node_type>>& domainMap, \
8827 const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
8828 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
8829 CrsMatrix<SCALAR, LO, GO, NODE>::node_type>>& rangeMap, \
8830 const Teuchos::RCP<Teuchos::ParameterList>& params);
8831
8832#define TPETRA_CRSMATRIX_EXPORT_AND_FILL_COMPLETE_INSTANT_TWO(SCALAR, LO, GO, NODE) \
8833 template <> \
8834 Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE>> \
8835 exportAndFillCompleteCrsMatrix(const Teuchos::RCP<const CrsMatrix<SCALAR, LO, GO, NODE>>& sourceMatrix, \
8836 const Export<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
8837 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
8838 CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& rowExporter, \
8839 const Export<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
8840 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
8841 CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& domainExporter, \
8842 const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
8843 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
8844 CrsMatrix<SCALAR, LO, GO, NODE>::node_type>>& domainMap, \
8845 const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
8846 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
8847 CrsMatrix<SCALAR, LO, GO, NODE>::node_type>>& rangeMap, \
8848 const Teuchos::RCP<Teuchos::ParameterList>& params);
8849
8850#define TPETRA_CRSMATRIX_INSTANT(SCALAR, LO, GO, NODE) \
8851 TPETRA_CRSMATRIX_MATRIX_INSTANT(SCALAR, LO, GO, NODE) \
8852 TPETRA_CRSMATRIX_IMPORT_AND_FILL_COMPLETE_INSTANT(SCALAR, LO, GO, NODE) \
8853 TPETRA_CRSMATRIX_EXPORT_AND_FILL_COMPLETE_INSTANT(SCALAR, LO, GO, NODE) \
8854 TPETRA_CRSMATRIX_IMPORT_AND_FILL_COMPLETE_INSTANT_TWO(SCALAR, LO, GO, NODE) \
8855 TPETRA_CRSMATRIX_EXPORT_AND_FILL_COMPLETE_INSTANT_TWO(SCALAR, LO, GO, NODE)
8856
8857#endif // TPETRA_CRSMATRIX_DEF_HPP
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declaration of Tpetra::Details::EquilibrationInfo.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Declare and define Tpetra::Details::copyConvert, an implementation detail of Tpetra (in particular,...
Declare and define Tpetra::Details::copyOffsets, an implementation detail of Tpetra (in particular,...
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary,...
Functions for manipulating CRS arrays.
Declaration of a function that prints strings from each process.
Declaration and definition of Tpetra::Details::getEntryOnHost.
Declaration of Tpetra::Details::iallreduce.
Declaration and definition of Tpetra::Details::leftScaleLocalCrsMatrix.
KOKKOS_FUNCTION size_t packRow(const LocalMapType &col_map, const Kokkos::View< Packet *, BufferDeviceType > &exports, const InputLidsType &lids_in, const InputPidsType &pids_in, const size_t offset, const size_t num_ent, const bool pack_pids)
Packs a single row of the CrsGraph.
Declaration and definition of Tpetra::Details::rightScaleLocalCrsMatrix.
KOKKOS_FUNCTION int unpackRow(const Kokkos::View< GO *, Device, Kokkos::MemoryUnmanaged > &gids_out, const Kokkos::View< int *, Device, Kokkos::MemoryUnmanaged > &pids_out, const Kokkos::View< const Packet *, BufferDevice > &imports, const size_t offset, const size_t num_ent)
Unpack a single row of a CrsGraph.
Utility functions for packing and unpacking sparse matrix entries.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects.
#define TPETRA_ABUSE_WARNING(throw_exception_test, Exception, msg)
Handle an abuse warning, according to HAVE_TPETRA_THROW_ABUSE_WARNINGS and HAVE_TPETRA_PRINT_ABUSE_WA...
Declaration of Tpetra::computeRowAndColumnOneNorms.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
global_inds_dualv_type::t_host::const_type getGlobalIndsViewHost(const RowInfo &rowinfo) const
Get a const, globally indexed view of the locally owned row myRow, such that rowinfo = getRowInfo(myR...
Teuchos::RCP< const map_type > colMap_
The Map describing the distribution of columns of the graph.
Teuchos::RCP< const map_type > rowMap_
The Map describing the distribution of rows of the graph.
bool isGloballyIndexed() const override
Whether the graph's column indices are stored as global indices.
Struct that holds views of the contents of a CrsMatrix.
Teuchos::RCP< const map_type > colMap
Col map for the original version of the matrix.
Teuchos::RCP< const map_type > domainMap
Domain map for original matrix.
Teuchos::RCP< const map_type > rowMap
Desired row map for "imported" version of the matrix.
Teuchos::RCP< const map_type > origRowMap
Original row map of matrix.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries.
virtual void insertGlobalValuesImpl(crs_graph_type &graph, RowInfo &rowInfo, const GlobalOrdinal gblColInds[], const impl_scalar_type vals[], const size_t numInputEnt)
Common implementation detail of insertGlobalValues and insertGlobalValuesFiltered.
bool isGloballyIndexed() const override
Whether the matrix is globally indexed on the calling process.
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const override
Print this object with the given verbosity level to the given output stream.
void localApply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, const Teuchos::ETransp mode=Teuchos::NO_TRANS, const Scalar &alpha=Teuchos::ScalarTraits< Scalar >::one(), const Scalar &beta=Teuchos::ScalarTraits< Scalar >::zero()) const
Compute the local part of a sparse matrix-(Multi)Vector multiply.
void unpackAndCombine(const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &importLIDs, Kokkos::DualView< char *, buffer_device_type > imports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, const size_t constantNumPackets, const CombineMode CM) override
Unpack the imported column indices and values, and combine into matrix.
void replaceRangeMap(const Teuchos::RCP< const map_type > &newRangeMap)
Replace the current range Map with the given objects.
typename device_type::execution_space execution_space
The Kokkos execution space.
void applyNonTranspose(const MV &X_in, MV &Y_in, Scalar alpha, Scalar beta) const
Special case of apply() for mode == Teuchos::NO_TRANS.
void importAndFillComplete(Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > &destMatrix, const import_type &importer, const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null) const
Import from this to the given destination matrix, and make the result fill complete.
CrsGraph< LocalOrdinal, GlobalOrdinal, Node > crs_graph_type
The CrsGraph specialization suitable for this CrsMatrix specialization.
local_ordinal_type replaceGlobalValues(const global_ordinal_type globalRow, const Kokkos::View< const global_ordinal_type *, Kokkos::AnonymousSpace > &inputInds, const Kokkos::View< const impl_scalar_type *, Kokkos::AnonymousSpace > &inputVals)
Replace one or more entries' values, using global indices.
bool haveGlobalConstants() const
Returns true if globalConstants have been computed; false otherwise.
size_t getGlobalMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, over all processes in the matrix's communicator.
void getGlobalRowCopy(GlobalOrdinal GlobalRow, nonconst_global_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Fill given arrays with a deep copy of the locally owned entries of the matrix in a given row,...
size_t getNumEntriesInGlobalRow(GlobalOrdinal globalRow) const override
Number of entries in the sparse matrix in the given global row, on the calling (MPI) process.
void scale(const Scalar &alpha)
Scale the matrix's values: this := alpha*this.
void sortAndMergeIndicesAndValues(const bool sorted, const bool merged)
Sort and merge duplicate local column indices in all rows on the calling process, along with their co...
void packNew(const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &exportLIDs, Kokkos::DualView< char *, buffer_device_type > &exports, const Kokkos::DualView< size_t *, buffer_device_type > &numPacketsPerLID, size_t &constantNumPackets) const
Pack this object's data for an Import or Export.
size_t getLocalNumCols() const override
The number of columns connected to the locally owned rows of this matrix.
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
bool hasColMap() const override
Whether the matrix has a well-defined column Map.
mag_type getNormInf() const
Compute and return the infinity norm of the matrix.
Teuchos::RCP< CrsMatrix< T, LocalOrdinal, GlobalOrdinal, Node > > convert() const
Return another CrsMatrix with the same entries, but converted to a different Scalar type T.
values_dualv_type::t_dev getValuesViewDeviceNonConst(const RowInfo &rowinfo)
Get a non-const Device view of the locally owned values row myRow, such that rowinfo = getRowInfo(myR...
void expertStaticFillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< const import_type > &importer=Teuchos::null, const Teuchos::RCP< const export_type > &exporter=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Perform a fillComplete on a matrix that already has data.
local_ordinal_type sumIntoLocalValues(const local_ordinal_type localRow, const Kokkos::View< const local_ordinal_type *, Kokkos::AnonymousSpace > &inputInds, const Kokkos::View< const impl_scalar_type *, Kokkos::AnonymousSpace > &inputVals, const bool atomic=useAtomicUpdatesByDefault)
Sum into one or more sparse matrix entries, using local row and column indices.
virtual Teuchos::RCP< RowMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const RowMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params) const override
Implementation of RowMatrix::add: return alpha*A + beta*this.
void applyTranspose(const MV &X_in, MV &Y_in, const Teuchos::ETransp mode, Scalar alpha, Scalar beta) const
Special case of apply() for mode != Teuchos::NO_TRANS.
size_t getNumEntriesInLocalRow(local_ordinal_type localRow) const override
Number of entries in the sparse matrix in the given local row, on the calling (MPI) process.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
bool isFillActive() const
Whether the matrix is not fill complete.
void replaceDomainMapAndImporter(const Teuchos::RCP< const map_type > &newDomainMap, Teuchos::RCP< const import_type > &newImporter)
Replace the current domain Map and Import with the given objects.
LocalOrdinal sumIntoGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals, const bool atomic=useAtomicUpdatesByDefault)
Sum into one or more sparse matrix entries, using global indices.
mag_type getNorm1(bool assumeSymmetric=false) const
Compute and return the 1-norm of the matrix.
void apply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const override
Compute a sparse matrix-MultiVector multiply.
mag_type getFrobeniusNorm() const override
Compute and return the Frobenius norm of the matrix.
void insertLocalValues(const LocalOrdinal localRow, const Teuchos::ArrayView< const LocalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals, const CombineMode CM=ADD)
Insert one or more entries into the matrix, using local column indices.
global_size_t getGlobalNumCols() const override
The number of global columns in the matrix.
Teuchos::RCP< const map_type > getRangeMap() const override
The range Map of this matrix.
void allocateValues(ELocalGlobal lg, GraphAllocationStatus gas, const bool verbose)
Allocate values (and optionally indices) using the Node.
size_t getLocalNumEntries() const override
The local number of entries in this matrix.
typename Node::device_type device_type
The Kokkos device type.
virtual LocalOrdinal sumIntoGlobalValuesImpl(impl_scalar_type rowVals[], const crs_graph_type &graph, const RowInfo &rowInfo, const GlobalOrdinal inds[], const impl_scalar_type newVals[], const LocalOrdinal numElts, const bool atomic=useAtomicUpdatesByDefault)
Implementation detail of sumIntoGlobalValues.
void replaceDomainMap(const Teuchos::RCP< const map_type > &newDomainMap)
Replace the current domain Map with the given objects.
std::string description() const override
A one-line description of this object.
void reindexColumns(crs_graph_type *const graph, const Teuchos::RCP< const map_type > &newColMap, const Teuchos::RCP< const import_type > &newImport=Teuchos::null, const bool sortEachRow=true)
Reindex the column indices in place, and replace the column Map. Optionally, replace the Import objec...
Teuchos::RCP< MV > getColumnMapMultiVector(const MV &X_domainMap, const bool force=false) const
Create a (or fetch a cached) column Map MultiVector.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
void replaceRangeMapAndExporter(const Teuchos::RCP< const map_type > &newRangeMap, Teuchos::RCP< const export_type > &newExporter)
Replace the current Range Map and Export with the given objects.
size_t getLocalMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
void replaceColMap(const Teuchos::RCP< const map_type > &newColMap)
Replace the matrix's column Map with the given Map.
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
void globalAssemble()
Communicate nonlocal contributions to other processes.
void checkInternalState() const
Check that this object's state is sane; throw if it's not.
bool hasTransposeApply() const override
Whether apply() allows applying the transpose or conjugate transpose.
GlobalOrdinal getIndexBase() const override
The index base for global indices for this matrix.
LocalOrdinal local_ordinal_type
The type of each local index in the matrix.
void getLocalDiagCopy(Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &diag) const override
Get a constant, nonpersisting view of a row of this matrix, using local row and column indices,...
void setAllToScalar(const Scalar &alpha)
Set all matrix entries equal to alpha.
void fillLocalGraphAndMatrix(const Teuchos::RCP< Teuchos::ParameterList > &params)
Fill data into the local graph and matrix.
TPETRA_DETAILS_ALWAYS_INLINE local_matrix_device_type getLocalMatrixDevice() const
The local sparse matrix.
void getLocalRowView(LocalOrdinal LocalRow, local_inds_host_view_type &indices, values_host_view_type &values) const override
Get a constant view of a row of this matrix, using local row and column indices.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
void insertGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals)
Insert one or more entries into the matrix, using global column indices.
typename Kokkos::ArithTraits< impl_scalar_type >::mag_type mag_type
Type of a norm result.
void fillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Tell the matrix that you are done changing its structure or values, and that you are ready to do comp...
void getGlobalRowView(GlobalOrdinal GlobalRow, global_inds_host_view_type &indices, values_host_view_type &values) const override
Get a constant, nonpersisting view of a row of this matrix, using global row and column indices.
void setAllValues(const typename local_graph_device_type::row_map_type &ptr, const typename local_graph_device_type::entries_type::non_const_type &ind, const typename local_matrix_device_type::values_type &val)
Set the local matrix using three (compressed sparse row) arrays.
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix's graph, as a RowGraph.
virtual void copyAndPermute(const SrcDistObject &source, const size_t numSameIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteToLIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteFromLIDs, const CombineMode CM) override
virtual void removeEmptyProcessesInPlace(const Teuchos::RCP< const map_type > &newMap) override
Remove processes owning zero rows from the Maps and their communicator.
virtual LocalOrdinal sumIntoLocalValuesImpl(impl_scalar_type rowVals[], const crs_graph_type &graph, const RowInfo &rowInfo, const LocalOrdinal inds[], const impl_scalar_type newVals[], const LocalOrdinal numElts, const bool atomic=useAtomicUpdatesByDefault)
Implementation detail of sumIntoLocalValues.
void swap(CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &matrix)
Swaps the data from *this with the data and maps from crsMatrix.
bool isStaticGraph() const
Indicates that the graph is static, so that new entries cannot be added to this matrix.
global_size_t getGlobalNumEntries() const override
The global number of entries in this matrix.
virtual LocalOrdinal replaceLocalValuesImpl(impl_scalar_type rowVals[], const crs_graph_type &graph, const RowInfo &rowInfo, const LocalOrdinal inds[], const impl_scalar_type newVals[], const LocalOrdinal numElts)
Implementation detail of replaceLocalValues.
size_t getLocalNumRows() const override
The number of matrix rows owned by the calling process.
bool isFillComplete() const override
Whether the matrix is fill complete.
virtual bool checkSizes(const SrcDistObject &source) override
Compare the source and target (this) objects for compatibility.
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.
local_ordinal_type replaceLocalValues(const local_ordinal_type localRow, const Kokkos::View< const local_ordinal_type *, Kokkos::AnonymousSpace > &inputInds, const Kokkos::View< const impl_scalar_type *, Kokkos::AnonymousSpace > &inputVals)
Replace one or more entries' values, using local row and column indices.
void exportAndFillComplete(Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > &destMatrix, const export_type &exporter, const Teuchos::RCP< const map_type > &domainMap=Teuchos::null, const Teuchos::RCP< const map_type > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null) const
Export from this to the given destination matrix, and make the result fill complete.
values_dualv_type::t_host::const_type getValuesViewHost(const RowInfo &rowinfo) const
Get a const Host view of the locally owned values row myRow, such that rowinfo = getRowInfo(myRow).
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
typename row_matrix_type::impl_scalar_type impl_scalar_type
The type used internally in place of Scalar.
Teuchos::RCP< MV > getRowMapMultiVector(const MV &Y_rangeMap, const bool force=false) const
Create a (or fetch a cached) row Map MultiVector.
virtual LocalOrdinal replaceGlobalValuesImpl(impl_scalar_type rowVals[], const crs_graph_type &graph, const RowInfo &rowInfo, const GlobalOrdinal inds[], const impl_scalar_type newVals[], const LocalOrdinal numElts)
Implementation detail of replaceGlobalValues.
values_dualv_type::t_host getValuesViewHostNonConst(const RowInfo &rowinfo)
Get a non-const Host view of the locally owned values row myRow, such that rowinfo = getRowInfo(myRow...
void resumeFill(const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Resume operations that may change the values or structure of the matrix.
void getLocalDiagOffsets(Teuchos::ArrayRCP< size_t > &offsets) const
Get offsets of the diagonal entries in the matrix.
void fillLocalMatrix(const Teuchos::RCP< Teuchos::ParameterList > &params)
Fill data into the local matrix.
void rightScale(const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &x) override
Scale the matrix on the right with the given Vector.
bool isStorageOptimized() const
Returns true if storage has been optimized.
void getLocalRowCopy(LocalOrdinal LocalRow, nonconst_local_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Fill given arrays with a deep copy of the locally owned entries of the matrix in a given row,...
values_dualv_type::t_dev::const_type getValuesViewDevice(const RowInfo &rowinfo) const
Get a const Device view of the locally owned values row myRow, such that rowinfo = getRowInfo(myRow).
void leftScale(const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &x) override
Scale the matrix on the left with the given Vector.
virtual bool supportsRowViews() const override
Return true if getLocalRowView() and getGlobalRowView() are valid for this object.
static size_t mergeRowIndicesAndValues(size_t rowLen, local_ordinal_type *cols, impl_scalar_type *vals)
Merge duplicate row indices in the given row, along with their corresponding values.
Teuchos::RCP< const crs_graph_type > getCrsGraph() const
This matrix's graph, as a CrsGraph.
Description of Tpetra's behavior.
static bool debug()
Whether Tpetra is in debug mode.
static bool verbose()
Whether Tpetra is in verbose mode.
static size_t verbosePrintCountThreshold()
Number of entries below which arrays, lists, etc. will be printed in debug mode.
static size_t rowImbalanceThreshold()
Threshold for deciding if a local matrix is "imbalanced" in the number of entries per row....
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
A parallel distribution of indices over processes.
One or more distributed dense vectors.
A read-only, row-oriented interface to a sparse matrix.
Abstract base class for objects that can be the source of an Import or Export operation.
Implementation details of Tpetra.
void padCrsArrays(const RowPtr &rowPtrBeg, const RowPtr &rowPtrEnd, Indices &indices_wdv, const Padding &padding, const int my_rank, const bool verbose)
Determine if the row pointers and indices arrays need to be resized to accommodate new entries....
void verbosePrintArray(std::ostream &out, const ArrayType &x, const char name[], const size_t maxNumToPrint)
Print min(x.size(), maxNumToPrint) entries of x.
void copyOffsets(const OutputViewType &dst, const InputViewType &src)
Copy row offsets (in a sparse graph or matrix) from src to dst. The offsets may have different types.
void leftScaleLocalCrsMatrix(const LocalSparseMatrixType &A_lcl, const ScalingFactorsViewType &scalingFactors, const bool assumeSymmetric, const bool divide=true)
Left-scale a KokkosSparse::CrsMatrix.
Kokkos::DualView< ValueType *, DeviceType > castAwayConstDualView(const Kokkos::DualView< const ValueType *, DeviceType > &input_dv)
Cast away const-ness of a 1-D Kokkos::DualView.
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks.
Teuchos::ArrayView< typename DualViewType::t_dev::value_type > getArrayViewFromDualView(const DualViewType &x)
Get a Teuchos::ArrayView which views the host Kokkos::View of the input 1-D Kokkos::DualView.
void packCrsMatrixWithOwningPIDs(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, Kokkos::DualView< char *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &exports_dv, const Teuchos::ArrayView< size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &exportLIDs, const Teuchos::ArrayView< const int > &sourcePIDs, size_t &constantNumPackets)
Pack specified entries of the given local sparse matrix for communication.
std::unique_ptr< std::string > createPrefix(const int myRank, const char prefix[])
Create string prefix for each line of verbose output.
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
std::string dualViewStatusToString(const DualViewType &dv, const char name[])
Return the status of the given Kokkos::DualView, as a human-readable string.
void gathervPrint(std::ostream &out, const std::string &s, const Teuchos::Comm< int > &comm)
On Process 0 in the given communicator, print strings from each process in that communicator,...
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Details::EquilibrationInfo< typename Kokkos::ArithTraits< SC >::val_type, typename NT::device_type > computeRowOneNorms(const Tpetra::RowMatrix< SC, LO, GO, NT > &A)
Compute global row one-norms ("row sums") of the input sparse matrix A, in a way suitable for one-sid...
Details::EquilibrationInfo< typename Kokkos::ArithTraits< SC >::val_type, typename NT::device_type > computeRowAndColumnOneNorms(const Tpetra::RowMatrix< SC, LO, GO, NT > &A, const bool assumeSymmetric)
Compute global row and column one-norms ("row sums" and "column sums") of the input sparse matrix A,...
void sort2(const IT1 &first1, const IT1 &last1, const IT2 &first2, const bool stableSort=false)
Sort the first array, and apply the resulting permutation to the second array.
Teuchos_Ordinal Array_size_type
Size type for Teuchos Array objects.
size_t global_size_t
Global size_t object.
std::string combineModeToString(const CombineMode combineMode)
Human-readable string representation of the given CombineMode.
Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > createOneToOne(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &M)
Nonmember constructor for a contiguous Map with user-defined weights and a user-specified,...
void merge2(IT1 &indResultOut, IT2 &valResultOut, IT1 indBeg, IT1 indEnd, IT2 valBeg, IT2)
Merge values in place, additively, with the same index.
CombineMode
Rule for combining data in an Import or Export.
@ REPLACE
Replace existing values with new values.
@ ADD
Sum new values.
@ ABSMAX
Replace old value with maximum of magnitudes of old and new values.
@ ADD_ASSIGN
Accumulate new values into existing values (may not be supported in all classes)
@ INSERT
Insert new values that don't currently exist.
@ ZERO
Replace old values with zero.
Functor for the the ABSMAX CombineMode of Import and Export operations.
Scalar operator()(const Scalar &x, const Scalar &y)
Return the maximum of the magnitudes (absolute values) of x and y.
Traits class for packing / unpacking data of type T.
Allocation information for a locally owned row in a CrsGraph or CrsMatrix.