Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
Tpetra_Import_Util2.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef TPETRA_IMPORT_UTIL2_HPP
11#define TPETRA_IMPORT_UTIL2_HPP
12
17
18#include "Tpetra_ConfigDefs.hpp"
19#include "Tpetra_Import.hpp"
20#include "Tpetra_HashTable.hpp"
21#include "Tpetra_Map.hpp"
22#include "Tpetra_Util.hpp"
23#include "Tpetra_Distributor.hpp"
26#include "Tpetra_Vector.hpp"
27#include "Kokkos_DualView.hpp"
28#include "KokkosSparse_SortCrs.hpp"
29#include <Teuchos_Array.hpp>
31#include <Kokkos_UnorderedMap.hpp>
32#include <unordered_map>
33#include <utility>
34#include <set>
35
37
38#include <Kokkos_Core.hpp>
39#include <Kokkos_Sort.hpp>
40
41namespace Tpetra {
42namespace Import_Util {
43
46template <typename Scalar, typename Ordinal>
47void sortCrsEntries(const Teuchos::ArrayView<size_t>& CRS_rowptr,
48 const Teuchos::ArrayView<Ordinal>& CRS_colind,
49 const Teuchos::ArrayView<Scalar>& CRS_vals);
50
51template <typename Ordinal>
52void sortCrsEntries(const Teuchos::ArrayView<size_t>& CRS_rowptr,
53 const Teuchos::ArrayView<Ordinal>& CRS_colind);
54
55template <typename rowptr_array_type, typename colind_array_type, typename vals_array_type>
56void sortCrsEntries(const rowptr_array_type& CRS_rowptr,
57 const colind_array_type& CRS_colind,
58 const vals_array_type& CRS_vals);
59
60template <typename rowptr_array_type, typename colind_array_type>
61void sortCrsEntries(const rowptr_array_type& CRS_rowptr,
62 const colind_array_type& CRS_colind);
63
68template <typename Scalar, typename Ordinal>
69void sortAndMergeCrsEntries(const Teuchos::ArrayView<size_t>& CRS_rowptr,
70 const Teuchos::ArrayView<Ordinal>& CRS_colind,
71 const Teuchos::ArrayView<Scalar>& CRS_vals);
72
73template <typename Ordinal>
74void sortAndMergeCrsEntries(const Teuchos::ArrayView<size_t>& CRS_rowptr,
75 const Teuchos::ArrayView<Ordinal>& CRS_colind);
76
77template <class rowptr_view_type, class colind_view_type, class vals_view_type>
78void sortAndMergeCrsEntries(const rowptr_view_type& CRS_rowptr,
79 const colind_view_type& CRS_colind,
80 const vals_view_type& CRS_vals);
81
97template <typename LocalOrdinal, typename GlobalOrdinal, typename Node>
99 const Teuchos::ArrayView<const size_t>& rowptr,
100 const Teuchos::ArrayView<LocalOrdinal>& colind_LID,
101 const Teuchos::ArrayView<GlobalOrdinal>& colind_GID,
102 const Teuchos::RCP<const Tpetra::Map<LocalOrdinal, GlobalOrdinal, Node>>& domainMapRCP,
103 const Teuchos::ArrayView<const int>& owningPIDs,
104 Teuchos::Array<int>& remotePIDs,
105 Teuchos::RCP<const Tpetra::Map<LocalOrdinal, GlobalOrdinal, Node>>& colMap);
106
111template <typename LocalOrdinal, typename GlobalOrdinal, typename Node>
113 const Kokkos::View<size_t*, typename Node::device_type> rowptr_view,
114 const Kokkos::View<LocalOrdinal*, typename Node::device_type> colind_LID_view,
115 const Kokkos::View<GlobalOrdinal*, typename Node::device_type> colind_GID_view,
117 const Teuchos::ArrayView<const int>& owningPIDs,
118 Teuchos::Array<int>& remotePIDs,
119 Teuchos::RCP<const Tpetra::Map<LocalOrdinal, GlobalOrdinal, Node>>& colMap);
120
134template <typename LocalOrdinal, typename GlobalOrdinal, typename Node>
135void getTwoTransferOwnershipVector(const ::Tpetra::Details::Transfer<LocalOrdinal, GlobalOrdinal, Node>& transferThatDefinesOwnership,
137 const ::Tpetra::Details::Transfer<LocalOrdinal, GlobalOrdinal, Node>& transferForMigratingData,
140
141} // namespace Import_Util
142} // namespace Tpetra
143
144//
145// Implementations
146//
147
148namespace Tpetra {
149namespace Import_Util {
150
151template <typename PID, typename GlobalOrdinal>
152bool sort_PID_then_GID(const std::pair<PID, GlobalOrdinal>& a,
153 const std::pair<PID, GlobalOrdinal>& b) {
154 if (a.first != b.first)
155 return (a.first < b.first);
156 return (a.second < b.second);
157}
158
159template <typename PID,
160 typename GlobalOrdinal,
161 typename LocalOrdinal>
162bool sort_PID_then_pair_GID_LID(const std::pair<PID, std::pair<GlobalOrdinal, LocalOrdinal>>& a,
163 const std::pair<PID, std::pair<GlobalOrdinal, LocalOrdinal>>& b) {
164 if (a.first != b.first)
165 return a.first < b.first;
166 else
167 return (a.second.first < b.second.first);
168}
169
170template <typename Scalar,
171 typename LocalOrdinal,
172 typename GlobalOrdinal,
173 typename Node>
174void reverseNeighborDiscovery(const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& SourceMatrix,
175 const typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::row_ptrs_host_view_type& rowptr,
176 const typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_inds_host_view_type& colind,
178 Teuchos::RCP<const Tpetra::Import<LocalOrdinal, GlobalOrdinal, Node>> MyImporter,
179 Teuchos::RCP<const Tpetra::Map<LocalOrdinal, GlobalOrdinal, Node>> MyDomainMap,
180 Teuchos::ArrayRCP<int>& type3PIDs,
181 Teuchos::ArrayRCP<LocalOrdinal>& type3LIDs,
182 Teuchos::RCP<const Teuchos::Comm<int>>& rcomm) {
183#ifdef HAVE_TPETRACORE_MPI
184 using Teuchos::TimeMonitor;
185 using ::Tpetra::Details::Behavior;
186 typedef LocalOrdinal LO;
187 typedef GlobalOrdinal GO;
188 typedef std::pair<GO, GO> pidgidpair_t;
189 using Teuchos::RCP;
190 const std::string prefix{" Import_Util2::ReverseND:: "};
191 const std::string label("IU2::Neighbor");
192
193 // There can be no neighbor discovery if you don't have an importer
194 if (MyImporter.is_null()) return;
195
196 std::ostringstream errstr;
197 bool error = false;
198 auto const comm = MyDomainMap->getComm();
199
200 MPI_Comm rawComm = getRawMpiComm(*comm);
201 const int MyPID = rcomm->getRank();
202
203 // Things related to messages I am sending in forward mode (RowTransfer)
204 // *** Note: this will be incorrect for transferAndFillComplete if it is in reverse mode. FIXME cbl.
205 auto ExportPIDs = RowTransfer.getExportPIDs();
206 auto ExportLIDs = RowTransfer.getExportLIDs();
207 auto NumExportLIDs = RowTransfer.getNumExportIDs();
208
209 Distributor& Distor = MyImporter->getDistributor();
210 const size_t NumRecvs = Distor.getNumReceives();
211 const size_t NumSends = Distor.getNumSends();
212 auto RemoteLIDs = MyImporter->getRemoteLIDs();
213 auto const ProcsFrom = Distor.getProcsFrom();
214 auto const ProcsTo = Distor.getProcsTo();
215
216 auto LengthsFrom = Distor.getLengthsFrom();
217 auto MyColMap = SourceMatrix.getColMap();
218 const size_t numCols = MyColMap->getLocalNumElements();
219 RCP<const Tpetra::Map<LocalOrdinal, GlobalOrdinal, Node>> target = MyImporter->getTargetMap();
220
221 // Get the owning pids in a special way,
222 // s.t. ProcsFrom[RemotePIDs[i]] is the proc that owns RemoteLIDs[j]....
223 Teuchos::Array<int> RemotePIDOrder(numCols, -1);
224
225 // For each remote ID, record index into ProcsFrom, who owns it.
226 for (size_t i = 0, j = 0; i < NumRecvs; ++i) {
227 for (size_t k = 0; k < LengthsFrom[i]; ++k) {
228 const int pid = ProcsFrom[i];
229 if (pid != MyPID) {
230 RemotePIDOrder[RemoteLIDs[j]] = i;
231 }
232 j++;
233 }
234 }
235
236 // Step One: Start tacking the (GID,PID) pairs on the std sets
237 //
238 // For each index in ProcsFrom, we will insert into a set of (PID,
239 // GID) pairs, in order to build a list of such pairs for each of
240 // those processes. Since this is building a reverse, we will send
241 // to these processes.
242 Teuchos::Array<int> ReverseSendSizes(NumRecvs, 0);
243 // do this as C array to avoid Teuchos::Array value initialization of all reserved memory
244 Teuchos::Array<Teuchos::ArrayRCP<pidgidpair_t>> RSB(NumRecvs);
245
246 {
247#ifdef HAVE_TPETRA_MMM_TIMINGS
248 TimeMonitor set_all(*TimeMonitor::getNewTimer(prefix + std::string("isMMallSetRSB")));
249#endif
250
251 // 25 Jul 2018: CBL
252 // todo:std::unordered_set (hash table),
253 // with an adequate prereservation ("bucket count").
254 // An onordered_set has to have a custom hasher for pid/gid pair
255 // However, when pidsets is copied to RSB, it will be in key
256 // order _not_ in pid,gid order. (unlike std::set).
257 // Impliment this with a reserve, and time BOTH building pidsets
258 // _and_ the sort after the receive. Even if unordered_set saves
259 // time, if it causes the sort to be longer, it's not a win.
260
261 Teuchos::Array<std::set<pidgidpair_t>> pidsets(NumRecvs);
262 {
263#ifdef HAVE_TPETRA_MMM_TIMINGS
264 TimeMonitor set_insert(*TimeMonitor::getNewTimer(prefix + std::string("isMMallSetRSBinsert")));
265#endif
266 for (size_t i = 0; i < NumExportLIDs; i++) {
267 LO lid = ExportLIDs[i];
268 GO exp_pid = ExportPIDs[i];
269 for (auto j = rowptr[lid]; j < rowptr[lid + 1]; j++) {
270 int pid_order = RemotePIDOrder[colind[j]];
271 if (pid_order != -1) {
272 GO gid = MyColMap->getGlobalElement(colind[j]); // Epetra SM.GCID46 =>sm->graph-> {colmap(colind)}
273 auto tpair = pidgidpair_t(exp_pid, gid);
274 pidsets[pid_order].insert(pidsets[pid_order].end(), tpair);
275 }
276 }
277 }
278 }
279
280 {
281#ifdef HAVE_TPETRA_MMM_TIMINGS
282 TimeMonitor set_cpy(*TimeMonitor::getNewTimer(prefix + std::string("isMMallSetRSBcpy")));
283#endif
284 int jj = 0;
285 for (auto&& ps : pidsets) {
286 auto s = ps.size();
287 RSB[jj] = Teuchos::arcp(new pidgidpair_t[s], 0, s, true);
288 std::copy(ps.begin(), ps.end(), RSB[jj]);
289 ReverseSendSizes[jj] = s;
290 ++jj;
291 }
292 }
293 } // end of set based packing.
294
295 Teuchos::Array<int> ReverseRecvSizes(NumSends, -1);
296 Teuchos::Array<MPI_Request> rawBreq(ProcsFrom.size() + ProcsTo.size(), MPI_REQUEST_NULL);
297 // 25 Jul 2018: MPI_TAG_UB is the largest tag value; could be < 32768.
298 const int mpi_tag_base_ = 3;
299
300 int mpireq_idx = 0;
301 for (int i = 0; i < ProcsTo.size(); ++i) {
302 int Rec_Tag = mpi_tag_base_ + ProcsTo[i];
303 int* thisrecv = (int*)(&ReverseRecvSizes[i]);
304 MPI_Request rawRequest = MPI_REQUEST_NULL;
305 MPI_Irecv(const_cast<int*>(thisrecv),
306 1,
307 MPI_INT,
308 ProcsTo[i],
309 Rec_Tag,
310 rawComm,
311 &rawRequest);
312 rawBreq[mpireq_idx++] = rawRequest;
313 }
314 for (int i = 0; i < ProcsFrom.size(); ++i) {
315 int Send_Tag = mpi_tag_base_ + MyPID;
316 int* mysend = (int*)(&ReverseSendSizes[i]);
317 MPI_Request rawRequest = MPI_REQUEST_NULL;
318 MPI_Isend(mysend,
319 1,
320 MPI_INT,
321 ProcsFrom[i],
322 Send_Tag,
323 rawComm,
324 &rawRequest);
325 rawBreq[mpireq_idx++] = rawRequest;
326 }
327 Teuchos::Array<MPI_Status> rawBstatus(rawBreq.size());
328#ifdef HAVE_TPETRA_DEBUG
329 const int err1 =
330#endif
331 MPI_Waitall(rawBreq.size(), rawBreq.getRawPtr(),
332 rawBstatus.getRawPtr());
333
334#ifdef HAVE_TPETRA_DEBUG
335 if (err1) {
336 errstr << MyPID << "sE1 reverseNeighborDiscovery Mpi_Waitall error on send ";
337 error = true;
338 std::cerr << errstr.str() << std::flush;
339 }
340#endif
341
342 int totalexportpairrecsize = 0;
343 for (size_t i = 0; i < NumSends; ++i) {
344 totalexportpairrecsize += ReverseRecvSizes[i];
345#ifdef HAVE_TPETRA_DEBUG
346 if (ReverseRecvSizes[i] < 0) {
347 errstr << MyPID << "E4 reverseNeighborDiscovery: got < 0 for receive size " << ReverseRecvSizes[i] << std::endl;
348 error = true;
349 }
350#endif
351 }
352 Teuchos::ArrayRCP<pidgidpair_t> AllReverseRecv = Teuchos::arcp(new pidgidpair_t[totalexportpairrecsize], 0, totalexportpairrecsize, true);
353 int offset = 0;
354 mpireq_idx = 0;
355 for (int i = 0; i < ProcsTo.size(); ++i) {
356 int recv_data_size = ReverseRecvSizes[i] * 2;
357 int recvData_MPI_Tag = mpi_tag_base_ * 2 + ProcsTo[i];
358 MPI_Request rawRequest = MPI_REQUEST_NULL;
359 GO* rec_bptr = (GO*)(&AllReverseRecv[offset]);
360 offset += ReverseRecvSizes[i];
361 MPI_Irecv(rec_bptr,
362 recv_data_size,
363 ::Tpetra::Details::MpiTypeTraits<GO>::getType(rec_bptr[0]),
364 ProcsTo[i],
365 recvData_MPI_Tag,
366 rawComm,
367 &rawRequest);
368 rawBreq[mpireq_idx++] = rawRequest;
369 }
370 for (int ii = 0; ii < ProcsFrom.size(); ++ii) {
371 GO* send_bptr = (GO*)(RSB[ii].getRawPtr());
372 MPI_Request rawSequest = MPI_REQUEST_NULL;
373 int send_data_size = ReverseSendSizes[ii] * 2; // 2 == count of pair
374 int sendData_MPI_Tag = mpi_tag_base_ * 2 + MyPID;
375 MPI_Isend(send_bptr,
376 send_data_size,
377 ::Tpetra::Details::MpiTypeTraits<GO>::getType(send_bptr[0]),
378 ProcsFrom[ii],
379 sendData_MPI_Tag,
380 rawComm,
381 &rawSequest);
382
383 rawBreq[mpireq_idx++] = rawSequest;
384 }
385#ifdef HAVE_TPETRA_DEBUG
386 const int err =
387#endif
388 MPI_Waitall(rawBreq.size(),
389 rawBreq.getRawPtr(),
390 rawBstatus.getRawPtr());
391#ifdef HAVE_TPETRA_DEBUG
392 if (err) {
393 errstr << MyPID << "E3.r reverseNeighborDiscovery Mpi_Waitall error on receive ";
394 error = true;
395 std::cerr << errstr.str() << std::flush;
396 }
397#endif
398 std::sort(AllReverseRecv.begin(), AllReverseRecv.end(), Tpetra::Import_Util::sort_PID_then_GID<GlobalOrdinal, GlobalOrdinal>);
399
400 auto newEndOfPairs = std::unique(AllReverseRecv.begin(), AllReverseRecv.end());
401 // don't resize to remove non-unique, just use the end-of-unique iterator
402 if (AllReverseRecv.begin() == newEndOfPairs) return;
403 int ARRsize = std::distance(AllReverseRecv.begin(), newEndOfPairs);
404 auto rPIDs = Teuchos::arcp(new int[ARRsize], 0, ARRsize, true);
405 auto rLIDs = Teuchos::arcp(new LocalOrdinal[ARRsize], 0, ARRsize, true);
406
407 int tsize = 0;
408 for (auto itr = AllReverseRecv.begin(); itr != newEndOfPairs; ++itr) {
409 if ((int)(itr->first) != MyPID) {
410 rPIDs[tsize] = (int)itr->first;
411 LocalOrdinal lid = MyDomainMap->getLocalElement(itr->second);
412 rLIDs[tsize] = lid;
413 tsize++;
414 }
415 }
416
417 type3PIDs = rPIDs.persistingView(0, tsize);
418 type3LIDs = rLIDs.persistingView(0, tsize);
419
420 if (error) {
421 std::cerr << errstr.str() << std::flush;
422 comm->barrier();
423 comm->barrier();
424 comm->barrier();
425 MPI_Abort(MPI_COMM_WORLD, -1);
426 }
427#endif
428}
429
430// Note: This should get merged with the other Tpetra sort routines eventually.
431template <typename Scalar, typename Ordinal>
432void sortCrsEntries(const Teuchos::ArrayView<size_t>& CRS_rowptr,
433 const Teuchos::ArrayView<Ordinal>& CRS_colind,
434 const Teuchos::ArrayView<Scalar>& CRS_vals) {
435 // For each row, sort column entries from smallest to largest.
436 // Use shell sort. Stable sort so it is fast if indices are already sorted.
437 // Code copied from Epetra_CrsMatrix::SortEntries()
438 size_t NumRows = CRS_rowptr.size() - 1;
439 size_t nnz = CRS_colind.size();
440
441 const bool permute_values_array = CRS_vals.size() > 0;
442
443 for (size_t i = 0; i < NumRows; i++) {
444 size_t start = CRS_rowptr[i];
445 if (start >= nnz) continue;
446
447 size_t NumEntries = CRS_rowptr[i + 1] - start;
448 Teuchos::ArrayRCP<Scalar> locValues;
450 locValues = Teuchos::arcp<Scalar>(&CRS_vals[start], 0, NumEntries, false);
451 Teuchos::ArrayRCP<Ordinal> locIndices(&CRS_colind[start], 0, NumEntries, false);
452
454 Ordinal m = 1;
455 while (m < n) m = m * 3 + 1;
456 m /= 3;
457
458 while (m > 0) {
459 Ordinal max = n - m;
460 for (Ordinal j = 0; j < max; j++) {
461 for (Ordinal k = j; k >= 0; k -= m) {
462 if (locIndices[k + m] >= locIndices[k])
463 break;
465 Scalar dtemp = locValues[k + m];
466 locValues[k + m] = locValues[k];
467 locValues[k] = dtemp;
468 }
470 locIndices[k + m] = locIndices[k];
471 locIndices[k] = itemp;
472 }
473 }
474 m = m / 3;
475 }
476 }
477}
478
479template <typename Ordinal>
480void sortCrsEntries(const Teuchos::ArrayView<size_t>& CRS_rowptr,
481 const Teuchos::ArrayView<Ordinal>& CRS_colind) {
482 // Generate dummy values array
483 Teuchos::ArrayView<Tpetra::Details::DefaultTypes::scalar_type> CRS_vals;
484 sortCrsEntries(CRS_rowptr, CRS_colind, CRS_vals);
485}
486
487namespace Impl {
488
489template <class RowOffsetsType, class ColumnIndicesType, class ValuesType>
490class SortCrsEntries {
491 private:
492 typedef typename ColumnIndicesType::non_const_value_type ordinal_type;
493 typedef typename ValuesType::non_const_value_type scalar_type;
494
495 public:
496 SortCrsEntries(const RowOffsetsType& ptr,
497 const ColumnIndicesType& ind,
498 const ValuesType& val)
499 : ptr_(ptr)
500 , ind_(ind)
501 , val_(val) {
502 static_assert(std::is_signed<ordinal_type>::value,
503 "The type of each "
504 "column index -- that is, the type of each entry of ind "
505 "-- must be signed in order for this functor to work.");
506 }
507
508 KOKKOS_FUNCTION void operator()(const size_t i) const {
509 const size_t nnz = ind_.extent(0);
510 const size_t start = ptr_(i);
511 const bool permute_values_array = val_.extent(0) > 0;
512
513 if (start < nnz) {
514 const size_t NumEntries = ptr_(i + 1) - start;
515
516 const ordinal_type n = static_cast<ordinal_type>(NumEntries);
517 ordinal_type m = 1;
518 while (m < n) m = m * 3 + 1;
519 m /= 3;
520
521 while (m > 0) {
522 ordinal_type max = n - m;
523 for (ordinal_type j = 0; j < max; j++) {
524 for (ordinal_type k = j; k >= 0; k -= m) {
525 const size_t sk = start + k;
526 if (ind_(sk + m) >= ind_(sk)) {
527 break;
528 }
529 if (permute_values_array) {
530 const scalar_type dtemp = val_(sk + m);
531 val_(sk + m) = val_(sk);
532 val_(sk) = dtemp;
533 }
534 const ordinal_type itemp = ind_(sk + m);
535 ind_(sk + m) = ind_(sk);
536 ind_(sk) = itemp;
537 }
538 }
539 m = m / 3;
540 }
541 }
542 }
543
544 static void
545 sortCrsEntries(const RowOffsetsType& ptr,
546 const ColumnIndicesType& ind,
547 const ValuesType& val) {
548 // For each row, sort column entries from smallest to largest.
549 // Use shell sort. Stable sort so it is fast if indices are already sorted.
550 // Code copied from Epetra_CrsMatrix::SortEntries()
551 // NOTE: This should not be taken as a particularly efficient way to sort
552 // rows of matrices in parallel. But it is correct, so that's something.
553 if (ptr.extent(0) == 0) {
554 return; // no rows, so nothing to sort
555 }
556 const size_t NumRows = ptr.extent(0) - 1;
557
558 typedef size_t index_type; // what this function was using; not my choice
559 typedef typename ValuesType::execution_space execution_space;
560 // Specify RangePolicy explicitly, in order to use correct execution
561 // space. See #1345.
562 typedef Kokkos::RangePolicy<execution_space, index_type> range_type;
563 Kokkos::parallel_for("sortCrsEntries", range_type(0, NumRows),
564 SortCrsEntries(ptr, ind, val));
565 }
566
567 private:
568 RowOffsetsType ptr_;
569 ColumnIndicesType ind_;
570 ValuesType val_;
571};
572
573} // namespace Impl
574
575template <typename rowptr_array_type, typename colind_array_type, typename vals_array_type>
576void sortCrsEntries(const rowptr_array_type& CRS_rowptr,
577 const colind_array_type& CRS_colind,
578 const vals_array_type& CRS_vals) {
579 Impl::SortCrsEntries<rowptr_array_type, colind_array_type,
580 vals_array_type>::sortCrsEntries(CRS_rowptr, CRS_colind, CRS_vals);
581}
582
583template <typename rowptr_array_type, typename colind_array_type>
584void sortCrsEntries(const rowptr_array_type& CRS_rowptr,
585 const colind_array_type& CRS_colind) {
586 // Generate dummy values array
587 typedef typename colind_array_type::execution_space execution_space;
588 typedef Tpetra::Details::DefaultTypes::scalar_type scalar_type;
589 typedef typename Kokkos::View<scalar_type*, execution_space> scalar_view_type;
590 scalar_view_type CRS_vals;
591 sortCrsEntries<rowptr_array_type, colind_array_type,
592 scalar_view_type>(CRS_rowptr, CRS_colind, CRS_vals);
593}
594
595// Note: This should get merged with the other Tpetra sort routines eventually.
596template <typename Scalar, typename Ordinal>
597void sortAndMergeCrsEntries(const Teuchos::ArrayView<size_t>& CRS_rowptr,
598 const Teuchos::ArrayView<Ordinal>& CRS_colind,
599 const Teuchos::ArrayView<Scalar>& CRS_vals) {
600 // For each row, sort column entries from smallest to largest,
601 // merging column ids that are identify by adding values. Use shell
602 // sort. Stable sort so it is fast if indices are already sorted.
603 // Code copied from Epetra_CrsMatrix::SortEntries()
604
605 if (CRS_rowptr.size() == 0) {
606 return; // no rows, so nothing to sort
607 }
608 const size_t NumRows = CRS_rowptr.size() - 1;
609 const size_t nnz = CRS_colind.size();
610 size_t new_curr = CRS_rowptr[0];
611 size_t old_curr = CRS_rowptr[0];
612
613 const bool permute_values_array = CRS_vals.size() > 0;
614
615 for (size_t i = 0; i < NumRows; i++) {
616 const size_t old_rowptr_i = CRS_rowptr[i];
618 if (old_rowptr_i >= nnz) continue;
619
620 size_t NumEntries = CRS_rowptr[i + 1] - old_rowptr_i;
621 Teuchos::ArrayRCP<Scalar> locValues;
623 locValues = Teuchos::arcp<Scalar>(&CRS_vals[old_rowptr_i], 0, NumEntries, false);
624 Teuchos::ArrayRCP<Ordinal> locIndices(&CRS_colind[old_rowptr_i], 0, NumEntries, false);
625
626 // Sort phase
628 Ordinal m = n / 2;
629
630 while (m > 0) {
631 Ordinal max = n - m;
632 for (Ordinal j = 0; j < max; j++) {
633 for (Ordinal k = j; k >= 0; k -= m) {
634 if (locIndices[k + m] >= locIndices[k])
635 break;
637 Scalar dtemp = locValues[k + m];
638 locValues[k + m] = locValues[k];
639 locValues[k] = dtemp;
640 }
642 locIndices[k + m] = locIndices[k];
643 locIndices[k] = itemp;
644 }
645 }
646 m = m / 2;
647 }
648
649 // Merge & shrink
650 for (size_t j = old_rowptr_i; j < CRS_rowptr[i + 1]; j++) {
651 if (j > old_rowptr_i && CRS_colind[j] == CRS_colind[new_curr - 1]) {
653 } else if (new_curr == j) {
654 new_curr++;
655 } else {
658 new_curr++;
659 }
660 }
662 }
663
665}
666
667template <typename Ordinal>
668void sortAndMergeCrsEntries(const Teuchos::ArrayView<size_t>& CRS_rowptr,
669 const Teuchos::ArrayView<Ordinal>& CRS_colind) {
670 Teuchos::ArrayView<Tpetra::Details::DefaultTypes::scalar_type> CRS_vals;
672}
673
674template <class rowptr_view_type, class colind_view_type, class vals_view_type>
675void sortAndMergeCrsEntries(rowptr_view_type& CRS_rowptr,
676 colind_view_type& CRS_colind,
677 vals_view_type& CRS_vals) {
678 using execution_space = typename vals_view_type::execution_space;
679
680 auto CRS_rowptr_in = CRS_rowptr;
681 auto CRS_colind_in = CRS_colind;
682 auto CRS_vals_in = CRS_vals;
683
684 KokkosSparse::sort_and_merge_matrix<execution_space, rowptr_view_type,
685 colind_view_type, vals_view_type>(CRS_rowptr_in, CRS_colind_in, CRS_vals_in,
686 CRS_rowptr, CRS_colind, CRS_vals);
687}
688
689template <typename LocalOrdinal, typename GlobalOrdinal, typename Node>
690void lowCommunicationMakeColMapAndReindexSerial(const Teuchos::ArrayView<const size_t>& rowptr,
691 const Teuchos::ArrayView<LocalOrdinal>& colind_LID,
692 const Teuchos::ArrayView<GlobalOrdinal>& colind_GID,
693 const Teuchos::RCP<const Tpetra::Map<LocalOrdinal, GlobalOrdinal, Node>>& domainMapRCP,
694 const Teuchos::ArrayView<const int>& owningPIDs,
695 Teuchos::Array<int>& remotePIDs,
696 Teuchos::RCP<const Tpetra::Map<LocalOrdinal, GlobalOrdinal, Node>>& colMap) {
697 using Teuchos::rcp;
698 typedef LocalOrdinal LO;
699 typedef GlobalOrdinal GO;
700 typedef Tpetra::global_size_t GST;
701 typedef Tpetra::Map<LO, GO, Node> map_type;
702 const char prefix[] = "lowCommunicationMakeColMapAndReindexSerial: ";
703
704 // The domainMap is an RCP because there is a shortcut for a
705 // (common) special case to return the columnMap = domainMap.
706 const map_type& domainMap = *domainMapRCP;
707
708 // Scan all column indices and sort into two groups:
709 // Local: those whose GID matches a GID of the domain map on this processor and
710 // Remote: All others.
711 const size_t numDomainElements = domainMap.getLocalNumElements();
712 Teuchos::Array<bool> LocalGIDs;
713 if (numDomainElements > 0) {
714 LocalGIDs.resize(numDomainElements, false); // Assume domain GIDs are not local
715 }
716
717 // In principle it is good to have RemoteGIDs and RemotGIDList be as
718 // long as the number of remote GIDs on this processor, but this
719 // would require two passes through the column IDs, so we make it
720 // the max of 100 and the number of block rows.
721 //
722 // FIXME (mfh 11 Feb 2015) Tpetra::Details::HashTable can hold at
723 // most INT_MAX entries, but it's possible to have more rows than
724 // that (if size_t is 64 bits and int is 32 bits).
725 const size_t numMyRows = rowptr.size() - 1;
726 const int hashsize = std::max(static_cast<int>(numMyRows), 100);
727
728 Tpetra::Details::HashTable<GO, LO> RemoteGIDs(hashsize);
729 Teuchos::Array<GO> RemoteGIDList;
730 RemoteGIDList.reserve(hashsize);
731 Teuchos::Array<int> PIDList;
732 PIDList.reserve(hashsize);
733
734 // Here we start using the *LocalOrdinal* colind_LID array. This is
735 // safe even if both columnIndices arrays are actually the same
736 // (because LocalOrdinal==GO). For *local* GID's set
737 // colind_LID with with their LID in the domainMap. For *remote*
738 // GIDs, we set colind_LID with (numDomainElements+NumRemoteColGIDs)
739 // before the increment of the remote count. These numberings will
740 // be separate because no local LID is greater than
741 // numDomainElements.
742
743 size_t NumLocalColGIDs = 0;
744 LO NumRemoteColGIDs = 0;
745 for (size_t i = 0; i < numMyRows; ++i) {
746 for (size_t j = rowptr[i]; j < rowptr[i + 1]; ++j) {
747 const GO GID = colind_GID[j];
748 // Check if GID matches a row GID
749 const LO LID = domainMap.getLocalElement(GID);
750 if (LID != -1) {
751 const bool alreadyFound = LocalGIDs[LID];
752 if (!alreadyFound) {
753 LocalGIDs[LID] = true; // There is a column in the graph associated with this domain map GID
754 NumLocalColGIDs++;
755 }
756 colind_LID[j] = LID;
757 } else {
758 const LO hash_value = RemoteGIDs.get(GID);
759 if (hash_value == -1) { // This means its a new remote GID
760 const int PID = owningPIDs[j];
761 TEUCHOS_TEST_FOR_EXCEPTION(
762 PID == -1, std::invalid_argument, prefix << "Cannot figure out if "
763 "PID is owned.");
764 colind_LID[j] = static_cast<LO>(numDomainElements + NumRemoteColGIDs);
765 RemoteGIDs.add(GID, NumRemoteColGIDs);
766 RemoteGIDList.push_back(GID);
767 PIDList.push_back(PID);
768 NumRemoteColGIDs++;
769 } else {
770 colind_LID[j] = static_cast<LO>(numDomainElements + hash_value);
771 }
772 }
773 }
774 }
775
776 // Possible short-circuit: If all domain map GIDs are present as
777 // column indices, then set ColMap=domainMap and quit.
778 if (domainMap.getComm()->getSize() == 1) {
779 // Sanity check: When there is only one process, there can be no
780 // remoteGIDs.
781 TEUCHOS_TEST_FOR_EXCEPTION(
782 NumRemoteColGIDs != 0, std::runtime_error, prefix << "There is only one "
783 "process in the domain Map's communicator, which means that there are no "
784 "\"remote\" indices. Nevertheless, some column indices are not in the "
785 "domain Map.");
786 if (static_cast<size_t>(NumLocalColGIDs) == numDomainElements) {
787 // In this case, we just use the domainMap's indices, which is,
788 // not coincidently, what we clobbered colind with up above
789 // anyway. No further reindexing is needed.
790 colMap = domainMapRCP;
791 return;
792 }
793 }
794
795 // Now build the array containing column GIDs
796 // Build back end, containing remote GIDs, first
797 const LO numMyCols = NumLocalColGIDs + NumRemoteColGIDs;
798 Teuchos::Array<GO> ColIndices;
799 GO* RemoteColIndices = NULL;
800 if (numMyCols > 0) {
801 ColIndices.resize(numMyCols);
802 if (NumLocalColGIDs != static_cast<size_t>(numMyCols)) {
803 RemoteColIndices = &ColIndices[NumLocalColGIDs]; // Points to back half of ColIndices
804 }
805 }
806
807 for (LO i = 0; i < NumRemoteColGIDs; ++i) {
808 RemoteColIndices[i] = RemoteGIDList[i];
809 }
810
811 // Build permute array for *remote* reindexing.
812 Teuchos::Array<LO> RemotePermuteIDs(NumRemoteColGIDs);
813 for (LO i = 0; i < NumRemoteColGIDs; ++i) {
814 RemotePermuteIDs[i] = i;
815 }
816
817 // Sort External column indices so that all columns coming from a
818 // given remote processor are contiguous. This is a sort with two
819 // auxilary arrays: RemoteColIndices and RemotePermuteIDs.
820 Tpetra::sort3(PIDList.begin(), PIDList.end(),
821 ColIndices.begin() + NumLocalColGIDs,
822 RemotePermuteIDs.begin());
823
824 // Stash the RemotePIDs.
825 //
826 // Note: If Teuchos::Array had a shrink_to_fit like std::vector,
827 // we'd call it here.
828 remotePIDs = PIDList;
829
830 // Sort external column indices so that columns from a given remote
831 // processor are not only contiguous but also in ascending
832 // order. NOTE: I don't know if the number of externals associated
833 // with a given remote processor is known at this point ... so I
834 // count them here.
835
836 // NTS: Only sort the RemoteColIndices this time...
837 LO StartCurrent = 0, StartNext = 1;
838 while (StartNext < NumRemoteColGIDs) {
839 if (PIDList[StartNext] == PIDList[StartNext - 1]) {
840 StartNext++;
841 } else {
842 Tpetra::sort2(ColIndices.begin() + NumLocalColGIDs + StartCurrent,
843 ColIndices.begin() + NumLocalColGIDs + StartNext,
844 RemotePermuteIDs.begin() + StartCurrent);
845 StartCurrent = StartNext;
846 StartNext++;
847 }
848 }
849 Tpetra::sort2(ColIndices.begin() + NumLocalColGIDs + StartCurrent,
850 ColIndices.begin() + NumLocalColGIDs + StartNext,
851 RemotePermuteIDs.begin() + StartCurrent);
852
853 // Reverse the permutation to get the information we actually care about
854 Teuchos::Array<LO> ReverseRemotePermuteIDs(NumRemoteColGIDs);
855 for (LO i = 0; i < NumRemoteColGIDs; ++i) {
856 ReverseRemotePermuteIDs[RemotePermuteIDs[i]] = i;
857 }
858
859 // Build permute array for *local* reindexing.
860 bool use_local_permute = false;
861 Teuchos::Array<LO> LocalPermuteIDs(numDomainElements);
862
863 // Now fill front end. Two cases:
864 //
865 // (1) If the number of Local column GIDs is the same as the number
866 // of Local domain GIDs, we can simply read the domain GIDs into
867 // the front part of ColIndices, otherwise
868 //
869 // (2) We step through the GIDs of the domainMap, checking to see if
870 // each domain GID is a column GID. we want to do this to
871 // maintain a consistent ordering of GIDs between the columns
872 // and the domain.
873 Teuchos::ArrayView<const GO> domainGlobalElements = domainMap.getLocalElementList();
874 if (static_cast<size_t>(NumLocalColGIDs) == numDomainElements) {
875 if (NumLocalColGIDs > 0) {
876 // Load Global Indices into first numMyCols elements column GID list
877 std::copy(domainGlobalElements.begin(), domainGlobalElements.end(),
878 ColIndices.begin());
879 }
880 } else {
881 LO NumLocalAgain = 0;
882 use_local_permute = true;
883 for (size_t i = 0; i < numDomainElements; ++i) {
884 if (LocalGIDs[i]) {
885 LocalPermuteIDs[i] = NumLocalAgain;
886 ColIndices[NumLocalAgain++] = domainGlobalElements[i];
887 }
888 }
889 TEUCHOS_TEST_FOR_EXCEPTION(
890 static_cast<size_t>(NumLocalAgain) != NumLocalColGIDs,
891 std::runtime_error, prefix << "Local ID count test failed.");
892 }
893
894 // Make column Map
895 const GST minus_one = Teuchos::OrdinalTraits<GST>::invalid();
896 colMap = rcp(new map_type(minus_one, ColIndices, domainMap.getIndexBase(),
897 domainMap.getComm()));
898
899 // Low-cost reindex of the matrix
900 for (size_t i = 0; i < numMyRows; ++i) {
901 for (size_t j = rowptr[i]; j < rowptr[i + 1]; ++j) {
902 const LO ID = colind_LID[j];
903 if (static_cast<size_t>(ID) < numDomainElements) {
904 if (use_local_permute) {
905 colind_LID[j] = LocalPermuteIDs[colind_LID[j]];
906 }
907 // In the case where use_local_permute==false, we just copy
908 // the DomainMap's ordering, which it so happens is what we
909 // put in colind_LID to begin with.
910 } else {
911 colind_LID[j] = NumLocalColGIDs + ReverseRemotePermuteIDs[colind_LID[j] - numDomainElements];
912 }
913 }
914 }
915}
916
917template <typename LocalOrdinal, typename GlobalOrdinal, typename Node>
919 const Teuchos::ArrayView<const size_t>& rowptr,
920 const Teuchos::ArrayView<LocalOrdinal>& colind_LID,
921 const Teuchos::ArrayView<GlobalOrdinal>& colind_GID,
923 const Teuchos::ArrayView<const int>& owningPIDs,
924 Teuchos::Array<int>& remotePIDs,
925 Teuchos::RCP<const Tpetra::Map<LocalOrdinal, GlobalOrdinal, Node>>& colMap) {
926 using Teuchos::rcp;
927 typedef LocalOrdinal LO;
928 typedef GlobalOrdinal GO;
930 typedef Tpetra::Map<LO, GO, Node> map_type;
931 const char prefix[] = "lowCommunicationMakeColMapAndReindex: ";
932
933 typedef typename Node::device_type DT;
934 using execution_space = typename DT::execution_space;
935 execution_space exec;
936 using team_policy = Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Dynamic>>;
937 typedef typename map_type::local_map_type local_map_type;
938
939 // Create device mirror and host mirror views from function parameters
940 // When we pass in views instead of Teuchos::ArrayViews, we can avoid copying views
941 auto colind_LID_view = Details::create_mirror_view_from_raw_host_array(exec, colind_LID.getRawPtr(), colind_LID.size(), true, "colind_LID");
942 auto rowptr_view = Details::create_mirror_view_from_raw_host_array(exec, rowptr.getRawPtr(), rowptr.size(), true, "rowptr");
943 auto colind_GID_view = Details::create_mirror_view_from_raw_host_array(exec, colind_GID.getRawPtr(), colind_GID.size(), true, "colind_GID");
944 auto owningPIDs_view = Details::create_mirror_view_from_raw_host_array(exec, owningPIDs.getRawPtr(), owningPIDs.size(), true, "owningPIDs");
945
946 typename decltype(colind_LID_view)::host_mirror_type colind_LID_host(colind_LID.getRawPtr(), colind_LID.size());
947 typename decltype(colind_GID_view)::host_mirror_type colind_GID_host(colind_GID.getRawPtr(), colind_GID.size());
948
949 Kokkos::deep_copy(colind_LID_view, colind_LID_host);
950 Kokkos::deep_copy(colind_GID_view, colind_GID_host);
951
952 // The domainMap is an RCP because there is a shortcut for a
953 // (common) special case to return the columnMap = domainMap.
954 const map_type& domainMap = *domainMapRCP;
955
956 Kokkos::UnorderedMap<LO, bool, DT> LocalGIDs_view_map(colind_LID.size());
957 Kokkos::UnorderedMap<GO, LO, DT> RemoteGIDs_view_map(colind_LID.size());
958
959 const size_t numMyRows = rowptr.size() - 1;
960 local_map_type domainMap_local = domainMap.getLocalMap();
961
962 const size_t numDomainElements = domainMap.getLocalNumElements();
963 Kokkos::View<bool*, DT> LocalGIDs_view("LocalGIDs", numDomainElements);
964 auto LocalGIDs_host = Kokkos::create_mirror_view(LocalGIDs_view);
965
966 size_t NumLocalColGIDs = 0;
967
968 // Scan all column indices and sort into two groups:
969 // Local: those whose GID matches a GID of the domain map on this processor and
970 // Remote: All others.
971 // Kokkos::Parallel_reduce sums up NumLocalColGIDs, while we use the size of the Remote GIDs map to find NumRemoteColGIDs
972 Kokkos::parallel_reduce(
973 team_policy(numMyRows, Kokkos::AUTO), KOKKOS_LAMBDA(const typename team_policy::member_type& member, size_t& update) {
974 const int i = member.league_rank();
975 size_t NumLocalColGIDs_temp = 0;
976 size_t rowptr_start = rowptr_view[i];
977 size_t rowptr_end = rowptr_view[i + 1];
978 Kokkos::parallel_reduce(
979 Kokkos::TeamThreadRange(member, rowptr_start, rowptr_end), [&](const size_t j, size_t& innerUpdate) {
980 const GO GID = colind_GID_view[j];
981 // Check if GID matches a row GID in local domain map
982 const LO LID = domainMap_local.getLocalElement(GID);
983 if (LID != -1) {
984 auto outcome = LocalGIDs_view_map.insert(LID);
985 // Fresh insert
986 if (outcome.success()) {
987 LocalGIDs_view[LID] = true;
988 innerUpdate++;
989 }
990 } else {
991 const int PID = owningPIDs_view[j];
992 auto outcome = RemoteGIDs_view_map.insert(GID, PID);
993 if (outcome.success() && PID == -1) {
994 Kokkos::abort("Cannot figure out if ID is owned.\n");
995 }
996 }
997 },
999 if (member.team_rank() == 0) update += NumLocalColGIDs_temp;
1000 },
1002
1004
1005 Kokkos::View<int*, DT> PIDList_view("PIDList", NumRemoteColGIDs);
1006 auto PIDList_host = Kokkos::create_mirror_view(PIDList_view);
1007
1008 Kokkos::View<GO*, DT> RemoteGIDList_view("RemoteGIDList", NumRemoteColGIDs);
1009 auto RemoteGIDList_host = Kokkos::create_mirror_view(RemoteGIDList_view);
1010
1011 // For each index in RemoteGIDs_map that contains a GID, use "update" to indicate the number of GIDs "before" this GID
1012 // This maps each element in the RemoteGIDs hash table to an index in RemoteGIDList / PIDList without any overwriting or empty spaces between indices
1013 Kokkos::parallel_scan(
1014 Kokkos::RangePolicy<execution_space>(0, RemoteGIDs_view_map.capacity()), KOKKOS_LAMBDA(const int i, GO& update, const bool final) {
1015 if (final && RemoteGIDs_view_map.valid_at(i)) {
1016 RemoteGIDList_view[update] = RemoteGIDs_view_map.key_at(i);
1017 PIDList_view[update] = RemoteGIDs_view_map.value_at(i);
1018 }
1019 if (RemoteGIDs_view_map.valid_at(i)) {
1020 update += 1;
1021 }
1022 });
1023
1024 // Possible short-circuit: If all domain map GIDs are present as
1025 // column indices, then set ColMap=domainMap and quit.
1026 if (domainMap.getComm()->getSize() == 1) {
1027 // Sanity check: When there is only one process, there can be no
1028 // remoteGIDs.
1030 NumRemoteColGIDs != 0, std::runtime_error, prefix << "There is only one "
1031 "process in the domain Map's communicator, which means that there are no "
1032 "\"remote\" indices. Nevertheless, some column indices are not in the "
1033 "domain Map.");
1034 if (static_cast<size_t>(NumLocalColGIDs) == numDomainElements) {
1035 // In this case, we just use the domainMap's indices, which is,
1036 // not coincidently, what we clobbered colind with up above
1037 // anyway. No further reindexing is needed.
1038 colMap = domainMapRCP;
1039
1040 // Fill out local colMap (which should only contain local GIDs)
1041 auto localColMap = colMap->getLocalMap();
1042 Kokkos::parallel_for(
1043 Kokkos::RangePolicy<execution_space>(0, colind_GID.size()), KOKKOS_LAMBDA(const int i) {
1044 colind_LID_view[i] = localColMap.getLocalElement(colind_GID_view[i]);
1045 });
1046 Kokkos::deep_copy(execution_space(), colind_LID_host, colind_LID_view);
1047 return;
1048 }
1049 }
1050
1051 // Now build the array containing column GIDs
1052 // Build back end, containing remote GIDs, first
1054 Kokkos::View<GO*, DT> ColIndices_view("ColIndices", numMyCols);
1055
1056 // We don't need to load the backend of ColIndices or sort if there are no remote GIDs
1057 if (NumRemoteColGIDs > 0) {
1058 if (NumLocalColGIDs != static_cast<size_t>(numMyCols)) {
1059 Kokkos::parallel_for(
1060 Kokkos::RangePolicy<execution_space>(0, NumRemoteColGIDs), KOKKOS_LAMBDA(const int i) {
1062 });
1063 }
1064
1065 // Find the largest PID for bin sorting purposes
1066 int PID_max = 0;
1067 Kokkos::parallel_reduce(
1068 Kokkos::RangePolicy<execution_space>(0, PIDList_host.size()), KOKKOS_LAMBDA(const int i, int& max) {
1069 if (max < PIDList_view[i]) max = PIDList_view[i];
1070 },
1071 Kokkos::Max<int>(PID_max));
1072
1073 using KeyViewTypePID = decltype(PIDList_view);
1074 using BinSortOpPID = Kokkos::BinOp1D<KeyViewTypePID>;
1075
1076 // Make a subview of ColIndices for remote GID sorting
1077 auto ColIndices_subview = Kokkos::subview(ColIndices_view, Kokkos::make_pair(NumLocalColGIDs, ColIndices_view.size()));
1078
1079 // Make binOp with bins = PID_max + 1, min = 0, max = PID_max
1081
1082 // Sort External column indices so that all columns coming from a
1083 // given remote processor are contiguous. This is a sort with one
1084 // auxilary array: RemoteColIndices
1085 Kokkos::BinSort<KeyViewTypePID, BinSortOpPID> bin_sort2(PIDList_view, 0, PIDList_view.size(), binOp2, false);
1086 bin_sort2.create_permute_vector(exec);
1089
1090 // Deep copy back from device to host
1091 Kokkos::deep_copy(exec, PIDList_host, PIDList_view);
1092
1093 // Stash the RemotePIDs. Once remotePIDs is changed to become a Kokkos view, we can remove this and copy directly.
1094 // Note: If Teuchos::Array had a shrink_to_fit like std::vector,
1095 // we'd call it here.
1096
1097 exec.fence("fence before setting PIDList");
1098 Teuchos::Array<int> PIDList(NumRemoteColGIDs);
1099 for (LO i = 0; i < NumRemoteColGIDs; ++i) {
1100 PIDList[i] = PIDList_host[i];
1101 }
1102
1103 remotePIDs = PIDList;
1104
1105 // Sort external column indices so that columns from a given remote
1106 // processor are not only contiguous but also in ascending
1107 // order. NOTE: I don't know if the number of externals associated
1108 // with a given remote processor is known at this point ... so I
1109 // count them here.
1110 LO StartCurrent = 0, StartNext = 1;
1111 while (StartNext < NumRemoteColGIDs) {
1113 StartNext++;
1114 } else {
1117 StartNext++;
1118 }
1119 }
1120
1122 }
1123
1124 // Build permute array for *local* reindexing.
1125
1126 // Now fill front end. Two cases:
1127 //
1128 // (1) If the number of Local column GIDs is the same as the number
1129 // of Local domain GIDs, we can simply read the domain GIDs into
1130 // the front part of ColIndices, otherwise
1131 //
1132 // (2) We step through the GIDs of the domainMap, checking to see if
1133 // each domain GID is a column GID. we want to do this to
1134 // maintain a consistent ordering of GIDs between the columns
1135 // and the domain.
1136 if (static_cast<size_t>(NumLocalColGIDs) == numDomainElements) {
1137 if (NumLocalColGIDs > 0) {
1138 // Load Global Indices into first numMyCols elements column GID list
1139 Kokkos::parallel_for(
1140 Kokkos::RangePolicy<execution_space>(0, numDomainElements), KOKKOS_LAMBDA(const int i) {
1141 ColIndices_view[i] = domainMap_local.getGlobalElement(i);
1142 });
1143 }
1144 } else {
1145 // This part isn't actually tested in the unit tests
1146 LO NumLocalAgain = 0;
1147 Kokkos::parallel_scan(
1148 Kokkos::RangePolicy<execution_space>(0, numDomainElements), KOKKOS_LAMBDA(const int i, LO& update, const bool final) {
1149 if (final && LocalGIDs_view[i]) {
1150 ColIndices_view[update] = domainMap_local.getGlobalElement(i);
1151 }
1152 if (LocalGIDs_view[i]) {
1153 update++;
1154 }
1155 },
1157
1159 static_cast<size_t>(NumLocalAgain) != NumLocalColGIDs,
1160 std::runtime_error, prefix << "Local ID count test failed.");
1161 }
1162
1163 // Make column Map
1164 const GST minus_one = Teuchos::OrdinalTraits<GST>::invalid();
1165
1166 colMap = rcp(new map_type(minus_one, ColIndices_view, domainMap.getIndexBase(),
1167 domainMap.getComm()));
1168
1169 // Fill out colind_LID using local map
1170 auto localColMap = colMap->getLocalMap();
1171 Kokkos::parallel_for(
1172 Kokkos::RangePolicy<execution_space>(0, colind_GID.size()), KOKKOS_LAMBDA(const int i) {
1173 colind_LID_view[i] = localColMap.getLocalElement(colind_GID_view[i]);
1174 });
1175
1176 // For now, we copy back into colind_LID_host (which also overwrites the colind_LID Tuechos array)
1177 // When colind_LID becomes a Kokkos View we can delete this
1178 Kokkos::deep_copy(exec, colind_LID_host, colind_LID_view);
1179}
1180
1181template <typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1182void lowCommunicationMakeColMapAndReindex(
1183 const Kokkos::View<size_t*, typename Node::device_type> rowptr_view,
1184 const Kokkos::View<LocalOrdinal*, typename Node::device_type> colind_LID_view,
1185 const Kokkos::View<GlobalOrdinal*, typename Node::device_type> colind_GID_view,
1187 const Kokkos::View<int*, typename Node::device_type> owningPIDs_view,
1188 Teuchos::Array<int>& remotePIDs,
1189 Teuchos::RCP<const Tpetra::Map<LocalOrdinal, GlobalOrdinal, Node>>& colMap) {
1190 using Teuchos::rcp;
1191 typedef LocalOrdinal LO;
1192 typedef GlobalOrdinal GO;
1193 typedef Tpetra::global_size_t GST;
1194 typedef Tpetra::Map<LO, GO, Node> map_type;
1195 const char prefix[] = "lowCommunicationMakeColMapAndReindex: ";
1196
1197 typedef typename Node::device_type DT;
1198 using execution_space = typename DT::execution_space;
1199 execution_space exec;
1200 using team_policy = Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Dynamic>>;
1201 typedef typename map_type::local_map_type local_map_type;
1202
1203 // The domainMap is an RCP because there is a shortcut for a
1204 // (common) special case to return the columnMap = domainMap.
1205 const map_type& domainMap = *domainMapRCP;
1206
1207 Kokkos::UnorderedMap<LO, bool, DT> LocalGIDs_view_map(colind_LID_view.size());
1208 Kokkos::UnorderedMap<GO, LO, DT> RemoteGIDs_view_map(colind_LID_view.size());
1209
1210 const size_t numMyRows = rowptr_view.size() - 1;
1211 local_map_type domainMap_local = domainMap.getLocalMap();
1212
1213 const size_t numDomainElements = domainMap.getLocalNumElements();
1214 Kokkos::View<bool*, DT> LocalGIDs_view("LocalGIDs", numDomainElements);
1215 auto LocalGIDs_host = Kokkos::create_mirror_view(LocalGIDs_view);
1216
1217 size_t NumLocalColGIDs = 0;
1218
1219 // Scan all column indices and sort into two groups:
1220 // Local: those whose GID matches a GID of the domain map on this processor and
1221 // Remote: All others.
1222 // Kokkos::Parallel_reduce sums up NumLocalColGIDs, while we use the size of the Remote GIDs map to find NumRemoteColGIDs
1223 Kokkos::parallel_reduce(
1224 team_policy(numMyRows, Kokkos::AUTO), KOKKOS_LAMBDA(const typename team_policy::member_type& member, size_t& update) {
1225 const int i = member.league_rank();
1226 size_t NumLocalColGIDs_temp = 0;
1227 size_t rowptr_start = rowptr_view[i];
1228 size_t rowptr_end = rowptr_view[i + 1];
1229 Kokkos::parallel_reduce(
1230 Kokkos::TeamThreadRange(member, rowptr_start, rowptr_end), [&](const size_t j, size_t& innerUpdate) {
1231 const GO GID = colind_GID_view[j];
1232 // Check if GID matches a row GID in local domain map
1233 const LO LID = domainMap_local.getLocalElement(GID);
1234 if (LID != -1) {
1235 auto outcome = LocalGIDs_view_map.insert(LID);
1236 // Fresh insert
1237 if (outcome.success()) {
1238 LocalGIDs_view[LID] = true;
1239 innerUpdate++;
1240 }
1241 } else {
1242 const int PID = owningPIDs_view[j];
1243 auto outcome = RemoteGIDs_view_map.insert(GID, PID);
1244 if (outcome.success() && PID == -1) {
1245 Kokkos::abort("Cannot figure out if ID is owned.\n");
1246 }
1247 }
1248 },
1249 NumLocalColGIDs_temp);
1250 if (member.team_rank() == 0) update += NumLocalColGIDs_temp;
1251 },
1252 NumLocalColGIDs);
1253
1254 LO NumRemoteColGIDs = RemoteGIDs_view_map.size();
1255
1256 Kokkos::View<int*, DT> PIDList_view("PIDList_d", NumRemoteColGIDs);
1257
1258 Kokkos::View<GO*, DT> RemoteGIDList_view("RemoteGIDList", NumRemoteColGIDs);
1259 auto RemoteGIDList_host = Kokkos::create_mirror_view(RemoteGIDList_view);
1260
1261 // For each index in RemoteGIDs_map that contains a GID, use "update" to indicate the number of GIDs "before" this GID
1262 // This maps each element in the RemoteGIDs hash table to an index in RemoteGIDList / PIDList without any overwriting or empty spaces between indices
1263 Kokkos::parallel_scan(
1264 Kokkos::RangePolicy<execution_space>(0, RemoteGIDs_view_map.capacity()), KOKKOS_LAMBDA(const int i, GO& update, const bool final) {
1265 if (final && RemoteGIDs_view_map.valid_at(i)) {
1266 RemoteGIDList_view[update] = RemoteGIDs_view_map.key_at(i);
1267 PIDList_view[update] = RemoteGIDs_view_map.value_at(i);
1268 }
1269 if (RemoteGIDs_view_map.valid_at(i)) {
1270 update += 1;
1271 }
1272 });
1273
1274 // Possible short-circuit: If all domain map GIDs are present as
1275 // column indices, then set ColMap=domainMap and quit.
1276 if (domainMap.getComm()->getSize() == 1) {
1277 // Sanity check: When there is only one process, there can be no
1278 // remoteGIDs.
1279 TEUCHOS_TEST_FOR_EXCEPTION(
1280 NumRemoteColGIDs != 0, std::runtime_error, prefix << "There is only one "
1281 "process in the domain Map's communicator, which means that there are no "
1282 "\"remote\" indices. Nevertheless, some column indices are not in the "
1283 "domain Map.");
1284 if (static_cast<size_t>(NumLocalColGIDs) == numDomainElements) {
1285 // In this case, we just use the domainMap's indices, which is,
1286 // not coincidently, what we clobbered colind with up above
1287 // anyway. No further reindexing is needed.
1288 colMap = domainMapRCP;
1289
1290 // Fill out local colMap (which should only contain local GIDs)
1291 auto localColMap = colMap->getLocalMap();
1292 Kokkos::parallel_for(
1293 Kokkos::RangePolicy<execution_space>(0, colind_GID_view.size()), KOKKOS_LAMBDA(const int i) {
1294 colind_LID_view[i] = localColMap.getLocalElement(colind_GID_view[i]);
1295 });
1296 return;
1297 }
1298 }
1299
1300 // Now build the array containing column GIDs
1301 // Build back end, containing remote GIDs, first
1302 const LO numMyCols = NumLocalColGIDs + NumRemoteColGIDs;
1303 Kokkos::View<GO*, DT> ColIndices_view("ColIndices", numMyCols);
1304
1305 // We don't need to load the backend of ColIndices or sort if there are no remote GIDs
1306 if (NumRemoteColGIDs > 0) {
1307 if (NumLocalColGIDs != static_cast<size_t>(numMyCols)) {
1308 Kokkos::parallel_for(
1309 Kokkos::RangePolicy<execution_space>(0, NumRemoteColGIDs), KOKKOS_LAMBDA(const int i) {
1310 ColIndices_view[NumLocalColGIDs + i] = RemoteGIDList_view[i];
1311 });
1312 }
1313
1314 // Find the largest PID for bin sorting purposes
1315 int PID_max = 0;
1316 Kokkos::parallel_reduce(
1317 Kokkos::RangePolicy<execution_space>(0, PIDList_view.size()), KOKKOS_LAMBDA(const int i, int& max) {
1318 if (max < PIDList_view[i]) max = PIDList_view[i];
1319 },
1320 Kokkos::Max<int>(PID_max));
1321
1322 using KeyViewTypePID = decltype(PIDList_view);
1323 using BinSortOpPID = Kokkos::BinOp1D<KeyViewTypePID>;
1324
1325 // Make a subview of ColIndices for remote GID sorting
1326 auto ColIndices_subview = Kokkos::subview(ColIndices_view, Kokkos::make_pair(NumLocalColGIDs, ColIndices_view.size()));
1327
1328 // Make binOp with bins = PID_max + 1, min = 0, max = PID_max
1329 BinSortOpPID binOp2(PID_max + 1, 0, PID_max);
1330
1331 // Sort External column indices so that all columns coming from a
1332 // given remote processor are contiguous. This is a sort with one
1333 // auxilary array: RemoteColIndices
1334 Kokkos::BinSort<KeyViewTypePID, BinSortOpPID> bin_sort2(PIDList_view, 0, PIDList_view.size(), binOp2, false);
1335 bin_sort2.create_permute_vector(exec);
1336 bin_sort2.sort(exec, PIDList_view);
1337 bin_sort2.sort(exec, ColIndices_subview);
1338
1339 // Deep copy back from device to host
1340 // Stash the RemotePIDs. Once remotePIDs is changed to become a Kokkos view, we can remove this and copy directly.
1341 Teuchos::Array<int> PIDList(NumRemoteColGIDs);
1342 Kokkos::View<int*, Kokkos::HostSpace> PIDList_host(PIDList.data(), PIDList.size());
1343 Kokkos::deep_copy(exec, PIDList_host, PIDList_view);
1344 exec.fence();
1345
1346 remotePIDs = PIDList;
1347
1348 // Sort external column indices so that columns from a given remote
1349 // processor are not only contiguous but also in ascending
1350 // order. NOTE: I don't know if the number of externals associated
1351 // with a given remote processor is known at this point ... so I
1352 // count them here.
1353 LO StartCurrent = 0, StartNext = 1;
1354 while (StartNext < NumRemoteColGIDs) {
1355 if (PIDList_host[StartNext] == PIDList_host[StartNext - 1]) {
1356 StartNext++;
1357 } else {
1358 Kokkos::sort(ColIndices_view, NumLocalColGIDs + StartCurrent, NumLocalColGIDs + StartNext);
1359 StartCurrent = StartNext;
1360 StartNext++;
1361 }
1362 }
1363
1364 Kokkos::sort(ColIndices_view, NumLocalColGIDs + StartCurrent, NumLocalColGIDs + StartNext);
1365 }
1366
1367 // Build permute array for *local* reindexing.
1368
1369 // Now fill front end. Two cases:
1370 //
1371 // (1) If the number of Local column GIDs is the same as the number
1372 // of Local domain GIDs, we can simply read the domain GIDs into
1373 // the front part of ColIndices, otherwise
1374 //
1375 // (2) We step through the GIDs of the domainMap, checking to see if
1376 // each domain GID is a column GID. we want to do this to
1377 // maintain a consistent ordering of GIDs between the columns
1378 // and the domain.
1379 if (static_cast<size_t>(NumLocalColGIDs) == numDomainElements) {
1380 if (NumLocalColGIDs > 0) {
1381 // Load Global Indices into first numMyCols elements column GID list
1382 Kokkos::parallel_for(
1383 Kokkos::RangePolicy<execution_space>(0, numDomainElements), KOKKOS_LAMBDA(const int i) {
1384 ColIndices_view[i] = domainMap_local.getGlobalElement(i);
1385 });
1386 }
1387 } else {
1388 // This part isn't actually tested in the unit tests
1389 LO NumLocalAgain = 0;
1390 Kokkos::parallel_scan(
1391 Kokkos::RangePolicy<execution_space>(0, numDomainElements), KOKKOS_LAMBDA(const int i, LO& update, const bool final) {
1392 if (final && LocalGIDs_view[i]) {
1393 ColIndices_view[update] = domainMap_local.getGlobalElement(i);
1394 }
1395 if (LocalGIDs_view[i]) {
1396 update++;
1397 }
1398 },
1399 NumLocalAgain);
1400
1401 TEUCHOS_TEST_FOR_EXCEPTION(
1402 static_cast<size_t>(NumLocalAgain) != NumLocalColGIDs,
1403 std::runtime_error, prefix << "Local ID count test failed.");
1404 }
1405
1406 // Make column Map
1407 const GST minus_one = Teuchos::OrdinalTraits<GST>::invalid();
1408
1409 colMap = rcp(new map_type(minus_one, ColIndices_view, domainMap.getIndexBase(),
1410 domainMap.getComm()));
1411
1412 // Fill out colind_LID using local map
1413 auto localColMap = colMap->getLocalMap();
1414 Kokkos::parallel_for(
1415 Kokkos::RangePolicy<execution_space>(0, colind_GID_view.size()), KOKKOS_LAMBDA(const int i) {
1416 colind_LID_view[i] = localColMap.getLocalElement(colind_GID_view[i]);
1417 });
1418}
1419
1420// Generates an list of owning PIDs based on two transfer (aka import/export objects)
1421// Let:
1422// OwningMap = useReverseModeForOwnership ? transferThatDefinesOwnership.getTargetMap() : transferThatDefinesOwnership.getSourceMap();
1423// MapAo = useReverseModeForOwnership ? transferThatDefinesOwnership.getSourceMap() : transferThatDefinesOwnership.getTargetMap();
1424// MapAm = useReverseModeForMigration ? transferThatDefinesMigration.getTargetMap() : transferThatDefinesMigration.getSourceMap();
1425// VectorMap = useReverseModeForMigration ? transferThatDefinesMigration.getSourceMap() : transferThatDefinesMigration.getTargetMap();
1426// Precondition:
1427// 1) MapAo.isSameAs(*MapAm) - map compatibility between transfers
1428// 2) VectorMap->isSameAs(*owningPIDs->getMap()) - map compabibility between transfer & vector
1429// 3) OwningMap->isOneToOne() - owning map is 1-to-1
1430// --- Precondition 3 is only checked in DEBUG mode ---
1431// Postcondition:
1432// owningPIDs[VectorMap->getLocalElement(GID i)] = j iff (OwningMap->isLocalElement(GID i) on rank j)
1433template <typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1434void getTwoTransferOwnershipVector(const ::Tpetra::Details::Transfer<LocalOrdinal, GlobalOrdinal, Node>& transferThatDefinesOwnership,
1436 const ::Tpetra::Details::Transfer<LocalOrdinal, GlobalOrdinal, Node>& transferThatDefinesMigration,
1441
1442 Teuchos::RCP<const Tpetra::Map<LocalOrdinal, GlobalOrdinal, Node>> OwningMap = useReverseModeForOwnership ? transferThatDefinesOwnership.getTargetMap() : transferThatDefinesOwnership.getSourceMap();
1443 Teuchos::RCP<const Tpetra::Map<LocalOrdinal, GlobalOrdinal, Node>> MapAo = useReverseModeForOwnership ? transferThatDefinesOwnership.getSourceMap() : transferThatDefinesOwnership.getTargetMap();
1444 Teuchos::RCP<const Tpetra::Map<LocalOrdinal, GlobalOrdinal, Node>> MapAm = useReverseModeForMigration ? transferThatDefinesMigration.getTargetMap() : transferThatDefinesMigration.getSourceMap();
1445 Teuchos::RCP<const Tpetra::Map<LocalOrdinal, GlobalOrdinal, Node>> VectorMap = useReverseModeForMigration ? transferThatDefinesMigration.getSourceMap() : transferThatDefinesMigration.getTargetMap();
1446
1447 TEUCHOS_TEST_FOR_EXCEPTION(!MapAo->isSameAs(*MapAm), std::runtime_error, "Tpetra::Import_Util::getTwoTransferOwnershipVector map mismatch between transfers");
1448 TEUCHOS_TEST_FOR_EXCEPTION(!VectorMap->isSameAs(*owningPIDs.getMap()), std::runtime_error, "Tpetra::Import_Util::getTwoTransferOwnershipVector map mismatch transfer and vector");
1449#ifdef HAVE_TPETRA_DEBUG
1450 TEUCHOS_TEST_FOR_EXCEPTION(!OwningMap->isOneToOne(), std::runtime_error, "Tpetra::Import_Util::getTwoTransferOwnershipVector owner must be 1-to-1");
1451#endif
1452
1453 int rank = OwningMap->getComm()->getRank();
1454 // Generate "A" vector and fill it with owning information. We can read this from transferThatDefinesOwnership w/o communication
1455 // Note: Due to the 1-to-1 requirement, several of these options throw
1457 const import_type* ownAsImport = dynamic_cast<const import_type*>(&transferThatDefinesOwnership);
1458 const export_type* ownAsExport = dynamic_cast<const export_type*>(&transferThatDefinesOwnership);
1459
1460 Teuchos::ArrayRCP<int> pids = temp.getDataNonConst();
1461 Teuchos::ArrayView<int> v_pids = pids();
1463 TEUCHOS_TEST_FOR_EXCEPTION(1, std::runtime_error, "Tpetra::Import_Util::getTwoTransferOwnershipVector owner must be 1-to-1");
1465 getPids(*ownAsImport, v_pids, false);
1467 TEUCHOS_TEST_FOR_EXCEPTION(1, std::runtime_error, "Tpetra::Import_Util::getTwoTransferOwnershipVector this option not yet implemented");
1468 } else {
1469 TEUCHOS_TEST_FOR_EXCEPTION(1, std::runtime_error, "Tpetra::Import_Util::getTwoTransferOwnershipVector owner must be 1-to-1");
1470 }
1471
1472 const import_type* xferAsImport = dynamic_cast<const import_type*>(&transferThatDefinesMigration);
1473 const export_type* xferAsExport = dynamic_cast<const export_type*>(&transferThatDefinesMigration);
1474 TEUCHOS_TEST_FOR_EXCEPTION(!xferAsImport && !xferAsExport, std::runtime_error, "Tpetra::Import_Util::getTwoTransferOwnershipVector transfer undefined");
1475
1476 // Migrate from "A" vector to output vector
1477 owningPIDs.putScalar(rank);
1484 else
1486}
1487
1488} // namespace Import_Util
1489} // namespace Tpetra
1490
1491#endif // TPETRA_IMPORT_UTIL_HPP
Declaration of the Tpetra::CrsMatrix class.
Add specializations of Teuchos::Details::MpiTypeTraits for Kokkos::complex<float> and Kokkos::complex...
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary,...
Declaration and definition of Tpetra::Details::reallocDualViewIfNeeded, an implementation detail of T...
void lowCommunicationMakeColMapAndReindex(const Teuchos::ArrayView< const size_t > &rowptr, const Teuchos::ArrayView< LocalOrdinal > &colind_LID, const Teuchos::ArrayView< GlobalOrdinal > &colind_GID, const Teuchos::RCP< const Tpetra::Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMapRCP, const Teuchos::ArrayView< const int > &owningPIDs, Teuchos::Array< int > &remotePIDs, Teuchos::RCP< const Tpetra::Map< LocalOrdinal, GlobalOrdinal, Node > > &colMap)
lowCommunicationMakeColMapAndReindex
void getTwoTransferOwnershipVector(const ::Tpetra::Details::Transfer< LocalOrdinal, GlobalOrdinal, Node > &transferThatDefinesOwnership, bool useReverseModeForOwnership, const ::Tpetra::Details::Transfer< LocalOrdinal, GlobalOrdinal, Node > &transferForMigratingData, bool useReverseModeForMigration, Tpetra::Vector< int, LocalOrdinal, GlobalOrdinal, Node > &owningPIDs)
Generates an list of owning PIDs based on two transfer (aka import/export objects) Let: OwningMap = u...
void sortAndMergeCrsEntries(const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< Ordinal > &CRS_colind, const Teuchos::ArrayView< Scalar > &CRS_vals)
Sort and merge the entries of the (raw CSR) matrix by column index within each row.
void sortCrsEntries(const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< Ordinal > &CRS_colind, const Teuchos::ArrayView< Scalar > &CRS_vals)
Sort the entries of the (raw CSR) matrix by column index within each row.
void getPids(const Tpetra::Import< LocalOrdinal, GlobalOrdinal, Node > &Importer, Teuchos::Array< int > &pids, bool use_minus_one_for_local)
Like getPidGidPairs, but just gets the PIDs, ordered by the column Map.
Stand-alone utility functions and macros.
Struct that holds views of the contents of a CrsMatrix.
Common base class of Import and Export.
Teuchos::ArrayView< const LO > getExportLIDs() const
List of entries in the source Map that will be sent to other processes.
size_t getNumExportIDs() const
Number of entries that must be sent by the calling process to other processes.
Teuchos::ArrayView< const int > getExportPIDs() const
List of processes to which entries will be sent.
void start()
Start the deep_copy counter.
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void sort2(const IT1 &first1, const IT1 &last1, const IT2 &first2, const bool stableSort=false)
Sort the first array, and apply the resulting permutation to the second array.
size_t global_size_t
Global size_t object.
void sort3(const IT1 &first1, const IT1 &last1, const IT2 &first2, const IT3 &first3, const bool stableSort=false)
Sort the first array, and apply the same permutation to the second and third arrays.
@ REPLACE
Replace existing values with new values.