Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
Tpetra_Details_DistributorPlan.cpp
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
11
13#include "Teuchos_StandardParameterEntryValidators.hpp"
14#include "Tpetra_Util.hpp"
16#include <numeric>
17
18namespace Tpetra {
19namespace Details {
20
21std::string
23 if (sendType == DISTRIBUTOR_ISEND) {
24 return "Isend";
25 } else if (sendType == DISTRIBUTOR_SEND) {
26 return "Send";
27 } else if (sendType == DISTRIBUTOR_ALLTOALL) {
28 return "Alltoall";
29 }
30#if defined(HAVE_TPETRA_MPI)
31 else if (sendType == DISTRIBUTOR_IALLTOFEWV) {
32 return "Ialltofewv";
33 }
34#endif
35#if defined(HAVE_TPETRACORE_MPI_ADVANCE)
37 return "MpiAdvanceAlltoall";
39 return "MpiAdvanceNbralltoallv";
40 }
41#endif
42 else {
43 TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument,
44 "Invalid "
45 "EDistributorSendType enum value "
46 << sendType << ".");
47 }
48}
49
51DistributorSendTypeStringToEnum(const std::string_view s) {
52 if (s == "Isend") return DISTRIBUTOR_ISEND;
53 if (s == "Send") return DISTRIBUTOR_SEND;
54 if (s == "Alltoall") return DISTRIBUTOR_ALLTOALL;
55#if defined(HAVE_TPETRA_MPI)
56 if (s == "Ialltofewv") return DISTRIBUTOR_IALLTOFEWV;
57#endif
58#if defined(HAVE_TPETRACORE_MPI_ADVANCE)
59 if (s == "MpiAdvanceAlltoall") return DISTRIBUTOR_MPIADVANCE_ALLTOALL;
60 if (s == "MpiAdvanceNbralltoallv") return DISTRIBUTOR_MPIADVANCE_NBRALLTOALLV;
61#endif
62 TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Invalid string to convert to EDistributorSendType enum value: " << s);
63}
64
66const std::string& validSendTypeOrThrow(const std::string& s) {
67 const auto valids = distributorSendTypes();
68 if (std::find(valids.begin(), valids.end(), s) == valids.end()) {
69 TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Invalid string for EDistributorSendType enum value: " << s);
70 }
71 return s;
72}
73
74std::string
76 switch (how) {
77 case Details::DISTRIBUTOR_NOT_INITIALIZED:
78 return "Not initialized yet";
79 case Details::DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_SENDS:
80 return "By createFromSends";
81 case Details::DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_RECVS:
82 return "By createFromRecvs";
83 case Details::DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_SENDS_N_RECVS:
84 return "By createFromSendsAndRecvs";
85 case Details::DISTRIBUTOR_INITIALIZED_BY_REVERSE:
86 return "By createReverseDistributor";
87 case Details::DISTRIBUTOR_INITIALIZED_BY_COPY:
88 return "By copy constructor";
89 default:
90 return "INVALID";
91 }
92}
93
94DistributorPlan::DistributorPlan(Teuchos::RCP<const Teuchos::Comm<int>> comm)
95 : comm_(comm)
96 ,
98 mpixComm_(Teuchos::null)
99 ,
100#endif
101 howInitialized_(DISTRIBUTOR_NOT_INITIALIZED)
102 , reversePlan_(Teuchos::null)
103 , sendType_(DistributorSendTypeStringToEnum(Behavior::defaultSendType()))
104 , sendMessageToSelf_(false)
105 , numSendsToOtherProcs_(0)
106 , maxSendLength_(0)
107 , numReceives_(0)
108 , totalReceiveLength_(0) {
109}
110
111DistributorPlan::DistributorPlan(const DistributorPlan& otherPlan)
112 : comm_(otherPlan.comm_)
113 ,
114#if defined(HAVE_TPETRACORE_MPI_ADVANCE)
115 mpixComm_(otherPlan.mpixComm_)
116 ,
117#endif
118 howInitialized_(DISTRIBUTOR_INITIALIZED_BY_COPY)
119 , reversePlan_(otherPlan.reversePlan_)
120 , sendType_(otherPlan.sendType_)
121 , sendMessageToSelf_(otherPlan.sendMessageToSelf_)
122 , numSendsToOtherProcs_(otherPlan.numSendsToOtherProcs_)
123 , procIdsToSendTo_(otherPlan.procIdsToSendTo_)
124 , startsTo_(otherPlan.startsTo_)
125 , lengthsTo_(otherPlan.lengthsTo_)
126 , maxSendLength_(otherPlan.maxSendLength_)
127 , indicesTo_(otherPlan.indicesTo_)
128 , numReceives_(otherPlan.numReceives_)
129 , totalReceiveLength_(otherPlan.totalReceiveLength_)
130 , lengthsFrom_(otherPlan.lengthsFrom_)
131 , procsFrom_(otherPlan.procsFrom_)
132 , startsFrom_(otherPlan.startsFrom_)
133 , indicesFrom_(otherPlan.indicesFrom_)
134#if defined(HAVE_TPETRACORE_MPI)
135 , roots_(otherPlan.roots_)
136#endif
137{
138}
139
140size_t DistributorPlan::createFromSends(const Teuchos::ArrayView<const int>& exportProcIDs) {
141 using std::endl;
142 using Teuchos::outArg;
143 using Teuchos::REDUCE_MAX;
144 using Teuchos::reduceAll;
145 const char rawPrefix[] = "Tpetra::DistributorPlan::createFromSends";
146
147 const size_t numExports = exportProcIDs.size();
148 const int myProcID = comm_->getRank();
149 const int numProcs = comm_->getSize();
150 const bool debug = Details::Behavior::debug("Distributor");
151
152 // exportProcIDs tells us the communication pattern for this
153 // distributor. It dictates the way that the export data will be
154 // interpreted in doPosts(). We want to perform at most one
155 // send per process in doPosts; this is for two reasons:
156 // * minimize latency / overhead in the comm routines (nice)
157 // * match the number of receives and sends between processes
158 // (necessary)
159 //
160 // Teuchos::Comm requires that the data for a send are contiguous
161 // in a send buffer. Therefore, if the data in the send buffer
162 // for doPosts() are not contiguous, they will need to be copied
163 // into a contiguous buffer. The user has specified this
164 // noncontiguous pattern and we can't do anything about it.
165 // However, if they do not provide an efficient pattern, we will
166 // warn them if one of the following compile-time options has been
167 // set:
168 // * HAVE_TPETRA_PRINT_EFFICIENCY_WARNINGS
169 //
170 // If the data are contiguous, then we can post the sends in situ
171 // (i.e., without needing to copy them into a send buffer).
172 //
173 // Determine contiguity. There are a number of ways to do this:
174 // * If the export IDs are sorted, then all exports to a
175 // particular proc must be contiguous. This is what Epetra does.
176 // * If the export ID of the current export already has been
177 // listed, then the previous listing should correspond to the
178 // same export. This tests contiguity, but not sortedness.
179 //
180 // Both of these tests require O(n), where n is the number of
181 // exports. However, the latter will positively identify a greater
182 // portion of contiguous patterns. We use the latter method.
183 //
184 // Check to see if values are grouped by procs without gaps
185 // If so, indices_to -> 0.
186
187 if (debug) {
188 // Test whether any process in the communicator got an invalid
189 // process ID. If badID != -1 on this process, then it equals
190 // this process' rank. The max of all badID over all processes
191 // is the max rank which has an invalid process ID.
192 int badID = -1;
193 for (size_t i = 0; i < numExports; ++i) {
194 const int exportID = exportProcIDs[i];
195 if (exportID >= numProcs || exportID < 0) {
196 badID = myProcID;
197 break;
198 }
199 }
200 int gbl_badID;
201 reduceAll<int, int>(*comm_, REDUCE_MAX, badID, outArg(gbl_badID));
202 TEUCHOS_TEST_FOR_EXCEPTION(gbl_badID >= 0, std::runtime_error, rawPrefix << "Proc " << gbl_badID << ", perhaps among other processes, got a bad "
203 "send process ID.");
204 }
205
206 // Set up data structures for quick traversal of arrays.
207 // This contains the number of sends for each process ID.
208 //
209 // FIXME (mfh 20 Mar 2014) This is one of a few places in Tpetra
210 // that create an array of length the number of processes in the
211 // communicator (plus one). Given how this code uses this array,
212 // it should be straightforward to replace it with a hash table or
213 // some other more space-efficient data structure. In practice,
214 // most of the entries of starts should be zero for a sufficiently
215 // large process count, unless the communication pattern is dense.
216 // Note that it's important to be able to iterate through keys (i
217 // for which starts[i] is nonzero) in increasing order.
218 Teuchos::Array<size_t> starts(numProcs + 1, 0);
219
220 // numActive is the number of sends that are not Null
221 size_t numActive = 0;
222 int needSendBuff = 0; // Boolean
223
224 for (size_t i = 0; i < numExports; ++i) {
225 const int exportID = exportProcIDs[i];
226 if (exportID >= 0) {
227 // exportID is a valid process ID. Increment the number of
228 // messages this process will send to that process.
229 ++starts[exportID];
230
231 // If we're sending more than one message to process exportID,
232 // then it is possible that the data are not contiguous.
233 // Check by seeing if the previous process ID in the list
234 // (exportProcIDs[i-1]) is the same. It's safe to use i-1,
235 // because if starts[exportID] > 1, then i must be > 1 (since
236 // the starts array was filled with zeros initially).
237
238 // null entries break continuity.
239 // e.g., [ 0, 0, 0, 1, -99, 1, 2, 2, 2] is not contiguous
240 if (needSendBuff == 0 && starts[exportID] > 1 &&
241 exportID != exportProcIDs[i - 1]) {
242 needSendBuff = 1;
243 }
244 ++numActive;
245 }
246 }
247
248#if defined(HAVE_TPETRA_PRINT_EFFICIENCY_WARNINGS)
249 {
250 int global_needSendBuff;
251 reduceAll<int, int>(*comm_, REDUCE_MAX, needSendBuff,
252 outArg(global_needSendBuff));
254 global_needSendBuff != 0,
255 "::createFromSends: Grouping export IDs together by process rank often "
256 "improves performance.");
257 }
258#endif
259
260 // Determine from the caller's data whether or not the current
261 // process should send (a) message(s) to itself.
262 if (starts[myProcID] != 0) {
263 sendMessageToSelf_ = true;
264 } else {
265 sendMessageToSelf_ = false;
266 }
267
268 if (!needSendBuff) {
269 // grouped by proc, no send buffer or indicesTo_ needed
270 numSendsToOtherProcs_ = 0;
271 // Count total number of sends, i.e., total number of procs to
272 // which we are sending. This includes myself, if applicable.
273 for (int i = 0; i < numProcs; ++i) {
274 if (starts[i]) {
275 ++numSendsToOtherProcs_;
276 }
277 }
278
279 // Not only do we not need these, but we must clear them, as
280 // empty status of indicesTo is a flag used later.
281 indicesTo_.resize(0);
282 // Size these to numSendsToOtherProcs_; note, at the moment, numSendsToOtherProcs_
283 // includes self sends. Set their values to zeros.
284 procIdsToSendTo_.assign(numSendsToOtherProcs_, 0);
285 startsTo_.assign(numSendsToOtherProcs_, 0);
286 lengthsTo_.assign(numSendsToOtherProcs_, 0);
287
288 // set startsTo to the offset for each send (i.e., each proc ID)
289 // set procsTo to the proc ID for each send
290 // in interpreting this code, remember that we are assuming contiguity
291 // that is why index skips through the ranks
292 {
293 size_t procIndex = 0;
294 for (size_t i = 0; i < numSendsToOtherProcs_; ++i) {
295 while (exportProcIDs[procIndex] < 0) {
296 ++procIndex; // skip all negative proc IDs
297 }
298 startsTo_[i] = procIndex;
299 int procID = exportProcIDs[procIndex];
300 procIdsToSendTo_[i] = procID;
301 procIndex += starts[procID];
302 }
303 }
304 // sort the startsTo and proc IDs together, in ascending order, according
305 // to proc IDs
306 if (numSendsToOtherProcs_ > 0) {
307 sort2(procIdsToSendTo_.begin(), procIdsToSendTo_.end(), startsTo_.begin());
308 }
309 // compute the maximum send length
310 maxSendLength_ = 0;
311 for (size_t i = 0; i < numSendsToOtherProcs_; ++i) {
312 int procID = procIdsToSendTo_[i];
313 lengthsTo_[i] = starts[procID];
314 if ((procID != myProcID) && (lengthsTo_[i] > maxSendLength_)) {
315 maxSendLength_ = lengthsTo_[i];
316 }
317 }
318 } else {
319 // not grouped by proc, need send buffer and indicesTo_
320
321 // starts[i] is the number of sends to proc i
322 // numActive equals number of sends total, \sum_i starts[i]
323
324 // this loop starts at starts[1], so explicitly check starts[0]
325 if (starts[0] == 0) {
326 numSendsToOtherProcs_ = 0;
327 } else {
328 numSendsToOtherProcs_ = 1;
329 }
330 for (Teuchos::Array<size_t>::iterator i = starts.begin() + 1,
331 im1 = starts.begin();
332 i != starts.end(); ++i) {
333 if (*i != 0) ++numSendsToOtherProcs_;
334 *i += *im1;
335 im1 = i;
336 }
337 // starts[i] now contains the number of exports to procs 0 through i
338
339 for (Teuchos::Array<size_t>::reverse_iterator ip1 = starts.rbegin(),
340 i = starts.rbegin() + 1;
341 i != starts.rend(); ++i) {
342 *ip1 = *i;
343 ip1 = i;
344 }
345 starts[0] = 0;
346 // starts[i] now contains the number of exports to procs 0 through
347 // i-1, i.e., all procs before proc i
348
349 indicesTo_.resize(numActive);
350
351 for (size_t i = 0; i < numExports; ++i) {
352 if (exportProcIDs[i] >= 0) {
353 // record the offset to the sendBuffer for this export
354 indicesTo_[starts[exportProcIDs[i]]] = i;
355 // now increment the offset for this proc
356 ++starts[exportProcIDs[i]];
357 }
358 }
359 // our send buffer will contain the export data for each of the procs
360 // we communicate with, in order by proc id
361 // sendBuffer = {proc_0_data, proc_1_data, ..., proc_np-1_data}
362 // indicesTo now maps each export to the location in our send buffer
363 // associated with the export
364 // data for export i located at sendBuffer[indicesTo[i]]
365 //
366 // starts[i] once again contains the number of exports to
367 // procs 0 through i
368 for (int proc = numProcs - 1; proc != 0; --proc) {
369 starts[proc] = starts[proc - 1];
370 }
371 starts.front() = 0;
372 starts[numProcs] = numActive;
373 //
374 // starts[proc] once again contains the number of exports to
375 // procs 0 through proc-1
376 // i.e., the start of my data in the sendBuffer
377
378 // this contains invalid data at procs we don't care about, that is okay
379 procIdsToSendTo_.resize(numSendsToOtherProcs_);
380 startsTo_.resize(numSendsToOtherProcs_);
381 lengthsTo_.resize(numSendsToOtherProcs_);
382
383 // for each group of sends/exports, record the destination proc,
384 // the length, and the offset for this send into the
385 // send buffer (startsTo_)
386 maxSendLength_ = 0;
387 size_t snd = 0;
388 for (int proc = 0; proc < numProcs; ++proc) {
389 if (starts[proc + 1] != starts[proc]) {
390 lengthsTo_[snd] = starts[proc + 1] - starts[proc];
391 startsTo_[snd] = starts[proc];
392 // record max length for all off-proc sends
393 if ((proc != myProcID) && (lengthsTo_[snd] > maxSendLength_)) {
394 maxSendLength_ = lengthsTo_[snd];
395 }
396 procIdsToSendTo_[snd] = proc;
397 ++snd;
398 }
399 }
400 }
401
402 if (sendMessageToSelf_) {
403 --numSendsToOtherProcs_;
404 }
405
406 // Invert map to see what msgs are received and what length
407 computeReceives();
408
409#if defined(HAVE_TPETRA_MPI)
410 maybeInitializeRoots();
411#endif
412
413 // createFromRecvs() calls createFromSends(), but will set
414 // howInitialized_ again after calling createFromSends().
415 howInitialized_ = Details::DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_SENDS;
416
417 return totalReceiveLength_;
418}
419
420void DistributorPlan::createFromRecvs(const Teuchos::ArrayView<const int>& remoteProcIDs) {
421 *this = *getReversePlan();
422 howInitialized_ = Details::DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_RECVS;
423}
424
425void DistributorPlan::createFromSendsAndRecvs(const Teuchos::ArrayView<const int>& exportProcIDs,
426 const Teuchos::ArrayView<const int>& remoteProcIDs) {
427 // note the exportProcIDs and remoteProcIDs _must_ be a list that has
428 // an entry for each GID. If the export/remoteProcIDs is taken from
429 // the getProcs{From|To} lists that are extracted from a previous distributor,
430 // it will generate a wrong answer, because those lists have a unique entry
431 // for each processor id. A version of this with lengthsTo and lengthsFrom
432 // should be made.
433
434 howInitialized_ = Tpetra::Details::DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_SENDS_N_RECVS;
435
436 int myProcID = comm_->getRank();
437 int numProcs = comm_->getSize();
438
439 const size_t numExportIDs = exportProcIDs.size();
440 Teuchos::Array<size_t> starts(numProcs + 1, 0);
441
442 size_t numActive = 0;
443 int needSendBuff = 0; // Boolean
444
445 for (size_t i = 0; i < numExportIDs; i++) {
446 if (needSendBuff == 0 && i && (exportProcIDs[i] < exportProcIDs[i - 1]))
447 needSendBuff = 1;
448 if (exportProcIDs[i] >= 0) {
449 ++starts[exportProcIDs[i]];
450 ++numActive;
451 }
452 }
453
454 sendMessageToSelf_ = (starts[myProcID] != 0) ? 1 : 0;
455
456 numSendsToOtherProcs_ = 0;
457
458 if (needSendBuff) // grouped by processor, no send buffer or indicesTo_ needed
459 {
460 if (starts[0] == 0) {
461 numSendsToOtherProcs_ = 0;
462 } else {
463 numSendsToOtherProcs_ = 1;
464 }
465 for (Teuchos::Array<size_t>::iterator i = starts.begin() + 1,
466 im1 = starts.begin();
467 i != starts.end(); ++i) {
468 if (*i != 0) ++numSendsToOtherProcs_;
469 *i += *im1;
470 im1 = i;
471 }
472 // starts[i] now contains the number of exports to procs 0 through i
473
474 for (Teuchos::Array<size_t>::reverse_iterator ip1 = starts.rbegin(),
475 i = starts.rbegin() + 1;
476 i != starts.rend(); ++i) {
477 *ip1 = *i;
478 ip1 = i;
479 }
480 starts[0] = 0;
481 // starts[i] now contains the number of exports to procs 0 through
482 // i-1, i.e., all procs before proc i
483
484 indicesTo_.resize(numActive);
485
486 for (size_t i = 0; i < numExportIDs; ++i) {
487 if (exportProcIDs[i] >= 0) {
488 // record the offset to the sendBuffer for this export
489 indicesTo_[starts[exportProcIDs[i]]] = i;
490 // now increment the offset for this proc
491 ++starts[exportProcIDs[i]];
492 }
493 }
494 for (int proc = numProcs - 1; proc != 0; --proc) {
495 starts[proc] = starts[proc - 1];
496 }
497 starts.front() = 0;
498 starts[numProcs] = numActive;
499 procIdsToSendTo_.resize(numSendsToOtherProcs_);
500 startsTo_.resize(numSendsToOtherProcs_);
501 lengthsTo_.resize(numSendsToOtherProcs_);
502 maxSendLength_ = 0;
503 size_t snd = 0;
504 for (int proc = 0; proc < numProcs; ++proc) {
505 if (starts[proc + 1] != starts[proc]) {
506 lengthsTo_[snd] = starts[proc + 1] - starts[proc];
507 startsTo_[snd] = starts[proc];
508 // record max length for all off-proc sends
509 if ((proc != myProcID) && (lengthsTo_[snd] > maxSendLength_)) {
510 maxSendLength_ = lengthsTo_[snd];
511 }
512 procIdsToSendTo_[snd] = proc;
513 ++snd;
514 }
515 }
516 } else {
517 // grouped by proc, no send buffer or indicesTo_ needed
518 numSendsToOtherProcs_ = 0;
519 // Count total number of sends, i.e., total number of procs to
520 // which we are sending. This includes myself, if applicable.
521 for (int i = 0; i < numProcs; ++i) {
522 if (starts[i]) {
523 ++numSendsToOtherProcs_;
524 }
525 }
526
527 // Not only do we not need these, but we must clear them, as
528 // empty status of indicesTo is a flag used later.
529 indicesTo_.resize(0);
530 // Size these to numSendsToOtherProcs_; note, at the moment, numSendsToOtherProcs_
531 // includes self sends. Set their values to zeros.
532 procIdsToSendTo_.assign(numSendsToOtherProcs_, 0);
533 startsTo_.assign(numSendsToOtherProcs_, 0);
534 lengthsTo_.assign(numSendsToOtherProcs_, 0);
535
536 // set startsTo to the offset for each send (i.e., each proc ID)
537 // set procsTo to the proc ID for each send
538 // in interpreting this code, remember that we are assuming contiguity
539 // that is why index skips through the ranks
540 {
541 size_t procIndex = 0;
542 for (size_t i = 0; i < numSendsToOtherProcs_; ++i) {
543 while (exportProcIDs[procIndex] < 0) {
544 ++procIndex; // skip all negative proc IDs
545 }
546 startsTo_[i] = procIndex;
547 int procID = exportProcIDs[procIndex];
548 procIdsToSendTo_[i] = procID;
549 procIndex += starts[procID];
550 }
551 }
552 // sort the startsTo and proc IDs together, in ascending order, according
553 // to proc IDs
554 if (numSendsToOtherProcs_ > 0) {
555 sort2(procIdsToSendTo_.begin(), procIdsToSendTo_.end(), startsTo_.begin());
556 }
557 // compute the maximum send length
558 maxSendLength_ = 0;
559 for (size_t i = 0; i < numSendsToOtherProcs_; ++i) {
560 int procID = procIdsToSendTo_[i];
561 lengthsTo_[i] = starts[procID];
562 if ((procID != myProcID) && (lengthsTo_[i] > maxSendLength_)) {
563 maxSendLength_ = lengthsTo_[i];
564 }
565 }
566 }
567
568 numSendsToOtherProcs_ -= sendMessageToSelf_;
569 std::vector<int> recv_list;
570 recv_list.reserve(numSendsToOtherProcs_); // reserve an initial guess for size needed
571
572 int last_pid = -2;
573 for (int i = 0; i < remoteProcIDs.size(); i++) {
574 if (remoteProcIDs[i] > last_pid) {
575 recv_list.push_back(remoteProcIDs[i]);
576 last_pid = remoteProcIDs[i];
577 } else if (remoteProcIDs[i] < last_pid)
578 throw std::runtime_error("Tpetra::Distributor:::createFromSendsAndRecvs expected RemotePIDs to be in sorted order");
579 }
580 numReceives_ = recv_list.size();
581 if (numReceives_) {
582 procsFrom_.assign(numReceives_, 0);
583 lengthsFrom_.assign(numReceives_, 0);
584 indicesFrom_.assign(numReceives_, 0);
585 startsFrom_.assign(numReceives_, 0);
586 }
587 for (size_t i = 0, j = 0; i < numReceives_; ++i) {
588 int jlast = j;
589 procsFrom_[i] = recv_list[i];
590 startsFrom_[i] = j;
591 for (; j < (size_t)remoteProcIDs.size() &&
592 remoteProcIDs[jlast] == remoteProcIDs[j];
593 j++) {
594 ;
595 }
596 lengthsFrom_[i] = j - jlast;
597 }
598 totalReceiveLength_ = remoteProcIDs.size();
599 indicesFrom_.clear();
600 numReceives_ -= sendMessageToSelf_;
601
602#if defined(HAVE_TPETRA_MPI)
603 maybeInitializeRoots();
604#endif
605}
606
607Teuchos::RCP<DistributorPlan> DistributorPlan::getReversePlan() const {
608 if (reversePlan_.is_null()) createReversePlan();
609 return reversePlan_;
610}
611
612void DistributorPlan::createReversePlan() const {
613 reversePlan_ = Teuchos::rcp(new DistributorPlan(comm_));
614 reversePlan_->howInitialized_ = Details::DISTRIBUTOR_INITIALIZED_BY_REVERSE;
615 reversePlan_->sendType_ = sendType_;
616
617#if defined(HAVE_TPETRACORE_MPI)
618 // If the forward plan matches an all-to-few communication pattern,
619 // the reverse plan is few-to-all, so don't use a special all-to-few
620 // implementation for it
621 if (DISTRIBUTOR_IALLTOFEWV == sendType_) {
622 if (Behavior::verbose()) {
623 std::stringstream ss;
624 ss << __FILE__ << ":" << __LINE__ << " WARNING (Ialltofewv send type): using default for reversed Ialltofewv\n";
625 std::cerr << ss.str();
626 }
627
628 reversePlan_->sendType_ = DistributorSendTypeStringToEnum(Behavior::defaultSendType());
629 }
630#endif
631
632 // The total length of all the sends of this DistributorPlan. We
633 // calculate it because it's the total length of all the receives
634 // of the reverse DistributorPlan.
635 size_t totalSendLength =
636 std::accumulate(lengthsTo_.begin(), lengthsTo_.end(), 0);
637
638 // The maximum length of any of the receives of this DistributorPlan.
639 // We calculate it because it's the maximum length of any of the
640 // sends of the reverse DistributorPlan.
641 size_t maxReceiveLength = 0;
642 const int myProcID = comm_->getRank();
643 for (size_t i = 0; i < numReceives_; ++i) {
644 if (procsFrom_[i] != myProcID) {
645 // Don't count receives for messages sent by myself to myself.
646 if (lengthsFrom_[i] > maxReceiveLength) {
647 maxReceiveLength = lengthsFrom_[i];
648 }
649 }
650 }
651
652 reversePlan_->sendMessageToSelf_ = sendMessageToSelf_;
653 reversePlan_->numSendsToOtherProcs_ = numReceives_;
654 reversePlan_->procIdsToSendTo_ = procsFrom_;
655 reversePlan_->startsTo_ = startsFrom_;
656 reversePlan_->lengthsTo_ = lengthsFrom_;
657 reversePlan_->maxSendLength_ = maxReceiveLength;
658 reversePlan_->indicesTo_ = indicesFrom_;
659 reversePlan_->numReceives_ = numSendsToOtherProcs_;
660 reversePlan_->totalReceiveLength_ = totalSendLength;
661 reversePlan_->lengthsFrom_ = lengthsTo_;
662 reversePlan_->procsFrom_ = procIdsToSendTo_;
663 reversePlan_->startsFrom_ = startsTo_;
664 reversePlan_->indicesFrom_ = indicesTo_;
665
666#if defined(HAVE_TPETRACORE_MPI_ADVANCE)
667 // is there a smarter way to do this
668 reversePlan_->initializeMpiAdvance();
669#endif
670
671#if defined(HAVE_TPETRA_MPI)
672 reversePlan_->maybeInitializeRoots();
673#endif
674}
675
676void DistributorPlan::computeReceives() {
677 using Teuchos::Array;
678 using Teuchos::ArrayRCP;
679 using Teuchos::as;
680 using Teuchos::CommRequest;
681 using Teuchos::CommStatus;
682 using Teuchos::ireceive;
683 using Teuchos::RCP;
684 using Teuchos::rcp;
685 using Teuchos::receive;
686 using Teuchos::reduce;
687 using Teuchos::REDUCE_SUM;
688 using Teuchos::scatter;
689 using Teuchos::send;
690 using Teuchos::waitAll;
691
692 const int myRank = comm_->getRank();
693 const int numProcs = comm_->getSize();
694
695 const int mpiTag = DEFAULT_MPI_TAG;
696
697 // toProcsFromMe[i] == the number of messages sent by this process
698 // to process i. The data in numSendsToOtherProcs_, procIdsToSendTo_, and lengthsTo_
699 // concern the contiguous sends. Therefore, each process will be
700 // listed in procIdsToSendTo_ at most once, and so toProcsFromMe[i] will
701 // either be 0 or 1.
702 {
703 Array<int> toProcsFromMe(numProcs, 0);
704#ifdef HAVE_TPETRA_DEBUG
705 bool counting_error = false;
706#endif // HAVE_TPETRA_DEBUG
707 for (size_t i = 0; i < (numSendsToOtherProcs_ + (sendMessageToSelf_ ? 1 : 0)); ++i) {
708#ifdef HAVE_TPETRA_DEBUG
709 if (toProcsFromMe[procIdsToSendTo_[i]] != 0) {
710 counting_error = true;
711 }
712#endif // HAVE_TPETRA_DEBUG
713 toProcsFromMe[procIdsToSendTo_[i]] = 1;
714 }
715#ifdef HAVE_TPETRA_DEBUG
716 // Note that SHARED_TEST_FOR_EXCEPTION does a global reduction
717 SHARED_TEST_FOR_EXCEPTION(counting_error, std::logic_error,
718 "Tpetra::Distributor::computeReceives: There was an error on at least "
719 "one process in counting the number of messages send by that process to "
720 "the other processs. Please report this bug to the Tpetra developers.",
721 *comm_);
722#endif // HAVE_TPETRA_DEBUG
723
724 // Compute the number of receives that this process needs to
725 // post. The number of receives includes any self sends (i.e.,
726 // messages sent by this process to itself).
727 //
728 // (We will use numReceives_ this below to post exactly that
729 // number of receives, with MPI_ANY_SOURCE as the sending rank.
730 // This will tell us from which processes this process expects
731 // to receive, and how many packets of data we expect to receive
732 // from each process.)
733 //
734 // toProcsFromMe[i] is the number of messages sent by this
735 // process to process i. Compute the sum (elementwise) of all
736 // the toProcsFromMe arrays on all processes in the
737 // communicator. If the array x is that sum, then if this
738 // process has rank j, x[j] is the number of messages sent
739 // to process j, that is, the number of receives on process j
740 // (including any messages sent by process j to itself).
741 //
742 // Yes, this requires storing and operating on an array of
743 // length P, where P is the number of processes in the
744 // communicator. Epetra does this too. Avoiding this O(P)
745 // memory bottleneck would require some research.
746 //
747 // mfh 09 Jan 2012, 15 Jul 2015: There are three ways to
748 // implement this O(P) memory algorithm.
749 //
750 // 1. Use MPI_Reduce and MPI_Scatter: reduce on the root
751 // process (0) from toProcsFromMe, to numRecvsOnEachProc.
752 // Then, scatter the latter, so that each process p gets
753 // numRecvsOnEachProc[p].
754 //
755 // 2. Like #1, but use MPI_Reduce_scatter instead of
756 // MPI_Reduce and MPI_Scatter. MPI_Reduce_scatter might be
757 // optimized to reduce the number of messages, but
758 // MPI_Reduce_scatter is more general than we need (it
759 // allows the equivalent of MPI_Scatterv). See Bug 6336.
760 //
761 // 3. Do an all-reduce on toProcsFromMe, and let my process
762 // (with rank myRank) get numReceives_ from
763 // toProcsFromMe[myRank]. The HPCCG miniapp uses the
764 // all-reduce method.
765 //
766 // Approaches 1 and 3 have the same critical path length.
767 // However, #3 moves more data. This is because the final
768 // result is just one integer, but #3 moves a whole array of
769 // results to all the processes. This is why we use Approach 1
770 // here.
771 //
772 // mfh 12 Apr 2013: See discussion in createFromSends() about
773 // how we could use this communication to propagate an error
774 // flag for "free" in a release build.
775
776 const int root = 0; // rank of root process of the reduction
777 Array<int> numRecvsOnEachProc; // temp; only needed on root
778 if (myRank == root) {
779 numRecvsOnEachProc.resize(numProcs);
780 }
781 int numReceivesAsInt = 0; // output
782 reduce<int, int>(toProcsFromMe.getRawPtr(),
783 numRecvsOnEachProc.getRawPtr(),
784 numProcs, REDUCE_SUM, root, *comm_);
785 scatter<int, int>(numRecvsOnEachProc.getRawPtr(), 1,
786 &numReceivesAsInt, 1, root, *comm_);
787 numReceives_ = static_cast<size_t>(numReceivesAsInt);
788 }
789
790 // Now we know numReceives_, which is this process' number of
791 // receives. Allocate the lengthsFrom_ and procsFrom_ arrays
792 // with this number of entries.
793 lengthsFrom_.assign(numReceives_, 0);
794 procsFrom_.assign(numReceives_, 0);
795
796 //
797 // Ask (via nonblocking receive) each process from which we are
798 // receiving how many packets we should expect from it in the
799 // communication pattern.
800 //
801
802 // At this point, numReceives_ includes any self message that
803 // there may be. At the end of this routine, we'll subtract off
804 // the self message (if there is one) from numReceives_. In this
805 // routine, we don't need to receive a message from ourselves in
806 // order to figure out our lengthsFrom_ and source process ID; we
807 // can just ask ourselves directly. Thus, the actual number of
808 // nonblocking receives we post here does not include the self
809 // message.
810 const size_t actualNumReceives = numReceives_ - (sendMessageToSelf_ ? 1 : 0);
811
812 // Teuchos' wrapper for nonblocking receives requires receive
813 // buffers that it knows won't go away. This is why we use RCPs,
814 // one RCP per nonblocking receive request. They get allocated in
815 // the loop below.
816 Array<RCP<CommRequest<int>>> requests(actualNumReceives);
817 Array<ArrayRCP<size_t>> lengthsFromBuffers(actualNumReceives);
818 Array<RCP<CommStatus<int>>> statuses(actualNumReceives);
819
820 // Teuchos::Comm treats a negative process ID as MPI_ANY_SOURCE
821 // (receive data from any process).
822#ifdef HAVE_MPI
823 const int anySourceProc = MPI_ANY_SOURCE;
824#else
825 const int anySourceProc = -1;
826#endif
827
828 // Post the (nonblocking) receives.
829 for (size_t i = 0; i < actualNumReceives; ++i) {
830 // Once the receive completes, we can ask the corresponding
831 // CommStatus object (output by wait()) for the sending process'
832 // ID (which we'll assign to procsFrom_[i] -- don't forget to
833 // do that!).
834 lengthsFromBuffers[i].resize(1);
835 lengthsFromBuffers[i][0] = as<size_t>(0);
836 requests[i] = ireceive<int, size_t>(lengthsFromBuffers[i], anySourceProc,
837 mpiTag, *comm_);
838 }
839
840 // Post the sends: Tell each process to which we are sending how
841 // many packets it should expect from us in the communication
842 // pattern. We could use nonblocking sends here, as long as we do
843 // a waitAll() on all the sends and receives at once.
844 //
845 // We assume that numSendsToOtherProcs_ and sendMessageToSelf_ have already been
846 // set. The value of numSendsToOtherProcs_ (my process' number of sends) does
847 // not include any message that it might send to itself.
848 for (size_t i = 0; i < numSendsToOtherProcs_ + (sendMessageToSelf_ ? 1 : 0); ++i) {
849 if (procIdsToSendTo_[i] != myRank) {
850 // Send a message to procIdsToSendTo_[i], telling that process that
851 // this communication pattern will send that process
852 // lengthsTo_[i] blocks of packets.
853 const size_t* const lengthsTo_i = &lengthsTo_[i];
854 send<int, size_t>(lengthsTo_i, 1, as<int>(procIdsToSendTo_[i]), mpiTag, *comm_);
855 } else {
856 // We don't need a send in the self-message case. If this
857 // process will send a message to itself in the communication
858 // pattern, then the last element of lengthsFrom_ and
859 // procsFrom_ corresponds to the self-message. Of course
860 // this process knows how long the message is, and the process
861 // ID is its own process ID.
862 lengthsFrom_[numReceives_ - 1] = lengthsTo_[i];
863 procsFrom_[numReceives_ - 1] = myRank;
864 }
865 }
866
867 //
868 // Wait on all the receives. When they arrive, check the status
869 // output of wait() for the receiving process ID, unpack the
870 // request buffers into lengthsFrom_, and set procsFrom_ from the
871 // status.
872 //
873 waitAll(*comm_, requests(), statuses());
874 for (size_t i = 0; i < actualNumReceives; ++i) {
875 lengthsFrom_[i] = *lengthsFromBuffers[i];
876 procsFrom_[i] = statuses[i]->getSourceRank();
877 }
878
879 // Sort the procsFrom_ array, and apply the same permutation to
880 // lengthsFrom_. This ensures that procsFrom_[i] and
881 // lengthsFrom_[i] refers to the same thing.
882 sort2(procsFrom_.begin(), procsFrom_.end(), lengthsFrom_.begin());
883
884 // Compute indicesFrom_
885 totalReceiveLength_ =
886 std::accumulate(lengthsFrom_.begin(), lengthsFrom_.end(), 0);
887 indicesFrom_.clear();
888
889 startsFrom_.clear();
890 startsFrom_.reserve(numReceives_);
891 for (size_t i = 0, j = 0; i < numReceives_; ++i) {
892 startsFrom_.push_back(j);
893 j += lengthsFrom_[i];
894 }
895
896 if (sendMessageToSelf_) {
897 --numReceives_;
898 }
899}
900
901void DistributorPlan::setParameterList(const Teuchos::RCP<Teuchos::ParameterList>& plist) {
902 using std::endl;
903 using Teuchos::FancyOStream;
904 using Teuchos::getIntegralValue;
905 using Teuchos::ParameterList;
906 using Teuchos::parameterList;
907 using Teuchos::RCP;
908
909 if (!plist.is_null()) {
910 RCP<const ParameterList> validParams = getValidParameters();
911 plist->validateParametersAndSetDefaults(*validParams);
912
913 const Details::EDistributorSendType sendType =
914 getIntegralValue<Details::EDistributorSendType>(*plist, "Send type");
915
916 // Now that we've validated the input list, save the results.
917 sendType_ = sendType;
918
919#if defined(HAVE_TPETRACORE_MPI_ADVANCE)
920 initializeMpiAdvance();
921#endif
922
923 // ParameterListAcceptor semantics require pointer identity of the
924 // sublist passed to setParameterList(), so we save the pointer.
925 this->setMyParamList(plist);
926
927#if defined(HAVE_TPETRA_MPI)
928 maybeInitializeRoots();
929#endif
930 }
931}
932
933Teuchos::Array<std::string> distributorSendTypes() {
934 Teuchos::Array<std::string> sendTypes;
935 sendTypes.push_back("Isend");
936 sendTypes.push_back("Send");
937 sendTypes.push_back("Alltoall");
938#if defined(HAVE_TPETRA_MPI)
939 sendTypes.push_back("Ialltofewv");
940#endif
941#if defined(HAVE_TPETRACORE_MPI_ADVANCE)
942 sendTypes.push_back("MpiAdvanceAlltoall");
943 sendTypes.push_back("MpiAdvanceNbralltoallv");
944#endif
945 return sendTypes;
946}
947
948Teuchos::Array<EDistributorSendType> distributorSendTypeEnums() {
949 Teuchos::Array<EDistributorSendType> res;
950 res.push_back(DISTRIBUTOR_ISEND);
951 res.push_back(DISTRIBUTOR_SEND);
952 res.push_back(DISTRIBUTOR_ALLTOALL);
953#if defined(HAVE_TPETRA_MPI)
954 res.push_back(DISTRIBUTOR_IALLTOFEWV);
955#endif
956#if defined(HAVE_TPETRACORE_MPI_ADVANCE)
959#endif
960 return res;
961}
962
963Teuchos::RCP<const Teuchos::ParameterList>
964DistributorPlan::getValidParameters() const {
965 using Teuchos::Array;
966 using Teuchos::ParameterList;
967 using Teuchos::parameterList;
968 using Teuchos::RCP;
969 using Teuchos::setStringToIntegralParameter;
970
971 Array<std::string> sendTypes = distributorSendTypes();
972 const Array<Details::EDistributorSendType> sendTypeEnums = distributorSendTypeEnums();
973
974 const std::string validatedSendType = validSendTypeOrThrow(Behavior::defaultSendType());
975
976 RCP<ParameterList> plist = parameterList("Tpetra::Distributor");
977
980 "When using MPI, the variant of send to use in "
981 "do[Reverse]Posts()",
982 sendTypes(), sendTypeEnums(), plist.getRawPtr());
983 plist->set("Timer Label", "", "Label for Time Monitor output");
984
985 return Teuchos::rcp_const_cast<const ParameterList>(plist);
986}
987
988#if defined(HAVE_TPETRACORE_MPI_ADVANCE)
989
990// Used by Teuchos::RCP to clean up an owned MPIX_Comm*
991struct MpixCommDeallocator {
992 void free(MPIX_Comm** comm) const {
993 MPIX_Comm_free(comm);
994 }
995};
996
997void DistributorPlan::initializeMpiAdvance() {
998 // assert the mpix communicator is null. if this is not the case we will figure out why
999 TEUCHOS_ASSERT(mpixComm_.is_null());
1000
1001 // use the members to initialize the graph for neightborhood mode, or just the MPIX communicator for non-neighborhood mode
1002 Teuchos::RCP<const Teuchos::MpiComm<int>> mpiComm = Teuchos::rcp_dynamic_cast<const Teuchos::MpiComm<int>>(comm_);
1003 Teuchos::RCP<const Teuchos::OpaqueWrapper<MPI_Comm>> rawComm = mpiComm->getRawMpiComm();
1004 int err = 0;
1005 if (sendType_ == DISTRIBUTOR_MPIADVANCE_ALLTOALL ||
1006 sendType_ == DISTRIBUTOR_MPIADVANCE_NBRALLTOALLV) {
1007 MPIX_Comm** mpixComm = new (MPIX_Comm*);
1008 err = MPIX_Comm_init(mpixComm, (*rawComm)());
1009 mpixComm_ = Teuchos::RCP(mpixComm,
1010 MpixCommDeallocator(),
1011 true /*take ownership*/
1012 );
1013 }
1014
1015 TEUCHOS_ASSERT(err == 0);
1016}
1017#endif
1018
1019#if defined(HAVE_TPETRA_MPI)
1020// FIXME: probably need to rename this function since it might change the sendType
1021void DistributorPlan::maybeInitializeRoots() {
1022 // Only IALLTOFEWV needs to know the roots
1023 if (DISTRIBUTOR_IALLTOFEWV != sendType_) {
1024 roots_.clear();
1025 return;
1026 }
1027
1028 ProfilingRegion region_maybeInitializeRoots("Tpetra::DistributorPlan::maybeInitializeRoots");
1029
1030 // send my number of recvs to everyone
1031 const int numRecvs = (int)(getNumReceives() + (hasSelfMessage() ? 1 : 0));
1032 std::vector<int> sendbuf(comm_->getSize(), numRecvs);
1033 std::vector<int> recvbuf(comm_->getSize());
1034
1035 // FIXME: is there a more natural way to do this?
1036 // Maybe MPI_Allreduce is better, we just care if anyone is sending anything to each process
1037 // we just need to know all processes that receive anything (including a self message)
1038 Teuchos::RCP<const Teuchos::MpiComm<int>> mpiComm = Teuchos::rcp_dynamic_cast<const Teuchos::MpiComm<int>>(comm_);
1039 Teuchos::RCP<const Teuchos::OpaqueWrapper<MPI_Comm>> rawComm = mpiComm->getRawMpiComm();
1040 MPI_Comm comm = (*rawComm)();
1041 MPI_Alltoall(sendbuf.data(), 1, MPI_INT, recvbuf.data(), 1, MPI_INT, comm);
1042
1043 roots_.clear();
1044 for (size_t root = 0; root < recvbuf.size(); ++root) {
1045 if (recvbuf[root] > 0) {
1046 roots_.push_back(root);
1047 }
1048 }
1049
1050 // In "slow-path" communication, the data is not blocked according to sending / receiving proc.
1051 // The root-detection algorithm expects data to be blocked, so disable.
1052 int slow = !getIndicesTo().is_null() ? 1 : 0;
1053 MPI_Allreduce(MPI_IN_PLACE, &slow, 1, MPI_INT, MPI_LOR, comm);
1054 if (slow) {
1056 {
1057 std::stringstream ss;
1058 ss << __FILE__ << ":" << __LINE__ << " " << comm_->getRank() << ": WARNING: Ialltoallv send mode set, at least one rank's data is not grouped by rank. Setting to \"Send\"" << std::endl;
1059 std::cerr << ss.str();
1060 }
1061 }
1062
1063 roots_.clear();
1064 sendType_ = DISTRIBUTOR_SEND;
1065 }
1066
1067 // if there aren't many roots, probably someone wanted to use a gather somewhere but then just reused the import/export thing for a scatter
1068 // which this won't work well for
1069 // just fall back to SEND if roots are more than sqrt of comm
1070 if (roots_.size() * roots_.size() >= size_t(comm_->getSize())) {
1072 std::stringstream ss;
1073 ss << __FILE__ << ":" << __LINE__ << " " << comm_->getRank() << ": WARNING (Ialltoallv send type): too many roots (" << roots_.size() << ") for " << comm_->getSize() << " ranks. Setting send-type to \"Send\"" << std::endl;
1074 std::cerr << ss.str();
1075 }
1076 roots_.clear();
1077 sendType_ = DISTRIBUTOR_SEND;
1078 }
1079}
1080#endif // HAVE_TPETRA_MPI
1081
1082DistributorPlan::SubViewLimits DistributorPlan::getImportViewLimits(size_t numPackets) const {
1083 const size_t actualNumReceives = getNumReceives() + (hasSelfMessage() ? 1 : 0);
1084
1085 IndexView importStarts(actualNumReceives);
1086 IndexView importLengths(actualNumReceives);
1087
1088 size_t offset = 0;
1089 for (size_t i = 0; i < actualNumReceives; ++i) {
1090 importStarts[i] = offset;
1091 offset += getLengthsFrom()[i] * numPackets;
1092 importLengths[i] = getLengthsFrom()[i] * numPackets;
1093 }
1094 return std::make_pair(importStarts, importLengths);
1095}
1096
1097DistributorPlan::SubViewLimits DistributorPlan::getImportViewLimits(const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID) const {
1098 const size_t actualNumReceives = getNumReceives() + (hasSelfMessage() ? 1 : 0);
1099
1100 IndexView importStarts(actualNumReceives);
1101 IndexView importLengths(actualNumReceives);
1102
1103 size_t offset = 0;
1104 size_t curLIDoffset = 0;
1105 for (size_t i = 0; i < actualNumReceives; ++i) {
1106 size_t totalPacketsFrom_i = 0;
1107 for (size_t j = 0; j < getLengthsFrom()[i]; ++j) {
1108 totalPacketsFrom_i += numImportPacketsPerLID[curLIDoffset + j];
1109 }
1110 curLIDoffset += getLengthsFrom()[i];
1111 importStarts[i] = offset;
1112 offset += totalPacketsFrom_i;
1113 importLengths[i] = totalPacketsFrom_i;
1114 }
1115 return std::make_pair(importStarts, importLengths);
1116}
1117
1118DistributorPlan::SubViewLimits DistributorPlan::getExportViewLimits(size_t numPackets) const {
1119 if (getIndicesTo().is_null()) {
1120 const size_t actualNumSends = getNumSends() + (hasSelfMessage() ? 1 : 0);
1121 IndexView exportStarts(actualNumSends);
1122 IndexView exportLengths(actualNumSends);
1123 for (size_t pp = 0; pp < actualNumSends; ++pp) {
1124 exportStarts[pp] = getStartsTo()[pp] * numPackets;
1125 exportLengths[pp] = getLengthsTo()[pp] * numPackets;
1126 }
1127 return std::make_pair(exportStarts, exportLengths);
1128 } else {
1129 const size_t numIndices = getIndicesTo().size();
1130 IndexView exportStarts(numIndices);
1131 IndexView exportLengths(numIndices);
1132 for (size_t j = 0; j < numIndices; ++j) {
1133 exportStarts[j] = getIndicesTo()[j] * numPackets;
1134 exportLengths[j] = numPackets;
1135 }
1136 return std::make_pair(exportStarts, exportLengths);
1137 }
1138}
1139
1140DistributorPlan::SubViewLimits DistributorPlan::getExportViewLimits(const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID) const {
1141 if (getIndicesTo().is_null()) {
1142 const size_t actualNumSends = getNumSends() + (hasSelfMessage() ? 1 : 0);
1143 IndexView exportStarts(actualNumSends);
1144 IndexView exportLengths(actualNumSends);
1145 size_t offset = 0;
1146 for (size_t pp = 0; pp < actualNumSends; ++pp) {
1147 size_t numPackets = 0;
1148 for (size_t j = getStartsTo()[pp];
1149 j < getStartsTo()[pp] + getLengthsTo()[pp]; ++j) {
1150 numPackets += numExportPacketsPerLID[j];
1151 }
1152 exportStarts[pp] = offset;
1153 offset += numPackets;
1154 exportLengths[pp] = numPackets;
1155 }
1156 return std::make_pair(exportStarts, exportLengths);
1157 } else {
1158 const size_t numIndices = getIndicesTo().size();
1159 IndexView exportStarts(numIndices);
1160 IndexView exportLengths(numIndices);
1161 size_t offset = 0;
1162 for (size_t j = 0; j < numIndices; ++j) {
1163 exportStarts[j] = offset;
1164 offset += numExportPacketsPerLID[j];
1165 exportLengths[j] = numExportPacketsPerLID[j];
1166 }
1167 return std::make_pair(exportStarts, exportLengths);
1168 }
1169}
1170
1171} // namespace Details
1172} // namespace Tpetra
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
Stand-alone utility functions and macros.
#define TPETRA_EFFICIENCY_WARNING(throw_exception_test, msg)
Print or throw an efficency warning.
#define SHARED_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg, comm)
Test for exception, with reduction over the given communicator.
Struct that holds views of the contents of a CrsMatrix.
static bool debug()
Whether Tpetra is in debug mode.
static bool verbose()
Whether Tpetra is in verbose mode.
Implementation details of Tpetra.
std::string DistributorSendTypeEnumToString(EDistributorSendType sendType)
Convert an EDistributorSendType enum value to a string.
EDistributorSendType
The type of MPI send that Distributor should use.
Teuchos::Array< EDistributorSendType > distributorSendTypeEnums()
Valid enum values of distributor send types.
Teuchos::Array< std::string > distributorSendTypes()
Valid string values for Distributor's "Send type" parameter.
EDistributorSendType DistributorSendTypeStringToEnum(const std::string_view s)
Convert a string to an EDistributorSendType. Throw on error.
const std::string & validSendTypeOrThrow(const std::string &s)
Valid enum values of distributor send types.
EDistributorHowInitialized
Enum indicating how and whether a Distributor was initialized.
std::string DistributorHowInitializedEnumToString(EDistributorHowInitialized how)
Convert an EDistributorHowInitialized enum value to a string.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void sort2(const IT1 &first1, const IT1 &last1, const IT2 &first2, const bool stableSort=false)
Sort the first array, and apply the resulting permutation to the second array.