Zoltan2
Loading...
Searching...
No Matches
Zoltan2_AlgMultiJagged.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// Zoltan2: A package of combinatorial algorithms for scientific computing
4//
5// Copyright 2012 NTESS and the Zoltan2 contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
14#ifndef _ZOLTAN2_ALGMultiJagged_HPP_
15#define _ZOLTAN2_ALGMultiJagged_HPP_
16
20#include <Zoltan2_Algorithm.hpp>
23#include <Zoltan2_Util.hpp>
24#include <Tpetra_Distributor.hpp>
25#include <Teuchos_StandardParameterEntryValidators.hpp>
26#include <Teuchos_ParameterList.hpp>
27#include <Kokkos_Sort.hpp>
28
29#include <algorithm> // std::sort
30#include <vector>
31#include <unordered_map>
32
33#ifdef ZOLTAN2_USEZOLTANCOMM
34#ifdef HAVE_ZOLTAN2_MPI
35#define ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
36#include "zoltan_comm_cpp.h"
37#include "zoltan_types.h" // for error codes
38#endif
39#endif
40
41namespace Teuchos{
42
46template <typename Ordinal, typename T>
47class Zoltan2_BoxBoundaries : public ValueTypeReductionOp<Ordinal,T>
48{
49private:
50 Ordinal size;
51 T epsilon;
52
53public:
57 epsilon(std::numeric_limits<T>::epsilon()) {}
58
63 size(s_), epsilon(std::numeric_limits<T>::epsilon()) {}
64
70 void reduce( const Ordinal count, const T inBuffer[], T inoutBuffer[]) const {
71 for(Ordinal i = 0; i < count; i++) {
72 if(Z2_ABS(inBuffer[i]) > epsilon) {
73 inoutBuffer[i] = inBuffer[i];
74 }
75 }
76 }
77};
78
79} // namespace Teuchos
80
81namespace Zoltan2{
82
89template <typename IT, typename CT, typename WT>
91{
92public:
93 // TODO: Why volatile?
94 // no idea, another intel compiler failure.
95 volatile IT index;
96 volatile CT count;
97 volatile WT *val;
98 volatile WT epsilon;
99
101 this->index = 0;
102 this->count = 0;
103 this->val = NULL;
104 this->epsilon = std::numeric_limits<WT>::epsilon() * 100;
105 }
106
107 // TODO: Document these methods?
108 uMultiSortItem(IT index_ ,CT count_, WT *vals_) {
109 this->index = index_;
110 this->count = count_;
111 this->val = vals_;
112 this->epsilon = std::numeric_limits<WT>::epsilon() * 100;
113 }
114
117
118 void set(IT index_ ,CT count_, WT *vals_) {
119 this->index = index_;
120 this->count = count_;
121 this->val = vals_;
122 }
123
124 bool operator<(const uMultiSortItem<IT,CT,WT>& other) const {
125 assert(this->count == other.count);
126 for(CT i = 0; i < this->count; ++i) {
127 // if the values are equal go to next one.
128 if(std::abs(this->val[i] - other.val[i]) < this->epsilon) {
129 continue;
130 }
131 // if next value is smaller return true;
132 if(this->val[i] < other.val[i]) {
133 return true;
134 }
135 // if next value is bigger return false;
136 else {
137 return false;
138 }
139 }
140 // if they are totally equal.
141 return this->index < other.index;
142 }
143};
144
147template <class IT, class WT>
149{
150 IT id;
151 WT val;
152};
153
158template <class IT, class WT>
159void uqsort(IT n, uSortItem<IT, WT> * arr) {
160 const int NSTACK = 50;
161 int M = 7;
162 IT i, ir=n, j, k, l=1;
163 IT jstack=0, istack[NSTACK];
164 WT aval;
166
167 --arr;
168 for(;;) {
169 if(ir-l < M) {
170 for(j=l+1;j<=ir;j++) {
171 a=arr[j];
172 aval = a.val;
173 for(i=j-1;i>=1;i--) {
174 if(arr[i].val <= aval)
175 break;
176 arr[i+1] = arr[i];
177 }
178 arr[i+1]=a;
179 }
180 if(jstack == 0)
181 break;
182 ir=istack[jstack--];
183 l=istack[jstack--];
184 }
185 else {
186 k=(l+ir) >> 1;
187 std::swap(arr[k],arr[l+1]);
188 if(arr[l+1].val > arr[ir].val) {
189 std::swap(arr[l+1],arr[ir]);
190 }
191 if(arr[l].val > arr[ir].val) {
192 std::swap(arr[l],arr[ir]);
193 }
194 if(arr[l+1].val > arr[l].val) {
195 std::swap(arr[l+1],arr[l]);
196 }
197 i=l+1;
198 j=ir;
199 a=arr[l];
200 aval = a.val;
201 for(;;) {
202 do i++; while (arr[i].val < aval);
203 do j--; while (arr[j].val > aval);
204 if(j < i) break;
205 std::swap(arr[i],arr[j]);
206 }
207 arr[l]=arr[j];
208 arr[j]=a;
209 jstack += 2;
210 if(jstack > NSTACK) {
211 std::cout << "uqsort: NSTACK too small in sort." << std::endl;
212 std::terminate();
213 }
214 if(ir-i+1 >= j-l) {
215 istack[jstack]=ir;
216 istack[jstack-1]=i;
217 ir=j-1;
218 }
219 else {
220 istack[jstack]=j-1;
221 istack[jstack-1]=l;
222 l=i;
223 }
224 }
225 }
226}
227
228template <class IT, class WT, class SIGN>
230{
231 IT id;
232 WT val;
233 SIGN signbit; // 1 means positive, 0 means negative.
235 /*if I am negative, the other is positive*/
236 if(this->signbit < rhs.signbit) {
237 return true;
238 }
239 /*if both has the same sign*/
240 else if(this->signbit == rhs.signbit) {
241 if(this->val < rhs.val) {//if my value is smaller,
242 return this->signbit;//then if we both are positive return true.
243 //if we both are negative, return false.
244 }
245 else if(this->val > rhs.val) {//if my value is larger,
246 return !this->signbit; //then if we both are positive return false.
247 //if we both are negative, return true.
248 }
249 else { //if both are equal.
250 return false;
251 }
252 }
253 else {
254 /*if I am positive, the other is negative*/
255 return false;
256 }
257 }
258
260 return (this->val == rhs.val && this->signbit == rhs.signbit) || (*this < rhs);
261 }
262};
263
267template <class IT, class WT, class SIGN>
269 const IT NSTACK = 50;
270 IT M = 7;
271 IT i, ir=n, j, k, l=1;
272 IT jstack=0, istack[NSTACK];
274
275 --arr;
276 for(;;) {
277 if(ir < M + l) {
278 for(j=l+1;j<=ir;j++) {
279 a=arr[j];
280 for(i=j-1;i>=1;i--) {
281 if(arr[i] <= a) {
282 break;
283 }
284 arr[i+1] = arr[i];
285 }
286 arr[i+1]=a;
287 }
288 if(jstack == 0) {
289 break;
290 }
291 ir=istack[jstack--];
292 l=istack[jstack--];
293 }
294 else {
295 k=(l+ir) >> 1;
296 std::swap(arr[k],arr[l+1]);
297 if(arr[ir] < arr[l+1]) {
298 std::swap(arr[l+1],arr[ir]);
299 }
300 if(arr[ir] < arr[l] ) {
301 std::swap(arr[l],arr[ir]);
302 }
303 if(arr[l] < arr[l+1]) {
304 std::swap(arr[l+1],arr[l]);
305 }
306 i=l+1;
307 j=ir;
308 a=arr[l];
309 for(;;) {
310 do i++; while (arr[i] < a);
311 do j--; while (a < arr[j]);
312 if(j < i) break;
313 std::swap(arr[i],arr[j]);
314 }
315 arr[l]=arr[j];
316 arr[j]=a;
317 jstack += 2;
318 if(jstack > NSTACK) {
319 std::cout << "uqsort: NSTACK too small in sort." << std::endl;
320 std::terminate();
321 }
322 if(ir+l+1 >= j+i) {
323 istack[jstack]=ir;
324 istack[jstack-1]=i;
325 ir=j-1;
326 }
327 else {
328 istack[jstack]=j-1;
329 istack[jstack-1]=l;
330 l=i;
331 }
332 }
333 }
334}
335
336// This exists only so we can track how many times the MJ algorithm is
337// called and put each of those into different timer names.
338// Currently the MultiJaggedTest.cpp will actually call it twice.
339// First time with data from a Tpetra MultiVector and then a second time using
340// a BasicVectorAdapter which allows us to turn UVM off for some tests. The
341// results of the two runs are compared which helps to catch a lot of bugs. For
342// profiling I'm mostly just interested in the UVM off case and need it to be
343// in separate timers. Passing a value through would mess up the API. Possibly
344// we could check the Adapter and use that. The statics have to be outside the
345// templated class as the two called instances will be different template
346// parameters. Another complication is that MultiJagged.cpp will call through
347// the Zoltan2_AlgMJ class and we want to time things in both classes. However
348// TaskMapper will directly call AlgMJ so I made two counters for the two
349// classes to make sure it was always correct. This does not impact any
350// behavior and has the sole purpose of generating unique timer names. If you
351// run an MJ test you'll see MJ(0) and MJ(1) in the names to distinguish the
352// 1st and 2nd run. Right now only MultijaggedTest.cpp cares about this.
354 static int get_counter_AlgMJ() {
355 static int counter = 0;
356 return counter++;
357 }
359 static int counter = 0;
360 return counter++;
361 }
362};
363
366template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
367 typename mj_part_t, typename mj_node_t>
368class AlgMJ
369{
370private:
371 typedef typename mj_node_t::device_type device_t; // for views
373 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
374
375 //if the (last dimension reduce all count) x the mpi world size
376 //estimated to be bigger than this number then migration will be forced
377 //in earlier iterations.
378 static constexpr size_t future_reduceall_cutoff = 1500000;
379
380 //if parts right before last dimension are estimated to have less than
381 //MIN_WORK_LAST_DIM many coords, migration will be forced in earlier iterations.
382 static constexpr mj_lno_t min_work_last_dim = 1000;
383
384 static constexpr mj_scalar_t least_signifiance = 0.0001;
385 static constexpr int significance_mul = 1000;
386
387 std::string mj_timer_base_string; // for convenience making timer names
388
389 RCP<const Environment> mj_env; // the environment object
390 RCP<const Comm<int> > mj_problemComm; // initial comm object
391 RCP<Comm<int> > comm; // comm object than can be altered during execution
392 double imbalance_tolerance; // input imbalance tolerance.
393 int recursion_depth; // number of steps that partitioning will be solved in.
394 int coord_dim; // coordinate dim
395 int num_weights_per_coord; // # of weights per coord
396 size_t initial_num_loc_coords; // initial num local coords.
397 global_size_t initial_num_glob_coords; // initial num global coords.
398 mj_lno_t num_local_coords; // number of local coords.
399 mj_gno_t num_global_coords; // number of global coords.
400 mj_scalar_t sEpsilon; // epsilon for mj_scalar_t
401
402 // can distribute points on same coordinant to different parts.
403 bool distribute_points_on_cut_lines;
404
405 // how many parts we can calculate concurrently.
406 mj_part_t max_concurrent_part_calculation;
407
408 bool mj_run_as_rcb; // means recursion depth is adjusted to maximum value.
409 int mj_user_recursion_depth; // the recursion depth value provided by user.
410 bool mj_keep_part_boxes; // if the boxes need to be kept.
411
412 // whether to migrate=1, avoid migrate=2, or leave decision to MJ=0
413 int check_migrate_avoid_migration_option;
414
415 // when doing the migration, 0 will aim for perfect load-imbalance, 1 - will
416 // aim for minimized number of messages with possibly bad load-imbalance
417 int migration_type;
418
419 // when MJ decides whether to migrate, the minimum imbalance for migration.
420 double minimum_migration_imbalance;
421
422 // Nonuniform first level partitioning
423 // (Currently available only for sequential_task_partitioning):
424 // Used for Dragonfly task mapping by partitioning Dragonfly RCA
425 // machine coordinates and application coordinates.
426 // An optimization that completely partitions the most important machine dimension
427 // first (i.e. the Dragonfly group coordinate, or RCA's x coordinate). The standard
428 // MJ alg follows after the nonuniform first level partitioning.
429 //
430 // Ex. (first level partitioning): If we have 120 elements,
431 // num_first_level_parts = 3, first_level_distribution = [4, 10, 6], then
432 // part sizes after first level will be [24, 60, 36]. Standard uniform MJ
433 // continues for all subsequent levels.
434
435 // If used, number of parts requested for a nonuniform
436 // first level partitioning
437 mj_part_t num_first_level_parts;
438
439 // If used, the requested distribution of parts for the
440 // nonuniform first level partitioning
441 Kokkos::View<mj_part_t*, Kokkos::HostSpace> first_level_distribution;
442
443 mj_part_t total_num_cut ; // how many cuts will be totally
444 mj_part_t total_num_part; // how many parts will be totally
445
446 mj_part_t max_num_part_along_dim ; // maximum part count along a dimension.
447 mj_part_t max_num_cut_along_dim; // maximum cut count along a dimension.
448
449 // maximum part+cut count along a dimension.
450 size_t max_num_total_part_along_dim;
451
452 mj_part_t total_dim_num_reduce_all; // estimate on #reduceAlls can be done.
453
454 // max no of parts that might occur during the partition before the last
455 // partitioning dimension.
456 mj_part_t last_dim_num_part;
457
458 // input part array specifying num part to divide along each dim.
459 Kokkos::View<mj_part_t *, Kokkos::HostSpace> part_no_array;
460
461 // two dimension coordinate array
462 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
463 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>
464 mj_coordinates;
465
466 // two dimension weight array
467 Kokkos::View<mj_scalar_t **, device_t> mj_weights;
468
469 // if the target parts are uniform
470 Kokkos::View<bool *, Kokkos::HostSpace> mj_uniform_parts;
471
472 // if the coordinates have uniform weights
473 Kokkos::View<bool *, Kokkos::HostSpace> mj_uniform_weights;
474
475 int mj_num_teams; // the number of teams
476
477 size_t num_global_parts; // the targeted number of parts
478
479 // vector of all boxes for all parts, constructed if mj_keep_part_boxes true
480 RCP<mj_partBoxVector_t> kept_boxes;
481
482 RCP<mj_partBox_t> global_box;
483
484 int myRank; // processor rank
485 int myActualRank; // initial rank
486
487 bool divide_to_prime_first;
488
489 // initial global ids of the coordinates.
490 Kokkos::View<const mj_gno_t*, device_t> initial_mj_gnos;
491
492 // current global ids of the coordinates, might change during migration.
493 Kokkos::View<mj_gno_t*, device_t> current_mj_gnos;
494
495 // the actual processor owner of the coordinate, to track after migrations.
496 Kokkos::View<int*, Kokkos::HostSpace> owner_of_coordinate;
497
498 // permutation of coordinates, for partitioning.
499 Kokkos::View<mj_lno_t*, device_t> coordinate_permutations;
500
501 // permutation work array.
502 Kokkos::View<mj_lno_t*, device_t> new_coordinate_permutations;
503
504 // the part ids assigned to coordinates.
505 Kokkos::View<mj_part_t*, device_t> assigned_part_ids;
506
507 // beginning and end of each part.
508 Kokkos::View<mj_lno_t *, device_t> part_xadj;
509
510 // work array for beginning and end of each part.
511 Kokkos::View<mj_lno_t *, device_t> new_part_xadj;
512
513 Kokkos::View<mj_scalar_t *, device_t> all_cut_coordinates;
514
515 // how much weight should a MPI put left side of the each cutline
516 Kokkos::View<mj_scalar_t *, device_t>
517 process_cut_line_weight_to_put_left;
518
519 // weight percentage each thread in MPI puts left side of the each outline
520 Kokkos::View<mj_scalar_t *, device_t>
521 thread_cut_line_weight_to_put_left;
522
523 // work array to manipulate coordinate of cutlines in different iterations.
524 // necessary because previous cut line information is used for determining
525 // the next cutline information. therefore, cannot update the cut work array
526 // until all cutlines are determined.
527 Kokkos::View<mj_scalar_t *, device_t> cut_coordinates_work_array;
528
529 // Used for swapping above cut_coordinates_work_array
530 Kokkos::View<mj_scalar_t *, device_t> temp_cut_coords;
531
532 // cumulative part weight array.
533 Kokkos::View<mj_scalar_t *, device_t> target_part_weights;
534
535 // upper bound coordinate of a cut line
536 Kokkos::View<mj_scalar_t *, device_t> cut_upper_bound_coordinates;
537
538 // lower bound coordinate of a cut line
539 Kokkos::View<mj_scalar_t *, device_t> cut_lower_bound_coordinates;
540
541 // lower bound weight of a cut line
542 Kokkos::View<mj_scalar_t *, device_t> cut_lower_bound_weights;
543
544 // upper bound weight of a cut line
545 Kokkos::View<mj_scalar_t *, device_t> cut_upper_bound_weights;
546
547 // combined array to exchange the min and max coordinate, and total
548 // weight of part.
549 Kokkos::View<mj_scalar_t *, device_t>
550 process_local_min_max_coord_total_weight;
551
552 // global combined array with the results for min, max and total weight.
553 Kokkos::View<mj_scalar_t *, device_t>
554 global_min_max_coord_total_weight;
555
556 // isDone is used to determine if a cutline is determined already. If a cut
557 // line is already determined, the next iterations will skip this cut line.
558 Kokkos::View<bool *, device_t> is_cut_line_determined;
559
560 // incomplete_cut_count count holds the number of cutlines that have not
561 // been finalized for each part when concurrentPartCount>1, using this
562 // information, if incomplete_cut_count[x]==0, then no work is done
563 // for this part.
564 Kokkos::View<mj_part_t *, device_t> device_incomplete_cut_count;
565 typename decltype(device_incomplete_cut_count)::host_mirror_type
566 incomplete_cut_count;
567
568 // Need a quick accessor for this on host
569 typename decltype (part_xadj)::host_mirror_type host_part_xadj;
570
571 // local part weights of each thread.
572 Kokkos::View<double *, device_t>
573 thread_part_weights;
574
575 // the work manupulation array for partweights.
576 Kokkos::View<double *, device_t>
577 thread_part_weight_work;
578
579 // thread_cut_left_closest_point to hold the closest coordinate
580 // to a cutline from left (for each thread).
581 Kokkos::View<mj_scalar_t *, device_t>
582 thread_cut_left_closest_point;
583
584 // thread_cut_right_closest_point to hold the closest coordinate
585 // to a cutline from right (for each thread)
586 Kokkos::View<mj_scalar_t *, device_t>
587 thread_cut_right_closest_point;
588
589 // to store how many points in each part a thread has.
590 Kokkos::View<mj_lno_t *, device_t>
591 thread_point_counts;
592
593 Kokkos::View<mj_scalar_t *, device_t> process_rectilinear_cut_weight;
594 Kokkos::View<mj_scalar_t *, device_t> global_rectilinear_cut_weight;
595
596 // for faster communication, concatanation of
597 // totalPartWeights sized 2P-1, since there are P parts and P-1 cut lines
598 // leftClosest distances sized P-1, since P-1 cut lines
599 // rightClosest distances size P-1, since P-1 cut lines.
600 Kokkos::View<mj_scalar_t *, device_t>
601 total_part_weight_left_right_closests;
602 Kokkos::View<mj_scalar_t *, device_t>
603 global_total_part_weight_left_right_closests;
604
605 Kokkos::View<mj_part_t*, device_t> device_num_partitioning_in_current_dim;
606 typename decltype(device_num_partitioning_in_current_dim)::host_mirror_type
607 host_num_partitioning_in_current_dim; // for quick access on host
608
609 /* \brief helper functio to calculate imbalance.
610 * \param achieved balance we achieved.
611 * \param expected balance expected.
612 */
613 static
614 KOKKOS_INLINE_FUNCTION
615 double calculate_imbalance(mj_scalar_t achieved, mj_scalar_t expected) {
616 return static_cast<double>(achieved) / static_cast<double>(expected) - 1.0;
617 }
618
619 /* \brief Either the mj array (part_no_array) or num_global_parts should be
620 * provided in the input. part_no_array takes precedence if both are
621 * provided. Depending on these parameters, total cut/part number, maximum
622 * part/cut number along a dimension, estimated number of reduceAlls,
623 * and the number of parts before the last dimension is calculated.
624 * */
625 void set_part_specifications();
626
627 /* \brief Tries to determine the part number for current dimension,
628 * by trying to make the partitioning as square as possible.
629 * \param num_total_future how many more partitionings are required.
630 * \param root how many more recursion depth is left.
631 */
632 inline mj_part_t get_part_count(
633 mj_part_t num_total_future,
634 double root);
635
636 /* \brief for part communication we keep track of the box boundaries.
637 * This is performed when either asked specifically, or when geometric
638 * mapping is performed afterwards. This function initializes a single box
639 * with all global min and max coordinates.
640 * \param initial_partitioning_boxes the input and output vector for boxes.
641 */
642 void init_part_boxes(RCP<mj_partBoxVector_t> & outPartBoxes);
643
644 /* \brief Function returns how many parts that will be obtained after this
645 * dimension partitioning. It sets how many parts each current part will be
646 * partitioned into in this dimension to device_num_partitioning_in_current_dim
647 * vector, sets how many total future parts each obtained part will be
648 * partitioned into in next_future_num_parts_in_parts vector, If part boxes
649 * are kept, then sets initializes the output_part_boxes as its ancestor.
650 * \param future_num_part_in_parts: input, how many future parts each
651 * current part will be partitioned into.
652 * \param next_future_num_parts_in_parts: output, how many future parts
653 * each obtained part will be partitioned into.
654 * \param future_num_parts: output, max number of future parts that will be
655 * obtained from a single
656 * \param current_num_parts: input, how many parts are there currently.
657 * \param current_iteration: input, current dimension iteration number.
658 * \param input_part_boxes: input, if boxes are kept, current boxes.
659 * \param output_part_boxes: output, if boxes are kept, the initial box
660 * boundaries for obtained parts.
661 * \param atomic_part_count // DOCWORK: Documentation
662 */
663 mj_part_t update_part_num_arrays(
664 std::vector<mj_part_t> *future_num_part_in_parts,
665 std::vector<mj_part_t> *next_future_num_parts_in_parts,
666 mj_part_t &future_num_parts,
667 mj_part_t current_num_parts,
668 int current_iteration,
669 RCP<mj_partBoxVector_t> input_part_boxes,
670 RCP<mj_partBoxVector_t> output_part_boxes,
671 mj_part_t atomic_part_count);
672
684 static
685 KOKKOS_INLINE_FUNCTION
686 void mj_calculate_new_cut_position (
687 mj_scalar_t cut_upper_bound,
688 mj_scalar_t cut_lower_bound,
689 mj_scalar_t cut_upper_weight,
690 mj_scalar_t cut_lower_weight,
691 mj_scalar_t expected_weight,
692 mj_scalar_t &new_cut_position,
693 mj_scalar_t sEpsilon);
694
719 bool mj_perform_migration(
720 mj_part_t in_num_parts, //current number of parts
721 mj_part_t &out_num_parts, //output number of parts.
722 std::vector<mj_part_t> *next_future_num_parts_in_parts,
723 mj_part_t &output_part_begin_index,
724 size_t migration_reduce_all_population,
725 mj_lno_t num_coords_for_last_dim_part,
726 std::string iteration,
727 RCP<mj_partBoxVector_t> &input_part_boxes,
728 RCP<mj_partBoxVector_t> &output_part_boxes);
729
747 bool mj_check_to_migrate(
748 size_t migration_reduce_all_population,
749 mj_lno_t num_coords_for_last_dim_part,
750 mj_part_t num_procs,
751 mj_part_t num_parts,
752 mj_gno_t *num_points_in_all_processor_parts);
753
778 void mj_migration_part_proc_assignment(
779 mj_gno_t * num_points_in_all_processor_parts,
780 mj_part_t num_parts,
781 mj_part_t num_procs,
782 mj_lno_t *send_count_to_each_proc,
783 std::vector<mj_part_t> &processor_ranks_for_subcomm,
784 std::vector<mj_part_t> *next_future_num_parts_in_parts,
785 mj_part_t &out_num_part,
786 std::vector<mj_part_t> &out_part_indices,
787 mj_part_t &output_part_numbering_begin_index,
788 int *coordinate_destinations);
789
815 void mj_assign_proc_to_parts(
816 mj_gno_t * num_points_in_all_processor_parts,
817 mj_part_t num_parts,
818 mj_part_t num_procs,
819 mj_lno_t *send_count_to_each_proc,
820 std::vector<mj_part_t> &processor_ranks_for_subcomm,
821 std::vector<mj_part_t> *next_future_num_parts_in_parts,
822 mj_part_t &out_part_index,
823 mj_part_t &output_part_numbering_begin_index,
824 int *coordinate_destinations);
825
841 void assign_send_destinations(
842 mj_part_t num_parts,
843 mj_part_t *part_assignment_proc_begin_indices,
844 mj_part_t *processor_chains_in_parts,
845 mj_lno_t *send_count_to_each_proc,
846 int *coordinate_destinations);
847
862 void assign_send_destinations2(
863 mj_part_t num_parts,
864 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment,
865 int *coordinate_destinations,
866 mj_part_t &output_part_numbering_begin_index,
867 std::vector<mj_part_t> *next_future_num_parts_in_parts);
868
891 void mj_assign_parts_to_procs(
892 mj_gno_t * num_points_in_all_processor_parts,
893 mj_part_t num_parts,
894 mj_part_t num_procs,
895 mj_lno_t *send_count_to_each_proc,
896 std::vector<mj_part_t> *next_future_num_parts_in_parts,
897 mj_part_t &out_num_part,
898 std::vector<mj_part_t> &out_part_indices,
899 mj_part_t &output_part_numbering_begin_index,
900 int *coordinate_destinations);
901
915 void mj_migrate_coords(
916 mj_part_t num_procs,
917 mj_lno_t &num_new_local_points,
918 std::string iteration,
919 int *coordinate_destinations,
920 mj_part_t num_parts);
921
927 void create_sub_communicator(
928 std::vector<mj_part_t> &processor_ranks_for_subcomm);
929
934 mj_part_t find_largest_prime_factor(mj_part_t num_parts) {
935 mj_part_t largest_factor = 1;
936 mj_part_t n = num_parts;
937 mj_part_t divisor = 2;
938 while (n > 1) {
939 while (n % divisor == 0) {
940 n = n / divisor;
941 largest_factor = divisor;
942 }
943 ++divisor;
944 if(divisor * divisor > n) {
945 if(n > 1) {
946 largest_factor = n;
947 }
948 break;
949 }
950 }
951 return largest_factor;
952 }
953
954public:
955 AlgMJ();
956
957 // DOCWORK: Make param documentation use : consistently
984 const RCP<const Environment> &env,
985 RCP<const Comm<int> > &problemComm,
986 double imbalance_tolerance,
987 int num_teams,
988 size_t num_global_parts,
989 Kokkos::View<mj_part_t*, Kokkos::HostSpace> & part_no_array,
990 int recursion_depth,
991 int coord_dim,
992 mj_lno_t num_local_coords,
993 mj_gno_t num_global_coords,
994 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos,
995 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
996 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates,
997 int num_weights_per_coord,
998 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_weights,
999 Kokkos::View<mj_scalar_t**, device_t> & mj_weights,
1000 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_parts,
1001 Kokkos::View<mj_part_t*, device_t> & result_assigned_part_ids,
1002 Kokkos::View<mj_gno_t*, device_t> & result_mj_gnos);
1003
1017 bool distribute_points_on_cut_lines_,
1018 int max_concurrent_part_calculation_,
1019 int check_migrate_avoid_migration_option_,
1020 double minimum_migration_imbalance_,
1021 int migration_type_ = 0);
1022
1026
1029 RCP<mj_partBox_t> get_global_box() const;
1030
1033 RCP<mj_partBoxVector_t> get_kept_boxes() const;
1034
1037 RCP<mj_partBoxVector_t> compute_global_box_boundaries(
1038 RCP<mj_partBoxVector_t> &localPartBoxes) const;
1039
1079 const RCP<const Environment> &env,
1080 mj_lno_t num_total_coords,
1081 mj_lno_t num_selected_coords,
1082 size_t num_target_part,
1083 int coord_dim,
1084 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
1085 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
1086 Kokkos::View<mj_lno_t *, device_t> &
1087 initial_selected_coords_output_permutation,
1088 mj_lno_t *output_xadj,
1089 int recursion_depth_,
1090 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & part_no_array,
1091 bool partition_along_longest_dim,
1092 int num_ranks_per_node,
1093 bool divide_to_prime_first_,
1094 mj_part_t num_first_level_parts_ = 1,
1095 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & first_level_distribution_
1096 = Kokkos::View<mj_part_t *, Kokkos::HostSpace>());
1097
1098#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
1099 public:
1100#else
1101 private:
1102#endif
1103
1104 /* \brief Allocates all required memory for the mj partitioning algorithm.
1105 */
1106 void allocate_set_work_memory();
1107
1108 /* \brief compute global bounding box: min/max coords of global domain */
1109 void compute_global_box();
1110
1111 // DOCWORK: Inconsisent use of ! for descriptive/brief commenting - decide.
1118 void mj_get_local_min_max_coord_totW(
1119 mj_part_t current_work_part,
1120 mj_part_t current_concurrent_num_parts,
1121 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords);
1122
1135 void mj_get_global_min_max_coord_totW(
1136 mj_part_t current_concurrent_num_parts,
1137 Kokkos::View<mj_scalar_t *, device_t> & local_min_max_total,
1138 Kokkos::View<mj_scalar_t *, device_t> & global_min_max_total);
1139
1170 void mj_get_initial_cut_coords_target_weights(
1171 mj_scalar_t min_coord,
1172 mj_scalar_t max_coord,
1173 mj_part_t num_cuts/*p-1*/ ,
1174 mj_scalar_t global_weight,
1175 Kokkos::View<mj_scalar_t *, device_t> & initial_cut_coords,
1176 Kokkos::View<mj_scalar_t *, device_t> & target_part_weights,
1177 std::vector <mj_part_t> *future_num_part_in_parts,
1178 std::vector <mj_part_t> *next_future_num_parts_in_parts,
1179 mj_part_t concurrent_current_part,
1180 mj_part_t obtained_part_index,
1181 mj_part_t num_target_first_level_parts = 1,
1182 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & target_first_level_dist =
1183 Kokkos::View<mj_part_t *, Kokkos::HostSpace>());
1184
1201 void set_initial_coordinate_parts(
1202 mj_scalar_t &max_coordinate,
1203 mj_scalar_t &min_coordinate,
1204 mj_lno_t coordinate_begin_index,
1205 mj_lno_t coordinate_end_index,
1206 Kokkos::View<mj_lno_t *, device_t> &
1207 mj_current_coordinate_permutations,
1208 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1209 Kokkos::View<mj_part_t *, device_t> & mj_part_ids,
1210 mj_part_t &partition_count);
1211
1228 void mj_1D_part(
1229 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1230 double imbalanceTolerance,
1231 mj_part_t current_work_part,
1232 mj_part_t current_concurrent_num_parts,
1233 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
1234 mj_part_t total_incomplete_cut_count,
1235 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count,
1236 Kokkos::View<size_t*, device_t> & view_total_reduction_size);
1237
1243 void mj_1D_part_get_part_weights(
1244 mj_part_t current_concurrent_num_parts,
1245 mj_part_t current_work_part,
1246 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1247 int loop_count);
1248
1256 void mj_combine_rightleft_and_weights(
1257 mj_part_t current_work_part,
1258 mj_part_t current_concurrent_num_parts);
1259
1272 void mj_create_new_partitions(
1273 mj_part_t num_parts,
1274 mj_part_t current_concurrent_work_part,
1275 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1276 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
1277 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
1278 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj);
1279
1315 void mj_get_new_cut_coordinates(
1316 mj_part_t current_concurrent_num_parts,
1317 mj_part_t kk,
1318 const mj_part_t &num_cuts,
1319 const double &used_imbalance_tolerance,
1320 Kokkos::View<mj_scalar_t *, device_t> & current_global_part_weights,
1321 Kokkos::View<mj_scalar_t *, device_t> & current_local_part_weights,
1322 Kokkos::View<mj_scalar_t *, device_t> & current_part_target_weights,
1323 Kokkos::View<bool *, device_t> & current_cut_line_determined,
1324 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
1325 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_bounds,
1326 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bounds,
1327 Kokkos::View<mj_scalar_t *, device_t> & current_global_left_closest_points,
1328 Kokkos::View<mj_scalar_t *, device_t> & current_global_right_closest_points,
1329 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bound_weights,
1330 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_weights,
1331 Kokkos::View<mj_scalar_t *, device_t> & new_current_cut_coordinates,
1332 Kokkos::View<mj_scalar_t *, device_t> &
1333 current_part_cut_line_weight_to_put_left,
1334 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count);
1335
1345 void get_processor_num_points_in_parts(
1346 mj_part_t num_procs,
1347 mj_part_t num_parts,
1348 mj_gno_t *&num_points_in_all_processor_parts);
1349
1354 void fill_permutation_array(
1355 mj_part_t output_num_parts,
1356 mj_part_t num_parts);
1357
1379 void create_consistent_chunks(
1380 mj_part_t num_parts,
1381 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1382 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
1383 mj_lno_t coordinate_begin,
1384 mj_lno_t coordinate_end,
1385 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
1386 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj,
1387 int coordInd,
1388 bool longest_dim_part,
1389 uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted);
1390
1399 void set_final_parts(
1400 mj_part_t current_num_parts,
1401 mj_part_t output_part_begin_index,
1402 RCP<mj_partBoxVector_t> &output_part_boxes,
1403 bool is_data_ever_migrated);
1404};
1405
1408template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1409 typename mj_part_t, typename mj_node_t>
1411 mj_env(), mj_problemComm(), comm(), imbalance_tolerance(0),
1412 recursion_depth(0), coord_dim(0),
1413 num_weights_per_coord(0), initial_num_loc_coords(0),
1414 initial_num_glob_coords(0),
1415 num_local_coords(0), num_global_coords(0),
1416 sEpsilon(std::numeric_limits<mj_scalar_t>::epsilon() * 100),
1417 distribute_points_on_cut_lines(true),
1418 max_concurrent_part_calculation(1),
1419 mj_run_as_rcb(false), mj_user_recursion_depth(0),
1420 mj_keep_part_boxes(false),
1421 check_migrate_avoid_migration_option(0), migration_type(0),
1422 minimum_migration_imbalance(0.30),
1423 num_first_level_parts(1),
1424 total_num_cut(0), total_num_part(0), max_num_part_along_dim(0),
1425 max_num_cut_along_dim(0),
1426 max_num_total_part_along_dim(0),
1427 total_dim_num_reduce_all(0),
1428 last_dim_num_part(0),
1429 mj_num_teams(0),
1430 num_global_parts(1),
1431 kept_boxes(), global_box(),
1432 myRank(0), myActualRank(0),
1433 divide_to_prime_first(false)
1434{
1435}
1436
1480template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1481 typename mj_part_t, typename mj_node_t>
1484 const RCP<const Environment> &env,
1485 mj_lno_t num_total_coords,
1486 mj_lno_t num_selected_coords,
1487 size_t num_target_part,
1488 int coord_dim_,
1489 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
1490 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> &
1491 mj_coordinates_,
1492 Kokkos::View<mj_lno_t *, device_t> & initial_adjList_output_adjlist,
1493 mj_lno_t *output_xadj,
1494 int recursion_depth_,
1495 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & part_no_array_,
1496 bool partition_along_longest_dim,
1497 int num_ranks_per_node,
1498 bool divide_to_prime_first_,
1499 mj_part_t num_first_level_parts_,
1500 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & first_level_distribution_)
1501{
1502 this->mj_env = env;
1503 const RCP<Comm<int> > commN;
1504 this->mj_problemComm = Teuchos::DefaultComm<int>::getDefaultSerialComm(commN);
1505 this->comm = Teuchos::rcp_const_cast<Comm<int> >(this->mj_problemComm);
1506 this->myActualRank = this->myRank = 1;
1507
1508 this->divide_to_prime_first = divide_to_prime_first_;
1509 //weights are uniform for task mapping
1510
1511 //parts are uniform for task mapping
1512 //as input indices.
1513 this->imbalance_tolerance = 0;
1514 this->num_global_parts = num_target_part;
1515 this->part_no_array = part_no_array_;
1516 this->recursion_depth = recursion_depth_;
1517
1518 // If nonuniform first level partitioning, the requested num of parts and the
1519 // requested distribution of elements for each part
1520 this->num_first_level_parts = num_first_level_parts_;
1521
1522 this->first_level_distribution = first_level_distribution_;
1523
1524 this->coord_dim = coord_dim_;
1525 this->num_local_coords = num_total_coords;
1526
1527 this->num_global_coords = num_total_coords;
1528 this->mj_coordinates = mj_coordinates_;
1529
1530
1531 this->initial_mj_gnos =
1532 Kokkos::View<mj_gno_t*, device_t>("gids", this->num_local_coords);
1533
1534 this->num_weights_per_coord = 0;
1535
1536 this->mj_uniform_weights = Kokkos::View<bool*, Kokkos::HostSpace>(
1537 "uniform weights", 1);
1538 this->mj_uniform_weights(0) = true;
1539
1540 this->mj_weights = Kokkos::View<mj_scalar_t**, device_t>
1541 ("weights", 1, 1);
1542
1543 this->mj_uniform_parts =
1544 Kokkos::View<bool*, Kokkos::HostSpace>("uniform parts", 1);
1545 this->mj_uniform_parts(0) = true;
1546
1547 this->set_part_specifications();
1548
1549 this->allocate_set_work_memory();
1550
1551 // Do single init
1552 auto local_part_xadj = this->part_xadj;
1553 Kokkos::parallel_for(
1554 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
1555 KOKKOS_LAMBDA (int dummy) {
1556 local_part_xadj(0) = static_cast<mj_lno_t>(num_selected_coords);
1557 });
1558
1559 Kokkos::deep_copy(coordinate_permutations, initial_adjList_output_adjlist);
1560
1561 mj_part_t current_num_parts = 1;
1562
1563 Kokkos::View<mj_scalar_t *, device_t> current_cut_coordinates =
1564 this->all_cut_coordinates;
1565
1566 mj_part_t future_num_parts = this->total_num_part;
1567
1568 std::vector<mj_part_t> *future_num_part_in_parts =
1569 new std::vector<mj_part_t>();
1570 std::vector<mj_part_t> *next_future_num_parts_in_parts =
1571 new std::vector<mj_part_t>();
1572 next_future_num_parts_in_parts->push_back(this->num_global_parts);
1573 RCP<mj_partBoxVector_t> t1;
1574 RCP<mj_partBoxVector_t> t2;
1575
1576 std::vector <uSignedSortItem<int, mj_scalar_t, char>>
1577 coord_dimension_range_sorted(this->coord_dim);
1578 uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted =
1579 &(coord_dimension_range_sorted[0]);
1580 std::vector <mj_scalar_t> coord_dim_mins(this->coord_dim);
1581 std::vector <mj_scalar_t> coord_dim_maxs(this->coord_dim);
1582
1583 // Need a device counter - how best to allocate?
1584 // Putting this allocation in the loops is very costly so moved out here.
1585 Kokkos::View<mj_part_t*, device_t>
1586 view_rectilinear_cut_count("view_rectilinear_cut_count", 1);
1587 Kokkos::View<size_t*, device_t>
1588 view_total_reduction_size("view_total_reduction_size", 1);
1589
1590 for(int rd = 0; rd < this->recursion_depth; ++rd) {
1591 // next_future_num_parts_in_parts will be as the size of outnumParts,
1592 // and this will hold how many more parts that each output part
1593 // should be divided. this array will also be used to determine the weight
1594 // ratios of the parts.
1595 // swap the arrays to use iteratively..
1596 std::vector<mj_part_t> *tmpPartVect = future_num_part_in_parts;
1597 future_num_part_in_parts = next_future_num_parts_in_parts;
1598 next_future_num_parts_in_parts = tmpPartVect;
1599
1600 // clear next_future_num_parts_in_parts array as
1601 // getPartitionArrays expects it to be empty.
1602 next_future_num_parts_in_parts->clear();
1603
1604 // returns the total number of output parts for this dimension partitioning.
1605 mj_part_t output_part_count_in_dimension =
1606 this->update_part_num_arrays(
1607 future_num_part_in_parts,
1608 next_future_num_parts_in_parts,
1609 future_num_parts,
1610 current_num_parts,
1611 rd,
1612 t1,
1613 t2, num_ranks_per_node);
1614
1615 // if the number of obtained parts equal to current number of parts,
1616 // skip this dimension. For example, this happens when 1 is given in
1617 // the input part array is given. P=4,5,1,2
1618 if(output_part_count_in_dimension == current_num_parts) {
1619 tmpPartVect = future_num_part_in_parts;
1620 future_num_part_in_parts = next_future_num_parts_in_parts;
1621 next_future_num_parts_in_parts = tmpPartVect;
1622 continue;
1623 }
1624
1625 //convert i to string to be used for debugging purposes.
1626 std::string istring = std::to_string(rd);
1627
1628 // alloc Memory to point the indices
1629 // of the parts in the permutation array.
1630 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
1631 "new part xadj", output_part_count_in_dimension);
1632
1633 // the index where in the outtotalCounts will be written.
1634
1635 mj_part_t output_part_index = 0;
1636
1637 // whatever is written to outTotalCounts will be added with previousEnd
1638 // so that the points will be shifted.
1639 mj_part_t output_coordinate_end_index = 0;
1640
1641 mj_part_t current_work_part = 0;
1642 mj_part_t current_concurrent_num_parts = 1;
1643
1644 mj_part_t obtained_part_index = 0;
1645
1646 // get the coordinate axis along which the partitioning will be done.
1647 int coordInd = rd % this->coord_dim;
1648
1649 Kokkos::View<mj_scalar_t *, device_t> mj_current_dim_coords =
1650 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1651
1652 auto host_process_local_min_max_coord_total_weight =
1653 Kokkos::create_mirror_view(process_local_min_max_coord_total_weight);
1654 auto host_global_min_max_coord_total_weight =
1655 Kokkos::create_mirror_view(global_min_max_coord_total_weight);
1656
1657 // run for all available parts.
1658 for(; current_work_part < current_num_parts;
1659 current_work_part += current_concurrent_num_parts) {
1660
1661 mj_part_t actual_work_part_count = 0;
1662
1663 // initialization for 1D partitioning.
1664 // get the min and max coordinates of each part
1665 // together with the part weights of each part.
1666 for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1667 mj_part_t current_work_part_in_concurrent_parts =
1668 current_work_part + kk;
1669
1670 // if this part wont be partitioned any further
1671 // dont do any work for this part.
1672 mj_part_t partition_count = host_num_partitioning_in_current_dim(
1673 current_work_part_in_concurrent_parts);
1674 if(partition_count == 1) {
1675 continue;
1676 }
1677 ++actual_work_part_count;
1678 if(partition_along_longest_dim) {
1679 auto local_process_local_min_max_coord_total_weight =
1680 this->process_local_min_max_coord_total_weight;
1681 for(int coord_traverse_ind = 0;
1682 coord_traverse_ind < this->coord_dim; ++coord_traverse_ind) {
1683
1684 Kokkos::View<mj_scalar_t *, device_t> coords =
1685 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coord_traverse_ind);
1686
1687 this->mj_get_local_min_max_coord_totW(
1688 current_work_part,
1689 current_concurrent_num_parts,
1690 coords);
1691
1692 coord_dimension_range_sorted[coord_traverse_ind].id =
1693 coord_traverse_ind;
1694 coord_dimension_range_sorted[coord_traverse_ind].signbit = 1;
1695
1696 Kokkos::deep_copy(host_process_local_min_max_coord_total_weight,
1697 process_local_min_max_coord_total_weight);
1698
1699 coord_dim_mins[coord_traverse_ind] =
1700 host_process_local_min_max_coord_total_weight(kk);
1701 coord_dim_maxs[coord_traverse_ind] =
1702 host_process_local_min_max_coord_total_weight(
1703 kk + current_concurrent_num_parts);
1704 coord_dimension_range_sorted[coord_traverse_ind].val =
1705 host_process_local_min_max_coord_total_weight(
1706 kk + current_concurrent_num_parts) -
1707 host_process_local_min_max_coord_total_weight(kk);
1708 }
1709
1710 uqSignsort(this->coord_dim, p_coord_dimension_range_sorted);
1711 coordInd = p_coord_dimension_range_sorted[this->coord_dim - 1].id;
1712 auto set_min = coord_dim_mins[coordInd];
1713 auto set_max = coord_dim_maxs[coordInd];
1714 Kokkos::parallel_for(
1715 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
1716 (0, 1), KOKKOS_LAMBDA (int dummy) {
1717 local_process_local_min_max_coord_total_weight(kk) = set_min;
1718 local_process_local_min_max_coord_total_weight(
1719 kk + current_concurrent_num_parts) = set_max;
1720 });
1721
1722 mj_current_dim_coords =
1723 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1724 }
1725 else {
1726 Kokkos::View<mj_scalar_t *, device_t> coords =
1727 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1728 this->mj_get_local_min_max_coord_totW(
1729 current_work_part,
1730 current_concurrent_num_parts,
1731 coords);
1732 }
1733 }
1734
1735 // 1D partitioning
1736 if(actual_work_part_count > 0) {
1737 // obtain global Min max of the part.
1738 this->mj_get_global_min_max_coord_totW(
1739 current_concurrent_num_parts,
1740 this->process_local_min_max_coord_total_weight,
1741 this->global_min_max_coord_total_weight);
1742
1743 // update host copy
1744 Kokkos::deep_copy(host_global_min_max_coord_total_weight,
1745 global_min_max_coord_total_weight);
1746
1747 // represents the total number of cutlines
1748 // whose coordinate should be determined.
1749 mj_part_t total_incomplete_cut_count = 0;
1750
1751 //Compute weight ratios for parts & cuts:
1752 //e.g., 0.25 0.25 0.5 0.5 0.75 0.75 1.0
1753 // part0 cut0 part1 cut1 part2 cut2 part3
1754 mj_part_t concurrent_part_cut_shift = 0;
1755 mj_part_t concurrent_part_part_shift = 0;
1756 for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1757 mj_scalar_t min_coordinate =
1758 host_global_min_max_coord_total_weight(kk);
1759 mj_scalar_t max_coordinate = host_global_min_max_coord_total_weight(
1760 kk + current_concurrent_num_parts);
1761 mj_scalar_t global_total_weight = host_global_min_max_coord_total_weight(
1762 kk + 2*current_concurrent_num_parts);
1763
1764 mj_part_t concurrent_current_part_index = current_work_part + kk;
1765
1766 mj_part_t partition_count = host_num_partitioning_in_current_dim(
1767 concurrent_current_part_index);
1768
1769 Kokkos::View<mj_scalar_t *, device_t> usedCutCoordinate =
1770 Kokkos::subview(current_cut_coordinates,
1771 std::pair<mj_lno_t, mj_lno_t>(
1772 concurrent_part_cut_shift,
1773 current_cut_coordinates.size()));
1774 Kokkos::View<mj_scalar_t *, device_t>
1775 current_target_part_weights =
1776 Kokkos::subview(target_part_weights,
1777 std::pair<mj_lno_t, mj_lno_t>(
1778 concurrent_part_part_shift,
1779 target_part_weights.size()));
1780
1781 // shift the usedCutCoordinate array as noCuts.
1782 concurrent_part_cut_shift += partition_count - 1;
1783 // shift the partRatio array as noParts.
1784 concurrent_part_part_shift += partition_count;
1785 // calculate only if part is not empty,
1786 // and part will be further partitioend.
1787 if(partition_count > 1 && min_coordinate <= max_coordinate) {
1788 // increase allDone by the number of cuts of the current
1789 // part's cut line number.
1790 total_incomplete_cut_count += partition_count - 1;
1791
1792 this->incomplete_cut_count(kk) = partition_count - 1;
1793
1794 // When num_first_level_parts != 1 we have
1795 // nonuniform partitioning on the first level, providing
1796 // requested number of parts (num_first_level_parts) and
1797 // requested distribution in parts (first_level_distribution)
1798
1799 // Get the target part weights given a desired distribution
1800 this->mj_get_initial_cut_coords_target_weights(
1801 min_coordinate,
1802 max_coordinate,
1803 partition_count - 1,
1804 global_total_weight,
1805 usedCutCoordinate,
1806 current_target_part_weights,
1807 future_num_part_in_parts,
1808 next_future_num_parts_in_parts,
1809 concurrent_current_part_index,
1810 obtained_part_index,
1811 rd == 0 ? this->num_first_level_parts : 1,
1812 this->first_level_distribution);
1813
1814 mj_lno_t coordinate_end_index =
1815 host_part_xadj(concurrent_current_part_index);
1816 mj_lno_t coordinate_begin_index =
1817 (concurrent_current_part_index==0) ? 0 :
1818 host_part_xadj[concurrent_current_part_index - 1];
1819
1820 // get the initial estimated part assignments of the coordinates.
1821 this->set_initial_coordinate_parts(
1822 max_coordinate,
1823 min_coordinate,
1824 coordinate_begin_index, coordinate_end_index,
1825 this->coordinate_permutations,
1826 mj_current_dim_coords,
1827 this->assigned_part_ids,
1828 partition_count);
1829 }
1830 else {
1831 // e.g., if have fewer coordinates than parts, don't need to do
1832 // next dim.
1833 this->incomplete_cut_count(kk) = 0;
1834 }
1835 obtained_part_index += partition_count;
1836 }
1837
1838 // used imbalance, it is always 0, as it is difficult
1839 // to estimate a range.
1840 double used_imbalance = 0;
1841
1842 // Determine cut lines for k parts here.
1843 this->mj_env->timerStart(MACRO_TIMERS,
1844 mj_timer_base_string + "mj_1D_part()");
1845
1846 this->mj_1D_part(
1847 mj_current_dim_coords,
1848 used_imbalance,
1849 current_work_part,
1850 current_concurrent_num_parts,
1851 current_cut_coordinates,
1852 total_incomplete_cut_count,
1853 view_rectilinear_cut_count,
1854 view_total_reduction_size);
1855
1856 this->mj_env->timerStop(MACRO_TIMERS,
1857 mj_timer_base_string + "mj_1D_part()");
1858 }
1859 else {
1860 obtained_part_index += current_concurrent_num_parts;
1861 }
1862 // create part chunks
1863 {
1864 mj_part_t output_array_shift = 0;
1865 mj_part_t cut_shift = 0;
1866 size_t tlr_shift = 0;
1867 size_t partweight_array_shift = 0;
1868
1869 for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1870 mj_part_t current_concurrent_work_part = current_work_part + kk;
1871
1872 mj_part_t num_parts = host_num_partitioning_in_current_dim(
1873 current_concurrent_work_part);
1874
1875 // if the part is empty, skip the part.
1876 int coordinateA_bigger_than_coordinateB =
1877 host_global_min_max_coord_total_weight(kk) >
1878 host_global_min_max_coord_total_weight(
1879 kk + current_concurrent_num_parts);
1880
1881 if((num_parts != 1) && coordinateA_bigger_than_coordinateB) {
1882 // we still need to write the begin and end point of the empty part.
1883 // simply set it zero, the array indices will be shifted later
1884 auto local_new_part_xadj = this->new_part_xadj;
1885 Kokkos::parallel_for(
1886 Kokkos::RangePolicy<typename mj_node_t::execution_space,
1887 mj_part_t> (0, num_parts), KOKKOS_LAMBDA(mj_part_t jj) {
1888 local_new_part_xadj(
1889 output_part_index + output_array_shift + jj) = 0;
1890 });
1891
1892 cut_shift += num_parts - 1;
1893 tlr_shift += (4 *(num_parts - 1) + 1);
1894 output_array_shift += num_parts;
1895 partweight_array_shift += (2 * (num_parts - 1) + 1);
1896 continue;
1897 }
1898 mj_lno_t coordinate_end =
1899 host_part_xadj(current_concurrent_work_part);
1900 mj_lno_t coordinate_begin =
1901 current_concurrent_work_part==0 ? 0 :
1902 host_part_xadj(current_concurrent_work_part-1);
1903
1904 Kokkos::View<mj_scalar_t *, device_t>
1905 current_concurrent_cut_coordinate =
1906 Kokkos::subview(current_cut_coordinates,
1907 std::pair<mj_lno_t, mj_lno_t>(
1908 cut_shift,
1909 current_cut_coordinates.size()));
1910 Kokkos::View<mj_scalar_t *, device_t>
1911 used_local_cut_line_weight_to_left =
1912 Kokkos::subview(process_cut_line_weight_to_put_left,
1913 std::pair<mj_lno_t, mj_lno_t>(
1914 cut_shift,
1915 process_cut_line_weight_to_put_left.size()));
1916
1917 this->thread_part_weight_work =
1918 Kokkos::subview(
1919 this->thread_part_weights,
1920 std::pair<mj_lno_t, mj_lno_t>(
1921 partweight_array_shift,
1922 this->thread_part_weights.size()));
1923
1924 if(num_parts > 1) {
1925 // Rewrite the indices based on the computed cuts.
1926 Kokkos::View<mj_lno_t *, device_t> subview_new_part_xadj =
1927 Kokkos::subview(this->new_part_xadj,
1928 std::pair<mj_lno_t, mj_lno_t>(
1929 output_part_index + output_array_shift,
1930 this->new_part_xadj.size()));
1931
1932 this->create_consistent_chunks(
1933 num_parts,
1934 mj_current_dim_coords,
1935 current_concurrent_cut_coordinate,
1936 coordinate_begin,
1937 coordinate_end,
1938 used_local_cut_line_weight_to_left,
1939 subview_new_part_xadj,
1940 coordInd,
1941 partition_along_longest_dim,
1942 p_coord_dimension_range_sorted);
1943 }
1944 else {
1945 // if this part is partitioned into 1 then just copy
1946 // the old values.
1947 mj_lno_t part_size = coordinate_end - coordinate_begin;
1948
1949 auto local_new_part_xadj = this->new_part_xadj;
1950 Kokkos::parallel_for(
1951 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
1952 (0, 1), KOKKOS_LAMBDA (int dummy) {
1953 local_new_part_xadj(output_part_index + output_array_shift)
1954 = part_size;
1955 });
1956
1957 auto subview_new_coordinate_permutations =
1958 Kokkos::subview(this->new_coordinate_permutations,
1959 std::pair<mj_lno_t, mj_lno_t>(
1960 coordinate_begin,
1961 coordinate_begin + part_size));
1962 auto subview_coordinate_permutations =
1963 Kokkos::subview(this->coordinate_permutations,
1964 std::pair<mj_lno_t, mj_lno_t>(
1965 coordinate_begin,
1966 coordinate_begin + part_size));
1967 Kokkos::deep_copy(subview_new_coordinate_permutations,
1968 subview_coordinate_permutations);
1969 }
1970
1971 cut_shift += num_parts - 1;
1972 tlr_shift += (4 *(num_parts - 1) + 1);
1973 output_array_shift += num_parts;
1974 partweight_array_shift += (2 * (num_parts - 1) + 1);
1975 }
1976
1977 // shift cut coordinates so that all cut coordinates are stored.
1978 // current_cut_coordinates += cutShift;
1979
1980 // getChunks from coordinates partitioned the parts and
1981 // wrote the indices as if there were a single part.
1982 // now we need to shift the beginning indices.
1983 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
1984 mj_part_t num_parts =
1985 host_num_partitioning_in_current_dim(current_work_part + kk);
1986 auto local_new_part_xadj = this->new_part_xadj;
1987 auto local_mj_current_dim_coords = mj_current_dim_coords;
1988 auto local_new_coordinate_permutations =
1989 new_coordinate_permutations;
1990 Kokkos::parallel_for(
1991 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t> (
1992 0, num_parts), KOKKOS_LAMBDA (mj_part_t ii) {
1993 //shift it by previousCount
1994 local_new_part_xadj(output_part_index+ii) +=
1995 output_coordinate_end_index;
1996
1997 if(ii % 2 == 1) {
1998 mj_lno_t coordinate_end =
1999 local_new_part_xadj(output_part_index+ii);
2000 mj_lno_t coordinate_begin =
2001 local_new_part_xadj(output_part_index);
2002
2003 for(mj_lno_t task_traverse = coordinate_begin;
2004 task_traverse < coordinate_end; ++task_traverse) {
2005 mj_lno_t l = local_new_coordinate_permutations(task_traverse);
2006 //MARKER: FLIPPED ZORDER BELOW
2007 local_mj_current_dim_coords(l) = -local_mj_current_dim_coords(l);
2008 }
2009 }
2010 });
2011
2012 // increase the previous count by current end.
2013 mj_part_t get_single;
2014 Kokkos::parallel_reduce("Read new_part_xadj",
2015 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0, 1),
2016 KOKKOS_LAMBDA(int dummy, mj_part_t & set_single) {
2017 set_single = local_new_part_xadj(output_part_index + num_parts - 1);
2018 }, get_single);;
2019
2020 output_coordinate_end_index = get_single;
2021 // increase the current out.
2022 output_part_index += num_parts;
2023 }
2024 }
2025 }
2026
2027 // end of this partitioning dimension
2028 // set the current num parts for next dim partitioning
2029 current_num_parts = output_part_count_in_dimension;
2030
2031 //swap the coordinate permutations for the next dimension.
2032 Kokkos::View<mj_lno_t *, device_t> tmp = this->coordinate_permutations;
2033 this->coordinate_permutations = this->new_coordinate_permutations;
2034 this->new_coordinate_permutations = tmp;
2035
2036 this->part_xadj = this->new_part_xadj;
2037 this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
2038 Kokkos::deep_copy(host_part_xadj, part_xadj); // keep in sync
2039 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>("empty", 0);
2040 }
2041
2042 Kokkos::deep_copy(initial_adjList_output_adjlist, coordinate_permutations);
2043
2044 // Return output_xadj in CSR format
2045 output_xadj[0] = 0;
2046 for(size_t i = 0; i < this->num_global_parts ; ++i) {
2047 output_xadj[i+1] = host_part_xadj(i);
2048 }
2049
2050 delete future_num_part_in_parts;
2051 delete next_future_num_parts_in_parts;
2052}
2053
2057template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2058 typename mj_part_t, typename mj_node_t>
2059RCP<typename AlgMJ
2060 <mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t,mj_node_t>::mj_partBox_t>
2062 get_global_box() const
2063{
2064 return this->global_box;
2065}
2066
2069template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2070 typename mj_part_t, typename mj_node_t>
2071void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2072 mj_node_t>::set_to_keep_part_boxes()
2073{
2074 this->mj_keep_part_boxes = true;
2075}
2076
2077/* \brief Either the mj array (part_no_array) or num_global_parts should be
2078 * provided in the input. part_no_array takes
2079 * precedence if both are provided.
2080 * Depending on these parameters, total cut/part number,
2081 * maximum part/cut number along a dimension, estimated number of reduceAlls,
2082 * and the number of parts before the last dimension is calculated.
2083 * */
2084template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2085 typename mj_part_t, typename mj_node_t>
2088{
2089 this->total_num_cut = 0; //how many cuts will be totally
2090 this->total_num_part = 1; //how many parts will be totally
2091 this->max_num_part_along_dim = 0; // maximum part count along a dimension.
2092 this->total_dim_num_reduce_all = 0; // estimate on #reduceAlls can be done.
2093 this->last_dim_num_part = 1; //max no of parts that might occur
2094 //during the partition before the
2095 //last partitioning dimension.
2096 this->max_num_cut_along_dim = 0;
2097 this->max_num_total_part_along_dim = 0;
2098
2099 if(this->part_no_array.size()) {
2100 auto local_recursion_depth = this->recursion_depth;
2101
2102 this->total_dim_num_reduce_all =
2103 this->total_num_part * this->recursion_depth;
2104
2105 this->total_num_part = 1;
2106 for(int i = 0; i < local_recursion_depth; ++i) {
2107 this->total_num_part *= this->part_no_array(i);
2108 }
2109
2110 mj_part_t track_max = 0;
2111 for(int i = 0; i < local_recursion_depth; ++i) {
2112 if(part_no_array(i) > track_max) {
2113 track_max = this->part_no_array(i);
2114 };
2115 }
2116
2117 this->last_dim_num_part = this->total_num_part /
2118 this->part_no_array(local_recursion_depth-1);
2119
2120 this->max_num_part_along_dim = track_max;
2121 this->num_global_parts = this->total_num_part;
2122 } else {
2123 mj_part_t future_num_parts = this->num_global_parts;
2124
2125 // If using nonuniform first level partitioning.
2126 // initial value max_num_part_along_dim == num_first_level_parts
2127 if (this->first_level_distribution.size() != 0 &&
2128 this->num_first_level_parts > 1) {
2129 this->max_num_part_along_dim = this->num_first_level_parts;
2130 }
2131
2132 // we need to calculate the part numbers now, to determine
2133 // the maximum along the dimensions.
2134 for(int rd = 0; rd < this->recursion_depth; ++rd) {
2135 mj_part_t maxNoPartAlongI = 0;
2136 mj_part_t nfutureNumParts = 0;
2137
2138 // Nonuniform first level partitioning sets part specificiations for
2139 // rd == 0 only, given requested num of parts and distribution in parts
2140 // for the first level.
2141 if (rd == 0 &&
2142 this->first_level_distribution.size() != 0 &&
2143 this->num_first_level_parts > 1) {
2144
2145 maxNoPartAlongI = this->num_first_level_parts;
2146 this->max_num_part_along_dim = this->num_first_level_parts;
2147
2148 mj_part_t sum_first_level_dist = 0;
2149 mj_part_t max_part = 0;
2150
2151 // Cumulative sum of distribution of parts and size of largest part
2152 for (int i = 0; i < this->num_first_level_parts; ++i) {
2153 sum_first_level_dist += this->first_level_distribution(i);
2154 if (this->first_level_distribution(i) > max_part)
2155 max_part = this->first_level_distribution(i);
2156 }
2157
2158 // Total parts in largest nonuniform superpart from
2159 // first level partitioning
2160 nfutureNumParts =
2161 this->num_global_parts * max_part / sum_first_level_dist;
2162 }
2163 // Standard uniform partitioning this level
2164 else {
2165 maxNoPartAlongI = this->get_part_count(future_num_parts,
2166 1.0f / (this->recursion_depth - rd));
2167 if (maxNoPartAlongI > this->max_num_part_along_dim)
2168 this->max_num_part_along_dim = maxNoPartAlongI;
2169 nfutureNumParts = future_num_parts / maxNoPartAlongI;
2170 if (future_num_parts % maxNoPartAlongI) {
2171 ++nfutureNumParts;
2172 }
2173 }
2174 future_num_parts = nfutureNumParts;
2175 }
2176 this->total_num_part = this->num_global_parts;
2177
2178 if(this->divide_to_prime_first) {
2179 this->total_dim_num_reduce_all = this->num_global_parts * 2;
2180 this->last_dim_num_part = this->num_global_parts;
2181 }
2182 else {
2183 //this is the lower bound.
2184 //estimate reduceAll Count here.
2185 //we find the upperbound instead.
2186 size_t p = 1;
2187 for(int i = 0; i < this->recursion_depth; ++i) {
2188 this->total_dim_num_reduce_all += p;
2189 p *= this->max_num_part_along_dim;
2190 }
2191
2192 if(p / this->max_num_part_along_dim > this->num_global_parts) {
2193 this->last_dim_num_part = this->num_global_parts;
2194 }
2195 else {
2196 this->last_dim_num_part = p / this->max_num_part_along_dim;
2197 }
2198 }
2199 }
2200
2201 this->total_num_cut = this->total_num_part - 1;
2202 this->max_num_cut_along_dim = this->max_num_part_along_dim - 1;
2203 this->max_num_total_part_along_dim = this->max_num_part_along_dim +
2204 size_t(this->max_num_cut_along_dim);
2205 // maxPartNo is P, maxCutNo = P-1, matTotalPartcount = 2P-1
2206
2207 // refine the concurrent part count, if it is given bigger than the maximum
2208 // possible part count.
2209 if(this->max_concurrent_part_calculation > this->last_dim_num_part) {
2210 if(this->mj_problemComm->getRank() == 0) {
2211 std::cerr << "Warning: Concurrent part count (" <<
2212 this->max_concurrent_part_calculation <<
2213 ") has been set bigger than maximum amount that can be used." <<
2214 " Setting to:" << this->last_dim_num_part << "." << std::endl;
2215 }
2216 this->max_concurrent_part_calculation = this->last_dim_num_part;
2217 }
2218}
2219
2220/* \brief Tries to determine the part number for current dimension,
2221 * by trying to make the partitioning as square as possible.
2222 * \param num_total_future how many more partitionings are required.
2223 * \param root how many more recursion depth is left.
2224 */
2225template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2226 typename mj_part_t, typename mj_node_t>
2227inline mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2228 get_part_count(mj_part_t num_total_future, double root)
2229{
2230 double fp = pow(num_total_future, root);
2231 mj_part_t ip = mj_part_t(fp);
2232 if(fp - ip < std::numeric_limits<float>::epsilon() * 100) {
2233 return ip;
2234 }
2235 else {
2236 return ip + 1;
2237 }
2238}
2239
2240/* \brief Function returns how many parts that will be obtained after this
2241 * dimension partitioning. It sets how many parts each current part will be
2242 * partitioned into in this dimension to device_num_partitioning_in_current_dim
2243 * view, sets how many total future parts each obtained part will be
2244 * partitioned into in next_future_num_parts_in_parts vector. If part boxes are
2245 * kept, then sets initializes the output_part_boxes as its ancestor.
2246 * \param future_num_part_in_parts: input, how many future parts each current
2247 * part will be partitioned into.
2248 * \param next_future_num_parts_in_parts: output, how many future parts each
2249 * obtained part will be partitioned into.
2250 * \param future_num_parts: output, max number of future parts that will be
2251 * obtained from a single
2252 * \param current_num_parts: input, how many parts are there currently.
2253 * \param current_iteration: input, current dimension iteration number.
2254 * \param input_part_boxes: input, if boxes are kept, current boxes.
2255 * \param output_part_boxes: output, if boxes are kept, the initial box
2256 * boundaries for obtained parts.
2257 * \param atomic_part_count DOCWORK: Documentation
2258 */
2259template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2260 typename mj_part_t, typename mj_node_t>
2261mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2262 update_part_num_arrays(
2263 std::vector<mj_part_t> *future_num_part_in_parts,
2264 std::vector<mj_part_t> *next_future_num_parts_in_parts,
2265 mj_part_t &future_num_parts,
2266 mj_part_t current_num_parts,
2267 int current_iteration,
2268 RCP<mj_partBoxVector_t> input_part_boxes,
2269 RCP<mj_partBoxVector_t> output_part_boxes,
2270 mj_part_t atomic_part_count)
2271{
2272 std::vector<mj_part_t> num_partitioning_in_current_dim;
2273
2274 // how many parts that will be obtained after this dimension.
2275 mj_part_t output_num_parts = 0;
2276 if(this->part_no_array.size()) {
2277 // when the partNo array is provided as input,
2278 // each current partition will be partition to the same number of parts.
2279 // we dont need to use the future_num_part_in_parts vector in this case.
2280 mj_part_t current_part_no_array =
2281 this->part_no_array(current_iteration);
2282
2283 if(current_part_no_array < 1) {
2284 std::cout << "Current recursive iteration: " << current_iteration <<
2285 " part_no_array[" << current_iteration << "] is given as:" <<
2286 current_part_no_array << std::endl;
2287 std::terminate();
2288 }
2289 if(current_part_no_array == 1) {
2290 return current_num_parts;
2291 }
2292
2293 // If using part_no_array, ensure compatibility with num_first_level_parts.
2294 if (this->first_level_distribution.size() != 0 &&
2295 current_iteration == 0 &&
2296 current_part_no_array != this->num_first_level_parts) {
2297 std::cout << "Current recursive iteration: " << current_iteration
2298 << " part_no_array[" << current_iteration << "] is given as: " <<
2299 current_part_no_array << " and contradicts num_first_level_parts: " <<
2300 this->num_first_level_parts << std::endl;
2301 std::terminate();
2302 }
2303
2304 for(mj_part_t ii = 0; ii < current_num_parts; ++ii) {
2305 num_partitioning_in_current_dim.push_back(current_part_no_array);
2306 }
2307
2308/*
2309 std::cout << "\n\nme: " << this->myRank << " current_iteration: " <<
2310 current_iteration << " current_num_parts: " <<
2311 current_num_parts << "\n\n";
2312
2313 std::cout << "\n\nnum_partitioning_in_current_dim[0]: " <<
2314 num_partitioning_in_current_dim[0] << "\n\n";
2315
2316 std::cout << "\n\nfuture_num_parts: " << future_num_parts
2317 << " num_partitioning_in_current_dim[0]: " <<
2318 num_partitioning_in_current_dim[0] << " " <<
2319 future_num_parts / num_partitioning_in_current_dim[0] << "\n\n";
2320*/
2321
2322 future_num_parts /= num_partitioning_in_current_dim[0];
2323 output_num_parts = current_num_parts *
2324 num_partitioning_in_current_dim[0];
2325 if(this->mj_keep_part_boxes) {
2326 for(mj_part_t k = 0; k < current_num_parts; ++k) {
2327 //initialized the output boxes as its ancestor.
2328 for(mj_part_t j = 0; j <
2329 num_partitioning_in_current_dim[0]; ++j) {
2330 output_part_boxes->push_back((*input_part_boxes)[k]);
2331 }
2332 }
2333 }
2334
2335 // set the how many more parts each part will be divided.
2336 // this is obvious when partNo array is provided as input.
2337 // however, fill this so weights will be calculated according to this array.
2338 for(mj_part_t ii = 0; ii < output_num_parts; ++ii) {
2339 next_future_num_parts_in_parts->push_back(future_num_parts);
2340 }
2341 }
2342 else {
2343 // if partNo array is not provided as input, future_num_part_in_parts
2344 // holds how many parts each part should be divided. Initially it holds a
2345 // single number equal to the total number of global parts.
2346
2347 // calculate the future_num_parts from beginning,
2348 // since each part might be divided into different number of parts.
2349 future_num_parts = 1;
2350
2351 // cout << "i:" << i << std::endl;
2352 for(mj_part_t ii = 0; ii < current_num_parts; ++ii) {
2353 // get how many parts a part should be divided.
2354 mj_part_t future_num_parts_of_part_ii = (*future_num_part_in_parts)[ii];
2355
2356 // get the ideal number of parts that is close to the
2357 // (recursion_depth - i) root of the future_num_parts_of_part_ii.
2358 mj_part_t num_partitions_in_current_dim =
2359 this->get_part_count(future_num_parts_of_part_ii,
2360 1.0 / (this->recursion_depth - current_iteration)
2361 );
2362 if(num_partitions_in_current_dim > this->max_num_part_along_dim) {
2363 std::cerr << "ERROR: maxPartNo calculation is wrong."
2364 " num_partitions_in_current_dim: "
2365 << num_partitions_in_current_dim << " this->max_num_part_along_dim: "
2366 << this->max_num_part_along_dim <<
2367 " this->recursion_depth: " << this->recursion_depth <<
2368 " current_iteration:" << current_iteration <<
2369 " future_num_parts_of_part_ii: " << future_num_parts_of_part_ii <<
2370 " might need to fix max part no calculation for "
2371 "largest_prime_first partitioning." <<
2372 std::endl;
2373 std::terminate();
2374 }
2375 // add this number to vector_num_partitioning_in_current_dim vector.
2376 // num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2377 // mj_part_t largest_prime_factor = num_partitions_in_current_dim;
2378
2379 // Update part num arrays when on current_iteration == 0 and
2380 // using nonuniform first level partitioning
2381 // with requested num parts (num_first_level_parts) and
2382 // a requested distribution in parts (first_level_distribution).
2383 if (current_iteration == 0 &&
2384 this->first_level_distribution.size() != 0 &&
2385 this->num_first_level_parts > 1) {
2386 // Only 1 current part to begin and partitions into
2387 // num_first_level_parts many parts
2388 num_partitioning_in_current_dim.push_back(this->num_first_level_parts);
2389
2390 // The output number of parts from first level partitioning
2391 output_num_parts = this->num_first_level_parts;
2392
2393 // Remaining parts left to partition for all future levels
2394 future_num_parts /= this->num_first_level_parts;
2395
2396 mj_part_t max_part = 0;
2397 mj_part_t sum_first_level_dist = 0;
2398
2399 // Cumulative sum of distribution of first level parts
2400 // and size of largest first level part
2401 for (int i = 0; i < this->num_first_level_parts; ++i) {
2402 sum_first_level_dist += this->first_level_distribution(i);
2403
2404 if (this->first_level_distribution(i) > max_part)
2405 max_part = this->first_level_distribution(i);
2406 }
2407
2408 // Maximum # of remaining parts left to partition for all future levels
2409 future_num_parts = this->num_global_parts * max_part / sum_first_level_dist;
2410
2411 // Number of parts remaining left to partition for each future_part
2412 // The sum must exactly equal global_num_parts
2413 for (int i = 0; i < this->num_first_level_parts; ++i) {
2414 next_future_num_parts_in_parts->push_back(this->first_level_distribution(i) *
2415 this->num_global_parts / sum_first_level_dist);
2416 }
2417 }
2418 else if (this->divide_to_prime_first) {
2419 // Add this number to num_partitioning_in_current_dim vector.
2420 num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2421
2422 mj_part_t largest_prime_factor = num_partitions_in_current_dim;
2423
2424 //increase the output number of parts.
2425 output_num_parts += num_partitions_in_current_dim;
2426
2427 if (future_num_parts_of_part_ii == atomic_part_count ||
2428 future_num_parts_of_part_ii % atomic_part_count != 0) {
2429 atomic_part_count = 1;
2430 }
2431
2432 largest_prime_factor =
2433 this->find_largest_prime_factor(future_num_parts_of_part_ii / atomic_part_count);
2434
2435 // We divide to num_partitions_in_current_dim. But we adjust the weights
2436 // based on largest prime/ if num_partitions_in_current_dim = 2,
2437 // largest prime = 5 --> we divide to 2 parts with weights 3x and 2x.
2438 // if the largest prime is less than part count, we use the part count
2439 // so that we divide uniformly.
2440 if (largest_prime_factor < num_partitions_in_current_dim) {
2441 largest_prime_factor = num_partitions_in_current_dim;
2442 }
2443 //ideal number of future partitions for each part.
2444 mj_part_t ideal_num_future_parts_in_part =
2445 (future_num_parts_of_part_ii / atomic_part_count) / largest_prime_factor;
2446 //if num_partitions_in_current_dim = 2, largest prime = 5 then ideal weight is 2x
2447 mj_part_t ideal_prime_scale = largest_prime_factor / num_partitions_in_current_dim;
2448
2449/*
2450 std::cout << "\ncurrent num part: " << ii
2451 << " largest_prime_factor: " << largest_prime_factor
2452 << " To Partition: " << future_num_parts_of_part_ii << "\n\n";
2453*/
2454
2455 for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii) {
2456 //if num_partitions_in_current_dim = 2, largest prime = 5 then ideal weight is 2x
2457 mj_part_t my_ideal_primescale = ideal_prime_scale;
2458 //left over weighs. Left side is adjusted to be 3x, right side stays as 2x
2459 if (iii < (largest_prime_factor) % num_partitions_in_current_dim) {
2460 ++my_ideal_primescale;
2461 }
2462 //scale with 'x';
2463 mj_part_t num_future_parts_for_part_iii =
2464 ideal_num_future_parts_in_part * my_ideal_primescale;
2465
2466 //if there is a remainder in the part increase the part weight.
2467 if (iii < (future_num_parts_of_part_ii / atomic_part_count) % largest_prime_factor) {
2468 //if not uniform, add 1 for the extra parts.
2469 ++num_future_parts_for_part_iii;
2470 }
2471
2472 next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii * atomic_part_count);
2473
2474 //if part boxes are stored, initialize the box of the parts as the ancestor.
2475 if (this->mj_keep_part_boxes) {
2476 output_part_boxes->push_back((*input_part_boxes)[ii]);
2477 }
2478
2479 //set num future_num_parts to maximum in this part.
2480 if (num_future_parts_for_part_iii > future_num_parts)
2481 future_num_parts = num_future_parts_for_part_iii;
2482
2483 }
2484 }
2485 else {
2486 // Add this number to num_partitioning_in_current_dim vector.
2487 num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2488
2489 //increase the output number of parts.
2490 output_num_parts += num_partitions_in_current_dim;
2491
2492 if((future_num_parts_of_part_ii == atomic_part_count) ||
2493 (future_num_parts_of_part_ii % atomic_part_count != 0)) {
2494 atomic_part_count = 1;
2495 }
2496 //ideal number of future partitions for each part.
2497 mj_part_t ideal_num_future_parts_in_part =
2498 (future_num_parts_of_part_ii / atomic_part_count) /
2499 num_partitions_in_current_dim;
2500 for(mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii) {
2501 mj_part_t num_future_parts_for_part_iii =
2502 ideal_num_future_parts_in_part;
2503
2504 //if there is a remainder in the part increase the part weight.
2505 if(iii < (future_num_parts_of_part_ii / atomic_part_count) %
2506 num_partitions_in_current_dim) {
2507 // if not uniform, add 1 for the extra parts.
2508 ++num_future_parts_for_part_iii;
2509 }
2510
2511 next_future_num_parts_in_parts->push_back(
2512 num_future_parts_for_part_iii * atomic_part_count);
2513
2514 // if part boxes are stored, initialize the box of the parts as
2515 // the ancestor.
2516 if(this->mj_keep_part_boxes) {
2517 output_part_boxes->push_back((*input_part_boxes)[ii]);
2518 }
2519 //set num future_num_parts to maximum in this part.
2520 if(num_future_parts_for_part_iii > future_num_parts)
2521 future_num_parts = num_future_parts_for_part_iii;
2522 }
2523 }
2524 }
2525 }
2526 // move temp std::vector to host view
2527 device_num_partitioning_in_current_dim = Kokkos::View<
2528 mj_part_t*, device_t>("test", num_partitioning_in_current_dim.size());
2529 host_num_partitioning_in_current_dim =
2530 Kokkos::create_mirror_view(device_num_partitioning_in_current_dim);
2531 for(size_t n = 0; n < num_partitioning_in_current_dim.size(); ++n) {
2532 host_num_partitioning_in_current_dim(n) =
2533 num_partitioning_in_current_dim[n];
2534 }
2535 // setup device equivalent - this data is used on host and device and it's
2536 // more efficient to just setup array on both sides now rather than copy
2537 // values as needed later.
2538 Kokkos::deep_copy(device_num_partitioning_in_current_dim,
2539 host_num_partitioning_in_current_dim);
2540 return output_num_parts;
2541}
2542
2543/* \brief Allocates and initializes the work memory that will be used by MJ.
2544 * */
2545template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2546 typename mj_part_t, typename mj_node_t>
2547void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2548 allocate_set_work_memory()
2549{
2550 // Throughout the partitioning execution,
2551 // instead of the moving the coordinates, hold a permutation array for parts.
2552 // coordinate_permutations holds the current permutation.
2553 this->coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>(
2554 Kokkos::ViewAllocateWithoutInitializing("coordinate_permutations"),
2555 this->num_local_coords);
2556 auto local_coordinate_permutations = coordinate_permutations;
2557 Kokkos::parallel_for(
2558 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t> (
2559 0, this->num_local_coords), KOKKOS_LAMBDA (mj_lno_t i) {
2560 local_coordinate_permutations(i) = i;
2561 });
2562
2563 // new_coordinate_permutations holds the current permutation.
2564 this->new_coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>(
2565 Kokkos::ViewAllocateWithoutInitializing("num_local_coords"),
2566 this->num_local_coords);
2567
2568 this->assigned_part_ids = Kokkos::View<mj_part_t*, device_t>(
2569 Kokkos::ViewAllocateWithoutInitializing("assigned parts"), 0);
2570 if(this->num_local_coords > 0) {
2571 this->assigned_part_ids = Kokkos::View<mj_part_t*, device_t>(
2572 Kokkos::ViewAllocateWithoutInitializing("assigned part ids"),
2573 this->num_local_coords);
2574 }
2575
2576 // single partition starts at index-0, and ends at numLocalCoords
2577 // inTotalCounts array holds the end points in coordinate_permutations array
2578 // for each partition. Initially sized 1, and single element is set to
2579 // numLocalCoords.
2580 this->part_xadj = Kokkos::View<mj_lno_t*, device_t>(
2581 Kokkos::ViewAllocateWithoutInitializing("part xadj"), 1);
2582 this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
2583 host_part_xadj(0) = num_local_coords;
2584 Kokkos::deep_copy(this->part_xadj, host_part_xadj);
2585
2586 // the ends points of the output, this is allocated later.
2587 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
2588 Kokkos::ViewAllocateWithoutInitializing("empty"), 0);
2589
2590 // only store this much if cuts are needed to be stored.
2591 this->all_cut_coordinates = Kokkos::View<mj_scalar_t*, device_t>(
2592 Kokkos::ViewAllocateWithoutInitializing("all cut coordinates"),
2593 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2594
2595 // how much weight percentage should a MPI put left side of the each cutline
2596 this->process_cut_line_weight_to_put_left = Kokkos::View<mj_scalar_t*,
2597 device_t>(Kokkos::ViewAllocateWithoutInitializing("empty"), 0);
2598
2599 // how much weight percentage should each thread in MPI put left side of
2600 // each outline
2601 this->thread_cut_line_weight_to_put_left =
2602 Kokkos::View<mj_scalar_t*, device_t>(
2603 Kokkos::ViewAllocateWithoutInitializing("empty"), 0);
2604
2605 if(this->distribute_points_on_cut_lines) {
2606 this->process_cut_line_weight_to_put_left =
2607 Kokkos::View<mj_scalar_t *, device_t>(
2608 Kokkos::ViewAllocateWithoutInitializing(
2609 "process_cut_line_weight_to_put_left"),
2610 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2611 this->thread_cut_line_weight_to_put_left =
2612 Kokkos::View<mj_scalar_t *, device_t>(
2613 Kokkos::ViewAllocateWithoutInitializing(
2614 "thread_cut_line_weight_to_put_left"),
2615 this->max_num_cut_along_dim);
2616 this->process_rectilinear_cut_weight =
2617 Kokkos::View<mj_scalar_t *, device_t>(
2618 Kokkos::ViewAllocateWithoutInitializing("process_rectilinear_cut_weight"),
2619 this->max_num_cut_along_dim);
2620 this->global_rectilinear_cut_weight =
2621 Kokkos::View<mj_scalar_t *, device_t>(
2622 Kokkos::ViewAllocateWithoutInitializing("global_rectilinear_cut_weight"),
2623 this->max_num_cut_along_dim);
2624 }
2625
2626 // work array to manipulate coordinate of cutlines in different iterations.
2627 // necessary because previous cut line information is used for determining
2628 // the next cutline information. therefore, cannot update the cut work array
2629 // until all cutlines are determined.
2630 this->cut_coordinates_work_array =
2631 Kokkos::View<mj_scalar_t *, device_t>(
2632 Kokkos::ViewAllocateWithoutInitializing("cut_coordinates_work_array"),
2633 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2634
2635 // cumulative part weight array.
2636 this->target_part_weights = Kokkos::View<mj_scalar_t*, device_t>(
2637 Kokkos::ViewAllocateWithoutInitializing("target_part_weights"),
2638 this->max_num_part_along_dim * this->max_concurrent_part_calculation);
2639
2640 // upper bound coordinate of a cut line
2641 this->cut_upper_bound_coordinates =
2642 Kokkos::View<mj_scalar_t*, device_t>(
2643 Kokkos::ViewAllocateWithoutInitializing("cut_upper_bound_coordinates"),
2644 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2645
2646 // lower bound coordinate of a cut line
2647 this->cut_lower_bound_coordinates =
2648 Kokkos::View<mj_scalar_t*, device_t>(
2649 Kokkos::ViewAllocateWithoutInitializing("cut_lower_bound_coordinates"),
2650 this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2651
2652 // lower bound weight of a cut line
2653 this->cut_lower_bound_weights =
2654 Kokkos::View<mj_scalar_t*, device_t>(
2655 Kokkos::ViewAllocateWithoutInitializing("cut_lower_bound_weights"),
2656 this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2657
2658 //upper bound weight of a cut line
2659 this->cut_upper_bound_weights =
2660 Kokkos::View<mj_scalar_t*, device_t>(
2661 Kokkos::ViewAllocateWithoutInitializing("cut_upper_bound_weights"),
2662 this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2663
2664 // combined array to exchange the min and max coordinate,
2665 // and total weight of part.
2666 this->process_local_min_max_coord_total_weight =
2667 Kokkos::View<mj_scalar_t*, device_t>(
2668 Kokkos::ViewAllocateWithoutInitializing(
2669 "process_local_min_max_coord_total_weight"),
2670 3 * this->max_concurrent_part_calculation);
2671
2672 // global combined array with the results for min, max and total weight.
2673 this->global_min_max_coord_total_weight =
2674 Kokkos::View<mj_scalar_t*, device_t>(
2675 Kokkos::ViewAllocateWithoutInitializing("global_min_max_coord_total_weight"),
2676 3 * this->max_concurrent_part_calculation);
2677
2678 // is_cut_line_determined is used to determine if a cutline is
2679 // determined already. If a cut line is already determined, the next
2680 // iterations will skip this cut line.
2681 this->is_cut_line_determined = Kokkos::View<bool *, device_t>(
2682 Kokkos::ViewAllocateWithoutInitializing("is_cut_line_determined"),
2683 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2684
2685 // incomplete_cut_count count holds the number of cutlines that have not
2686 // been finalized for each part when concurrentPartCount>1, using this
2687 // information, if incomplete_cut_count[x]==0, then no work is done for
2688 // this part.
2689 this->device_incomplete_cut_count = Kokkos::View<mj_part_t *, device_t>(
2690 Kokkos::ViewAllocateWithoutInitializing("device_incomplete_cut_count"),
2691 this->max_concurrent_part_calculation);
2692 this->incomplete_cut_count =
2693 Kokkos::create_mirror_view(device_incomplete_cut_count);
2694
2695 // local part weights of each thread.
2696 this->thread_part_weights = Kokkos::View<double *, device_t>(
2697 Kokkos::ViewAllocateWithoutInitializing("thread_part_weights"),
2698 this->max_num_total_part_along_dim * this->max_concurrent_part_calculation);
2699
2700 this->thread_cut_left_closest_point = Kokkos::View<mj_scalar_t *, device_t>(
2701 Kokkos::ViewAllocateWithoutInitializing("thread_cut_left_closest_point"),
2702 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2703
2704 // thread_cut_right_closest_point to hold the closest coordinate to a
2705 // cutline from right (for each thread)
2706 this->thread_cut_right_closest_point = Kokkos::View<mj_scalar_t *, device_t>(
2707 Kokkos::ViewAllocateWithoutInitializing("thread_cut_right_closest_point"),
2708 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2709
2710 // to store how many points in each part a thread has.
2711 this->thread_point_counts = Kokkos::View<mj_lno_t *, device_t>(
2712 Kokkos::ViewAllocateWithoutInitializing("thread_point_counts"),
2713 this->max_num_part_along_dim);
2714
2715 // for faster communication, concatanation of
2716 // totalPartWeights sized 2P-1, since there are P parts and P-1 cut lines
2717 // leftClosest distances sized P-1, since P-1 cut lines
2718 // rightClosest distances size P-1, since P-1 cut lines.
2719 this->total_part_weight_left_right_closests =
2720 Kokkos::View<mj_scalar_t*, device_t>(
2721 Kokkos::ViewAllocateWithoutInitializing(
2722 "total_part_weight_left_right_closests"),
2723 (this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) *
2724 this->max_concurrent_part_calculation);
2725
2726 this->global_total_part_weight_left_right_closests =
2727 Kokkos::View<mj_scalar_t*, device_t>(
2728 Kokkos::ViewAllocateWithoutInitializing(
2729 "global_total_part_weight_left_right_closests"),
2730 (this->max_num_total_part_along_dim +
2731 this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2732
2733 this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
2734 Kokkos::ViewAllocateWithoutInitializing("gids"), num_local_coords);
2735
2736 this->owner_of_coordinate = Kokkos::View<int *, Kokkos::HostSpace>(
2737 Kokkos::ViewAllocateWithoutInitializing("owner_of_coordinate"),
2738 num_local_coords);
2739
2740 // changes owners back to host - so we don't run them on device
2741 // this improves migration code but means we have to serial init here.
2742 // Note we might allow this to be OpenMP when available even for CUDA.
2743 Kokkos::deep_copy(owner_of_coordinate, myActualRank);
2744
2745 auto local_current_mj_gnos = current_mj_gnos;
2746 auto local_initial_mj_gnos = initial_mj_gnos;
2747 Kokkos::parallel_for(
2748 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2749 (0, num_local_coords), KOKKOS_LAMBDA (mj_lno_t j) {
2750 local_current_mj_gnos(j) = local_initial_mj_gnos(j);
2751 });
2752}
2753
2754/* \brief compute the global bounding box
2755 */
2756template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2757 typename mj_part_t, typename mj_node_t>
2758void AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t,
2759 mj_node_t>::compute_global_box()
2760{
2761 //local min coords
2762 mj_scalar_t *mins = new mj_scalar_t[this->coord_dim];
2763 //global min coords
2764 mj_scalar_t *gmins = new mj_scalar_t[this->coord_dim];
2765 //local max coords
2766 mj_scalar_t *maxs = new mj_scalar_t[this->coord_dim];
2767 //global max coords
2768 mj_scalar_t *gmaxs = new mj_scalar_t[this->coord_dim];
2769
2770 auto local_mj_coordinates = this->mj_coordinates;
2771
2772 // If we are only doing 2 parts then we don't need these values
2773 // for y and z. Init them all to 0 first
2774 for(int i = 0; i < this->coord_dim; ++i) {
2775 mins[i] = 0;
2776 maxs[i] = 0;
2777 }
2778
2779 for(int i = 0; i < std::min(this->recursion_depth, this->coord_dim); ++i) {
2780 Kokkos::parallel_reduce("MinReduce",
2781 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2782 (0, this->num_local_coords),
2783 KOKKOS_LAMBDA(mj_lno_t j, mj_scalar_t & running_min) {
2784 if(local_mj_coordinates(j,i) < running_min) {
2785 running_min = local_mj_coordinates(j,i);
2786 }
2787 }, Kokkos::Min<mj_scalar_t>(mins[i]));
2788 Kokkos::parallel_reduce("MaxReduce",
2789 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2790 (0, this->num_local_coords),
2791 KOKKOS_LAMBDA(mj_lno_t j, mj_scalar_t & running_max) {
2792 if(local_mj_coordinates(j,i) > running_max) {
2793 running_max = local_mj_coordinates(j,i);
2794 }
2795 }, Kokkos::Max<mj_scalar_t>(maxs[i]));
2796 }
2797
2798 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MIN,
2799 this->coord_dim, mins, gmins
2800 );
2801
2802 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MAX,
2803 this->coord_dim, maxs, gmaxs
2804 );
2805
2806 //create single box with all areas.
2807 global_box = rcp(new mj_partBox_t(0,this->coord_dim,gmins,gmaxs));
2808 //coordinateModelPartBox <mj_scalar_t, mj_part_t> tmpBox (0, coordDim);
2809 delete [] mins;
2810 delete [] gmins;
2811 delete [] maxs;
2812 delete [] gmaxs;
2813}
2814
2815/* \brief for part communication we keep track of the box boundaries.
2816 * This is performed when either asked specifically, or when geometric mapping
2817 * is performed afterwards.
2818 * This function initializes a single box with all global min, max coordinates.
2819 * \param initial_partitioning_boxes the input and output vector for boxes.
2820 */
2821template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2822 typename mj_part_t, typename mj_node_t>
2823void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2824 mj_node_t>::init_part_boxes(
2825 RCP<mj_partBoxVector_t> & initial_partitioning_boxes)
2826{
2827 mj_partBox_t tmp_box(*global_box);
2828 initial_partitioning_boxes->push_back(tmp_box);
2829}
2830
2835template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2836 typename mj_part_t,
2837 typename mj_node_t>
2838void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2839 mj_get_local_min_max_coord_totW(
2840 mj_part_t current_work_part,
2841 mj_part_t current_concurrent_num_parts,
2842 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords)
2843{
2844 auto local_coordinate_permutations = this->coordinate_permutations;
2845 auto local_process_local_min_max_coord_total_weight =
2846 this->process_local_min_max_coord_total_weight;
2847 auto local_mj_weights = this->mj_weights;
2848
2849 bool bUniformWeights = mj_uniform_weights(0);
2850
2851 for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
2852
2853 mj_part_t concurrent_current_part = current_work_part + kk;
2854 mj_lno_t coordinate_begin_index = concurrent_current_part == 0 ? 0 :
2855 host_part_xadj(concurrent_current_part - 1);
2856 mj_lno_t coordinate_end_index =
2857 host_part_xadj(concurrent_current_part);
2858
2859 mj_scalar_t my_min_coord = 0;
2860 mj_scalar_t my_max_coord = 0;
2861 mj_scalar_t my_total_weight;
2862 //if the part is empty.
2863 //set the min and max coordinates as reverse.
2864 if(coordinate_begin_index >= coordinate_end_index)
2865 {
2866 my_min_coord = std::numeric_limits<mj_scalar_t>::max();
2867 my_max_coord = -std::numeric_limits<mj_scalar_t>::max();
2868 my_total_weight = 0;
2869 }
2870 else {
2871 // get min
2872 Kokkos::parallel_reduce("get min",
2873 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2874 (coordinate_begin_index, coordinate_end_index),
2875 KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & running_min) {
2876 int i = local_coordinate_permutations(j);
2877 if(mj_current_dim_coords(i) < running_min)
2878 running_min = mj_current_dim_coords(i);
2879 }, Kokkos::Min<mj_scalar_t>(my_min_coord));
2880 // get max
2881 Kokkos::parallel_reduce("get max",
2882 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2883 (coordinate_begin_index, coordinate_end_index),
2884 KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & running_max) {
2885 int i = local_coordinate_permutations(j);
2886 if(mj_current_dim_coords(i) > running_max)
2887 running_max = mj_current_dim_coords(i);
2888 }, Kokkos::Max<mj_scalar_t>(my_max_coord));
2889 if(bUniformWeights) {
2890 my_total_weight = coordinate_end_index - coordinate_begin_index;
2891 }
2892 else {
2893 my_total_weight = 0;
2894 Kokkos::parallel_reduce("get weight",
2895 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2896 (coordinate_begin_index, coordinate_end_index),
2897 KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & lsum) {
2898 int i = local_coordinate_permutations(j);
2899 lsum += local_mj_weights(i,0);
2900 }, my_total_weight);
2901 }
2902 }
2903
2904 // single write
2905 Kokkos::parallel_for(
2906 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
2907 (0, 1), KOKKOS_LAMBDA (int dummy) {
2908 local_process_local_min_max_coord_total_weight(kk) =
2909 my_min_coord;
2910 local_process_local_min_max_coord_total_weight(
2911 kk + current_concurrent_num_parts) = my_max_coord;
2912 local_process_local_min_max_coord_total_weight(
2913 kk + 2*current_concurrent_num_parts) = my_total_weight;
2914 });
2915 }
2916}
2917
2930template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2931 typename mj_part_t, typename mj_node_t>
2932void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2933 mj_node_t>::mj_get_global_min_max_coord_totW(
2934 mj_part_t current_concurrent_num_parts,
2935 Kokkos::View<mj_scalar_t *, device_t> & local_min_max_total,
2936 Kokkos::View<mj_scalar_t *, device_t> & global_min_max_total) {
2937 // reduce min for first current_concurrent_num_parts elements, reduce
2938 // max for next concurrentPartCount elements, reduce sum for the last
2939 // concurrentPartCount elements.
2940 if(this->comm->getSize() > 1) {
2941 // We're using explicit host here as Spectrum MPI would fail
2942 // with the prior host_mirror_type UVMSpace to UVMSpace setup.
2943 auto host_local_min_max_total =
2944 Kokkos::create_mirror_view(Kokkos::HostSpace(), local_min_max_total);
2945 auto host_global_min_max_total =
2946 Kokkos::create_mirror_view(Kokkos::HostSpace(), global_min_max_total);
2947 Kokkos::deep_copy(host_local_min_max_total, local_min_max_total);
2949 reductionOp(current_concurrent_num_parts,
2950 current_concurrent_num_parts, current_concurrent_num_parts);
2951 try {
2952 reduceAll<int, mj_scalar_t>(
2953 *(this->comm),
2954 reductionOp,
2955 3 * current_concurrent_num_parts,
2956 host_local_min_max_total.data(),
2957 host_global_min_max_total.data());
2958 }
2959 Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
2960 Kokkos::deep_copy(global_min_max_total, host_global_min_max_total);
2961 }
2962 else {
2963 mj_part_t s = 3 * current_concurrent_num_parts;
2964 Kokkos::parallel_for(
2965 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
2966 (0, s), KOKKOS_LAMBDA (mj_part_t i) {
2967 global_min_max_total(i) = local_min_max_total(i);
2968 });
2969 }
2970}
2971
3004template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3005 typename mj_part_t, typename mj_node_t>
3006void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
3007 mj_get_initial_cut_coords_target_weights(
3008 mj_scalar_t min_coord,
3009 mj_scalar_t max_coord,
3010 mj_part_t num_cuts/*p-1*/ ,
3011 mj_scalar_t global_weight,
3012 /*p - 1 sized, coordinate of each cut line*/
3013 Kokkos::View<mj_scalar_t *, device_t> & initial_cut_coords,
3014 /*cumulative weights, at left side of each cut line. p-1 sized*/
3015 Kokkos::View<mj_scalar_t *, device_t> & current_target_part_weights ,
3016 std::vector <mj_part_t> *future_num_part_in_parts, //the vecto
3017 std::vector <mj_part_t> *next_future_num_parts_in_parts,
3018 mj_part_t concurrent_current_part,
3019 mj_part_t obtained_part_index,
3020 mj_part_t num_target_first_level_parts,
3021 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & target_first_level_dist)
3022{
3023 mj_scalar_t coord_range = max_coord - min_coord;
3024
3025 // We decided we could keep some std::vectors around for now. Eventually
3026 // it would be nice to have everything just as views with some being device
3027 // and some host. This particular case needs a bit of work to get setup
3028 // in a cleaner way so not going to mess with it at the moment.
3029
3030 bool bUniformPartsCheck =
3031 num_target_first_level_parts <= 1 && this->mj_uniform_parts(0);
3032
3033 if(!bUniformPartsCheck) {
3034 bool bValidNonUniformTargetWeights =
3035 (num_target_first_level_parts > 1 && target_first_level_dist.size() != 0);
3036 if(!bValidNonUniformTargetWeights) {
3037 std::cerr << "MJ does not support non uniform part weights beyond the first partition" << std::endl;
3038 std::terminate();
3039 }
3040 }
3041
3042 Kokkos::View<mj_scalar_t*, device_t> device_cumulative(
3043 "device_cumulative", num_cuts);
3044 auto host_cumulative = Kokkos::create_mirror_view(device_cumulative);
3045
3046 mj_scalar_t cumulative = 0;
3047
3048 if(bUniformPartsCheck) {
3049 // How many total future parts the part will be partitioned into.
3050 mj_scalar_t total_future_part_count_in_part =
3051 static_cast<mj_scalar_t>((*future_num_part_in_parts)[concurrent_current_part]);
3052
3053 // How much each part should weigh in ideal case.
3054 mj_scalar_t unit_part_weight =
3055 global_weight / total_future_part_count_in_part;
3056
3057 for(mj_part_t i = 0; i < num_cuts; ++i) {
3058 cumulative += unit_part_weight * static_cast<mj_scalar_t>((*next_future_num_parts_in_parts)[i + obtained_part_index]);
3059 host_cumulative(i) = cumulative;
3060 }
3061 }
3062 else {
3063 // Sum of entries in the first level partition distribution vector
3064 mj_scalar_t sum_target_first_level_dist = 0.0;
3065 for (int i = 0; i < num_target_first_level_parts; ++i) {
3066 sum_target_first_level_dist += target_first_level_dist(i);
3067 }
3068
3069 for(mj_part_t i = 0; i < num_cuts; ++i) {
3070 cumulative += global_weight * target_first_level_dist(i) /
3071 sum_target_first_level_dist;
3072 host_cumulative(i) = cumulative;
3073 }
3074 }
3075
3076 Kokkos::deep_copy(device_cumulative, host_cumulative);
3077
3078 Kokkos::parallel_for("Write num in parts",
3079 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3080 (0, num_cuts), KOKKOS_LAMBDA(mj_part_t cut) {
3081 // set target part weight.
3082 current_target_part_weights(cut) = device_cumulative(cut);
3083 initial_cut_coords(cut) = min_coord +
3084 (coord_range * device_cumulative(cut)) / global_weight;
3085 // set this multiple times but here for device handling
3086 current_target_part_weights(num_cuts) = global_weight;
3087 });
3088
3089 // round the target part weights.
3090 // Note need to discuss regarding DragonFly commits and determine if we
3091 // would not simply check mj_uniform_weights here.
3092 if (!bUniformPartsCheck || this->mj_uniform_weights[0]) {
3093 Kokkos::parallel_for(
3094 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3095 (0, num_cuts + 1),
3096 KOKKOS_LAMBDA (mj_part_t i) {
3097 current_target_part_weights(i) =
3098 long(current_target_part_weights(i) + 0.5);
3099 });
3100 }
3101}
3102
3119template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3120 typename mj_part_t, typename mj_node_t>
3121void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
3122 set_initial_coordinate_parts(
3123 mj_scalar_t &max_coordinate,
3124 mj_scalar_t &min_coordinate,
3125 mj_lno_t coordinate_begin_index,
3126 mj_lno_t coordinate_end_index,
3127 Kokkos::View<mj_lno_t *, device_t> & mj_current_coordinate_permutations,
3128 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
3129 Kokkos::View<mj_part_t *, device_t> & mj_part_ids,
3130 mj_part_t &partition_count)
3131{
3132 mj_scalar_t coordinate_range = max_coordinate - min_coordinate;
3133
3134 // if there is single point, or if all points are along a line.
3135 // set initial part to 0 for all.
3136 if(std::abs(coordinate_range) < this->sEpsilon ) {
3137 Kokkos::parallel_for(
3138 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
3139 (coordinate_begin_index, coordinate_end_index),
3140 KOKKOS_LAMBDA (mj_lno_t ii) {
3141 mj_part_ids(mj_current_coordinate_permutations[ii]) = 0;
3142 });
3143 }
3144 else {
3145 // otherwise estimate an initial part for each coordinate.
3146 // assuming uniform distribution of points.
3147 mj_scalar_t slice = coordinate_range / partition_count;
3148 Kokkos::parallel_for(
3149 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
3150 (coordinate_begin_index, coordinate_end_index),
3151 KOKKOS_LAMBDA (mj_lno_t ii) {
3152 mj_lno_t iii = mj_current_coordinate_permutations[ii];
3153 mj_part_t pp =
3154 mj_part_t((mj_current_dim_coords[iii] - min_coordinate) / slice);
3155 if(pp >= partition_count) {
3156 pp = partition_count - 1; // don't want last coord in an invalid part
3157 }
3158 mj_part_ids[iii] = 2 * pp;
3159 });
3160 }
3161}
3162
3177template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3178 typename mj_part_t, typename mj_node_t>
3179void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,mj_node_t>::mj_1D_part(
3180 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
3181 double used_imbalance_tolerance,
3182 mj_part_t current_work_part,
3183 mj_part_t current_concurrent_num_parts,
3184 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
3185 mj_part_t total_incomplete_cut_count,
3186 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count,
3187 Kokkos::View<size_t*, device_t> & view_total_reduction_size)
3188{
3189 this->temp_cut_coords = current_cut_coordinates;
3190
3192 *reductionOp = NULL;
3193
3194 bool bSingleProcess = (this->comm->getSize() == 1);
3195
3196 std::vector<mj_part_t> temp(host_num_partitioning_in_current_dim.size());
3197 if(!bSingleProcess) {
3198 for(size_t n = 0; n < host_num_partitioning_in_current_dim.size(); ++n) {
3199 temp[n] = host_num_partitioning_in_current_dim(n);
3200 }
3202 <mj_part_t, mj_scalar_t>(
3203 &temp,
3204 current_work_part,
3205 current_concurrent_num_parts);
3206 }
3207
3208 auto local_cut_lower_bound_coordinates =
3209 cut_lower_bound_coordinates;
3210 auto local_cut_upper_bound_coordinates =
3211 cut_upper_bound_coordinates;
3212 auto local_cut_upper_bound_weights = cut_upper_bound_weights;
3213 auto local_cut_lower_bound_weights = cut_lower_bound_weights;
3214 bool local_distribute_points_on_cut_lines = distribute_points_on_cut_lines;
3215 auto local_process_cut_line_weight_to_put_left =
3216 process_cut_line_weight_to_put_left;
3217 auto local_temp_cut_coords = temp_cut_coords;
3218 auto local_global_total_part_weight_left_right_closests =
3219 global_total_part_weight_left_right_closests;
3220 auto local_cut_coordinates_work_array =
3221 cut_coordinates_work_array;
3222 auto local_part_xadj = part_xadj;
3223 auto local_global_min_max_coord_total_weight =
3224 global_min_max_coord_total_weight;
3225 auto local_target_part_weights =
3226 target_part_weights;
3227 auto local_global_rectilinear_cut_weight =
3228 global_rectilinear_cut_weight;
3229 auto local_process_rectilinear_cut_weight =
3230 process_rectilinear_cut_weight;
3231
3232 auto local_is_cut_line_determined = this->is_cut_line_determined;
3233 auto local_device_num_partitioning_in_current_dim =
3234 device_num_partitioning_in_current_dim;
3235
3236 Kokkos::parallel_for(
3237 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
3238 KOKKOS_LAMBDA (int dummy) {
3239
3240 // these need to be initialized
3241 view_rectilinear_cut_count(0) = 0;
3242 view_total_reduction_size(0) = 0;
3243
3244 // initialize the lower and upper bounds of the cuts.
3245 mj_part_t next = 0;
3246 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
3247 mj_part_t num_part_in_dim =
3248 local_device_num_partitioning_in_current_dim(current_work_part + i);
3249 mj_part_t num_cut_in_dim = num_part_in_dim - 1;
3250 view_total_reduction_size(0) += (4 * num_cut_in_dim + 1);
3251
3252 for(mj_part_t ii = 0; ii < num_cut_in_dim; ++ii) {
3253 local_is_cut_line_determined(next) = false;
3254 // min coordinate
3255 local_cut_lower_bound_coordinates(next) =
3256 local_global_min_max_coord_total_weight(i);
3257 // max coordinate
3258 local_cut_upper_bound_coordinates(next) =
3259 local_global_min_max_coord_total_weight(
3260 i + current_concurrent_num_parts);
3261 // total weight
3262 local_cut_upper_bound_weights(next) =
3263 local_global_min_max_coord_total_weight(
3264 i + 2 * current_concurrent_num_parts);
3265 local_cut_lower_bound_weights(next) = 0;
3266 if(local_distribute_points_on_cut_lines) {
3267 local_process_cut_line_weight_to_put_left(next) = 0;
3268 }
3269 ++next;
3270 }
3271 }
3272 });
3273
3274 // loop_count allows the kernel to behave differently on the first loop
3275 // and subsequent loops. First loop we do a binary search and subsequent
3276 // loops we simply step towards our target.
3277 int loop_count = 0;
3278 while (total_incomplete_cut_count != 0) {
3279 this->mj_1D_part_get_part_weights(
3280 current_concurrent_num_parts,
3281 current_work_part,
3282 mj_current_dim_coords,
3283 loop_count);
3284 ++loop_count;
3285
3286 this->mj_combine_rightleft_and_weights(
3287 current_work_part,
3288 current_concurrent_num_parts);
3289
3290 // now sum up the results of mpi processors.
3291 if(!bSingleProcess) {
3292 // We're using explicit host here as Spectrum MPI would fail
3293 // with the prior host_mirror_type UVMSpace to UVMSpace setup.
3294 auto host_total_part_weight_left_right_closests =
3295 Kokkos::create_mirror_view(Kokkos::HostSpace(),
3296 total_part_weight_left_right_closests);
3297 auto host_global_total_part_weight_left_right_closests =
3298 Kokkos::create_mirror_view(Kokkos::HostSpace(),
3299 global_total_part_weight_left_right_closests);
3300
3301 Kokkos::deep_copy(host_total_part_weight_left_right_closests,
3302 total_part_weight_left_right_closests);
3303
3304 size_t host_view_total_reduction_size;
3305 Kokkos::parallel_reduce("Read single",
3306 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
3307 KOKKOS_LAMBDA(int dummy, size_t & set_single) {
3308 set_single = view_total_reduction_size(0);
3309 }, host_view_total_reduction_size);
3310
3311 reduceAll<int, mj_scalar_t>( *(this->comm), *reductionOp,
3312 host_view_total_reduction_size,
3313 host_total_part_weight_left_right_closests.data(),
3314 host_global_total_part_weight_left_right_closests.data());
3315 Kokkos::deep_copy(global_total_part_weight_left_right_closests,
3316 host_global_total_part_weight_left_right_closests);
3317 }
3318 else {
3319 local_global_total_part_weight_left_right_closests =
3320 this->total_part_weight_left_right_closests;
3321 }
3322
3323 // how much cut will be shifted for the next part in the concurrent
3324 // part calculation.
3325 mj_part_t cut_shift = 0;
3326
3327 // how much the concantaneted array will be shifted for the next part
3328 // in concurrent part calculation.
3329 size_t tlr_shift = 0;
3330
3331 Kokkos::View<mj_part_t*, Kokkos::HostSpace>
3332 save_initial_incomplete_cut_count("save_initial_incomplete_cut_count",
3333 current_concurrent_num_parts);
3334
3335 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
3336
3337 mj_part_t num_parts =
3338 host_num_partitioning_in_current_dim(current_work_part + kk);
3339
3340 mj_part_t num_cuts = num_parts - 1;
3341 size_t num_total_part = num_parts + size_t (num_cuts);
3342
3343 //if the cuts of this cut has already been completed.
3344 //nothing to do for this part.
3345 //just update the shift amount and proceed.
3346 mj_part_t kk_incomplete_cut_count = this->incomplete_cut_count(kk);
3347
3348 if(kk_incomplete_cut_count == 0) {
3349 cut_shift += num_cuts;
3350 tlr_shift += (num_total_part + 2 * num_cuts);
3351 continue;
3352 }
3353
3354 Kokkos::View<mj_scalar_t *, device_t> current_local_part_weights =
3355 Kokkos::subview(this->total_part_weight_left_right_closests,
3356 std::pair<mj_lno_t, mj_lno_t>(
3357 tlr_shift,
3358 this->total_part_weight_left_right_closests.size()));
3359
3360 Kokkos::View<mj_scalar_t *, device_t> current_global_tlr =
3361 Kokkos::subview(
3362 local_global_total_part_weight_left_right_closests,
3363 std::pair<mj_lno_t, mj_lno_t>(
3364 tlr_shift,
3365 local_global_total_part_weight_left_right_closests.size()));
3366 Kokkos::View<mj_scalar_t *, device_t>
3367 current_global_left_closest_points =
3368 Kokkos::subview(current_global_tlr,
3369 std::pair<mj_lno_t, mj_lno_t>(
3370 num_total_part,
3371 current_global_tlr.size()));
3372 Kokkos::View<mj_scalar_t *, device_t>
3373 current_global_right_closest_points =
3374 Kokkos::subview(current_global_tlr,
3375 std::pair<mj_lno_t, mj_lno_t>(
3376 num_total_part + num_cuts,
3377 current_global_tlr.size()));
3378 Kokkos::View<mj_scalar_t *, device_t> current_global_part_weights =
3379 current_global_tlr;
3380
3381 Kokkos::View<bool *, device_t> current_cut_line_determined =
3382 Kokkos::subview(this->is_cut_line_determined,
3383 std::pair<mj_lno_t, mj_lno_t>(
3384 cut_shift,
3385 this->is_cut_line_determined.size()));
3386 Kokkos::View<mj_scalar_t *, device_t> current_part_target_weights =
3387 Kokkos::subview(local_target_part_weights,
3388 std::pair<mj_lno_t, mj_lno_t>(
3389 cut_shift + kk,
3390 local_target_part_weights.size()));
3391 Kokkos::View<mj_scalar_t *, device_t>
3392 current_part_cut_line_weight_to_put_left =
3393 Kokkos::subview(local_process_cut_line_weight_to_put_left,
3394 std::pair<mj_lno_t, mj_lno_t>(
3395 cut_shift,
3396 local_process_cut_line_weight_to_put_left.size()));
3397
3398 save_initial_incomplete_cut_count(kk) =
3399 kk_incomplete_cut_count;
3400
3401 Kokkos::View<mj_scalar_t *, device_t>
3402 current_cut_lower_bound_weights =
3403 Kokkos::subview(local_cut_lower_bound_weights,
3404 std::pair<mj_lno_t, mj_lno_t>(
3405 cut_shift,
3406 local_cut_lower_bound_weights.size()));
3407 Kokkos::View<mj_scalar_t *, device_t> current_cut_upper_weights =
3408 Kokkos::subview(local_cut_upper_bound_weights,
3409 std::pair<mj_lno_t, mj_lno_t>(
3410 cut_shift,
3411 local_cut_upper_bound_weights.size()));
3412 Kokkos::View<mj_scalar_t *, device_t> current_cut_upper_bounds =
3413 Kokkos::subview(local_cut_upper_bound_coordinates,
3414 std::pair<mj_lno_t, mj_lno_t>(
3415 cut_shift,
3416 local_cut_upper_bound_coordinates.size()));
3417 Kokkos::View<mj_scalar_t *, device_t> current_cut_lower_bounds =
3418 Kokkos::subview(local_cut_lower_bound_coordinates,
3419 std::pair<mj_lno_t, mj_lno_t>(
3420 cut_shift,
3421 local_cut_lower_bound_coordinates.size()));
3422
3423 // Now compute the new cut coordinates.
3424 Kokkos::View<mj_scalar_t*, device_t> sub_temp_cut_coords =
3425 Kokkos::subview(this->temp_cut_coords,
3426 std::pair<mj_lno_t, mj_lno_t>(
3427 cut_shift, this->temp_cut_coords.size()));
3428 Kokkos::View<mj_scalar_t*, device_t> sub_cut_coordinates_work_array =
3429 Kokkos::subview(this->cut_coordinates_work_array,
3430 std::pair<mj_lno_t, mj_lno_t>(
3431 cut_shift, this->cut_coordinates_work_array.size()));
3432
3433 this->mj_get_new_cut_coordinates(
3434 current_concurrent_num_parts,
3435 kk,
3436 num_cuts,
3437 used_imbalance_tolerance,
3438 current_global_part_weights,
3439 current_local_part_weights,
3440 current_part_target_weights,
3441 current_cut_line_determined,
3442 sub_temp_cut_coords,
3443 current_cut_upper_bounds,
3444 current_cut_lower_bounds,
3445 current_global_left_closest_points,
3446 current_global_right_closest_points,
3447 current_cut_lower_bound_weights,
3448 current_cut_upper_weights,
3449 sub_cut_coordinates_work_array,
3450 current_part_cut_line_weight_to_put_left,
3451 view_rectilinear_cut_count);
3452
3453 cut_shift += num_cuts;
3454 tlr_shift += (num_total_part + 2 * num_cuts);
3455 } // end of kk loop
3456
3457 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
3458 mj_part_t iteration_complete_cut_count =
3459 save_initial_incomplete_cut_count(kk) - this->incomplete_cut_count(kk);
3460 total_incomplete_cut_count -= iteration_complete_cut_count;
3461 }
3462
3463 Kokkos::parallel_for(
3464 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
3465 (0, local_temp_cut_coords.size()), KOKKOS_LAMBDA(int n) {
3466 auto t = local_temp_cut_coords(n);
3467 local_temp_cut_coords(n) = local_cut_coordinates_work_array(n);
3468 local_cut_coordinates_work_array(n) = t;
3469 });
3470 } // end of the while loop
3471
3472 // Needed only if keep_cuts; otherwise can simply swap array pointers
3473 // cutCoordinates and cutCoordinatesWork.
3474 // (at first iteration, cutCoordinates == cutCoorindates_tmp).
3475 // computed cuts must be in cutCoordinates.
3476 if(current_cut_coordinates != local_temp_cut_coords) {
3477 Kokkos::parallel_for(
3478 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
3479 (0, 1), KOKKOS_LAMBDA(int dummy) {
3480 mj_part_t next = 0;
3481 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
3482 mj_part_t num_parts = -1;
3483 num_parts = local_device_num_partitioning_in_current_dim(
3484 current_work_part + i);
3485 mj_part_t num_cuts = num_parts - 1;
3486 for(mj_part_t ii = 0; ii < num_cuts; ++ii) {
3487 current_cut_coordinates(next + ii) = local_temp_cut_coords(next + ii);
3488 }
3489 next += num_cuts;
3490 }
3491 for(int n = 0; n <
3492 static_cast<int>(local_cut_coordinates_work_array.size()); ++n) {
3493 local_cut_coordinates_work_array(n) = local_temp_cut_coords(n);
3494 }
3495 });
3496 }
3497
3498 delete reductionOp;
3499}
3500
3501template<class scalar_t>
3503 scalar_t * ptr;
3504
3505 // With new kokkos setup parallel_reduce will call empty constructor and
3506 // we update the ptr in the init method.
3507 KOKKOS_INLINE_FUNCTION
3509
3510 KOKKOS_INLINE_FUNCTION
3511 Zoltan2_MJArrayType(scalar_t * pSetPtr) : ptr(pSetPtr) {};
3512};
3513
3514#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
3515
3516template<class policy_t, class scalar_t, class part_t>
3518
3521 scalar_t max_scalar;
3525
3526 KOKKOS_INLINE_FUNCTION ArrayCombinationReducer(
3527 scalar_t mj_max_scalar,
3528 value_type &val,
3529 int mj_value_count_rightleft,
3530 int mj_value_count_weights) :
3531 max_scalar(mj_max_scalar),
3532 value(&val),
3533 value_count_rightleft(mj_value_count_rightleft),
3534 value_count_weights(mj_value_count_weights)
3535 {}
3536
3537 KOKKOS_INLINE_FUNCTION
3539 return *value;
3540 }
3541
3542 KOKKOS_INLINE_FUNCTION
3543 void join(value_type& dst, const value_type& src) const {
3544 for(int n = 0; n < value_count_weights; ++n) {
3545 dst.ptr[n] += src.ptr[n];
3546 }
3547
3548 for(int n = value_count_weights + 2;
3549 n < value_count_weights + value_count_rightleft - 2; n += 2) {
3550 if(src.ptr[n] > dst.ptr[n]) {
3551 dst.ptr[n] = src.ptr[n];
3552 }
3553 if(src.ptr[n+1] < dst.ptr[n+1]) {
3554 dst.ptr[n+1] = src.ptr[n+1];
3555 }
3556 }
3557 }
3558
3559 KOKKOS_INLINE_FUNCTION
3560 void join (volatile value_type& dst, const volatile value_type& src) const {
3561 for(int n = 0; n < value_count_weights; ++n) {
3562 dst.ptr[n] += src.ptr[n];
3563 }
3564
3565 for(int n = value_count_weights + 2;
3566 n < value_count_weights + value_count_rightleft - 2; n += 2) {
3567 if(src.ptr[n] > dst.ptr[n]) {
3568 dst.ptr[n] = src.ptr[n];
3569 }
3570 if(src.ptr[n+1] < dst.ptr[n+1]) {
3571 dst.ptr[n+1] = src.ptr[n+1];
3572 }
3573 }
3574 }
3575
3576 KOKKOS_INLINE_FUNCTION void init (value_type& dst) const {
3577 dst.ptr = value->ptr; // must update ptr
3578
3579 for(int n = 0; n < value_count_weights; ++n) {
3580 dst.ptr[n] = 0;
3581 }
3582
3583 for(int n = value_count_weights;
3585 dst.ptr[n] = -max_scalar;
3586 dst.ptr[n+1] = max_scalar;
3587 }
3588 }
3589};
3590#endif // KOKKOS_ENABLE_CUDA && KOKKOS_ENABLE_HIP
3591
3592template<class policy_t, class scalar_t, class part_t, class index_t,
3593 class device_t, class array_t>
3595 typedef typename policy_t::member_type member_type;
3596 typedef Kokkos::View<scalar_t*> scalar_view_t;
3597
3598#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
3599 typedef array_t value_type[];
3600#endif
3601
3603 array_t max_scalar;
3604
3612 Kokkos::View<index_t*, device_t> permutations;
3613 Kokkos::View<scalar_t *, device_t> coordinates;
3614 Kokkos::View<scalar_t**, device_t> weights;
3615 Kokkos::View<part_t*, device_t> parts;
3616 Kokkos::View<scalar_t *, device_t> cut_coordinates;
3617 Kokkos::View<index_t *, device_t> part_xadj;
3619 scalar_t sEpsilon;
3620
3621#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3622 Kokkos::View<double *, device_t> current_part_weights;
3623 Kokkos::View<scalar_t *, device_t> current_left_closest;
3624 Kokkos::View<scalar_t *, device_t> current_right_closest;
3625#endif // KOKKOS_ENABLE_CUDA || defined(KOKKOS_ENABLE_HIP)
3626
3628 int mj_loop_count,
3629 array_t mj_max_scalar,
3630 part_t mj_concurrent_current_part,
3631 part_t mj_num_cuts,
3632 part_t mj_current_work_part,
3633 part_t mj_current_concurrent_num_parts,
3634 part_t mj_left_right_array_size,
3635 part_t mj_weight_array_size,
3636 Kokkos::View<index_t*, device_t> & mj_permutations,
3637 Kokkos::View<scalar_t *, device_t> & mj_coordinates,
3638 Kokkos::View<scalar_t**, device_t> & mj_weights,
3639 Kokkos::View<part_t*, device_t> & mj_parts,
3640 Kokkos::View<scalar_t *, device_t> & mj_cut_coordinates,
3641 Kokkos::View<index_t *, device_t> & mj_part_xadj,
3642 bool mj_uniform_weights0,
3643 scalar_t mj_sEpsilon
3644#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3645 ,Kokkos::View<double *, device_t> & mj_current_part_weights,
3646 Kokkos::View<scalar_t *, device_t> & mj_current_left_closest,
3647 Kokkos::View<scalar_t *, device_t> & mj_current_right_closest
3648#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3649 ) :
3650 loop_count(mj_loop_count),
3651 max_scalar(mj_max_scalar),
3652 concurrent_current_part(mj_concurrent_current_part),
3653 num_cuts(mj_num_cuts),
3654 current_work_part(mj_current_work_part),
3655 current_concurrent_num_parts(mj_current_concurrent_num_parts),
3656 value_count_rightleft(mj_left_right_array_size),
3657 value_count_weights(mj_weight_array_size),
3658 value_count(mj_weight_array_size+mj_left_right_array_size),
3659 permutations(mj_permutations),
3660 coordinates(mj_coordinates),
3661 weights(mj_weights),
3662 parts(mj_parts),
3663 cut_coordinates(mj_cut_coordinates),
3664 part_xadj(mj_part_xadj),
3665 uniform_weights0(mj_uniform_weights0),
3666 sEpsilon(mj_sEpsilon)
3667#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3668 ,current_part_weights(mj_current_part_weights),
3669 current_left_closest(mj_current_left_closest),
3670 current_right_closest(mj_current_right_closest)
3671#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3672 {
3673 }
3674
3675 size_t team_shmem_size (int team_size) const {
3676#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3677 int result = sizeof(array_t) *
3679#else
3680 int result = sizeof(array_t) *
3682#endif
3683
3684 // pad this to a multiple of 8 or it will run corrupt
3685 int remainder = result % 8;
3686 if(remainder != 0) {
3687 result += 8 - remainder;
3688 }
3689 return result;
3690 }
3691
3692 KOKKOS_INLINE_FUNCTION
3693#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3694 void operator() (const member_type & teamMember) const {
3695#else
3696 void operator() (const member_type & teamMember, value_type teamSum) const {
3697#endif
3698
3699 index_t all_begin = (concurrent_current_part == 0) ? 0 :
3701 index_t all_end = part_xadj(concurrent_current_part);
3702
3703 index_t num_working_points = all_end - all_begin;
3704 int num_teams = teamMember.league_size();
3705
3706 index_t stride = num_working_points / num_teams;
3707 if((num_working_points % num_teams) > 0) {
3708 stride += 1; // make sure we have coverage for the final points
3709 }
3710
3711 // the last team may have less work than the other teams
3712 // the last team can be empty (begin > end) if num_teams > stride
3713 // which is true for many teams and small numbers of coords (tests)
3714 index_t begin = all_begin + stride * teamMember.league_rank();
3715 index_t end = begin + stride;
3716 if(end > all_end) {
3717 end = all_end;
3718 }
3719
3720#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3721 size_t sh_mem_size = sizeof(array_t) * (value_count_weights +
3723
3724 array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
3725 sh_mem_size);
3726
3727 // init the shared array to 0
3728 Kokkos::single(Kokkos::PerTeam(teamMember), [&] () {
3729 for(int n = 0; n < value_count_weights; ++n) {
3730 shared_ptr[n] = 0;
3731 }
3732 for(int n = value_count_weights;
3734 shared_ptr[n] = -max_scalar;
3735 shared_ptr[n+1] = max_scalar;
3736 }
3737 });
3738 teamMember.team_barrier();
3739
3740 Kokkos::parallel_for(
3741 Kokkos::TeamThreadRange(teamMember, begin, end),
3742 [&] (index_t ii) {
3743#else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3744 // create the team shared data - each thread gets one of the arrays
3745 size_t sh_mem_size = sizeof(array_t) * (value_count_weights +
3746 value_count_rightleft) * teamMember.team_size();
3747
3748 array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
3749 sh_mem_size);
3750
3751 // select the array for this thread
3752 Zoltan2_MJArrayType<array_t> array(&shared_ptr[teamMember.team_rank() *
3754
3755 // create reducer which handles the Zoltan2_MJArrayType class
3757 max_scalar, array,
3760
3761 Kokkos::parallel_reduce(
3762 Kokkos::TeamThreadRange(teamMember, begin, end),
3763#if (__cplusplus > 201703L)
3764 [=, this] (size_t ii, Zoltan2_MJArrayType<array_t>& threadSum) {
3765#else
3766 [=] (size_t ii, Zoltan2_MJArrayType<array_t>& threadSum) {
3767#endif
3768#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3769
3770 int i = permutations(ii);
3771 scalar_t coord = coordinates(i);
3772 array_t w = uniform_weights0 ? 1 : (array_t) weights(i,0);
3773
3774 // now check each part and it's right cut
3775 index_t part = parts(i)/2;
3776
3777 int upper = num_cuts;
3778 int lower = 0;
3779
3780 // binary search - find matching part
3781 while(true) {
3782 scalar_t a = (part == 0) ? -max_scalar : cut_coordinates(part-1);
3783 scalar_t b = (part == num_cuts) ? max_scalar : cut_coordinates(part);
3784
3785 if(coord >= a + sEpsilon && coord <= b - sEpsilon) {
3786#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3787 Kokkos::atomic_add(&shared_ptr[part*2], w);
3788#else
3789 threadSum.ptr[part*2] += w;
3790#endif
3791
3792 parts(i) = part*2;
3793
3794 // now handle the left/right closest part
3795#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3796 array_t new_value = (array_t) coord;
3797 array_t prev_value = shared_ptr[value_count_weights + part * 2 + 1];
3798 while(new_value < prev_value) {
3799 prev_value = Kokkos::atomic_compare_exchange(
3800 &shared_ptr[value_count_weights + part * 2 + 1],
3801 prev_value, new_value);
3802 }
3803 prev_value = shared_ptr[value_count_weights + part * 2 + 2];
3804 while(new_value > prev_value) {
3805 prev_value = Kokkos::atomic_compare_exchange(
3806 &shared_ptr[value_count_weights + part * 2 + 2],
3807 prev_value, new_value);
3808 }
3809#else
3810 // note cut to left needs to set right closest and cut to right needs
3811 // to set left closest. It's index +1 and +2 instead of -1 and +0
3812 // because right/left segment is padded with an extra pair at
3813 // begining and end to avoid branching with if checks.
3814 if(coord < threadSum.ptr[value_count_weights + part * 2 + 1]) {
3815 threadSum.ptr[value_count_weights + part * 2 + 1] = coord;
3816 }
3817 if(coord > threadSum.ptr[value_count_weights + part * 2 + 2]) {
3818 threadSum.ptr[value_count_weights + part * 2 + 2] = coord;
3819 }
3820#endif
3821
3822 break;
3823 }
3824 else if(part != num_cuts) {
3825 if(coord < b + sEpsilon && coord > b - sEpsilon) {
3826 // Note if on cut we set right/left closest to the cut itself
3827 // but we add +2 because we buffered the area with an extra slot
3828 // to reduce cuda branching. So it's +2, +3 instead of +0, +1.
3829#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3830 Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3831 shared_ptr[value_count_weights + part * 2 + 2] = b;
3832 shared_ptr[value_count_weights + part * 2 + 3] = b;
3833#else
3834 threadSum.ptr[part*2+1] += w;
3835 threadSum.ptr[value_count_weights + part * 2 + 2] = b;
3836 threadSum.ptr[value_count_weights + part * 2 + 3] = b;
3837#endif
3838
3839 parts(i) = part*2+1;
3840
3841 // Need to scan up for any other cuts of same coordinate
3842 // This is costly but it's only relevant for the fix4785 test
3843 // which loads a lot of coordinates on the same point, so without
3844 // this our cuts would all just sit at 0.
3845 part_t base_b = part;
3846 scalar_t base_coord = cut_coordinates(base_b);
3847 part += 1;
3848 while(part < num_cuts) {
3849 b = cut_coordinates(part);
3850 scalar_t delta = b - base_coord;
3851 if(delta < 0) delta = -delta;
3852 if(delta < sEpsilon) {
3853 // Note if on cut we set right/left closest to the cut itself
3854 // but we add +2 because we buffered the area with an extra slot
3855 // to reduce cuda branching. So it's +2, +3 instead of +0, +1.
3856#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3857 Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3858 shared_ptr[value_count_weights + part * 2 + 2] = b;
3859 shared_ptr[value_count_weights + part * 2 + 3] = b;
3860#else
3861 threadSum.ptr[part*2+1] += w;
3862 threadSum.ptr[value_count_weights + part * 2 + 2] = b;
3863 threadSum.ptr[value_count_weights + part * 2 + 3] = b;
3864#endif
3865 }
3866 else { break; }
3867 ++part;
3868 }
3869 part = base_b - 1;
3870 while(part >= 0) {
3871 b = cut_coordinates(part);
3872 scalar_t delta = b - base_coord;
3873 if(delta < 0) delta = -delta;
3874 if(delta < sEpsilon) {
3875 // Note if on cut we set right/left closest to the cut itself
3876 // but we add +2 because we buffered the area with an extra slot
3877 // to reduce cuda branching. So it's +2, +3 instead of +0, +1.
3878#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3879 Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3880 shared_ptr[value_count_weights + part * 2 + 2] = b;
3881 shared_ptr[value_count_weights + part * 2 + 3] = b;
3882#else
3883 threadSum.ptr[part*2+1] += w;
3884 threadSum.ptr[value_count_weights + part * 2 + 2] = b;
3885 threadSum.ptr[value_count_weights + part * 2 + 3] = b;
3886#endif
3887 }
3888 else { break; }
3889 --part;
3890 }
3891
3892 break;
3893 }
3894 }
3895
3896 if(loop_count != 0) {
3897 // subsequent loops can just step towards target
3898 if(coord < b) {
3899 part -= 1;
3900 }
3901 else {
3902 part += 1;
3903 }
3904 }
3905 else {
3906 // initial loop binary search
3907 if(coord < b) {
3908 if(part == lower + 1) {
3909 part = lower;
3910 }
3911 else {
3912 upper = part - 1;
3913 part -= (part - lower)/2;
3914 }
3915 }
3916 else if(part == upper - 1) {
3917 part = upper;
3918 }
3919 else {
3920 lower = part + 1;
3921 part += (upper - part)/2;
3922 }
3923 }
3924 }
3925#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3926 });
3927#else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3928 }, arraySumReducer);
3929#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3930
3931 teamMember.team_barrier();
3932
3933 // collect all the team's results
3934#if (__cplusplus > 201703L)
3935 Kokkos::single(Kokkos::PerTeam(teamMember), [=, this] () {
3936#else
3937 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
3938#endif
3939 for(int n = 0; n < value_count_weights; ++n) {
3940#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3941 Kokkos::atomic_add(&current_part_weights(n),
3942 static_cast<double>(shared_ptr[n]));
3943#else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3944 teamSum[n] += array.ptr[n];
3945#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3946 }
3947
3948#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3949 int insert_left = 0;
3950 int insert_right = 0;
3951#endif
3952
3953 for(int n = 2 + value_count_weights;
3954 n < value_count_weights + value_count_rightleft - 2; n += 2) {
3955#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3956 scalar_t new_value = shared_ptr[n+1];
3957 scalar_t prev_value = current_right_closest(insert_right);
3958 while(new_value < prev_value) {
3959 prev_value = Kokkos::atomic_compare_exchange(
3960 &current_right_closest(insert_right), prev_value, new_value);
3961 }
3962
3963 new_value = shared_ptr[n];
3964 prev_value = current_left_closest(insert_left);
3965 while(new_value > prev_value) {
3966 prev_value = Kokkos::atomic_compare_exchange(
3967 &current_left_closest(insert_left), prev_value, new_value);
3968 }
3969
3970 ++insert_left;
3971 ++insert_right;
3972#else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3973 if(array.ptr[n] > teamSum[n]) {
3974 teamSum[n] = array.ptr[n];
3975 }
3976 if(array.ptr[n+1] < teamSum[n+1]) {
3977 teamSum[n+1] = array.ptr[n+1];
3978 }
3979#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3980 }
3981 });
3982
3983 teamMember.team_barrier();
3984 }
3985
3986#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
3987 KOKKOS_INLINE_FUNCTION
3988 void join(value_type dst, const value_type src) const {
3989 for(int n = 0; n < value_count_weights; ++n) {
3990 dst[n] += src[n];
3991 }
3992
3993 for(int n = value_count_weights + 2;
3994 n < value_count_weights + value_count_rightleft - 2; n += 2) {
3995 if(src[n] > dst[n]) {
3996 dst[n] = src[n];
3997 }
3998 if(src[n+1] < dst[n+1]) {
3999 dst[n+1] = src[n+1];
4000 }
4001 }
4002 }
4003
4004 KOKKOS_INLINE_FUNCTION void init (value_type dst) const {
4005 for(int n = 0; n < value_count_weights; ++n) {
4006 dst[n] = 0;
4007 }
4008
4009 for(int n = value_count_weights;
4011 dst[n] = -max_scalar;
4012 dst[n+1] = max_scalar;
4013 }
4014 }
4015#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4016};
4017
4025template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4026 typename mj_part_t, typename mj_node_t>
4027void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t,mj_part_t, mj_node_t>::
4028 mj_1D_part_get_part_weights(
4030 mj_part_t current_work_part,
4031 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
4032 int loop_count)
4033{
4034 auto local_is_cut_line_determined = is_cut_line_determined;
4035 auto local_thread_part_weights = thread_part_weights;
4036 auto local_thread_cut_left_closest_point = thread_cut_left_closest_point;
4037 auto local_thread_cut_right_closest_point = thread_cut_right_closest_point;
4038
4039 // Create some locals so we don't use this inside the kernels
4040 // which causes problems
4041 auto local_sEpsilon = this->sEpsilon;
4042 auto local_assigned_part_ids = this->assigned_part_ids;
4043 auto local_coordinate_permutations = this->coordinate_permutations;
4044 auto local_mj_weights = this->mj_weights;
4045 auto local_part_xadj = this->part_xadj;
4046 auto local_global_min_max_coord_total_weight =
4047 this->global_min_max_coord_total_weight;
4048
4049 typedef Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy_t;
4050
4051 auto local_device_num_partitioning_in_current_dim =
4052 device_num_partitioning_in_current_dim;
4053
4054 Kokkos::deep_copy(device_incomplete_cut_count, this->incomplete_cut_count);
4055 auto local_device_incomplete_cut_count = device_incomplete_cut_count;
4056
4057 mj_part_t total_part_shift = 0;
4058
4059 mj_part_t concurrent_cut_shifts = 0;
4060 for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
4061 Kokkos::View<mj_scalar_t *, device_t> local_temp_cut_coords =
4062 Kokkos::subview(temp_cut_coords, std::pair<mj_lno_t, mj_lno_t>(
4063 concurrent_cut_shifts, temp_cut_coords.size()));
4064
4065 mj_part_t num_parts =
4066 host_num_partitioning_in_current_dim(current_work_part + kk);
4067 mj_part_t num_cuts = num_parts - 1;
4068 mj_part_t total_part_count = num_parts + num_cuts;
4069 mj_part_t weight_array_length = num_cuts + num_parts;
4070
4071 // for right/left closest + buffer cut on either side
4072 mj_part_t right_left_array_length = (num_cuts + 2) * 2;
4073
4074 if(this->incomplete_cut_count(kk) == 0) {
4075 total_part_shift += total_part_count;
4076 concurrent_cut_shifts += num_cuts;
4077 continue;
4078 }
4079
4080 // if not set use 60 - was initial testing amount but somewhat arbitrary
4081 auto policy_ReduceWeightsFunctor = policy_t(
4082 mj_num_teams ? mj_num_teams : 60, Kokkos::AUTO);
4083
4084#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4085 int total_array_length =
4086 weight_array_length + right_left_array_length;
4087#endif
4088
4089 // Using float here caused some numerical errors for coord on cut calculations.
4090 // Probably that can be fixed with proper epsilon adjustment but since cuda
4091 // doesn't reduce right now the shared memory pressure is no longer relevant.
4092 // Just use scalar_t to match the original algorithm.
4093 typedef mj_scalar_t array_t;
4094
4095#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4096 Kokkos::View<array_t*, Kokkos::HostSpace> reduce_array("reduce_array", total_array_length);
4097#endif // KOKKOS_ENABLE_CUDA && KOKKOS_ENABLE_HIP
4098
4099 int offset_cuts = 0;
4100 for(int kk2 = 0; kk2 < kk; ++kk2) {
4101 offset_cuts +=
4102 host_num_partitioning_in_current_dim(current_work_part + kk2) - 1;
4103 }
4104 Kokkos::View<double *, device_t> my_current_part_weights =
4105 Kokkos::subview(local_thread_part_weights,
4106 std::pair<mj_lno_t, mj_lno_t>(total_part_shift,
4107 total_part_shift + total_part_count));
4108 Kokkos::View<mj_scalar_t *, device_t> my_current_left_closest =
4109 Kokkos::subview(local_thread_cut_left_closest_point,
4110 std::pair<mj_lno_t, mj_lno_t>(
4111 offset_cuts,
4112 local_thread_cut_left_closest_point.size()));
4113 Kokkos::View<mj_scalar_t *, device_t> my_current_right_closest =
4114 Kokkos::subview(local_thread_cut_right_closest_point,
4115 std::pair<mj_lno_t, mj_lno_t>(
4116 offset_cuts,
4117 local_thread_cut_right_closest_point.size()));
4118
4119 array_t max_scalar = std::numeric_limits<array_t>::max();
4120
4121#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4122 // initialize values
4123 Kokkos::parallel_for(
4124 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4125 KOKKOS_LAMBDA (int dummy) {
4126 for(int n = 0; n < weight_array_length; ++n) {
4127 my_current_part_weights(n) = 0;
4128 }
4129 for(int n = 0; n < num_cuts; ++n) {
4130 my_current_left_closest(n) = -max_scalar;
4131 my_current_right_closest(n) = max_scalar;
4132 }
4133 });
4134#endif
4135
4136 mj_part_t concurrent_current_part =
4137 current_work_part + kk;
4138
4139 ReduceWeightsFunctor<policy_t, mj_scalar_t, mj_part_t, mj_lno_t,
4140 typename mj_node_t::device_type, array_t>
4141 teamFunctor(
4142 loop_count,
4143 max_scalar,
4145 num_cuts,
4148 right_left_array_length,
4149 weight_array_length,
4150 coordinate_permutations,
4151 mj_current_dim_coords,
4152 mj_weights,
4153 assigned_part_ids,
4154 local_temp_cut_coords,
4155 part_xadj,
4156 mj_uniform_weights(0), // host and currently only relevant to slot 0
4157 sEpsilon
4158#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4159 ,my_current_part_weights,
4160 my_current_left_closest,
4161 my_current_right_closest
4162#endif
4163 );
4164
4165#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4166 Kokkos::parallel_for(policy_ReduceWeightsFunctor, teamFunctor);
4167#else
4168 Kokkos::parallel_reduce(policy_ReduceWeightsFunctor,
4169 teamFunctor, reduce_array);
4170 Kokkos::fence();
4171#endif
4172
4173#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4174 auto hostArray = Kokkos::create_mirror_view(my_current_part_weights);
4175
4176 for(int i = 0; i < static_cast<int>(total_part_count); ++i) {
4177 hostArray(i) = reduce_array[i];
4178 }
4179
4180 Kokkos::deep_copy(my_current_part_weights, hostArray);
4181
4182 auto hostLeftArray = Kokkos::create_mirror_view(my_current_left_closest);
4183 auto hostRightArray = Kokkos::create_mirror_view(my_current_right_closest);
4184 for(mj_part_t cut = 0; cut < num_cuts; ++cut) {
4185 hostLeftArray(cut) = reduce_array[weight_array_length + (cut+1)*2+0];
4186 hostRightArray(cut) = reduce_array[weight_array_length + (cut+1)*2+1];
4187 }
4188 Kokkos::deep_copy(my_current_left_closest, hostLeftArray);
4189 Kokkos::deep_copy(my_current_right_closest, hostRightArray);
4190#endif
4191
4192 total_part_shift += total_part_count;
4193 concurrent_cut_shifts += num_cuts;
4194 }
4195
4196 auto local_temp_cut_coords = temp_cut_coords;
4197
4198 Kokkos::parallel_for(
4199 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
4200 (0, current_concurrent_num_parts), KOKKOS_LAMBDA(mj_part_t kk) {
4201 mj_part_t num_parts = local_device_num_partitioning_in_current_dim(
4202 current_work_part + kk);
4203 mj_part_t num_cuts = num_parts - 1;
4204 mj_part_t total_part_count = num_parts + num_cuts;
4205
4206 if(local_device_incomplete_cut_count(kk) > 0) {
4207 // get the prefix sum
4208 // This is an inefficiency but not sure if it matters much
4209 size_t offset = 0;
4210 size_t offset_cuts = 0;
4211 for(mj_part_t kk2 = 0; kk2 < kk; ++kk2) {
4212 auto num_parts_kk2 = local_device_num_partitioning_in_current_dim(
4213 current_work_part + kk2);
4214 offset += num_parts_kk2 * 2 - 1;
4215 offset_cuts += num_parts_kk2 - 1;
4216 }
4217
4218 for(mj_part_t i = 1; i < total_part_count; ++i) {
4219 // check for cuts sharing the same position; all cuts sharing a position
4220 // have the same weight == total weight for all cuts sharing the
4221 // position. Don't want to accumulate that total weight more than once.
4222 if(i % 2 == 0 && i > 1 && i < total_part_count - 1 &&
4223 std::abs(local_temp_cut_coords(offset_cuts + i / 2) -
4224 local_temp_cut_coords(offset_cuts + i /2 - 1))
4225 < local_sEpsilon) {
4226 // i % 2 = 0 when part i represents the cut coordinate.
4227 // if it is a cut, and if next cut also has the same coordinate, then
4228 // dont addup.
4229 local_thread_part_weights(offset + i)
4230 = local_thread_part_weights(offset + i-2);
4231 continue;
4232 }
4233
4234 // otherwise do the prefix sum.
4235 local_thread_part_weights(offset + i) +=
4236 local_thread_part_weights(offset + i-1);
4237 }
4238 }
4239 });
4240}
4241
4249template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4250 typename mj_part_t, typename mj_node_t>
4251void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
4252 mj_combine_rightleft_and_weights(
4253 mj_part_t current_work_part,
4255{
4256 auto local_thread_part_weights = this->thread_part_weights;
4257 auto local_is_cut_line_determined = this->is_cut_line_determined;
4258 auto local_thread_cut_left_closest_point =
4259 this->thread_cut_left_closest_point;
4260 auto local_thread_cut_right_closest_point =
4261 this->thread_cut_right_closest_point;
4262 auto local_total_part_weight_left_right_closests =
4263 this->total_part_weight_left_right_closests;
4264 auto local_device_num_partitioning_in_current_dim =
4265 device_num_partitioning_in_current_dim;
4266 Kokkos::parallel_for(
4267 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0,1),
4268 KOKKOS_LAMBDA (int dummy) {
4269
4270 size_t tlr_array_shift = 0;
4271 mj_part_t cut_shift = 0;
4272 size_t total_part_array_shift = 0;
4273
4274 // iterate for all concurrent parts to find the left and right closest
4275 // points in the process.
4276 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
4277
4278 mj_part_t num_parts_in_part =
4279 local_device_num_partitioning_in_current_dim(current_work_part + i);
4280 mj_part_t num_cuts_in_part = num_parts_in_part - 1;
4281 size_t num_total_part_in_part =
4282 num_parts_in_part + size_t (num_cuts_in_part);
4283
4284 // iterate for cuts in a single part.
4285 for(int ii = 0; ii < num_cuts_in_part; ++ii) {
4286 mj_part_t next = tlr_array_shift + ii;
4287 mj_part_t cut_index = cut_shift + ii;
4288
4289 if(!local_is_cut_line_determined(cut_index)) {
4290 mj_scalar_t left_closest_in_process =
4291 local_thread_cut_left_closest_point(cut_index);
4292 mj_scalar_t right_closest_in_process =
4293 local_thread_cut_right_closest_point(cut_index);
4294
4295 // store the left and right closes points.
4296 local_total_part_weight_left_right_closests(
4297 num_total_part_in_part + next) = left_closest_in_process;
4298
4299 local_total_part_weight_left_right_closests(
4300 num_total_part_in_part + num_cuts_in_part + next) =
4301 right_closest_in_process;
4302 }
4303 }
4304
4305 for(size_t j = 0; j < num_total_part_in_part; ++j) {
4306 mj_part_t cut_ind = j / 2 + cut_shift;
4307
4308 // need to check j != num_total_part_in_part - 1
4309 // which is same as j/2 != num_cuts_in_part.
4310 // we cannot check it using cut_ind, because of the concurrent part
4311 // concantanetion.
4312 if(j == num_total_part_in_part - 1 ||
4313 !local_is_cut_line_determined(cut_ind)) {
4314 double pwj = local_thread_part_weights(total_part_array_shift + j);
4315 local_total_part_weight_left_right_closests(tlr_array_shift + j) = pwj;
4316 }
4317 }
4318
4319 // set the shift position in the arrays
4320 cut_shift += num_cuts_in_part;
4321 tlr_array_shift += num_total_part_in_part + 2 * num_cuts_in_part;
4322 total_part_array_shift += num_total_part_in_part;
4323 }
4324 });
4325}
4326
4339template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4340 typename mj_part_t, typename mj_node_t>
4341KOKKOS_INLINE_FUNCTION
4342void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
4343 mj_node_t>::mj_calculate_new_cut_position(mj_scalar_t cut_upper_bound,
4344 mj_scalar_t cut_lower_bound,
4345 mj_scalar_t cut_upper_weight,
4346 mj_scalar_t cut_lower_weight,
4347 mj_scalar_t expected_weight,
4348 mj_scalar_t &new_cut_position,
4349 mj_scalar_t sEpsilon) {
4350
4351 if(std::abs(cut_upper_bound - cut_lower_bound) < sEpsilon) {
4352 new_cut_position = cut_upper_bound; //or lower bound does not matter.
4353 }
4354
4355 if(std::abs(cut_upper_weight - cut_lower_weight) < sEpsilon) {
4356 new_cut_position = cut_lower_bound;
4357 }
4358
4359 mj_scalar_t coordinate_range = (cut_upper_bound - cut_lower_bound);
4360 mj_scalar_t weight_range = (cut_upper_weight - cut_lower_weight);
4361 mj_scalar_t my_weight_diff = (expected_weight - cut_lower_weight);
4362
4363 mj_scalar_t required_shift = (my_weight_diff / weight_range);
4364 int scale_constant = 20;
4365 int shiftint= int (required_shift * scale_constant);
4366 if(shiftint == 0) shiftint = 1;
4367 required_shift = mj_scalar_t (shiftint) / scale_constant;
4368 new_cut_position = coordinate_range * required_shift + cut_lower_bound;
4369}
4370
4371#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4372
4373template<class policy_t, class scalar_t>
4375
4380
4381 KOKKOS_INLINE_FUNCTION ArrayReducer(
4382 value_type &val,
4383 int mj_value_count) :
4384 value(&val),
4385 value_count(mj_value_count)
4386 {}
4387
4388 KOKKOS_INLINE_FUNCTION
4390 return *value;
4391 }
4392
4393 KOKKOS_INLINE_FUNCTION
4394 void join(value_type& dst, const value_type& src) const {
4395 for(int n = 0; n < value_count; ++n) {
4396 dst.ptr[n] += src.ptr[n];
4397 }
4398 }
4399
4400 KOKKOS_INLINE_FUNCTION void init (value_type& dst) const {
4401 dst.ptr = value->ptr; // must update ptr
4402 for(int n = 0; n < value_count; ++n) {
4403 dst.ptr[n] = 0;
4404 }
4405 }
4406};
4407
4408#endif
4409
4410template<class policy_t, class scalar_t, class part_t, class index_t,
4411 class device_t, class array_t>
4413 typedef typename policy_t::member_type member_type;
4414 typedef Kokkos::View<scalar_t*> scalar_view_t;
4415
4416#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4417 typedef array_t value_type[];
4418#endif
4419
4422 Kokkos::View<index_t*, device_t> permutations;
4423 Kokkos::View<scalar_t *, device_t> coordinates;
4424 Kokkos::View<part_t*, device_t> parts;
4425 Kokkos::View<index_t *, device_t> part_xadj;
4426 Kokkos::View<index_t *, device_t> track_on_cuts;
4427
4428#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4429 Kokkos::View<int *, device_t> local_point_counts;
4430#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4431
4433 part_t mj_concurrent_current_part,
4434 part_t mj_weight_array_size,
4435 Kokkos::View<index_t*, device_t> & mj_permutations,
4436 Kokkos::View<scalar_t *, device_t> & mj_coordinates,
4437 Kokkos::View<part_t*, device_t> & mj_parts,
4438 Kokkos::View<index_t *, device_t> & mj_part_xadj,
4439 Kokkos::View<index_t *, device_t> & mj_track_on_cuts
4440#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4441 ,Kokkos::View<int *, device_t> & mj_local_point_counts
4442#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4443 ) :
4444 concurrent_current_part(mj_concurrent_current_part),
4445 value_count(mj_weight_array_size),
4446 permutations(mj_permutations),
4447 coordinates(mj_coordinates),
4448 parts(mj_parts),
4449 part_xadj(mj_part_xadj),
4450 track_on_cuts(mj_track_on_cuts)
4451#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4452 ,local_point_counts(mj_local_point_counts)
4453#endif
4454 {
4455 }
4456
4457 size_t team_shmem_size (int team_size) const {
4458#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4459 int result = sizeof(array_t) * (value_count);
4460#else
4461 int result = sizeof(array_t) * (value_count) * team_size;
4462#endif
4463
4464 // pad this to a multiple of 8 or it will run corrupt
4465 int remainder = result % 8;
4466 if(remainder != 0) {
4467 result += 8 - remainder;
4468 }
4469 return result;
4470 }
4471
4472 KOKKOS_INLINE_FUNCTION
4473#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4474 void operator() (const member_type & teamMember) const {
4475#else
4476 void operator() (const member_type & teamMember, value_type teamSum) const {
4477#endif
4478 index_t all_begin = (concurrent_current_part == 0) ? 0 :
4479 part_xadj(concurrent_current_part - 1);
4480 index_t all_end = part_xadj(concurrent_current_part);
4481
4482 index_t num_working_points = all_end - all_begin;
4483 int num_teams = teamMember.league_size();
4484
4485 index_t stride = num_working_points / num_teams;
4486 if((num_working_points % num_teams) > 0) {
4487 stride += 1; // make sure we have coverage for the final points
4488 }
4489
4490 index_t begin = all_begin + stride * teamMember.league_rank();
4491 index_t end = begin + stride;
4492 if(end > all_end) {
4493 end = all_end; // the last team may have less work than the other teams
4494 }
4495
4496 int track_on_cuts_insert_index = track_on_cuts.size() - 1;
4497
4498 // create the team shared data - each thread gets one of the arrays
4499#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4500 size_t sh_mem_size = sizeof(array_t) * (value_count);
4501#else
4502 size_t sh_mem_size =
4503 sizeof(array_t) * (value_count) * teamMember.team_size();
4504#endif
4505
4506 array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
4507 sh_mem_size);
4508
4509#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4510 // init the shared array to 0
4511 Kokkos::single(Kokkos::PerTeam(teamMember), [&] () {
4512 for(int n = 0; n < value_count; ++n) {
4513 shared_ptr[n] = 0;
4514 }
4515 });
4516 teamMember.team_barrier();
4517
4518 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, begin, end),
4519 [&] (index_t ii) {
4520#else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4521 // select the array for this thread
4522 Zoltan2_MJArrayType<array_t> array(&shared_ptr[teamMember.team_rank() *
4523 (value_count)]);
4524
4525 // create reducer which handles the Zoltan2_MJArrayType class
4526 ArrayReducer<policy_t, array_t> arrayReducer(array, value_count);
4527
4528 Kokkos::parallel_reduce(
4529 Kokkos::TeamThreadRange(teamMember, begin, end),
4530#if (__cplusplus > 201703L)
4531 [=, this] (size_t ii, Zoltan2_MJArrayType<array_t>& threadSum) {
4532#else
4533 [=] (size_t ii, Zoltan2_MJArrayType<array_t>& threadSum) {
4534#endif
4535#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4536
4537 index_t coordinate_index = permutations(ii);
4538 part_t place = parts(coordinate_index);
4539 part_t part = place / 2;
4540 if(place % 2 == 0) {
4541#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4542 Kokkos::atomic_add(&shared_ptr[part], 1);
4543#else
4544 threadSum.ptr[part] += 1;
4545#endif
4546
4547 parts(coordinate_index) = part;
4548 }
4549 else {
4550 // fill a tracking array so we can process these slower points
4551 // in next cycle
4552 index_t set_index = Kokkos::atomic_fetch_add(
4553 &track_on_cuts(track_on_cuts_insert_index), 1);
4554 track_on_cuts(set_index) = ii;
4555 }
4556#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4557 });
4558#else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4559 }, arrayReducer);
4560#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4561
4562 teamMember.team_barrier();
4563
4564 // collect all the team's results
4565#if (__cplusplus > 201703L)
4566 Kokkos::single(Kokkos::PerTeam(teamMember), [=, this] () {
4567#else
4568 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
4569#endif
4570 for(int n = 0; n < value_count; ++n) {
4571#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4572 Kokkos::atomic_add(&local_point_counts(n), shared_ptr[n]);
4573#else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4574 teamSum[n] += array.ptr[n];
4575#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4576 }
4577 });
4578
4579 teamMember.team_barrier();
4580 }
4581
4582#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4583
4584 KOKKOS_INLINE_FUNCTION
4585 void join(value_type dst, const value_type src) const {
4586 for(int n = 0; n < value_count; ++n) {
4587 dst[n] += src[n];
4588 }
4589 }
4590
4591 KOKKOS_INLINE_FUNCTION void init (value_type dst) const {
4592 for(int n = 0; n < value_count; ++n) {
4593 dst[n] = 0;
4594 }
4595 }
4596#endif
4597};
4598
4614template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4615 typename mj_part_t, typename mj_node_t>
4616void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
4617mj_create_new_partitions(
4618 mj_part_t num_parts,
4619 mj_part_t current_concurrent_work_part,
4620 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
4621 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
4622 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
4623 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj)
4624{
4625 // Get locals for cuda
4626 auto local_thread_part_weight_work = this->thread_part_weight_work;
4627 auto local_point_counts = this->thread_point_counts;
4628 auto local_distribute_points_on_cut_lines =
4629 this->distribute_points_on_cut_lines;
4630 auto local_thread_cut_line_weight_to_put_left =
4631 this->thread_cut_line_weight_to_put_left;
4632 auto local_sEpsilon = this->sEpsilon;
4633 auto local_coordinate_permutations = this->coordinate_permutations;
4634 auto local_mj_weights = this->mj_weights;
4635 auto local_assigned_part_ids = this->assigned_part_ids;
4636 auto local_new_coordinate_permutations = this->new_coordinate_permutations;
4637
4638 mj_part_t num_cuts = num_parts - 1;
4639
4640 Kokkos::parallel_for(
4641 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4642 KOKKOS_LAMBDA(int dummy) {
4643
4644 if(local_distribute_points_on_cut_lines) {
4645 for(int i = 0; i < num_cuts; ++i) {
4646 mj_scalar_t left_weight = used_local_cut_line_weight_to_left(i);
4647 if(left_weight > local_sEpsilon) {
4648 // the weight of thread ii on cut.
4649 mj_scalar_t thread_ii_weight_on_cut =
4650 local_thread_part_weight_work(i * 2 + 1) -
4651 local_thread_part_weight_work(i * 2);
4652
4653 if(thread_ii_weight_on_cut < left_weight) {
4654 // if left weight is bigger than threads weight on cut.
4655 local_thread_cut_line_weight_to_put_left(i) =
4656 thread_ii_weight_on_cut;
4657 }
4658 else {
4659 // if thread's weight is bigger than space, then put only a portion.
4660 local_thread_cut_line_weight_to_put_left(i) = left_weight;
4661 }
4662 left_weight -= thread_ii_weight_on_cut;
4663 }
4664 else {
4665 local_thread_cut_line_weight_to_put_left(i) = 0;
4666 }
4667 }
4668
4669 // this is a special case. If cutlines share the same coordinate,
4670 // their weights are equal. We need to adjust the ratio for that.
4671 for(mj_part_t i = num_cuts - 1; i > 0 ; --i) {
4672 if(std::abs(current_concurrent_cut_coordinate(i) -
4673 current_concurrent_cut_coordinate(i -1)) < local_sEpsilon) {
4674 local_thread_cut_line_weight_to_put_left(i) -=
4675 local_thread_cut_line_weight_to_put_left(i - 1);
4676 }
4677 local_thread_cut_line_weight_to_put_left(i) =
4678 static_cast<long long>((local_thread_cut_line_weight_to_put_left(i) +
4679 least_signifiance) * significance_mul) /
4680 static_cast<mj_scalar_t>(significance_mul);
4681 }
4682 }
4683
4684 for(mj_part_t i = 0; i < num_parts; ++i) {
4685 local_point_counts(i) = 0;
4686 }
4687 });
4688
4689 mj_lno_t coordinate_begin_index =
4690 current_concurrent_work_part == 0 ? 0 :
4691 host_part_xadj(current_concurrent_work_part - 1);
4692 mj_lno_t coordinate_end_index =
4693 host_part_xadj(current_concurrent_work_part);
4694
4695 mj_lno_t total_on_cut;
4696 Kokkos::parallel_reduce("Get total_on_cut",
4697 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (
4698 coordinate_begin_index, coordinate_end_index),
4699 KOKKOS_LAMBDA(int ii, mj_lno_t & val) {
4700 mj_lno_t coordinate_index = local_coordinate_permutations(ii);
4701 mj_part_t coordinate_assigned_place =
4702 local_assigned_part_ids(coordinate_index);
4703 if(coordinate_assigned_place % 2 == 1) {
4704 val += 1;
4705 }
4706 }, total_on_cut);
4707
4708 Kokkos::View<mj_lno_t *, device_t> track_on_cuts;
4709 if(total_on_cut > 0) {
4710 track_on_cuts = Kokkos::View<mj_lno_t *, device_t>(
4711 "track_on_cuts", // would do WithoutInitialization but need last init to 0
4712 total_on_cut + 1); // extra index to use for tracking
4713 }
4714
4715 // here we need to parallel reduce an array to count coords in each part
4716 // atomically adding, especially for low part count would kill us
4717 // in the original setup we kept arrays allocated for each thread but for
4718 // the cuda version we'd like to avoid allocating N arrays for the number
4719 // of teams/threads which would be complicated based on running openmp or
4720 // cuda.
4721 typedef Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy_t;
4722
4723 // if not set use 60 - somewhat arbitrary based on initial performance tests
4724 int use_num_teams = mj_num_teams ? mj_num_teams : 60;
4725
4726 auto policy_ReduceFunctor = policy_t(use_num_teams, Kokkos::AUTO);
4727 typedef int array_t;
4728
4729 // just need parts - on the cuts will be handled in a separate serial
4730 // call after this.
4731#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4732 Kokkos::View<array_t*, Kokkos::HostSpace> reduce_array("reduce_array", num_parts);
4733#endif
4734
4735 ReduceArrayFunctor<policy_t, mj_scalar_t, mj_part_t, mj_lno_t,
4736 typename mj_node_t::device_type, array_t>teamFunctor(
4737 current_concurrent_work_part,
4738 num_parts,
4739 coordinate_permutations,
4740 mj_current_dim_coords,
4741 assigned_part_ids,
4742 part_xadj,
4743 track_on_cuts
4744#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4745 ,local_point_counts
4746#endif
4747 );
4748
4749#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4750 Kokkos::parallel_for(policy_ReduceFunctor, teamFunctor);
4751#else
4752 Kokkos::parallel_reduce(policy_ReduceFunctor, teamFunctor, reduce_array);
4753 Kokkos::fence();
4754#endif
4755
4756#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4757 for(mj_part_t part = 0; part < num_parts; ++part) {
4758 local_point_counts(part) = reduce_array[part];
4759 }
4760#endif
4761
4762 // the last member is utility used for atomically inserting the values.
4763 // Sorting here avoids potential indeterminancy in the partitioning results
4764 if(track_on_cuts.size() > 0) { // size 0 unused, or size is minimum of 2
4765 auto track_on_cuts_sort = Kokkos::subview(track_on_cuts,
4766 std::pair<mj_lno_t, mj_lno_t>(0, track_on_cuts.size() - 1)); // do not sort last element
4767 Kokkos::sort(track_on_cuts_sort);
4768 }
4769
4770 bool uniform_weights0 = this->mj_uniform_weights(0);
4771 Kokkos::parallel_for(
4772 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4773 KOKKOS_LAMBDA (int dummy) {
4774
4775 for(int j = 0; j < total_on_cut; ++j) {
4776 int ii = track_on_cuts(j);
4777 mj_lno_t coordinate_index = local_coordinate_permutations(ii);
4778 mj_scalar_t coordinate_weight = uniform_weights0 ? 1 :
4779 local_mj_weights(coordinate_index,0);
4780 mj_part_t coordinate_assigned_place =
4781 local_assigned_part_ids(coordinate_index);
4782 mj_part_t coordinate_assigned_part = coordinate_assigned_place / 2;
4783 // if it is on the cut.
4784 if(local_distribute_points_on_cut_lines &&
4785 local_thread_cut_line_weight_to_put_left(
4786 coordinate_assigned_part) > local_sEpsilon) {
4787 // if the rectilinear partitioning is allowed,
4788 // and the thread has still space to put on the left of the cut
4789 // then thread puts the vertex to left.
4790 local_thread_cut_line_weight_to_put_left(
4791 coordinate_assigned_part) -= coordinate_weight;
4792 // if putting the vertex to left increased the weight more
4793 // than expected, and if the next cut is on the same coordinate,
4794 // then we need to adjust how much weight next cut puts to its left as
4795 // well, in order to take care of the imbalance.
4796 if(local_thread_cut_line_weight_to_put_left(
4797 coordinate_assigned_part) < 0 && coordinate_assigned_part <
4798 num_cuts - 1 &&
4799 std::abs(current_concurrent_cut_coordinate(
4800 coordinate_assigned_part+1) -
4801 current_concurrent_cut_coordinate(
4802 coordinate_assigned_part)) < local_sEpsilon)
4803 {
4804 local_thread_cut_line_weight_to_put_left(
4805 coordinate_assigned_part + 1) +=
4806 local_thread_cut_line_weight_to_put_left(
4807 coordinate_assigned_part);
4808 }
4809 ++local_point_counts(coordinate_assigned_part);
4810 local_assigned_part_ids(coordinate_index) =
4811 coordinate_assigned_part;
4812 }
4813 else {
4814 // if there is no more space on the left, put the coordinate to the
4815 // right of the cut.
4816 ++coordinate_assigned_part;
4817 // this while loop is necessary when a line is partitioned into more
4818 // than 2 parts.
4819 while(local_distribute_points_on_cut_lines &&
4820 coordinate_assigned_part < num_cuts)
4821 {
4822 // traverse all the cut lines having the same partitiong
4823 if(std::abs(current_concurrent_cut_coordinate(
4824 coordinate_assigned_part) -
4825 current_concurrent_cut_coordinate(
4826 coordinate_assigned_part - 1)) < local_sEpsilon)
4827 {
4828 // if line has enough space on left, put it there.
4829 if(local_thread_cut_line_weight_to_put_left(
4830 coordinate_assigned_part) > local_sEpsilon &&
4831 local_thread_cut_line_weight_to_put_left(
4832 coordinate_assigned_part) >=
4833 std::abs(local_thread_cut_line_weight_to_put_left(
4834 coordinate_assigned_part) - coordinate_weight))
4835 {
4836 local_thread_cut_line_weight_to_put_left(
4837 coordinate_assigned_part) -= coordinate_weight;
4838 // Again if it put too much on left of the cut,
4839 // update how much the next cut sharing the same coordinate will
4840 // put to its left.
4841 if(local_thread_cut_line_weight_to_put_left(
4842 coordinate_assigned_part) < 0 &&
4843 coordinate_assigned_part < num_cuts - 1 &&
4844 std::abs(current_concurrent_cut_coordinate(
4845 coordinate_assigned_part+1) -
4846 current_concurrent_cut_coordinate(
4847 coordinate_assigned_part)) < local_sEpsilon)
4848 {
4849 local_thread_cut_line_weight_to_put_left(
4850 coordinate_assigned_part + 1) +=
4851 local_thread_cut_line_weight_to_put_left(
4852 coordinate_assigned_part);
4853 }
4854 break;
4855 }
4856 }
4857 else {
4858 break;
4859 }
4860 ++coordinate_assigned_part;
4861 }
4862 local_point_counts(coordinate_assigned_part) += 1;
4863 local_assigned_part_ids(coordinate_index) = coordinate_assigned_part;
4864 }
4865 }
4866
4867 for(int j = 0; j < num_parts; ++j) {
4868 out_part_xadj(j) = local_point_counts(j);
4869 local_point_counts(j) = 0;
4870
4871 if(j != 0) {
4872 out_part_xadj(j) += out_part_xadj(j - 1);
4873 local_point_counts(j) += out_part_xadj(j - 1);
4874 }
4875 }
4876 });
4877
4878 // here we will determine insert indices for N teams
4879 // then all the teams can fill
4880
4881#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4882
4883 // This is the fastest so far - just straight atomic writes for CUDA
4884 // However this is not a deterministic result since it is atomic.
4885 // The final result will be deterministic.
4886 Kokkos::parallel_for(
4887 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t> (
4888 coordinate_begin_index, coordinate_end_index),
4889 KOKKOS_LAMBDA (mj_lno_t ii) {
4890 mj_lno_t i = local_coordinate_permutations(ii);
4891 mj_part_t p = local_assigned_part_ids(i);
4892 mj_lno_t idx = Kokkos::atomic_fetch_add(&local_point_counts(p), 1);
4893 local_new_coordinate_permutations(coordinate_begin_index + idx) = i;
4894 });
4895
4896#else
4897
4898#ifdef KOKKOS_ENABLE_OPENMP
4899 // will return and fix this - revert back to 1 for clear auto testing
4900 const int num_threads = 1; // Kokkos::OpenMP::impl_max_hardware_threads();
4901#else
4902 const int num_threads = 1;
4903#endif
4904
4905 const int num_teams = 1; // cuda is handled above using a different format
4906
4907 // allow init - we want all 0's first
4908 Kokkos::View<mj_lno_t*, device_t>
4909 point_counter("insert indices", num_teams * num_threads * num_parts);
4910
4911 // count how many coords per thread
4912 // then we will fill each independently
4913 Kokkos::TeamPolicy<typename mj_node_t::execution_space>
4914 block_policy(num_teams, num_threads);
4915 typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
4916 member_type member_type;
4917 mj_lno_t range = coordinate_end_index - coordinate_begin_index;
4918 mj_lno_t block_size = range / num_teams + 1;
4919 Kokkos::parallel_for(block_policy, KOKKOS_LAMBDA(member_type team_member) {
4920 int team = team_member.league_rank();
4921 int team_offset = team * num_threads * num_parts;
4922 mj_lno_t begin = coordinate_begin_index + team * block_size;
4923 mj_lno_t end = begin + block_size;
4924 if(end > coordinate_end_index) {
4925 end = coordinate_end_index;
4926 }
4927
4928 Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, begin, end),
4929 [=] (mj_lno_t ii) {
4930 int thread = team_member.team_rank();
4931 mj_lno_t i = local_coordinate_permutations(ii);
4932 mj_part_t p = local_assigned_part_ids(i);
4933 int index = team_offset + thread * num_parts + p;
4934 ++point_counter(index);
4935 });
4936 });
4937
4938 // now prefix sum
4939 // we currently have the counts in the slots
4940 // we want the first counter for each part to be 0
4941 // then the rest should be the sum of all the priors
4942 Kokkos::parallel_for(
4943 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4944 KOKKOS_LAMBDA (int dummy) {
4945 int num_sets = point_counter.size() / num_parts;
4946 for(int set = num_sets - 1; set >= 1; set -=1) {
4947 int base = set * num_parts;
4948 for(int part = 0; part < num_parts; ++part) {
4949 point_counter(base + part) = point_counter(base + part - num_parts);
4950 }
4951 }
4952
4953 for(int part = 0; part < num_parts; ++part) {
4954 point_counter(part) = 0;
4955 }
4956
4957 for(int set = 1; set < num_sets; ++set) {
4958 int base = set * num_parts;
4959 for(int part = 0; part < num_parts; ++part) {
4960 point_counter(base + part) += point_counter(base + part - num_parts);
4961 }
4962 }
4963 });
4964
4965 // now permute
4966 Kokkos::parallel_for(block_policy, KOKKOS_LAMBDA(member_type team_member) {
4967 int team = team_member.league_rank();
4968 int team_offset = team * num_threads * num_parts;
4969 mj_lno_t begin = coordinate_begin_index + team * block_size;
4970 mj_lno_t end = begin + block_size;
4971 if(end > coordinate_end_index) {
4972 end = coordinate_end_index;
4973 }
4974 Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, begin, end),
4975 [=] (mj_lno_t ii) {
4976 int thread = team_member.team_rank();
4977 mj_lno_t i = local_coordinate_permutations(ii);
4978 mj_part_t p = local_assigned_part_ids(i);
4979 int index = team_offset + thread * num_parts + p;
4980 int set_counter = (point_counter(index)++) + local_point_counts(p);
4981 local_new_coordinate_permutations(coordinate_begin_index + set_counter) = i;
4982 });
4983 });
4984#endif
4985}
4986
5030template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5031 typename mj_part_t, typename mj_node_t>
5032void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
5033 mj_node_t>::mj_get_new_cut_coordinates(
5034 mj_part_t current_concurrent_num_parts,
5035 mj_part_t kk,
5036 const mj_part_t &num_cuts,
5037 const double &used_imbalance_tolerance,
5038 Kokkos::View<mj_scalar_t *, device_t> & current_global_part_weights,
5039 Kokkos::View<mj_scalar_t *, device_t> & current_local_part_weights,
5040 Kokkos::View<mj_scalar_t *, device_t> & current_part_target_weights,
5041 Kokkos::View<bool *, device_t> & current_cut_line_determined,
5042 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
5043 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_bounds,
5044 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bounds,
5045 Kokkos::View<mj_scalar_t *, device_t> & current_global_left_closest_points,
5046 Kokkos::View<mj_scalar_t *, device_t> & current_global_right_closest_points,
5047 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bound_weights,
5048 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_weights,
5049 Kokkos::View<mj_scalar_t *, device_t> & new_current_cut_coordinates,
5050 Kokkos::View<mj_scalar_t *, device_t> &
5051 current_part_cut_line_weight_to_put_left,
5052 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count)
5053{
5054 Kokkos::deep_copy(device_incomplete_cut_count, this->incomplete_cut_count);
5055
5056 auto local_device_incomplete_cut_count = device_incomplete_cut_count;
5057 auto local_sEpsilon = sEpsilon;
5058 auto local_distribute_points_on_cut_lines = distribute_points_on_cut_lines;
5059 auto local_global_rectilinear_cut_weight = global_rectilinear_cut_weight;
5060 auto local_process_rectilinear_cut_weight = process_rectilinear_cut_weight;
5061 auto local_global_min_max_coord_total_weight =
5062 global_min_max_coord_total_weight;
5063
5064 const auto _sEpsilon = this->sEpsilon;
5065 // Note for a 22 part system I tried removing the outer loop
5066 // and doing each sub loop as a simple parallel_for over num_cuts.
5067 // But that was about twice as slow (10ms) as the current form (5ms)
5068 // so I think the overhead of launching the new global parallel kernels
5069 // is costly. This form is just running one team so effectively using
5070 // a single warp to process the cuts. I expect with a lot of parts this
5071 // might need changing.
5072 Kokkos::TeamPolicy<typename mj_node_t::execution_space>
5073 policy_one_team(1, Kokkos::AUTO());
5074 typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
5075 member_type member_type;
5076 Kokkos::parallel_for(policy_one_team, KOKKOS_LAMBDA(member_type team_member) {
5077
5078 mj_scalar_t min_coordinate =
5079 local_global_min_max_coord_total_weight(kk);
5080 mj_scalar_t max_coordinate =
5081 local_global_min_max_coord_total_weight(
5082 kk + current_concurrent_num_parts);
5083 mj_scalar_t global_total_weight =
5084 local_global_min_max_coord_total_weight(
5085 kk + current_concurrent_num_parts * 2);
5086
5087 Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, num_cuts),
5088 [=] (mj_part_t i) {
5089 // if left and right closest points are not set yet,
5090 // set it to the cut itself.
5091 if(min_coordinate -
5092 current_global_left_closest_points(i) > local_sEpsilon) {
5093 current_global_left_closest_points(i) =
5094 current_cut_coordinates(i);
5095 }
5096 if(current_global_right_closest_points(i) -
5097 max_coordinate > local_sEpsilon) {
5098 current_global_right_closest_points(i) =
5099 current_cut_coordinates(i);
5100 }
5101 });
5102 team_member.team_barrier(); // for end of Kokkos::TeamThreadRange
5103
5104 Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, num_cuts),
5105 [=] (mj_part_t i) {
5106 using algMJ_t = AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
5107 mj_node_t>;
5108 // seen weight in the part
5109 mj_scalar_t seen_weight_in_part = 0;
5110 // expected weight for part.
5111 mj_scalar_t expected_weight_in_part = 0;
5112 // imbalance for the left and right side of the cut.
5113 double imbalance_on_left = 0, imbalance_on_right = 0;
5114 if(local_distribute_points_on_cut_lines) {
5115 // init the weight on the cut.
5116 local_global_rectilinear_cut_weight(i) = 0;
5117 local_process_rectilinear_cut_weight(i) = 0;
5118 }
5119 bool bContinue = false;
5120 // if already determined at previous iterations,
5121 // then just write the coordinate to new array, and proceed.
5122 if(current_cut_line_determined(i)) {
5123 new_current_cut_coordinates(i) =
5124 current_cut_coordinates(i);
5125 bContinue = true;
5126 }
5127 if(!bContinue) {
5128 //current weight of the part at the left of the cut line.
5129 seen_weight_in_part = current_global_part_weights(i * 2);
5130
5131 //expected ratio
5132 expected_weight_in_part = current_part_target_weights(i);
5133
5134 //leftImbalance = imbalanceOf(seenW, globalTotalWeight, expected);
5135 imbalance_on_left = algMJ_t::calculate_imbalance(seen_weight_in_part,
5136 expected_weight_in_part);
5137 // rightImbalance = imbalanceOf(globalTotalWeight - seenW,
5138 // globalTotalWeight, 1 - expected);
5139 imbalance_on_right = algMJ_t::calculate_imbalance(global_total_weight -
5140 seen_weight_in_part, global_total_weight - expected_weight_in_part);
5141 bool is_left_imbalance_valid = std::abs(imbalance_on_left) -
5142 used_imbalance_tolerance < local_sEpsilon ;
5143 bool is_right_imbalance_valid = std::abs(imbalance_on_right) -
5144 used_imbalance_tolerance < local_sEpsilon;
5145 //if the cut line reaches to desired imbalance.
5146 if(is_left_imbalance_valid && is_right_imbalance_valid) {
5147 current_cut_line_determined(i) = true;
5148 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5149 new_current_cut_coordinates(i) = current_cut_coordinates(i);
5150 }
5151 else if(imbalance_on_left < 0) {
5152 //if left imbalance < 0 then we need to move the cut to right.
5153 if(local_distribute_points_on_cut_lines) {
5154 // if it is okay to distribute the coordinate on
5155 // the same coordinate to left and right.
5156 // then check if we can reach to the target weight by including the
5157 // coordinates in the part.
5158 if(current_global_part_weights(i * 2 + 1) ==
5159 expected_weight_in_part) {
5160 // if it is we are done.
5161 current_cut_line_determined(i) = true;
5162 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5163
5164 //then assign everything on the cut to the left of the cut.
5165 new_current_cut_coordinates(i) =
5166 current_cut_coordinates(i);
5167 //for this cut all the weight on cut will be put to left.
5168 current_part_cut_line_weight_to_put_left(i) =
5169 current_local_part_weights(i * 2 + 1) -
5170 current_local_part_weights(i * 2);
5171 bContinue = true;
5172 }
5173 else if(current_global_part_weights(i * 2 + 1) >
5174 expected_weight_in_part) {
5175 // if the weight is larger than the expected weight,
5176 // then we need to distribute some points to left, some to right.
5177 current_cut_line_determined(i) = true;
5178 Kokkos::atomic_add(&view_rectilinear_cut_count(0), 1);
5179
5180 // increase the num cuts to be determined with rectilinear
5181 // partitioning.
5182 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5183 new_current_cut_coordinates(i) =
5184 current_cut_coordinates(i);
5185 local_process_rectilinear_cut_weight[i] =
5186 current_local_part_weights(i * 2 + 1) -
5187 current_local_part_weights(i * 2);
5188 bContinue = true;
5189 }
5190 }
5191
5192 if(!bContinue) {
5193
5194 // we need to move further right,so set lower bound to current line,
5195 // and shift it to the closes point from right.
5196 current_cut_lower_bounds(i) =
5197 current_global_right_closest_points(i);
5198
5199 //set the lower bound weight to the weight we have seen.
5200 current_cut_lower_bound_weights(i) = seen_weight_in_part;
5201
5202 // compare the upper bound with what has been found in the
5203 // last iteration.
5204 // we try to make more strict bounds for the cut here.
5205 for(mj_part_t ii = i + 1; ii < num_cuts ; ++ii) {
5206 mj_scalar_t p_weight = current_global_part_weights(ii * 2);
5207 mj_scalar_t line_weight =
5208 current_global_part_weights(ii * 2 + 1);
5209 if(p_weight >= expected_weight_in_part) {
5210 // if a cut on the right has the expected weight, then we found
5211 // our cut position. Set up and low coordiantes to this
5212 // new cut coordinate, but we need one more iteration to
5213 // finalize the cut position, as wee need to update the part ids.
5214 if(p_weight == expected_weight_in_part) {
5215 current_cut_upper_bounds(i) =
5216 current_cut_coordinates(ii);
5217 current_cut_upper_weights(i) = p_weight;
5218 current_cut_lower_bounds(i) =
5219 current_cut_coordinates(ii);
5220 current_cut_lower_bound_weights(i) = p_weight;
5221 } else if(p_weight < current_cut_upper_weights(i)) {
5222 // if a part weight is larger then my expected weight,
5223 // but lower than my upper bound weight, update upper bound.
5224 current_cut_upper_bounds(i) =
5225 current_global_left_closest_points(ii);
5226 current_cut_upper_weights(i) = p_weight;
5227 }
5228 break;
5229 }
5230 // if comes here then pw < ew
5231 // then compare the weight against line weight.
5232 if(line_weight >= expected_weight_in_part) {
5233 // if the line is larger than the expected weight, then we need
5234 // to reach to the balance by distributing coordinates on
5235 // this line.
5236 current_cut_upper_bounds(i) =
5237 current_cut_coordinates(ii);
5238 current_cut_upper_weights(i) = line_weight;
5239 current_cut_lower_bounds(i) =
5240 current_cut_coordinates(ii);
5241 current_cut_lower_bound_weights(i) = p_weight;
5242 break;
5243 }
5244 // if a stricter lower bound is found,
5245 // update the lower bound.
5246 if(p_weight <= expected_weight_in_part && p_weight >=
5247 current_cut_lower_bound_weights(i)) {
5248 current_cut_lower_bounds(i) =
5249 current_global_right_closest_points(ii);
5250 current_cut_lower_bound_weights(i) = p_weight;
5251 }
5252 }
5253
5254 mj_scalar_t new_cut_position = 0;
5255 algMJ_t::mj_calculate_new_cut_position(
5256 current_cut_upper_bounds(i),
5257 current_cut_lower_bounds(i),
5258 current_cut_upper_weights(i),
5259 current_cut_lower_bound_weights(i),
5260 expected_weight_in_part, new_cut_position,
5261 _sEpsilon);
5262
5263 // if cut line does not move significantly.
5264 // then finalize the search.
5265 if(std::abs(current_cut_coordinates(i) -
5266 new_cut_position) < local_sEpsilon) {
5267 current_cut_line_determined(i) = true;
5268 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5269
5270 //set the cut coordinate and proceed.
5271 new_current_cut_coordinates(i) =
5272 current_cut_coordinates(i);
5273 } else {
5274 new_current_cut_coordinates(i) = new_cut_position;
5275 }
5276 } // bContinue
5277 } else {
5278 // need to move the cut line to left.
5279 // set upper bound to current line.
5280 current_cut_upper_bounds(i) =
5281 current_global_left_closest_points(i);
5282 current_cut_upper_weights(i) =
5283 seen_weight_in_part;
5284 // compare the current cut line weights with
5285 // previous upper and lower bounds.
5286 for(int ii = i - 1; ii >= 0; --ii) {
5287 mj_scalar_t p_weight =
5288 current_global_part_weights(ii * 2);
5289 mj_scalar_t line_weight =
5290 current_global_part_weights(ii * 2 + 1);
5291 if(p_weight <= expected_weight_in_part) {
5292 if(p_weight == expected_weight_in_part) {
5293 // if the weight of the part is my expected weight
5294 // then we find the solution.
5295 current_cut_upper_bounds(i) =
5296 current_cut_coordinates(ii);
5297 current_cut_upper_weights(i) = p_weight;
5298 current_cut_lower_bounds(i) =
5299 current_cut_coordinates(ii);
5300 current_cut_lower_bound_weights(i) = p_weight;
5301 }
5302 else if(p_weight > current_cut_lower_bound_weights(i)) {
5303 // if found weight is bigger than the lower bound
5304 // then update the lower bound.
5305 current_cut_lower_bounds(i) =
5306 current_global_right_closest_points(ii);
5307 current_cut_lower_bound_weights(i) = p_weight;
5308
5309 // at the same time, if weight of line is bigger than the
5310 // expected weight, then update the upper bound as well.
5311 // in this case the balance will be obtained by distributing
5312 // weights on this cut position.
5313 if(line_weight > expected_weight_in_part) {
5314 current_cut_upper_bounds(i) =
5315 current_global_right_closest_points(ii);
5316 current_cut_upper_weights(i) = line_weight;
5317 }
5318 }
5319 break;
5320 }
5321 // if the weight of the cut on the left is still bigger than
5322 // my weight, and also if the weight is smaller than the current
5323 // upper weight, or if the weight is equal to current upper
5324 // weight, but on the left of the upper weight, then update
5325 // upper bound.
5326 if(p_weight >= expected_weight_in_part &&
5327 (p_weight < current_cut_upper_weights(i) ||
5328 (p_weight == current_cut_upper_weights(i) &&
5329 current_cut_upper_bounds(i) >
5330 current_global_left_closest_points(ii)))) {
5331 current_cut_upper_bounds(i) =
5332 current_global_left_closest_points(ii);
5333 current_cut_upper_weights(i) = p_weight;
5334 }
5335 }
5336 mj_scalar_t new_cut_position = 0;
5337 algMJ_t::mj_calculate_new_cut_position(
5338 current_cut_upper_bounds(i),
5339 current_cut_lower_bounds(i),
5340 current_cut_upper_weights(i),
5341 current_cut_lower_bound_weights(i),
5342 expected_weight_in_part,
5343 new_cut_position,
5344 _sEpsilon);
5345
5346 // if cut line does not move significantly.
5347 if(std::abs(current_cut_coordinates(i) -
5348 new_cut_position) < local_sEpsilon) {
5349 current_cut_line_determined(i) = true;
5350 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5351 //set the cut coordinate and proceed.
5352 new_current_cut_coordinates(i) =
5353 current_cut_coordinates(i);
5354 } else {
5355 new_current_cut_coordinates(i) =
5356 new_cut_position;
5357 }
5358 }
5359 }; // bContinue
5360 });
5361
5362 team_member.team_barrier(); // for end of Kokkos::TeamThreadRange
5363 });
5364
5365 // view_rectilinear_cut_count
5366 mj_part_t rectilinear_cut_count;
5367 Kokkos::parallel_reduce("Read bDoingWork",
5368 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0, 1),
5369 KOKKOS_LAMBDA(int dummy, int & set_single) {
5370 set_single = view_rectilinear_cut_count(0);
5371 }, rectilinear_cut_count);
5372
5373 if(rectilinear_cut_count > 0) {
5374 auto host_local_process_rectilinear_cut_weight =
5375 Kokkos::create_mirror_view(Kokkos::HostSpace(),
5376 local_process_rectilinear_cut_weight);
5377 auto host_local_global_rectilinear_cut_weight =
5378 Kokkos::create_mirror_view(Kokkos::HostSpace(),
5379 local_global_rectilinear_cut_weight);
5380 Kokkos::deep_copy(host_local_process_rectilinear_cut_weight,
5381 local_process_rectilinear_cut_weight);
5382 Kokkos::deep_copy(host_local_global_rectilinear_cut_weight,
5383 local_global_rectilinear_cut_weight);
5384 Teuchos::scan<int,mj_scalar_t>(
5385 *comm, Teuchos::REDUCE_SUM,
5386 num_cuts,
5387 host_local_process_rectilinear_cut_weight.data(),
5388 host_local_global_rectilinear_cut_weight.data());
5389 Kokkos::deep_copy(local_process_rectilinear_cut_weight,
5390 host_local_process_rectilinear_cut_weight);
5391 Kokkos::deep_copy(local_global_rectilinear_cut_weight,
5392 host_local_global_rectilinear_cut_weight);
5393
5394 Kokkos::parallel_for("finish up mj_get_new_cut_coordinates",
5395 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
5396 KOKKOS_LAMBDA(int dummy) {
5397 for(mj_part_t i = 0; i < num_cuts; ++i) {
5398 // if cut line weight to be distributed.
5399 if(local_global_rectilinear_cut_weight(i) > 0) {
5400 // expected weight to go to left of the cut.
5401 mj_scalar_t expected_part_weight = current_part_target_weights(i);
5402 // the weight that should be put to left of the cut.
5403 mj_scalar_t necessary_weight_on_line_for_left =
5404 expected_part_weight - current_global_part_weights(i * 2);
5405
5406 // the weight of the cut in the process
5407 mj_scalar_t my_weight_on_line =
5408 local_process_rectilinear_cut_weight(i);
5409
5410 // the sum of the cut weights upto this process,
5411 // including the weight of this process.
5412 mj_scalar_t weight_on_line_upto_process_inclusive =
5413 local_global_rectilinear_cut_weight(i);
5414 // the space on the left side of the cut after all processes
5415 // before this process (including this process)
5416 // puts their weights on cut to left.
5417 mj_scalar_t space_to_put_left =
5418 necessary_weight_on_line_for_left -
5419 weight_on_line_upto_process_inclusive;
5420 // add my weight to this space to find out how much space
5421 // is left to me.
5422 mj_scalar_t space_left_to_me =
5423 space_to_put_left + my_weight_on_line;
5424
5425 /*
5426 cout << "expected_part_weight:" << expected_part_weight
5427 << " necessary_weight_on_line_for_left:"
5428 << necessary_weight_on_line_for_left
5429 << " my_weight_on_line" << my_weight_on_line
5430 << " weight_on_line_upto_process_inclusive:"
5431 << weight_on_line_upto_process_inclusive
5432 << " space_to_put_left:" << space_to_put_left
5433 << " space_left_to_me" << space_left_to_me << endl;
5434 */
5435
5436 if(space_left_to_me < 0) {
5437 // space_left_to_me is negative and i dont need to put
5438 // anything to left.
5439 current_part_cut_line_weight_to_put_left(i) = 0;
5440 }
5441 else if(space_left_to_me >= my_weight_on_line) {
5442 // space left to me is bigger than the weight of the
5443 // processor on cut.
5444 // so put everything to left.
5445 current_part_cut_line_weight_to_put_left(i) =
5446 my_weight_on_line;
5447 // cout << "setting current_part_cut_line_weight_to_put_left
5448 // to my_weight_on_line:" << my_weight_on_line << endl;
5449 }
5450 else {
5451 // put only the weight as much as the space.
5452 current_part_cut_line_weight_to_put_left(i) =
5453 space_left_to_me;
5454 // cout << "setting current_part_cut_line_weight_to_put_left
5455 // to space_left_to_me:" << space_left_to_me << endl;
5456 }
5457 }
5458 }
5459 view_rectilinear_cut_count(0) = 0;
5460 });
5461 }
5462
5463 Kokkos::deep_copy(this->incomplete_cut_count, device_incomplete_cut_count);
5464}
5465
5475template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5476 typename mj_part_t, typename mj_node_t>
5477void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5478 get_processor_num_points_in_parts(
5479 mj_part_t num_procs,
5480 mj_part_t num_parts,
5481 mj_gno_t *&num_points_in_all_processor_parts)
5482{
5483 // initially allocation_size is num_parts
5484 size_t allocation_size = num_parts * (num_procs + 1);
5485
5486 // this will be output
5487 // holds how many each processor has in each part.
5488 // last portion is the sum of all processor points in each part.
5489
5490 // allocate memory for the local num coordinates in each part.
5491 mj_gno_t *num_local_points_in_each_part_to_reduce_sum =
5492 new mj_gno_t[allocation_size];
5493
5494 // this is the portion of the memory which will be used
5495 // at the summation to obtain total number of processors' points in each part.
5496 mj_gno_t *my_local_points_to_reduce_sum =
5497 num_local_points_in_each_part_to_reduce_sum + num_procs * num_parts;
5498
5499 // this is the portion of the memory where each stores its local number.
5500 // this information is needed by other processors.
5501 mj_gno_t *my_local_point_counts_in_each_part =
5502 num_local_points_in_each_part_to_reduce_sum + this->myRank * num_parts;
5503
5504 // initialize the array with 0's.
5505 memset(num_local_points_in_each_part_to_reduce_sum, 0,
5506 sizeof(mj_gno_t)*allocation_size);
5507
5508 auto local_new_part_xadj = this->new_part_xadj;
5509 Kokkos::View<mj_gno_t *, typename mj_node_t::device_type> points_per_part(
5510 Kokkos::ViewAllocateWithoutInitializing("points per part"), num_parts);
5511 Kokkos::parallel_for("get vals on device",
5512 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_gno_t>
5513 (0, num_parts), KOKKOS_LAMBDA(mj_gno_t i) {
5514 points_per_part(i) =
5515 local_new_part_xadj(i) - ((i == 0) ? 0 : local_new_part_xadj(i-1));
5516 });
5517 auto host_points_per_part = Kokkos::create_mirror_view(points_per_part);
5518 Kokkos::deep_copy(host_points_per_part, points_per_part);
5519 for(int i = 0; i < num_parts; ++i) {
5520 my_local_points_to_reduce_sum[i] = host_points_per_part(i);
5521 }
5522
5523 // copy the local num parts to the last portion of array, so that this portion
5524 // will represent the global num points in each part after the reduction.
5525 memcpy (my_local_point_counts_in_each_part, my_local_points_to_reduce_sum,
5526 sizeof(mj_gno_t) * (num_parts) );
5527
5528 // reduceAll operation.
5529 // the portion that belongs to a processor with index p
5530 // will start from myRank * num_parts.
5531 // the global number of points will be held at the index
5532 try{
5533 reduceAll<int, mj_gno_t>(
5534 *(this->comm),
5535 Teuchos::REDUCE_SUM,
5536 allocation_size,
5537 num_local_points_in_each_part_to_reduce_sum,
5538 num_points_in_all_processor_parts);
5539 }
5540 Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
5541
5542 delete [] num_local_points_in_each_part_to_reduce_sum;
5543}
5544
5560template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5561 typename mj_part_t, typename mj_node_t>
5562bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5563 mj_check_to_migrate(
5564 size_t migration_reduce_all_population,
5565 mj_lno_t num_coords_for_last_dim_part,
5566 mj_part_t num_procs,
5567 mj_part_t num_parts,
5568 mj_gno_t *num_points_in_all_processor_parts)
5569{
5570 // if reduce all count and population in the last dim is too high
5571 if(migration_reduce_all_population > future_reduceall_cutoff) {
5572 return true;
5573 }
5574
5575 // if the work in a part per processor in the last dim is too low.
5576 if(num_coords_for_last_dim_part < min_work_last_dim) {
5577 return true;
5578 }
5579
5580 // if migration is to be checked and the imbalance is too high
5581 if(this->check_migrate_avoid_migration_option == 0) {
5582 double global_imbalance = 0;
5583 // global shift to reach the sum of coordiante count in each part.
5584 size_t global_shift = num_procs * num_parts;
5585
5586 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5587 for(mj_part_t i = 0; i < num_parts; ++i) {
5588 double ideal_num = num_points_in_all_processor_parts[global_shift + i]
5589 / double(num_procs);
5590
5591 global_imbalance += std::abs(ideal_num -
5592 num_points_in_all_processor_parts[ii * num_parts + i]) / (ideal_num);
5593 }
5594 }
5595 global_imbalance /= num_parts;
5596 global_imbalance /= num_procs;
5597
5598 if(global_imbalance <= this->minimum_migration_imbalance) {
5599 return false;
5600 }
5601 else {
5602 return true;
5603 }
5604 }
5605 else {
5606 // if migration is forced
5607 return true;
5608 }
5609}
5610
5624template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5625 typename mj_part_t, typename mj_node_t>
5626void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5627 assign_send_destinations(
5628 mj_part_t num_parts,
5629 mj_part_t *part_assignment_proc_begin_indices,
5630 mj_part_t *processor_chains_in_parts,
5631 mj_lno_t *send_count_to_each_proc,
5632 int *coordinate_destinations) {
5633
5634 auto host_new_part_xadj = Kokkos::create_mirror_view(this->new_part_xadj);
5635 deep_copy(host_new_part_xadj, this->new_part_xadj);
5636
5637 auto host_new_coordinate_permutations =
5638 Kokkos::create_mirror_view(this->new_coordinate_permutations);
5639 deep_copy(host_new_coordinate_permutations, this->new_coordinate_permutations);
5640
5641 for(mj_part_t p = 0; p < num_parts; ++p) {
5642 mj_lno_t part_begin = 0;
5643 if(p > 0) part_begin = host_new_part_xadj(p - 1);
5644 mj_lno_t part_end = host_new_part_xadj(p);
5645 // get the first part that current processor will send its part-p.
5646 mj_part_t proc_to_sent = part_assignment_proc_begin_indices[p];
5647 // initialize how many point I sent to this processor.
5648 mj_lno_t num_total_send = 0;
5649 for(mj_lno_t j=part_begin; j < part_end; j++) {
5650 mj_lno_t local_ind = host_new_coordinate_permutations(j);
5651 while (num_total_send >= send_count_to_each_proc[proc_to_sent]) {
5652 // then get the next processor to send the points in part p.
5653 num_total_send = 0;
5654 // assign new processor to part_assign_begin[p]
5655 part_assignment_proc_begin_indices[p] =
5656 processor_chains_in_parts[proc_to_sent];
5657 // remove the previous processor
5658 processor_chains_in_parts[proc_to_sent] = -1;
5659 // choose the next processor as the next one to send.
5660 proc_to_sent = part_assignment_proc_begin_indices[p];
5661 }
5662 // write the gno index to corresponding position in sendBuf.
5663 coordinate_destinations[local_ind] = proc_to_sent;
5664 ++num_total_send;
5665 }
5666 }
5667}
5668
5689template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5690 typename mj_part_t, typename mj_node_t>
5691void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5692 mj_assign_proc_to_parts(
5693 mj_gno_t * num_points_in_all_processor_parts,
5694 mj_part_t num_parts,
5695 mj_part_t num_procs,
5696 mj_lno_t *send_count_to_each_proc,
5697 std::vector<mj_part_t> &processor_ranks_for_subcomm,
5698 std::vector<mj_part_t> *next_future_num_parts_in_parts,
5699 mj_part_t &out_part_index,
5700 mj_part_t &output_part_numbering_begin_index,
5701 int * coordinate_destinations) {
5702 mj_gno_t *global_num_points_in_parts =
5703 num_points_in_all_processor_parts + num_procs * num_parts;
5704 mj_part_t *num_procs_assigned_to_each_part = new mj_part_t[num_parts];
5705
5706 // boolean variable if the process finds its part to be assigned.
5707 bool did_i_find_my_group = false;
5708
5709 mj_part_t num_free_procs = num_procs;
5710 mj_part_t minimum_num_procs_required_for_rest_of_parts = num_parts - 1;
5711
5712 double max_imbalance_difference = 0;
5713 mj_part_t max_differing_part = 0;
5714
5715 // find how many processor each part requires.
5716 for(mj_part_t i = 0; i < num_parts; i++) {
5717
5718 // scalar portion of the required processors
5719 double scalar_required_proc = num_procs *
5720 (double (global_num_points_in_parts[i]) /
5721 double (this->num_global_coords));
5722
5723 // round it to closest integer; make sure have at least one proc.
5724 mj_part_t required_proc =
5725 static_cast<mj_part_t> (0.5 + scalar_required_proc);
5726 if(required_proc == 0) required_proc = 1;
5727
5728 // if assigning the required num procs, creates problems for the rest
5729 // of the parts, then only assign {num_free_procs -
5730 // (minimum_num_procs_required_for_rest_of_parts)} procs to this part.
5731 if(num_free_procs -
5732 required_proc < minimum_num_procs_required_for_rest_of_parts) {
5733 required_proc = num_free_procs -
5734 (minimum_num_procs_required_for_rest_of_parts);
5735 }
5736
5737 // reduce the free processor count
5738 num_free_procs -= required_proc;
5739
5740 // reduce the free minimum processor count required for the rest of the
5741 // part by 1.
5742 --minimum_num_procs_required_for_rest_of_parts;
5743
5744 // part (i) is assigned to (required_proc) processors.
5745 num_procs_assigned_to_each_part[i] = required_proc;
5746
5747 // because of the roundings some processors might be left as unassigned.
5748 // we want to assign those processors to the part with most imbalance.
5749 // find the part with the maximum imbalance here.
5750 double imbalance_wrt_ideal =
5751 (scalar_required_proc - required_proc) / required_proc;
5752 if(imbalance_wrt_ideal > max_imbalance_difference) {
5753 max_imbalance_difference = imbalance_wrt_ideal;
5754 max_differing_part = i;
5755 }
5756 }
5757
5758 // assign extra processors to the part with maximum imbalance
5759 // than the ideal.
5760 if(num_free_procs > 0) {
5761 num_procs_assigned_to_each_part[max_differing_part] += num_free_procs;
5762 }
5763
5764 // now find what are the best processors with least migration for each part.
5765
5766 // part_assignment_proc_begin_indices ([i]) is the array that holds the
5767 // beginning index of a processor that processor sends its data for part - i
5768 mj_part_t *part_assignment_proc_begin_indices = new mj_part_t[num_parts];
5769
5770 // the next processor send is found in processor_chains_in_parts,
5771 // in linked list manner.
5772 mj_part_t *processor_chains_in_parts = new mj_part_t [num_procs];
5773 mj_part_t *processor_part_assignments = new mj_part_t[num_procs];
5774
5775 // initialize the assignment of each processor.
5776 // this has a linked list implementation.
5777 // the beginning of processors assigned
5778 // to each part is hold at part_assignment_proc_begin_indices[part].
5779 // then the next processor assigned to that part is located at
5780 // proc_part_assignments[part_assign_begins[part]], this is a chain
5781 // until the value of -1 is reached.
5782 for(int i = 0; i < num_procs; ++i ) {
5783 processor_part_assignments[i] = -1;
5784 processor_chains_in_parts[i] = -1;
5785 }
5786 for(int i = 0; i < num_parts; ++i ) {
5787 part_assignment_proc_begin_indices[i] = -1;
5788 }
5789
5790 // std::cout << "Before migration: mig type:" <<
5791 // this->migration_type << std::endl;
5792 // Allocate memory for sorting data structure.
5793 uSignedSortItem<mj_part_t, mj_gno_t, char> *
5794 sort_item_num_part_points_in_procs =
5795 new uSignedSortItem<mj_part_t, mj_gno_t, char>[num_procs];
5796
5797 for(mj_part_t i = 0; i < num_parts; ++i) {
5798 // the algorithm tries to minimize the cost of migration, by assigning the
5799 // processors with highest number of coordinates on that part.
5800 // here we might want to implement a maximum weighted bipartite matching
5801 // algorithm.
5802 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5803 sort_item_num_part_points_in_procs[ii].id = ii;
5804 // if processor is not assigned yet.
5805 // add its num points to the sort data structure.
5806 if(processor_part_assignments[ii] == -1) {
5807 sort_item_num_part_points_in_procs[ii].val =
5808 num_points_in_all_processor_parts[ii * num_parts + i];
5809 // indicate that the processor has positive weight.
5810 sort_item_num_part_points_in_procs[ii].signbit = 1;
5811 }
5812 else {
5813 // if processor is already assigned, insert -nLocal - 1 so that it
5814 // won't be selected again.
5815 // would be same if we simply set it to -1, but more information with
5816 // no extra cost (which is used later) is provided.
5817 // sort_item_num_part_points_in_procs[ii].val =
5818 // -num_points_in_all_processor_parts[ii * num_parts + i] - 1;
5819
5820 // UPDATE: Since above gets warning when unsigned is used to
5821 // represent, we added extra bit to as sign bit to the sort item.
5822 // It is 1 for positives, 0 for negatives.
5823 sort_item_num_part_points_in_procs[ii].val =
5824 num_points_in_all_processor_parts[ii * num_parts + i];
5825 sort_item_num_part_points_in_procs[ii].signbit = 0;
5826 }
5827 }
5828
5829 // sort the processors in the part.
5830 uqSignsort<mj_part_t, mj_gno_t,char>
5831 (num_procs, sort_item_num_part_points_in_procs);
5832
5833 /*
5834 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5835 std::cout << "ii:" << ii << " " <<
5836 sort_item_num_part_points_in_procs[ii].id <<
5837 " " << sort_item_num_part_points_in_procs[ii].val <<
5838 " " << int(sort_item_num_part_points_in_procs[ii].signbit) <<
5839 std::endl;
5840 }
5841 */
5842
5843 mj_part_t required_proc_count = num_procs_assigned_to_each_part[i];
5844 mj_gno_t total_num_points_in_part = global_num_points_in_parts[i];
5845 mj_gno_t ideal_num_points_in_a_proc = Teuchos::as<mj_gno_t>(
5846 ceil(total_num_points_in_part / double (required_proc_count)));
5847
5848 // starts sending to least heaviest part.
5849 mj_part_t next_proc_to_send_index = num_procs - required_proc_count;
5850 mj_part_t next_proc_to_send_id =
5851 sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
5852 mj_lno_t space_left_in_sent_proc = ideal_num_points_in_a_proc -
5853 sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
5854
5855 // find the processors that will be assigned to this part, which are the
5856 // heaviest non assigned processors.
5857 for(mj_part_t ii = num_procs - 1;
5858 ii >= num_procs - required_proc_count; --ii) {
5859 mj_part_t proc_id = sort_item_num_part_points_in_procs[ii].id;
5860 // assign processor to part - i.
5861 processor_part_assignments[proc_id] = i;
5862 }
5863
5864 bool did_change_sign = false;
5865 // if processor has a minus count, reverse it.
5866 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5867 // TODO: THE LINE BELOW PRODUCES A WARNING IF gno_t IS UNSIGNED
5868 // TODO: SEE BUG 6194
5869 if(sort_item_num_part_points_in_procs[ii].signbit == 0) {
5870 did_change_sign = true;
5871 sort_item_num_part_points_in_procs[ii].signbit = 1;
5872 }
5873 else {
5874 break;
5875 }
5876 }
5877
5878 if(did_change_sign) {
5879 // resort the processors in the part for the rest of the processors that
5880 // is not assigned.
5881 uqSignsort<mj_part_t, mj_gno_t>(num_procs - required_proc_count,
5882 sort_item_num_part_points_in_procs);
5883 }
5884
5885 /*
5886 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5887 std::cout << "after resort ii:" << ii << " " <<
5888 sort_item_num_part_points_in_procs[ii].id <<
5889 " " << sort_item_num_part_points_in_procs[ii].val <<
5890 " " << int(sort_item_num_part_points_in_procs[ii].signbit ) <<
5891 std::endl;
5892 }
5893 */
5894
5895 // check if this processors is one of the procs assigned to this part.
5896 // if it is, then get the group.
5897 if(!did_i_find_my_group) {
5898 for(mj_part_t ii = num_procs - 1; ii >=
5899 num_procs - required_proc_count; --ii) {
5900
5901 mj_part_t proc_id_to_assign = sort_item_num_part_points_in_procs[ii].id;
5902
5903 // add the proc to the group.
5904 processor_ranks_for_subcomm.push_back(proc_id_to_assign);
5905
5906 if(proc_id_to_assign == this->myRank) {
5907 // if the assigned process is me, then I find my group.
5908 did_i_find_my_group = true;
5909
5910 // set the beginning of part i to my rank.
5911 part_assignment_proc_begin_indices[i] = this->myRank;
5912 processor_chains_in_parts[this->myRank] = -1;
5913
5914 // set send count to myself to the number of points that I have
5915 // in part i.
5916 send_count_to_each_proc[this->myRank] =
5917 sort_item_num_part_points_in_procs[ii].val;
5918
5919 // calculate the shift required for the
5920 // output_part_numbering_begin_index
5921 for(mj_part_t in = 0; in < i; ++in) {
5922 output_part_numbering_begin_index +=
5923 (*next_future_num_parts_in_parts)[in];
5924 }
5925 out_part_index = i;
5926 }
5927 }
5928
5929 // if these was not my group,
5930 // clear the subcomminicator processor array.
5931 if(!did_i_find_my_group) {
5932 processor_ranks_for_subcomm.clear();
5933 }
5934 }
5935
5936 // send points of the nonassigned coordinates to the assigned coordinates.
5937 // starts from the heaviest nonassigned processor.
5938 // TODO we might want to play with this part, that allows more
5939 // computational imbalance but having better communication balance.
5940 for(mj_part_t ii = num_procs - required_proc_count - 1; ii >= 0; --ii) {
5941 mj_part_t nonassigned_proc_id =
5942 sort_item_num_part_points_in_procs[ii].id;
5943 mj_lno_t num_points_to_sent =
5944 sort_item_num_part_points_in_procs[ii].val;
5945
5946 // we set number of points to -to_sent - 1 for the assigned processors.
5947 // we reverse it here. This should not happen, as we have already
5948 // reversed them above.
5949#ifdef MJ_DEBUG
5950 if(num_points_to_sent < 0) {
5951 cout << "Migration - processor assignments - for part:" << i
5952 << "from proc:" << nonassigned_proc_id << " num_points_to_sent:"
5953 << num_points_to_sent << std::endl;
5954 std::terminate();
5955 }
5956#endif
5957
5958 switch (migration_type) {
5959 case 0:
5960 {
5961 // now sends the points to the assigned processors.
5962 while (num_points_to_sent > 0) {
5963 // if the processor has enough space.
5964 if(num_points_to_sent <= space_left_in_sent_proc) {
5965 // reduce the space left in the processor.
5966 space_left_in_sent_proc -= num_points_to_sent;
5967 // if my rank is the one that is sending the coordinates.
5968 if(this->myRank == nonassigned_proc_id) {
5969 // set my sent count to the sent processor.
5970 send_count_to_each_proc[next_proc_to_send_id] =
5971 num_points_to_sent;
5972 // save the processor in the list (processor_chains_in_parts
5973 // and part_assignment_proc_begin_indices)
5974 // that the processor will send its point in part-i.
5975 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
5976 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
5977 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
5978 }
5979 num_points_to_sent = 0;
5980 }
5981 else {
5982 // there might be no space left in the processor.
5983 if(space_left_in_sent_proc > 0) {
5984 num_points_to_sent -= space_left_in_sent_proc;
5985
5986 //send as the space left in the processor.
5987 if(this->myRank == nonassigned_proc_id) {
5988 // send as much as the space in this case.
5989 send_count_to_each_proc[next_proc_to_send_id] =
5990 space_left_in_sent_proc;
5991 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
5992 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
5993 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
5994 }
5995 }
5996 // change the sent part
5997 ++next_proc_to_send_index;
5998
5999#ifdef MJ_DEBUG
6000 if(next_part_to_send_index < nprocs - required_proc_count ) {
6001 cout << "Migration - processor assignments - for part:"
6002 << i
6003 << " next_part_to_send :" << next_part_to_send_index
6004 << " nprocs:" << nprocs
6005 << " required_proc_count:" << required_proc_count
6006 << " Error: next_part_to_send_index <" <<
6007 << " nprocs - required_proc_count" << std::endl;
6008 std::terminate();
6009 }
6010#endif
6011 // send the new id.
6012 next_proc_to_send_id =
6013 sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
6014 // set the new space in the processor.
6015 space_left_in_sent_proc = ideal_num_points_in_a_proc -
6016 sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
6017 }
6018 }
6019 }
6020 break;
6021 default:
6022 {
6023 // to minimize messages, we want each processor to send its
6024 // coordinates to only a single point.
6025 // we do not respect imbalances here, we send all points to the
6026 // next processor.
6027 if(this->myRank == nonassigned_proc_id) {
6028 // set my sent count to the sent processor.
6029 send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
6030 // save the processor in the list (processor_chains_in_parts and
6031 // part_assignment_proc_begin_indices)
6032 // that the processor will send its point in part-i.
6033 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6034 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6035 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6036 }
6037 num_points_to_sent = 0;
6038 ++next_proc_to_send_index;
6039
6040 // if we made it to the heaviest processor we round robin and
6041 // go to beginning
6042 if(next_proc_to_send_index == num_procs) {
6043 next_proc_to_send_index = num_procs - required_proc_count;
6044 }
6045 // send the new id.
6046 next_proc_to_send_id =
6047 sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
6048 // set the new space in the processor.
6049 space_left_in_sent_proc = ideal_num_points_in_a_proc -
6050 sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
6051 }
6052 }
6053 }
6054 }
6055
6056 /*
6057 for(int i = 0; i < num_procs;++i) {
6058 std::cout << "me:" << this->myRank << " to part:" << i << " sends:" <<
6059 send_count_to_each_proc[i] << std::endl;
6060 }
6061 */
6062
6063 this->assign_send_destinations(
6064 num_parts,
6065 part_assignment_proc_begin_indices,
6066 processor_chains_in_parts,
6067 send_count_to_each_proc,
6068 coordinate_destinations);
6069 delete [] part_assignment_proc_begin_indices;
6070 delete [] processor_chains_in_parts;
6071 delete [] processor_part_assignments;
6072 delete [] sort_item_num_part_points_in_procs;
6073 delete [] num_procs_assigned_to_each_part;
6074}
6075
6091template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6092 typename mj_part_t, typename mj_node_t>
6093void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6094 assign_send_destinations2(
6095 mj_part_t num_parts,
6096 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment,
6097 int *coordinate_destinations,
6098 mj_part_t &output_part_numbering_begin_index,
6099 std::vector<mj_part_t> *next_future_num_parts_in_parts)
6100{
6101 mj_part_t part_shift_amount = output_part_numbering_begin_index;
6102 mj_part_t previous_processor = -1;
6103
6104 auto local_new_part_xadj = Kokkos::create_mirror_view(this->new_part_xadj);
6105 Kokkos::deep_copy(local_new_part_xadj, this->new_part_xadj);
6106
6107 auto local_new_coordinate_permutations =
6108 Kokkos::create_mirror_view(this->new_coordinate_permutations);
6109 Kokkos::deep_copy(local_new_coordinate_permutations,
6110 this->new_coordinate_permutations);
6111
6112 for(mj_part_t i = 0; i < num_parts; ++i) {
6113 mj_part_t p = sort_item_part_to_proc_assignment[i].id;
6114
6115 // assigned processors are sorted.
6116 mj_lno_t part_begin_index = 0;
6117
6118 if(p > 0) {
6119 part_begin_index = local_new_part_xadj(p - 1);
6120 }
6121
6122 mj_lno_t part_end_index = local_new_part_xadj(p);
6123
6124 mj_part_t assigned_proc = sort_item_part_to_proc_assignment[i].val;
6125 if(this->myRank == assigned_proc && previous_processor != assigned_proc) {
6126 output_part_numbering_begin_index = part_shift_amount;
6127 }
6128 previous_processor = assigned_proc;
6129 part_shift_amount += (*next_future_num_parts_in_parts)[p];
6130
6131 for(mj_lno_t j= part_begin_index; j < part_end_index; j++) {
6132 mj_lno_t localInd = local_new_coordinate_permutations(j);
6133 coordinate_destinations[localInd] = assigned_proc;
6134 }
6135 }
6136}
6137
6159template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6160 typename mj_part_t, typename mj_node_t>
6161void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6162 mj_assign_parts_to_procs(
6163 mj_gno_t * num_points_in_all_processor_parts,
6164 mj_part_t num_parts,
6165 mj_part_t num_procs,
6166 mj_lno_t *send_count_to_each_proc,
6167 std::vector<mj_part_t> *next_future_num_parts_in_parts,
6168 mj_part_t &out_num_part,
6169 std::vector<mj_part_t> &out_part_indices,
6170 mj_part_t &output_part_numbering_begin_index,
6171 int *coordinate_destinations) {
6172
6173 out_num_part = 0;
6174 mj_gno_t *global_num_points_in_parts =
6175 num_points_in_all_processor_parts + num_procs * num_parts;
6176 out_part_indices.clear();
6177
6178 // to sort the parts that is assigned to the processors.
6179 // id is the part number, sort value is the assigned processor id.
6180 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment =
6181 new uSortItem<mj_part_t, mj_part_t>[num_parts];
6182 uSortItem<mj_part_t, mj_gno_t> * sort_item_num_points_of_proc_in_part_i =
6183 new uSortItem<mj_part_t, mj_gno_t>[num_procs];
6184
6185 // calculate the optimal number of coordinates that should be assigned
6186 // to each processor.
6187 mj_lno_t work_each =
6188 mj_lno_t (this->num_global_coords / (double (num_procs)) + 0.5f);
6189
6190 // to hold the left space as the number of coordinates to the optimal
6191 // number in each proc.
6192 mj_lno_t *space_in_each_processor = new mj_lno_t[num_procs];
6193
6194 // initialize left space in each.
6195 for(mj_part_t i = 0; i < num_procs; ++i) {
6196 space_in_each_processor[i] = work_each;
6197 }
6198
6199 // we keep track of how many parts each processor is assigned to.
6200 // because in some weird inputs, it might be possible that some
6201 // processors is not assigned to any part. Using these variables,
6202 // we force each processor to have at least one part.
6203 mj_part_t *num_parts_proc_assigned = new mj_part_t[num_procs];
6204 memset(num_parts_proc_assigned, 0, sizeof(mj_part_t) * num_procs);
6205 int empty_proc_count = num_procs;
6206
6207 // to sort the parts with decreasing order of their coordiantes.
6208 // id are the part numbers, sort value is the number of points in each.
6209 uSortItem<mj_part_t, mj_gno_t> * sort_item_point_counts_in_parts =
6210 new uSortItem<mj_part_t, mj_gno_t>[num_parts];
6211
6212 // initially we will sort the parts according to the number of coordinates
6213 // they have, so that we will start assigning with the part that has the most
6214 // number of coordinates.
6215 for(mj_part_t i = 0; i < num_parts; ++i) {
6216 sort_item_point_counts_in_parts[i].id = i;
6217 sort_item_point_counts_in_parts[i].val = global_num_points_in_parts[i];
6218 }
6219
6220 // sort parts with increasing order of loads.
6221 uqsort<mj_part_t, mj_gno_t>(num_parts, sort_item_point_counts_in_parts);
6222
6223 // assigning parts to the processors
6224 // traverse the part with decreasing order of load.
6225 // first assign the heaviest part.
6226 for(mj_part_t j = 0; j < num_parts; ++j) {
6227 // sorted with increasing order, traverse inverse.
6228 mj_part_t i = sort_item_point_counts_in_parts[num_parts - 1 - j].id;
6229
6230 // load of the part
6231 mj_gno_t load = global_num_points_in_parts[i];
6232
6233 // assigned processors
6234 mj_part_t assigned_proc = -1;
6235
6236 // sort processors with increasing number of points in this part.
6237 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
6238 sort_item_num_points_of_proc_in_part_i[ii].id = ii;
6239
6240 // if there are still enough parts to fill empty processors, than proceed
6241 // normally, but if empty processor count is equal to the number of part,
6242 // then we force to part assignments only to empty processors.
6243 if(empty_proc_count < num_parts - j ||
6244 num_parts_proc_assigned[ii] == 0) {
6245 // how many points processor ii has in part i?
6246 sort_item_num_points_of_proc_in_part_i[ii].val =
6247 num_points_in_all_processor_parts[ii * num_parts + i];
6248 }
6249 else {
6250 sort_item_num_points_of_proc_in_part_i[ii].val = -1;
6251 }
6252 }
6253
6254 uqsort<mj_part_t, mj_gno_t>(num_procs,
6255 sort_item_num_points_of_proc_in_part_i);
6256
6257 // traverse all processors with decreasing load.
6258 for(mj_part_t iii = num_procs - 1; iii >= 0; --iii) {
6259 mj_part_t ii = sort_item_num_points_of_proc_in_part_i[iii].id;
6260 if(assigned_proc == -1 ||
6261 (space_in_each_processor[ii] > space_in_each_processor[assigned_proc])) {
6262 assigned_proc = ii;
6263 }
6264 else if(space_in_each_processor[ii] == space_in_each_processor[assigned_proc]) {
6265 if(ii < assigned_proc) {
6266 // ties go to lower proc
6267 // not necessary for a valid result but allows testing to compare
6268 // MPI results and have parts numbers assigned to the same boxes.
6269 // We don't break here because we may have more ties still to check.
6270 // The indeterminate state before this is due to Cuda using
6271 // atomics to refill the permutation array. So non-cuda runs don't
6272 // actualy need this since they will always have the same pattern.
6273 assigned_proc = ii;
6274 }
6275 }
6276 else {
6277 break; // now we can break - we have our part and no more ties.
6278 }
6279 }
6280
6281 if(num_parts_proc_assigned[assigned_proc]++ == 0) {
6282 --empty_proc_count;
6283 }
6284
6285 space_in_each_processor[assigned_proc] -= load;
6286 //to sort later, part-i is assigned to the proccessor - assignment.
6287 sort_item_part_to_proc_assignment[j].id = i; //part i
6288
6289 // assigned to processor - assignment.
6290 sort_item_part_to_proc_assignment[j].val = assigned_proc;
6291
6292 // if assigned processor is me, increase the number.
6293 if(assigned_proc == this->myRank) {
6294 out_num_part++;//assigned_part_count;
6295 out_part_indices.push_back(i);
6296 }
6297
6298 // increase the send to that processor by the number of points in that
6299 // part, as everyone send their coordiantes in this part to the
6300 // processor assigned to this part.
6301 send_count_to_each_proc[assigned_proc] +=
6302 num_points_in_all_processor_parts[this->myRank * num_parts + i];
6303 }
6304
6305 delete [] num_parts_proc_assigned;
6306 delete [] sort_item_num_points_of_proc_in_part_i;
6307 delete [] sort_item_point_counts_in_parts;
6308 delete [] space_in_each_processor;
6309
6310 // sort assignments with respect to the assigned processors.
6311 uqsort<mj_part_t, mj_part_t>(num_parts, sort_item_part_to_proc_assignment);
6312
6313 // fill sendBuf.
6314 this->assign_send_destinations2(
6315 num_parts,
6316 sort_item_part_to_proc_assignment,
6317 coordinate_destinations,
6318 output_part_numbering_begin_index,
6319 next_future_num_parts_in_parts);
6320
6321 delete [] sort_item_part_to_proc_assignment;
6322}
6323
6324
6348template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6349 typename mj_part_t, typename mj_node_t>
6350void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6351 mj_migration_part_proc_assignment(
6352 mj_gno_t * num_points_in_all_processor_parts,
6353 mj_part_t num_parts,
6354 mj_part_t num_procs,
6355 mj_lno_t *send_count_to_each_proc,
6356 std::vector<mj_part_t> &processor_ranks_for_subcomm,
6357 std::vector<mj_part_t> *next_future_num_parts_in_parts,
6358 mj_part_t &out_num_part,
6359 std::vector<mj_part_t> &out_part_indices,
6360 mj_part_t &output_part_numbering_begin_index,
6361 int *coordinate_destinations)
6362{
6363 processor_ranks_for_subcomm.clear();
6364 // if(this->num_local_coords > 0)
6365 if(num_procs > num_parts) {
6366 // if there are more processors than the number of current part
6367 // then processors share the existing parts.
6368 // at the end each processor will have a single part,
6369 // but a part will be shared by a group of processors.
6370 mj_part_t out_part_index = 0;
6371
6372 this->mj_assign_proc_to_parts(
6373 num_points_in_all_processor_parts,
6374 num_parts,
6375 num_procs,
6376 send_count_to_each_proc,
6377 processor_ranks_for_subcomm,
6378 next_future_num_parts_in_parts,
6379 out_part_index,
6380 output_part_numbering_begin_index,
6381 coordinate_destinations
6382 );
6383
6384 out_num_part = 1;
6385 out_part_indices.clear();
6386 out_part_indices.push_back(out_part_index);
6387 }
6388 else {
6389
6390 // there are more parts than the processors.
6391 // therefore a processor will be assigned multiple parts,
6392 // the subcommunicators will only have a single processor.
6393 processor_ranks_for_subcomm.push_back(this->myRank);
6394
6395 // since there are more parts then procs,
6396 // assign multiple parts to processors.
6397
6398 this->mj_assign_parts_to_procs(
6399 num_points_in_all_processor_parts,
6400 num_parts,
6401 num_procs,
6402 send_count_to_each_proc,
6403 next_future_num_parts_in_parts,
6404 out_num_part,
6405 out_part_indices,
6406 output_part_numbering_begin_index,
6407 coordinate_destinations);
6408 }
6409}
6410
6424template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6425 typename mj_part_t, typename mj_node_t>
6426void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6427 mj_migrate_coords(
6428 mj_part_t num_procs,
6429 mj_lno_t &num_new_local_points,
6430 std::string iteration,
6431 int *coordinate_destinations,
6432 mj_part_t num_parts)
6433{
6434
6435#ifdef ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
6436 if(sizeof(mj_lno_t) <= sizeof(int)) {
6437 // Cannot use Zoltan_Comm with local ordinals larger than ints.
6438 // In Zoltan_Comm_Create, the cast int(this->num_local_coords)
6439 // may overflow.
6440 ZOLTAN_COMM_OBJ *plan = NULL;
6441 MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->comm));
6442 int num_incoming_gnos = 0;
6443 int message_tag = 7859;
6444
6445 this->mj_env->timerStart(MACRO_TIMERS,
6446 mj_timer_base_string + "Migration Z1PlanCreating-" + iteration);
6447 int ierr = Zoltan_Comm_Create(
6448 &plan,
6449 int(this->num_local_coords),
6450 coordinate_destinations,
6451 mpi_comm,
6452 message_tag,
6453 &num_incoming_gnos);
6454
6455 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6456 this->mj_env->timerStop(MACRO_TIMERS,
6457 mj_timer_base_string + "Migration Z1PlanCreating-" + iteration);
6458
6459 this->mj_env->timerStart(MACRO_TIMERS,
6460 mj_timer_base_string + "Migration Z1Migration-" + iteration);
6461
6462 // MPI Buffers should be on Kokkos::HostSpace not Kokkos::CudaUVMSpace
6463 // Note, with UVM space, create_mirror_view does NOT create a non-UVM
6464 // view; need the explicit Host creation and deep_copy.
6465
6466 // migrate gnos.
6467 {
6468 auto host_current_mj_gnos = Kokkos::create_mirror_view(
6469 Kokkos::HostSpace(), this->current_mj_gnos);
6470 Kokkos::deep_copy(host_current_mj_gnos, this->current_mj_gnos);
6471 Kokkos::View<mj_gno_t*, device_t> dst_gnos(
6472 Kokkos::ViewAllocateWithoutInitializing("dst_gnos"), num_incoming_gnos);
6473 auto host_dst_gnos = Kokkos::create_mirror_view(
6474 Kokkos::HostSpace(), dst_gnos);
6475 message_tag++;
6476 ierr = Zoltan_Comm_Do(
6477 plan,
6478 message_tag,
6479 (char *) host_current_mj_gnos.data(),
6480 sizeof(mj_gno_t),
6481 (char *) host_dst_gnos.data());
6482 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6483 Kokkos::deep_copy(dst_gnos, host_dst_gnos);
6484 this->current_mj_gnos = dst_gnos;
6485 }
6486
6487 //migrate coordinates
6488 {
6489 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
6490 auto host_src_coordinates = Kokkos::create_mirror_view(
6491 Kokkos::HostSpace(), this->mj_coordinates);
6492 Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
6493 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
6494 dst_coordinates(Kokkos::ViewAllocateWithoutInitializing("mj_coordinates"),
6495 num_incoming_gnos, this->coord_dim);
6496 auto host_dst_coordinates = Kokkos::create_mirror_view(
6497 Kokkos::HostSpace(), dst_coordinates);
6498 for(int i = 0; i < this->coord_dim; ++i) {
6499 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sub_host_src_coordinates
6500 = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
6501 Kokkos::View<mj_scalar_t *, Kokkos::HostSpace> sub_host_dst_coordinates
6502 = Kokkos::subview(host_dst_coordinates, Kokkos::ALL, i);
6503 // Note Layout Left means we can do these in contiguous blocks
6504 message_tag++;
6505 ierr = Zoltan_Comm_Do(
6506 plan,
6507 message_tag,
6508 (char *) sub_host_src_coordinates.data(),
6509 sizeof(mj_scalar_t),
6510 (char *) sub_host_dst_coordinates.data());
6511 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6512 }
6513 deep_copy(dst_coordinates, host_dst_coordinates);
6514 this->mj_coordinates = dst_coordinates;
6515 }
6516
6517 // migrate weights.
6518 {
6519 auto host_src_weights = Kokkos::create_mirror_view(
6520 Kokkos::HostSpace(), this->mj_weights);
6521 Kokkos::deep_copy(host_src_weights, this->mj_weights);
6522 Kokkos::View<mj_scalar_t**, device_t> dst_weights(
6523 Kokkos::ViewAllocateWithoutInitializing("mj_weights"),
6524 num_incoming_gnos, this->num_weights_per_coord);
6525 auto host_dst_weights = Kokkos::create_mirror_view(dst_weights);
6526 for(int i = 0; i < this->num_weights_per_coord; ++i) {
6527 auto sub_host_src_weights
6528 = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
6529 auto sub_host_dst_weights
6530 = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
6531 ArrayRCP<mj_scalar_t> sent_weight(this->num_local_coords);
6532 // Copy because of layout
6533 for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
6534 sent_weight[n] = sub_host_src_weights(n);
6535 }
6536 ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
6537 message_tag++;
6538 ierr = Zoltan_Comm_Do(
6539 plan,
6540 message_tag,
6541 (char *) sent_weight.getRawPtr(),
6542 sizeof(mj_scalar_t),
6543 (char *) received_weight.getRawPtr());
6544 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6545 // Again we copy by index due to layout
6546 for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
6547 sub_host_dst_weights(n) = received_weight[n];
6548 }
6549 }
6550 deep_copy(dst_weights, host_dst_weights);
6551 this->mj_weights = dst_weights;
6552 }
6553
6554 // migrate owners.
6555 {
6556 // Note that owners we kept on Serial
6557 Kokkos::View<int *, Kokkos::HostSpace> dst_owners_of_coordinate(
6558 Kokkos::ViewAllocateWithoutInitializing("owner_of_coordinate"),
6559 num_incoming_gnos);
6560 message_tag++;
6561 ierr = Zoltan_Comm_Do(
6562 plan,
6563 message_tag,
6564 (char *) owner_of_coordinate.data(),
6565 sizeof(int),
6566 (char *) dst_owners_of_coordinate.data());
6567 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6568 this->owner_of_coordinate = dst_owners_of_coordinate;
6569 }
6570
6571 // if num procs is less than num parts,
6572 // we need the part assigment arrays as well, since
6573 // there will be multiple parts in processor.
6574 {
6575 auto host_src_assigned_part_ids = Kokkos::create_mirror_view(
6576 Kokkos::HostSpace(), this->assigned_part_ids);
6577 Kokkos::deep_copy(host_src_assigned_part_ids, this->assigned_part_ids);
6578 Kokkos::View<int *, device_t> dst_assigned_part_ids(
6579 Kokkos::ViewAllocateWithoutInitializing("assigned_part_ids"),
6580 num_incoming_gnos);
6581 auto host_dst_assigned_part_ids = Kokkos::create_mirror_view(
6582 Kokkos::HostSpace(), dst_assigned_part_ids);
6583 mj_part_t *new_parts = new mj_part_t[num_incoming_gnos];
6584 if(num_procs < num_parts) {
6585 message_tag++;
6586 ierr = Zoltan_Comm_Do(
6587 plan,
6588 message_tag,
6589 (char *) host_src_assigned_part_ids.data(),
6590 sizeof(mj_part_t),
6591 (char *) host_dst_assigned_part_ids.data());
6592 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6593 Kokkos::deep_copy(dst_assigned_part_ids, host_dst_assigned_part_ids);
6594 }
6595 // In original code this would just assign to an uninitialized array
6596 // if num_procs < num_parts. We're doing the same here.
6597 this->assigned_part_ids = dst_assigned_part_ids;
6598 }
6599
6600 ierr = Zoltan_Comm_Destroy(&plan);
6601 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6602 num_new_local_points = num_incoming_gnos;
6603 this->mj_env->timerStop(MACRO_TIMERS,
6604 mj_timer_base_string + "Migration Z1Migration-" + iteration);
6605 }
6606 else
6607#endif // ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
6608 {
6609 this->mj_env->timerStart(MACRO_TIMERS, mj_timer_base_string +
6610 "Migration DistributorPlanCreating-" + iteration);
6611
6612 Tpetra::Distributor distributor(this->comm);
6613 ArrayView<const mj_part_t> destinations( coordinate_destinations,
6614 this->num_local_coords);
6615 mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
6616 this->mj_env->timerStop(MACRO_TIMERS, mj_timer_base_string +
6617 "Migration DistributorPlanCreating-" + iteration);
6618 this->mj_env->timerStart(MACRO_TIMERS, mj_timer_base_string +
6619 "Migration DistributorMigration-" + iteration);
6620
6621 // note MPI buffers should all be on Kokkos::HostSpace and not
6622 // Kokkos::CudaUVMSpace.
6623 // Note, with UVM space, create_mirror_view does NOT create a non-UVM
6624 // view; need the explicit Host creation and deep_copy.
6625 // migrate gnos.
6626 {
6627 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos(
6628 Kokkos::ViewAllocateWithoutInitializing("received_gnos"),
6629 num_incoming_gnos);
6630
6631 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> sent_gnos(
6632 Kokkos::ViewAllocateWithoutInitializing("sent_gnos"),
6633 this->current_mj_gnos.extent(0));
6634 Kokkos::deep_copy(sent_gnos, this->current_mj_gnos);
6635
6636 distributor.doPostsAndWaits(sent_gnos, 1, received_gnos);
6637
6638 this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
6639 Kokkos::ViewAllocateWithoutInitializing("gids"), num_incoming_gnos);
6640
6641 Kokkos::deep_copy(this->current_mj_gnos, received_gnos);
6642 }
6643
6644 // migrate coordinates
6645 // coordinates in MJ are LayoutLeft since Tpetra Multivector is LayoutLeft
6646 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
6647 dst_coordinates("mj_coordinates", num_incoming_gnos, this->coord_dim);
6648
6649 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, Kokkos::HostSpace>
6650 host_src_coordinates(
6651 Kokkos::ViewAllocateWithoutInitializing("host_coords"),
6652 this->mj_coordinates.extent(0), this->mj_coordinates.extent(1));
6653 Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
6654
6655 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_coord(
6656 Kokkos::ViewAllocateWithoutInitializing("received_coord"),
6657 num_incoming_gnos);
6658
6659 for(int i = 0; i < this->coord_dim; ++i) {
6660
6661 // Note Layout Left means we can do these in contiguous blocks
6662
6663 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sent_coord
6664 = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
6665
6666 distributor.doPostsAndWaits(sent_coord, 1, received_coord);
6667
6668 Kokkos::deep_copy(Kokkos::subview(dst_coordinates, Kokkos::ALL, i),
6669 received_coord);
6670
6671 // Kokkos::deep_copy will fence, I think, so it should be safe
6672 // to reuse received_coord in the next lop iteration
6673 }
6674 this->mj_coordinates = dst_coordinates;
6675
6676 // migrate weights.
6677 Kokkos::View<mj_scalar_t**, device_t> dst_weights(
6678 "mj_weights", num_incoming_gnos, this->num_weights_per_coord);
6679 auto host_dst_weights = Kokkos::create_mirror_view(Kokkos::HostSpace(),
6680 dst_weights);
6681
6682 auto host_src_weights = Kokkos::create_mirror_view_and_copy(
6683 Kokkos::HostSpace(), this->mj_weights);
6684
6685 // contiguous buffers to gather potentially strided data
6686 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sent_weight(
6687 Kokkos::ViewAllocateWithoutInitializing("send_weight_buffer"),
6688 this->num_local_coords);
6689
6690 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_weight(
6691 Kokkos::ViewAllocateWithoutInitializing("received_weight_buffer"),
6692 num_incoming_gnos);
6693
6694 for(int i = 0; i < this->num_weights_per_coord; ++i) {
6695
6696 auto sub_host_src_weights
6697 = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
6698
6699 auto sub_host_dst_weights
6700 = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
6701
6702
6703 // Layout Right means the weights are not contiguous
6704 // However we don't have any systems setup with more than 1 weight so
6705 // really I have not tested any of this code with num weights > 1.
6706 // I think this is the right thing to do.
6707 for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
6708 sent_weight[n] = sub_host_src_weights(n);
6709 }
6710
6711 distributor.doPostsAndWaits(sent_weight, 1, received_weight);
6712
6713 // Again we copy by index due to layout
6714 for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
6715 sub_host_dst_weights(n) = received_weight[n];
6716 }
6717 }
6718 Kokkos::deep_copy(dst_weights, host_dst_weights);
6719 this->mj_weights = dst_weights;
6720
6721 // migrate owners
6722 {
6723 // Note owners we kept on Serial
6724 Kokkos::View<int *, Kokkos::HostSpace> received_owners(
6725 Kokkos::ViewAllocateWithoutInitializing("owner_of_coordinate"),
6726 num_incoming_gnos);
6727
6728 distributor.doPostsAndWaits(owner_of_coordinate, 1, received_owners);
6729
6730 this->owner_of_coordinate = received_owners;
6731 }
6732
6733 // if num procs is less than num parts,
6734 // we need the part assigment arrays as well, since
6735 // there will be multiple parts in processor.
6736 if(num_procs < num_parts) {
6737 Kokkos::View<mj_part_t*, Kokkos::HostSpace> sent_partids(
6738 Kokkos::ViewAllocateWithoutInitializing("host_parts"),
6739 this->assigned_part_ids.extent(0));
6740 Kokkos::deep_copy(sent_partids, assigned_part_ids);
6741
6742 Kokkos::View<mj_part_t*, Kokkos::HostSpace> received_partids(
6743 Kokkos::ViewAllocateWithoutInitializing("received_partids"),
6744 num_incoming_gnos);
6745
6746 distributor.doPostsAndWaits(sent_partids, 1, received_partids);
6747
6748 this->assigned_part_ids = Kokkos::View<mj_part_t *, device_t>
6749 ("assigned_part_ids", num_incoming_gnos);
6750 Kokkos::deep_copy(this->assigned_part_ids, received_partids);
6751 }
6752 else {
6753 this->assigned_part_ids = Kokkos::View<mj_part_t *, device_t>
6754 ("assigned_part_ids", num_incoming_gnos);
6755 }
6756 this->mj_env->timerStop(MACRO_TIMERS, "" + mj_timer_base_string +
6757 "Migration DistributorMigration-" + iteration);
6758
6759 num_new_local_points = num_incoming_gnos;
6760 }
6761}
6762
6768template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6769 typename mj_part_t, typename mj_node_t>
6770void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6771 create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm)
6772{
6773 mj_part_t group_size = processor_ranks_for_subcomm.size();
6774 mj_part_t *ids = new mj_part_t[group_size];
6775 for(mj_part_t i = 0; i < group_size; ++i) {
6776 ids[i] = processor_ranks_for_subcomm[i];
6777 }
6778 ArrayView<const mj_part_t> idView(ids, group_size);
6779 this->comm = this->comm->createSubcommunicator(idView);
6780 delete [] ids;
6781}
6782
6788template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6789 typename mj_part_t, typename mj_node_t>
6790void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6791 fill_permutation_array(
6792 mj_part_t output_num_parts,
6793 mj_part_t num_parts)
6794{
6795 // if there is single output part, then simply fill the permutation array.
6796 if(output_num_parts == 1) {
6797 auto local_new_coordinate_permutations = this->new_coordinate_permutations;
6798 Kokkos::parallel_for(
6799 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
6800 (0, this->num_local_coords),
6801 KOKKOS_LAMBDA(mj_lno_t i) {
6802 local_new_coordinate_permutations(i) = i;
6803 });
6804 auto local_new_part_xadj = this->new_part_xadj;
6805 auto local_num_local_coords = this->num_local_coords;
6806 Kokkos::parallel_for(
6807 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0,1),
6808 KOKKOS_LAMBDA(int dummy) {
6809 local_new_part_xadj(0) = local_num_local_coords;
6810 });
6811 }
6812 else {
6813 auto local_num_local_coords = this->num_local_coords;
6814 auto local_assigned_part_ids = this->assigned_part_ids;
6815 auto local_new_part_xadj = this->new_part_xadj;
6816 auto local_new_coordinate_permutations = this->new_coordinate_permutations;
6817
6818 // part shift holds the which part number an old part number corresponds to.
6819 Kokkos::View<mj_part_t*, device_t> part_shifts("part_shifts", num_parts);
6820
6821 // otherwise we need to count how many points are there in each part.
6822 // we allocate here as num_parts, because the sent partids are up to
6823 // num_parts, although there are outout_num_parts different part.
6824 Kokkos::View<mj_lno_t*, device_t> num_points_in_parts(
6825 "num_points_in_parts", num_parts);
6826
6827 Kokkos::parallel_for(
6828 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0,1),
6829 KOKKOS_LAMBDA(int dummy) {
6830
6831 for(mj_lno_t i = 0; i < local_num_local_coords; ++i) {
6832 mj_part_t ii = local_assigned_part_ids(i);
6833 ++num_points_in_parts(ii);
6834 }
6835
6836 // write the end points of the parts.
6837 mj_part_t p = 0;
6838 mj_lno_t prev_index = 0;
6839 for(mj_part_t i = 0; i < num_parts; ++i) {
6840 if(num_points_in_parts(i) > 0) {
6841 local_new_part_xadj(p) = prev_index + num_points_in_parts(i);
6842 prev_index += num_points_in_parts(i);
6843 part_shifts(i) = p++;
6844 }
6845 }
6846
6847 // for the rest of the parts write the end index as end point.
6848 mj_part_t assigned_num_parts = p - 1;
6849 for(;p < num_parts; ++p) {
6850 local_new_part_xadj(p) =
6851 local_new_part_xadj(assigned_num_parts);
6852 }
6853 for(mj_part_t i = 0; i < output_num_parts; ++i) {
6854 num_points_in_parts(i) = local_new_part_xadj(i);
6855 }
6856
6857 // write the permutation array here.
6858 // get the part of the coordinate i, shift it to obtain the new part number.
6859 // assign it to the end of the new part numbers pointer.
6860 for(mj_lno_t i = local_num_local_coords - 1; i >= 0; --i) {
6861 mj_part_t part =
6862 part_shifts[mj_part_t(local_assigned_part_ids(i))];
6863 local_new_coordinate_permutations(--num_points_in_parts[part]) = i;
6864 }
6865 });
6866 }
6867}
6868
6893template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6894 typename mj_part_t, typename mj_node_t>
6895bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6896 mj_perform_migration(
6897 mj_part_t input_num_parts,
6898 mj_part_t &output_num_parts,
6899 std::vector<mj_part_t> *next_future_num_parts_in_parts,
6900 mj_part_t &output_part_begin_index,
6901 size_t migration_reduce_all_population,
6902 mj_lno_t num_coords_for_last_dim_part,
6903 std::string iteration,
6904 RCP<mj_partBoxVector_t> &input_part_boxes,
6905 RCP<mj_partBoxVector_t> &output_part_boxes)
6906{
6907 mj_part_t num_procs = this->comm->getSize();
6908 this->myRank = this->comm->getRank();
6909
6910 // this array holds how many points each processor has in each part.
6911 // to access how many points processor i has on part j,
6912 // num_points_in_all_processor_parts[i * num_parts + j]
6913 mj_gno_t *num_points_in_all_processor_parts =
6914 new mj_gno_t[input_num_parts * (num_procs + 1)];
6915
6916 // get the number of coordinates in each part in each processor.
6917 this->get_processor_num_points_in_parts(
6918 num_procs,
6919 input_num_parts,
6920 num_points_in_all_processor_parts);
6921
6922 // check if migration will be performed or not.
6923 if(!this->mj_check_to_migrate(
6924 migration_reduce_all_population,
6925 num_coords_for_last_dim_part,
6926 num_procs,
6927 input_num_parts,
6928 num_points_in_all_processor_parts)) {
6929 delete [] num_points_in_all_processor_parts;
6930 return false;
6931 }
6932
6933 mj_lno_t *send_count_to_each_proc = NULL;
6934 int *coordinate_destinations = new int[this->num_local_coords];
6935 send_count_to_each_proc = new mj_lno_t[num_procs];
6936
6937 for(int i = 0; i < num_procs; ++i) {
6938 send_count_to_each_proc[i] = 0;
6939 }
6940
6941 std::vector<mj_part_t> processor_ranks_for_subcomm;
6942 std::vector<mj_part_t> out_part_indices;
6943
6944 // determine which processors are assigned to which parts
6945 this->mj_migration_part_proc_assignment(
6946 num_points_in_all_processor_parts,
6947 input_num_parts,
6948 num_procs,
6949 send_count_to_each_proc,
6950 processor_ranks_for_subcomm,
6951 next_future_num_parts_in_parts,
6952 output_num_parts,
6953 out_part_indices,
6954 output_part_begin_index,
6955 coordinate_destinations);
6956
6957 delete [] send_count_to_each_proc;
6958 std::vector <mj_part_t> tmpv;
6959
6960 std::sort (out_part_indices.begin(), out_part_indices.end());
6961 mj_part_t outP = out_part_indices.size();
6962 mj_gno_t new_global_num_points = 0;
6963 mj_gno_t *global_num_points_in_parts =
6964 num_points_in_all_processor_parts + num_procs * input_num_parts;
6965
6966 if(this->mj_keep_part_boxes) {
6967 input_part_boxes->clear();
6968 }
6969
6970 // now we calculate the new values for next_future_num_parts_in_parts.
6971 // same for the part boxes.
6972 for(mj_part_t i = 0; i < outP; ++i) {
6973 mj_part_t ind = out_part_indices[i];
6974 new_global_num_points += global_num_points_in_parts[ind];
6975 tmpv.push_back((*next_future_num_parts_in_parts)[ind]);
6976 if(this->mj_keep_part_boxes) {
6977 input_part_boxes->push_back((*output_part_boxes)[ind]);
6978 }
6979 }
6980
6981 // swap the input and output part boxes.
6982 if(this->mj_keep_part_boxes) {
6983 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
6984 input_part_boxes = output_part_boxes;
6985 output_part_boxes = tmpPartBoxes;
6986 }
6987 next_future_num_parts_in_parts->clear();
6988 for(mj_part_t i = 0; i < outP; ++i) {
6989 mj_part_t p = tmpv[i];
6990 next_future_num_parts_in_parts->push_back(p);
6991 }
6992
6993 delete [] num_points_in_all_processor_parts;
6994
6995 mj_lno_t num_new_local_points = 0;
6996 //perform the actual migration operation here.
6997 this->mj_migrate_coords(
6998 num_procs,
6999 num_new_local_points,
7000 iteration,
7001 coordinate_destinations,
7002 input_num_parts);
7003
7004 delete [] coordinate_destinations;
7005 if(this->num_local_coords != num_new_local_points) {
7006 this->new_coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>
7007 (Kokkos::ViewAllocateWithoutInitializing("new_coordinate_permutations"),
7008 num_new_local_points);
7009 this->coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>
7010 (Kokkos::ViewAllocateWithoutInitializing("coordinate_permutations"),
7011 num_new_local_points);
7012 }
7013 this->num_local_coords = num_new_local_points;
7014 this->num_global_coords = new_global_num_points;
7015
7016 // create subcommunicator.
7017 this->create_sub_communicator(processor_ranks_for_subcomm);
7018
7019 processor_ranks_for_subcomm.clear();
7020
7021 // fill the new permutation arrays.
7022 this->fill_permutation_array(output_num_parts, input_num_parts);
7023
7024 return true;
7025}
7026
7045template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7046 typename mj_part_t, typename mj_node_t>
7047void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
7048 create_consistent_chunks(
7049 mj_part_t num_parts,
7050 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
7051 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
7052 mj_lno_t coordinate_begin,
7053 mj_lno_t coordinate_end,
7054 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
7055 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj,
7056 int coordInd,
7057 bool longest_dim_part,
7058 uSignedSortItem<int, mj_scalar_t, char> * p_coord_dimension_range_sorted)
7059{
7060 // Note that this method is only used by task mapper
7061 // All code in this file has been verified to run with UVM off by running
7062 // mj tests and task mapper tests with UVM off. However for this particular
7063 // method I did not do much for UVM off. I heavily use device to host copies
7064 // and more or less preserve the original logic. Due to the handling of
7065 // arrays it will be a bit of work to convert this to as better form.
7066 // Since it's only relevant to task mapper and I wasn't sure how much priority
7067 // to give it, I put that on hold until further discussion.
7068 mj_part_t no_cuts = num_parts - 1;
7069
7070 // now if the rectilinear partitioning is allowed we decide how
7071 // much weight each thread should put to left and right.
7072 if(this->distribute_points_on_cut_lines) {
7073 auto local_thread_cut_line_weight_to_put_left =
7074 this->thread_cut_line_weight_to_put_left;
7075 auto local_thread_part_weight_work =
7076 this->thread_part_weight_work;
7077 auto local_sEpsilon = this->sEpsilon;
7078
7079 Kokkos::parallel_for(
7080 Kokkos::RangePolicy<typename mj_node_t::execution_space,
7081 mj_part_t> (0, no_cuts), KOKKOS_LAMBDA (mj_part_t i) {
7082 // the left to be put on the left of the cut.
7083 mj_scalar_t left_weight = used_local_cut_line_weight_to_left(i);
7084 if(left_weight > local_sEpsilon) {
7085 // the weight of thread ii on cut.
7086 mj_scalar_t thread_ii_weight_on_cut =
7087 local_thread_part_weight_work(i * 2 + 1) -
7088 local_thread_part_weight_work(i * 2);
7089 if(thread_ii_weight_on_cut < left_weight) {
7090 local_thread_cut_line_weight_to_put_left(i) =
7091 thread_ii_weight_on_cut;
7092 }
7093 else {
7094 local_thread_cut_line_weight_to_put_left(i) = left_weight;
7095 }
7096 }
7097 else {
7098 local_thread_cut_line_weight_to_put_left(i) = 0;
7099 }
7100 });
7101
7102 if(no_cuts > 0) {
7103 auto local_least_signifiance = least_signifiance;
7104 auto local_significance_mul = significance_mul;
7105 Kokkos::parallel_for(
7106 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
7107 (0, 1), KOKKOS_LAMBDA (int dummy) {
7108 // this is a special case. If cutlines share the same coordinate,
7109 // their weights are equal.
7110 // we need to adjust the ratio for that.
7111 for(mj_part_t i = no_cuts - 1; i > 0 ; --i) {
7112 mj_scalar_t cut1 = current_concurrent_cut_coordinate(i-1);
7113 mj_scalar_t cut2 = current_concurrent_cut_coordinate(i);
7114 mj_scalar_t delta = cut2 - cut1;
7115 mj_scalar_t abs_delta = (delta > 0) ? delta : -delta;
7116 if(abs_delta < local_sEpsilon) {
7117 local_thread_cut_line_weight_to_put_left(i) -=
7118 local_thread_cut_line_weight_to_put_left(i - 1);
7119 }
7120 local_thread_cut_line_weight_to_put_left(i) =
7121 static_cast<long long>((local_thread_cut_line_weight_to_put_left(i) +
7122 local_least_signifiance) * local_significance_mul) /
7123 static_cast<mj_scalar_t>(local_significance_mul);
7124 }
7125 });
7126 }
7127 }
7128
7129 auto local_thread_point_counts = this->thread_point_counts;
7130 Kokkos::parallel_for(
7131 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
7132 (0, num_parts), KOKKOS_LAMBDA (mj_part_t i) {
7133 local_thread_point_counts(i) = 0;
7134 });
7135
7136 // for this specific case we dont want to distribute the points along the
7137 // cut position randomly, as we need a specific ordering of them. Instead,
7138 // we put the coordinates into a sort item, where we sort those
7139 // using the coordinates of points on other dimensions and the index.
7140
7141 // some of the cuts might share the same position.
7142 // in this case, if cut i and cut j share the same position
7143 // cut_map[i] = cut_map[j] = sort item index.
7144 mj_part_t *cut_map = new mj_part_t[no_cuts];
7145
7146 typedef uMultiSortItem<mj_lno_t, int, mj_scalar_t> multiSItem;
7147 typedef std::vector< multiSItem > multiSVector;
7148 typedef std::vector<multiSVector> multiS2Vector;
7149
7150 // to keep track of the memory allocated.
7151 std::vector<mj_scalar_t *>allocated_memory;
7152
7153 // vector for which the coordinates will be sorted.
7154 multiS2Vector sort_vector_points_on_cut;
7155
7156 // the number of cuts that have different coordinates.
7157 mj_part_t different_cut_count = 1;
7158 cut_map[0] = 0;
7159
7160 // now we insert 1 sort vector for all cuts on the different
7161 // positins.if multiple cuts are on the same position,
7162 // they share sort vectors.
7163 multiSVector tmpMultiSVector;
7164 sort_vector_points_on_cut.push_back(tmpMultiSVector);
7165
7166 auto local_current_concurrent_cut_coordinate =
7167 current_concurrent_cut_coordinate;
7168 auto host_current_concurrent_cut_coordinate =
7169 Kokkos::create_mirror_view(local_current_concurrent_cut_coordinate);
7170 Kokkos::deep_copy(host_current_concurrent_cut_coordinate,
7171 local_current_concurrent_cut_coordinate);
7172
7173 for(mj_part_t i = 1; i < no_cuts ; ++i) {
7174 // if cuts share the same cut coordinates
7175 // set the cutmap accordingly.
7176 if(std::abs(host_current_concurrent_cut_coordinate(i) -
7177 host_current_concurrent_cut_coordinate(i-1)) < this->sEpsilon) {
7178 cut_map[i] = cut_map[i-1];
7179 }
7180 else {
7181 cut_map[i] = different_cut_count++;
7182 multiSVector tmp2MultiSVector;
7183 sort_vector_points_on_cut.push_back(tmp2MultiSVector);
7184 }
7185 }
7186 Kokkos::deep_copy(current_concurrent_cut_coordinate,
7187 host_current_concurrent_cut_coordinate);
7188
7189 // now the actual part assigment.
7190 auto host_coordinate_permutations =
7191 Kokkos::create_mirror_view(coordinate_permutations);
7192 Kokkos::deep_copy(host_coordinate_permutations, coordinate_permutations);
7193
7194 auto host_assigned_part_ids = Kokkos::create_mirror_view(assigned_part_ids);
7195 Kokkos::deep_copy(host_assigned_part_ids, assigned_part_ids);
7196
7197 auto host_mj_coordinates = Kokkos::create_mirror_view(mj_coordinates);
7198 Kokkos::deep_copy(host_mj_coordinates, mj_coordinates);
7199
7200 auto host_thread_point_counts = Kokkos::create_mirror_view(thread_point_counts);
7201 Kokkos::deep_copy(host_thread_point_counts, thread_point_counts);
7202
7203 auto local_coord_dim = this->coord_dim;
7204
7205 for(mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii) {
7206 mj_lno_t i = host_coordinate_permutations(ii);
7207 mj_part_t pp = host_assigned_part_ids(i);
7208 mj_part_t p = pp / 2;
7209 // if the coordinate is on a cut.
7210 if(pp % 2 == 1 ) {
7211 mj_scalar_t *vals = new mj_scalar_t[local_coord_dim -1];
7212 allocated_memory.push_back(vals);
7213
7214 // we insert the coordinates to the sort item here.
7215 int val_ind = 0;
7216
7217 if(longest_dim_part) {
7218 // std::cout << std::endl << std::endl;
7219 for(int dim = local_coord_dim - 2; dim >= 0; --dim) {
7220 // uSignedSortItem<int, mj_scalar_t, char>
7221 // *p_coord_dimension_range_sorted
7222 int next_largest_coord_dim = p_coord_dimension_range_sorted[dim].id;
7223 // std::cout << "next_largest_coord_dim: " <<
7224 // next_largest_coord_dim << " ";
7225 // Note refactor in progress
7226 vals[val_ind++] =
7227 host_mj_coordinates(i,next_largest_coord_dim);
7228 }
7229 }
7230 else {
7231 for(int dim = coordInd + 1; dim < local_coord_dim; ++dim) {
7232 vals[val_ind++] = host_mj_coordinates(i,dim);
7233 }
7234 for(int dim = 0; dim < coordInd; ++dim) {
7235 vals[val_ind++] = host_mj_coordinates(i,dim);
7236 }
7237 }
7238
7239 multiSItem tempSortItem(i, local_coord_dim -1, vals);
7240 //insert the point to the sort vector pointed by the cut_map[p].
7241 mj_part_t cmap = cut_map[p];
7242 sort_vector_points_on_cut[cmap].push_back(tempSortItem);
7243 }
7244 else {
7245 //if it is not on the cut, simple sorting.
7246 ++host_thread_point_counts(p);
7247 host_assigned_part_ids(i) = p;
7248 }
7249 }
7250
7251 // sort all the sort vectors.
7252 for(mj_part_t i = 0; i < different_cut_count; ++i) {
7253 std::sort (sort_vector_points_on_cut[i].begin(),
7254 sort_vector_points_on_cut[i].end());
7255 }
7256
7257 mj_part_t previous_cut_map = cut_map[0];
7258
7259 auto host_thread_cut_line_weight_to_put_left =
7260 Kokkos::create_mirror_view(thread_cut_line_weight_to_put_left);
7261 Kokkos::deep_copy(host_thread_cut_line_weight_to_put_left,
7262 thread_cut_line_weight_to_put_left);
7263
7264 auto host_mj_weights = Kokkos::create_mirror_view(mj_weights);
7265 Kokkos::deep_copy(host_mj_weights, mj_weights);
7266
7267 // this is how much previous part owns the weight of the current part.
7268 // when target part weight is 1.6, and the part on the left is given 2,
7269 // the left has an extra 0.4, while the right has missing 0.4 from the
7270 // previous cut.
7271 // This parameter is used to balance this issues.
7272 // in the above example weight_stolen_from_previous_part will be 0.4.
7273 // if the left part target is 2.2 but it is given 2,
7274 // then weight_stolen_from_previous_part will be -0.2.
7275 mj_scalar_t weight_stolen_from_previous_part = 0;
7276 for(mj_part_t p = 0; p < no_cuts; ++p) {
7277 mj_part_t mapped_cut = cut_map[p];
7278
7279 // if previous cut map is done, and it does not have the same index,
7280 // then assign all points left on that cut to its right.
7281 if(previous_cut_map != mapped_cut) {
7282 mj_lno_t sort_vector_end = (mj_lno_t)
7283 sort_vector_points_on_cut[previous_cut_map].size() - 1;
7284 for(; sort_vector_end >= 0; --sort_vector_end) {
7285 multiSItem t =
7286 sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
7287 mj_lno_t i = t.index;
7288 ++host_thread_point_counts(p);
7289 host_assigned_part_ids(i) = p;
7290 }
7291 sort_vector_points_on_cut[previous_cut_map].clear();
7292 }
7293
7294 // TODO: MD: I dont remember why I have it reverse order here.
7295 mj_lno_t sort_vector_end = (mj_lno_t)
7296 sort_vector_points_on_cut[mapped_cut].size() - 1;
7297 // mj_lno_t sort_vector_begin= 0;
7298 // mj_lno_t sort_vector_size =
7299 // (mj_lno_t)sort_vector_points_on_cut[mapped_cut].size();
7300
7301 // TODO commented for reverse order
7302 for(; sort_vector_end >= 0; --sort_vector_end) {
7303 // for(; sort_vector_begin < sort_vector_size; ++sort_vector_begin) {
7304 // TODO COMMENTED FOR REVERSE ORDER
7305 multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_end];
7306 //multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_begin];
7307 mj_lno_t i = t.index;
7308 mj_scalar_t w = this->mj_uniform_weights(0) ? 1 :
7309 this->mj_weights(i,0);
7310 // part p has enough space for point i, then put it to point i.
7311 if(host_thread_cut_line_weight_to_put_left(p) +
7312 weight_stolen_from_previous_part> this->sEpsilon &&
7313 host_thread_cut_line_weight_to_put_left(p) +
7314 weight_stolen_from_previous_part -
7315 std::abs(host_thread_cut_line_weight_to_put_left(p) +
7316 weight_stolen_from_previous_part - w)> this->sEpsilon)
7317 {
7318 host_thread_cut_line_weight_to_put_left(p) -= w;
7319
7320 sort_vector_points_on_cut[mapped_cut].pop_back();
7321
7322 ++host_thread_point_counts(p);
7323 host_assigned_part_ids(i) = p;
7324 // if putting this weight to left overweights the left cut, then
7325 // increase the space for the next cut using
7326 // weight_stolen_from_previous_part.
7327 if(p < no_cuts - 1 &&
7328 host_thread_cut_line_weight_to_put_left(p) < this->sEpsilon) {
7329 if(mapped_cut == cut_map[p + 1] ) {
7330 // if the cut before the cut indexed at p was also at the same
7331 // position special case, as we handle the weight differently here.
7332 if(previous_cut_map != mapped_cut) {
7333 weight_stolen_from_previous_part =
7334 host_thread_cut_line_weight_to_put_left(p);
7335 }
7336 else {
7337 // if the cut before the cut indexed at p was also at the same
7338 // position we assign extra weights cumulatively in this case.
7339 weight_stolen_from_previous_part +=
7340 host_thread_cut_line_weight_to_put_left(p);
7341 }
7342 }
7343 else{
7344 weight_stolen_from_previous_part =
7345 -host_thread_cut_line_weight_to_put_left(p);
7346 }
7347 // end assignment for part p
7348 break;
7349 }
7350 } else {
7351 // if part p does not have enough space for this point
7352 // and if there is another cut sharing the same positon,
7353 // again increase the space for the next
7354 if(p < no_cuts - 1 && mapped_cut == cut_map[p + 1]) {
7355 if(previous_cut_map != mapped_cut) {
7356 weight_stolen_from_previous_part =
7357 host_thread_cut_line_weight_to_put_left(p);
7358 }
7359 else {
7360 weight_stolen_from_previous_part +=
7361 host_thread_cut_line_weight_to_put_left(p);
7362 }
7363 }
7364 else{
7365 weight_stolen_from_previous_part =
7366 -host_thread_cut_line_weight_to_put_left(p);
7367 }
7368 // end assignment for part p
7369 break;
7370 }
7371 }
7372 previous_cut_map = mapped_cut;
7373 }
7374
7375 // TODO commented for reverse order
7376 // put everything left on the last cut to the last part.
7377 mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[
7378 previous_cut_map].size() - 1;
7379
7380 // mj_lno_t sort_vector_begin= 0;
7381 // mj_lno_t sort_vector_size = (mj_lno_t)
7382 // sort_vector_points_on_cut[previous_cut_map].size();
7383 // TODO commented for reverse order
7384 for(; sort_vector_end >= 0; --sort_vector_end) {
7385 // TODO commented for reverse order
7386 multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
7387 // multiSItem t =
7388 // sort_vector_points_on_cut[previous_cut_map][sort_vector_begin];
7389 mj_lno_t i = t.index;
7390 ++host_thread_point_counts(no_cuts);
7391 host_assigned_part_ids(i) = no_cuts;
7392 }
7393
7394 sort_vector_points_on_cut[previous_cut_map].clear();
7395 delete [] cut_map;
7396
7397 //free the memory allocated for vertex sort items .
7398 mj_lno_t vSize = (mj_lno_t) allocated_memory.size();
7399 for(mj_lno_t i = 0; i < vSize; ++i) {
7400 delete [] allocated_memory[i];
7401 }
7402
7403 auto local_out_part_xadj = out_part_xadj;
7404 auto host_out_part_xadj = Kokkos::create_mirror_view(local_out_part_xadj);
7405 Kokkos::deep_copy(host_out_part_xadj, out_part_xadj);
7406
7407 // creation of part_xadj as in usual case.
7408 for(mj_part_t j = 0; j < num_parts; ++j) {
7409 host_out_part_xadj(j) = host_thread_point_counts(j);
7410 host_thread_point_counts(j) = 0;
7411 }
7412
7413 // perform prefix sum for num_points in parts.
7414 for(mj_part_t j = 1; j < num_parts; ++j) {
7415 host_out_part_xadj(j) += host_out_part_xadj(j - 1);
7416 }
7417
7418 // shift the num points in threads thread to obtain the
7419 // beginning index of each thread's private space.
7420 for(mj_part_t j = 1; j < num_parts; ++j) {
7421 host_thread_point_counts(j) += host_out_part_xadj(j - 1);
7422 }
7423
7424 auto host_new_coordinate_permutations =
7425 Kokkos::create_mirror_view(new_coordinate_permutations);
7426 Kokkos::deep_copy(host_new_coordinate_permutations,
7427 new_coordinate_permutations);
7428
7429 // now thread gets the coordinate and writes the index of coordinate to
7430 // the permutation array using the part index we calculated.
7431 for(mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii) {
7432 mj_lno_t i = host_coordinate_permutations(ii);
7433 mj_part_t p = host_assigned_part_ids(i);
7434 host_new_coordinate_permutations(coordinate_begin +
7435 host_thread_point_counts(p)++) = i;
7436 }
7437
7438 Kokkos::deep_copy(thread_point_counts, host_thread_point_counts);
7439 Kokkos::deep_copy(new_coordinate_permutations,
7440 host_new_coordinate_permutations);
7441 Kokkos::deep_copy(local_out_part_xadj, host_out_part_xadj);
7442}
7443
7453template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7454 typename mj_part_t, typename mj_node_t>
7455void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
7456 set_final_parts(
7457 mj_part_t current_num_parts,
7458 mj_part_t output_part_begin_index,
7459 RCP<mj_partBoxVector_t> &output_part_boxes,
7460 bool is_data_ever_migrated)
7461{
7462 this->mj_env->timerStart(MACRO_TIMERS,
7463 mj_timer_base_string + "Part_Assignment");
7464
7465 auto local_part_xadj = part_xadj;
7466 auto local_mj_keep_part_boxes = mj_keep_part_boxes;
7467 auto local_coordinate_permutations = coordinate_permutations;
7468 auto local_assigned_part_ids = assigned_part_ids;
7469
7470 if(local_mj_keep_part_boxes) {
7471 for(int i = 0; i < current_num_parts; ++i) {
7472 (*output_part_boxes)[i].setpId(i + output_part_begin_index);
7473 }
7474 }
7475
7476 Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy(
7477 current_num_parts, Kokkos::AUTO());
7478 typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
7479 member_type member_type;
7480 Kokkos::parallel_for(policy, KOKKOS_LAMBDA(member_type team_member) {
7481 int i = team_member.league_rank();
7482 Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, (i != 0) ?
7483 local_part_xadj(i-1) : 0, local_part_xadj(i)),
7484 [=] (mj_lno_t ii) {
7485 mj_lno_t k = local_coordinate_permutations(ii);
7486 local_assigned_part_ids(k) = i + output_part_begin_index;
7487 });
7488 });
7489
7490 if(is_data_ever_migrated) {
7491#ifdef ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
7492 if(sizeof(mj_lno_t) <= sizeof(int)) {
7493
7494 // Cannot use Zoltan_Comm with local ordinals larger than ints.
7495 // In Zoltan_Comm_Create, the cast int(this->num_local_coords)
7496 // may overflow.
7497
7498 // if data is migrated, then send part numbers to the original owners.
7499 ZOLTAN_COMM_OBJ *plan = NULL;
7500 MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->mj_problemComm));
7501
7502 int incoming = 0;
7503 int message_tag = 7856;
7504
7505 this->mj_env->timerStart(MACRO_TIMERS,
7506 mj_timer_base_string + "Final Z1PlanCreating");
7507
7508 // setup incoming count
7509 int ierr = Zoltan_Comm_Create( &plan, int(this->num_local_coords),
7510 this->owner_of_coordinate.data(), mpi_comm, message_tag, &incoming);
7511
7512 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
7513 this->mj_env->timerStop(MACRO_TIMERS,
7514 mj_timer_base_string + "Final Z1PlanCreating" );
7515
7516 this->mj_env->timerStart(MACRO_TIMERS,
7517 mj_timer_base_string + "Final Z1PlanComm");
7518
7519 // MPI Buffers should be on Kokkos::HostSpace not Kokkos::CudaUVMSpace
7520 // Note, with UVM space, create_mirror_view does NOT create a non-UVM
7521 // view; need the explicit Host creation and deep_copy.
7522
7523 // migrate gnos to actual owners.
7524 auto host_current_mj_gnos = Kokkos::create_mirror_view(
7525 Kokkos::HostSpace(), this->current_mj_gnos);
7526 deep_copy(host_current_mj_gnos, this->current_mj_gnos);
7527 Kokkos::View<mj_gno_t*, device_t> dst_gnos(
7528 Kokkos::ViewAllocateWithoutInitializing("dst_gnos"), incoming);
7529 auto host_dst_gnos = Kokkos::create_mirror_view(
7530 Kokkos::HostSpace(), dst_gnos);
7531 message_tag++;
7532 ierr = Zoltan_Comm_Do( plan, message_tag,
7533 (char *) host_current_mj_gnos.data(),
7534 sizeof(mj_gno_t), (char *) host_dst_gnos.data());
7535 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
7536 Kokkos::deep_copy(dst_gnos, host_dst_gnos);
7537 this->current_mj_gnos = dst_gnos;
7538
7539 // migrate part ids to actual owners.
7540 auto host_src_part_ids = Kokkos::create_mirror_view(
7541 Kokkos::HostSpace(), this->assigned_part_ids);
7542 deep_copy(host_src_part_ids, this->assigned_part_ids);
7543 Kokkos::View<mj_part_t*, device_t> dst_part_ids(
7544 Kokkos::ViewAllocateWithoutInitializing("dst_part_ids"), incoming);
7545 auto host_dst_part_ids = Kokkos::create_mirror_view(
7546 Kokkos::HostSpace(), dst_part_ids);
7547 message_tag++;
7548 ierr = Zoltan_Comm_Do( plan, message_tag,
7549 (char *) host_src_part_ids.data(),
7550 sizeof(mj_part_t), (char *) host_dst_part_ids.data());
7551 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
7552 Kokkos::deep_copy(dst_part_ids, host_dst_part_ids);
7553 this->assigned_part_ids = dst_part_ids;
7554
7555 ierr = Zoltan_Comm_Destroy(&plan);
7556 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
7557
7558 this->num_local_coords = incoming;
7559
7560 this->mj_env->timerStop(MACRO_TIMERS,
7561 mj_timer_base_string + "Final Z1PlanComm");
7562 }
7563 else
7564#endif // ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
7565 {
7566 // setup incoming count
7567 this->mj_env->timerStart(MACRO_TIMERS,
7568 mj_timer_base_string + "Final DistributorPlanCreating");
7569 Tpetra::Distributor distributor(this->mj_problemComm);
7570 ArrayView<const mj_part_t> owners_of_coords(
7571 this->owner_of_coordinate.data(), this->num_local_coords);
7572 mj_lno_t incoming = distributor.createFromSends(owners_of_coords);
7573 this->mj_env->timerStop(MACRO_TIMERS,
7574 mj_timer_base_string + "Final DistributorPlanCreating" );
7575
7576 this->mj_env->timerStart(MACRO_TIMERS,
7577 mj_timer_base_string + "Final DistributorPlanComm");
7578
7579 // migrate gnos to actual owners.
7580 // MPI buffers should be Kokkos::HostSpace, not Kokkos::CudaUVMSpace
7581 // Note, with UVM space, create_mirror_view does NOT create a non-UVM
7582 // view; need the explicit Host creation and deep_copy.
7583 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> sent_gnos(
7584 Kokkos::ViewAllocateWithoutInitializing("sent_gnos"),
7585 this->current_mj_gnos.extent(0));
7586 Kokkos::deep_copy(sent_gnos, this->current_mj_gnos);
7587
7588 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos(
7589 Kokkos::ViewAllocateWithoutInitializing("received_gnos"),
7590 incoming);
7591
7592 distributor.doPostsAndWaits(sent_gnos, 1, received_gnos);
7593
7594 this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
7595 Kokkos::ViewAllocateWithoutInitializing("current_mj_gnos"), incoming);
7596
7597 Kokkos::deep_copy(this->current_mj_gnos, received_gnos);
7598
7599 // migrate part ids to actual owners.
7600 Kokkos::View<mj_part_t *, Kokkos::HostSpace> sent_partids(
7601 Kokkos::ViewAllocateWithoutInitializing("sent_partids"),
7602 this->assigned_part_ids.extent(0));
7603 Kokkos::deep_copy(sent_partids, this->assigned_part_ids);
7604
7605 Kokkos::View<mj_part_t *, Kokkos::HostSpace> received_partids(
7606 Kokkos::ViewAllocateWithoutInitializing("received_partids"),
7607 incoming);
7608
7609 distributor.doPostsAndWaits(sent_partids, 1, received_partids);
7610
7611 this->assigned_part_ids =
7612 Kokkos::View<mj_part_t*, device_t>(
7613 Kokkos::ViewAllocateWithoutInitializing("assigned_part_ids"),
7614 incoming);
7615
7616 Kokkos::deep_copy(this->assigned_part_ids, received_partids);
7617 this->num_local_coords = incoming;
7618
7619 this->mj_env->timerStop(MACRO_TIMERS,
7620 mj_timer_base_string + "Final DistributorPlanComm");
7621 }
7622 }
7623
7624 this->mj_env->timerStop(MACRO_TIMERS,
7625 mj_timer_base_string + "Part_Assignment");
7626
7627 this->mj_env->timerStart(MACRO_TIMERS,
7628 mj_timer_base_string + "Solution_Part_Assignment");
7629
7630 // ArrayRCP<mj_part_t> partId;
7631 // partId = arcp(this->assigned_part_ids, 0, this->num_local_coords, true);
7632
7633 if(this->mj_keep_part_boxes) {
7634 this->kept_boxes = compute_global_box_boundaries(output_part_boxes);
7635 }
7636
7637 this->mj_env->timerStop(MACRO_TIMERS,
7638 mj_timer_base_string + "Solution_Part_Assignment");
7639}
7640
7653template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7654 typename mj_part_t, typename mj_node_t>
7657 bool distribute_points_on_cut_lines_,
7658 int max_concurrent_part_calculation_,
7659 int check_migrate_avoid_migration_option_,
7660 double minimum_migration_imbalance_,
7661 int migration_type_)
7662{
7663 this->distribute_points_on_cut_lines = distribute_points_on_cut_lines_;
7664 this->max_concurrent_part_calculation = max_concurrent_part_calculation_;
7665 this->check_migrate_avoid_migration_option =
7666 check_migrate_avoid_migration_option_;
7667 this->minimum_migration_imbalance = minimum_migration_imbalance_;
7668 this->migration_type = migration_type_;
7669}
7670
7698template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7699 typename mj_part_t, typename mj_node_t>
7702 const RCP<const Environment> &env,
7703 RCP<const Comm<int> > &problemComm,
7704 double imbalance_tolerance_,
7705 int num_teams_,
7706 size_t num_global_parts_,
7707 Kokkos::View<mj_part_t*, Kokkos::HostSpace> & part_no_array_,
7708 int recursion_depth_,
7709 int coord_dim_,
7710 mj_lno_t num_local_coords_,
7711 mj_gno_t num_global_coords_,
7712 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
7713 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
7714 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
7715 int num_weights_per_coord_,
7716 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_weights_,
7717 Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
7718 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_parts_,
7719 Kokkos::View<mj_part_t *, device_t> & result_assigned_part_ids_,
7720 Kokkos::View<mj_gno_t*, device_t> & result_mj_gnos_)
7721{
7722
7723 // see comment above for Zoltan2_AlgMJ_TrackCallsCounter
7725 this->mj_timer_base_string = "MJ(" + std::to_string(execute_counter) + ") - ";
7726
7727 this->mj_env = env;
7728 this->mj_problemComm = problemComm;
7729 this->myActualRank = this->myRank = this->mj_problemComm->getRank();
7730 this->mj_env->timerStart(MACRO_TIMERS,
7731 mj_timer_base_string + "Total");
7732 this->mj_env->debug(3, "In MultiJagged Jagged");
7733 this->imbalance_tolerance = imbalance_tolerance_;
7734 this->mj_num_teams = num_teams_;
7735 this->num_global_parts = num_global_parts_;
7736 this->part_no_array = part_no_array_;
7737 this->recursion_depth = recursion_depth_;
7738 this->coord_dim = coord_dim_;
7739 this->num_local_coords = num_local_coords_;
7740 this->num_global_coords = num_global_coords_;
7741 this->mj_coordinates = mj_coordinates_;
7742 this->initial_mj_gnos = initial_mj_gnos_;
7743 this->num_weights_per_coord = num_weights_per_coord_;
7744 this->mj_uniform_weights = mj_uniform_weights_;
7745 this->mj_weights = mj_weights_;
7746 this->mj_uniform_parts = mj_uniform_parts_;
7747
7748 // this->set_input_data();
7749
7750 this->set_part_specifications();
7751
7752 this->mj_env->timerStart(MACRO_TIMERS,
7753 mj_timer_base_string + "Allocate Views");
7754 this->allocate_set_work_memory();
7755 this->mj_env->timerStop(MACRO_TIMERS,
7756 mj_timer_base_string + "Allocate Views");
7757
7758 // We duplicate the comm as we create subcommunicators during migration.
7759 // We keep the problemComm as it is, while comm changes after each migration.
7760 this->comm = this->mj_problemComm->duplicate();
7761
7762#ifdef print_debug
7763 if(comm->getRank() == 0) {
7764 std::cout << "size of gno:" << sizeof(mj_gno_t) << std::endl;
7765 std::cout << "size of lno:" << sizeof(mj_lno_t) << std::endl;
7766 std::cout << "size of mj_scalar_t:" << sizeof(mj_scalar_t) << std::endl;
7767 }
7768#endif
7769
7770 // initially there is a single partition
7771 mj_part_t current_num_parts = 1;
7772 Kokkos::View<mj_scalar_t *, device_t> current_cut_coordinates =
7773 this->all_cut_coordinates;
7774 this->mj_env->timerStart(MACRO_TIMERS,
7775 mj_timer_base_string + "Problem_Partitioning");
7776 mj_part_t output_part_begin_index = 0;
7777 mj_part_t future_num_parts = this->total_num_part;
7778 bool is_data_ever_migrated = false;
7779
7780 std::vector<mj_part_t> *future_num_part_in_parts =
7781 new std::vector<mj_part_t> ();
7782 std::vector<mj_part_t> *next_future_num_parts_in_parts =
7783 new std::vector<mj_part_t> ();
7784
7785 next_future_num_parts_in_parts->push_back(this->num_global_parts);
7786
7787 RCP<mj_partBoxVector_t> input_part_boxes;
7788 RCP<mj_partBoxVector_t> output_part_boxes;
7789
7790 if(this->mj_keep_part_boxes) {
7791 input_part_boxes = RCP<mj_partBoxVector_t>(new mj_partBoxVector_t(), true);
7792 output_part_boxes = RCP<mj_partBoxVector_t>(new mj_partBoxVector_t(), true);
7793 compute_global_box();
7794 this->init_part_boxes(output_part_boxes);
7795 }
7796
7797 auto local_part_xadj = this->part_xadj;
7798
7799 // Need a device counter - how best to allocate?
7800 // Putting this allocation in the loops is very costly so moved out here.
7801 Kokkos::View<mj_part_t*, device_t>
7802 view_rectilinear_cut_count("view_rectilinear_cut_count", 1);
7803 Kokkos::View<size_t*, device_t>
7804 view_total_reduction_size("view_total_reduction_size", 1);
7805
7806 for(int i = 0; i < this->recursion_depth; ++i) {
7807
7808 // convert i to string to be used for debugging purposes.
7809 std::string istring = std::to_string(i);
7810
7811 // next_future_num_parts_in_parts will be as the size of outnumParts,
7812 // and this will hold how many more parts that each output part
7813 // should be divided. this array will also be used to determine the weight
7814 // ratios of the parts. swap the arrays to use iteratively.
7815 std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
7816 future_num_part_in_parts = next_future_num_parts_in_parts;
7817 next_future_num_parts_in_parts = tmpPartVect;
7818
7819 // clear next_future_num_parts_in_parts array as
7820 // getPartitionArrays expects it to be empty.
7821 next_future_num_parts_in_parts->clear();
7822 if(this->mj_keep_part_boxes) {
7823 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7824 input_part_boxes = output_part_boxes;
7825 output_part_boxes = tmpPartBoxes;
7826 output_part_boxes->clear();
7827 }
7828
7829 // returns the total no. of output parts for this dimension partitioning.
7830 mj_part_t output_part_count_in_dimension =
7831 this->update_part_num_arrays(
7832 future_num_part_in_parts,
7833 next_future_num_parts_in_parts,
7834 future_num_parts,
7835 current_num_parts,
7836 i,
7837 input_part_boxes,
7838 output_part_boxes, 1);
7839
7840 // if the number of obtained parts equal to current number of parts,
7841 // skip this dimension. For example, this happens when 1 is given in the
7842 // input part array is given. P=4,5,1,2
7843 if(output_part_count_in_dimension == current_num_parts) {
7844 //still need to swap the input output arrays.
7845 tmpPartVect= future_num_part_in_parts;
7846 future_num_part_in_parts = next_future_num_parts_in_parts;
7847 next_future_num_parts_in_parts = tmpPartVect;
7848
7849 if(this->mj_keep_part_boxes) {
7850 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7851 input_part_boxes = output_part_boxes;
7852 output_part_boxes = tmpPartBoxes;
7853 }
7854 continue;
7855 }
7856
7857 // get the coordinate axis along which the partitioning will be done.
7858 int coordInd = i % this->coord_dim;
7859
7860 Kokkos::View<mj_scalar_t *, device_t> mj_current_dim_coords =
7861 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
7862
7863 this->mj_env->timerStart(MACRO_TIMERS,
7864 mj_timer_base_string + "Problem_Partitioning_" + istring);
7865
7866 // alloc Memory to point the indices
7867 // of the parts in the permutation array.
7868 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
7869 "new part xadj", output_part_count_in_dimension);
7870
7871 // the index where in the new_part_xadj will be written.
7872 mj_part_t output_part_index = 0;
7873
7874 // whatever is written to output_part_index will be added with
7875 // output_coordinate_end_index so that the points will be shifted.
7876 mj_part_t output_coordinate_end_index = 0;
7877
7878 mj_part_t current_work_part = 0;
7879 mj_part_t current_concurrent_num_parts =
7880 std::min(current_num_parts - current_work_part,
7881 this->max_concurrent_part_calculation);
7882
7883 mj_part_t obtained_part_index = 0;
7884
7885 auto host_process_local_min_max_coord_total_weight =
7886 Kokkos::create_mirror_view(process_local_min_max_coord_total_weight);
7887 auto host_global_min_max_coord_total_weight =
7888 Kokkos::create_mirror_view(global_min_max_coord_total_weight);
7889
7890 // run for all available parts.
7891 for(; current_work_part < current_num_parts;
7892 current_work_part += current_concurrent_num_parts) {
7893
7894 current_concurrent_num_parts =
7895 std::min(current_num_parts - current_work_part,
7896 this->max_concurrent_part_calculation);
7897
7898 int bDoingWork_int; // Can't reduce on bool so use int
7899 auto local_device_num_partitioning_in_current_dim =
7900 device_num_partitioning_in_current_dim;
7901 Kokkos::parallel_reduce("Read bDoingWork",
7902 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
7903 KOKKOS_LAMBDA(int dummy, int & set_single) {
7904 set_single = 0;
7905 for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
7906 if(local_device_num_partitioning_in_current_dim(
7907 current_work_part + kk) != 1) {
7908 set_single = 1;
7909 break;
7910 }
7911 }
7912 }, bDoingWork_int);
7913 bool bDoingWork = (bDoingWork_int != 0) ? true : false;
7914
7915 this->mj_get_local_min_max_coord_totW(
7916 current_work_part,
7917 current_concurrent_num_parts,
7918 mj_current_dim_coords);
7919
7920 // 1D partitioning
7921 if(bDoingWork) {
7922 // obtain global Min max of the part.
7923 this->mj_get_global_min_max_coord_totW(
7924 current_concurrent_num_parts,
7925 this->process_local_min_max_coord_total_weight,
7926 this->global_min_max_coord_total_weight);
7927
7928 // represents the total number of cutlines
7929 // whose coordinate should be determined.
7930 mj_part_t total_incomplete_cut_count = 0;
7931
7932 // Compute weight ratios for parts & cuts:
7933 // e.g., 0.25 0.25 0.5 0.5 0.75 0.75 1
7934 // part0 cut0 part1 cut1 part2 cut2 part3
7935 mj_part_t concurrent_part_cut_shift = 0;
7936 mj_part_t concurrent_part_part_shift = 0;
7937
7938 for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
7939
7940 Kokkos::deep_copy(host_global_min_max_coord_total_weight,
7941 global_min_max_coord_total_weight);
7942
7943 mj_scalar_t min_coordinate =
7944 host_global_min_max_coord_total_weight(kk);
7945 mj_scalar_t max_coordinate =
7946 host_global_min_max_coord_total_weight(
7947 kk + current_concurrent_num_parts);
7948
7949 mj_scalar_t global_total_weight =
7950 host_global_min_max_coord_total_weight(
7951 kk + 2 * current_concurrent_num_parts);
7952
7953 mj_part_t concurrent_current_part_index = current_work_part + kk;
7954
7955 mj_part_t partition_count = host_num_partitioning_in_current_dim(
7956 concurrent_current_part_index);
7957
7958 Kokkos::View<mj_scalar_t *, device_t> usedCutCoordinate =
7959 Kokkos::subview(current_cut_coordinates,
7960 std::pair<mj_lno_t, mj_lno_t>(
7961 concurrent_part_cut_shift, current_cut_coordinates.size()));
7962 Kokkos::View<mj_scalar_t *, device_t>
7963 current_target_part_weights =
7964 Kokkos::subview(target_part_weights,
7965 std::pair<mj_lno_t, mj_lno_t>(
7966 concurrent_part_part_shift, target_part_weights.size()));
7967
7968 // shift the usedCutCoordinate array as noCuts.
7969 concurrent_part_cut_shift += partition_count - 1;
7970 // shift the partRatio array as noParts.
7971 concurrent_part_part_shift += partition_count;
7972
7973 // calculate only if part is not empty,
7974 // and part will be further partitioned.
7975 if(partition_count > 1 && min_coordinate <= max_coordinate) {
7976
7977 // increase num_cuts_do_be_determined by the number of cuts of the
7978 // current part's cut line number.
7979 total_incomplete_cut_count += partition_count - 1;
7980
7981 this->incomplete_cut_count(kk) = partition_count - 1;
7982
7983 // get the target weights of the parts
7984 this->mj_get_initial_cut_coords_target_weights(
7985 min_coordinate,
7986 max_coordinate,
7987 partition_count - 1,
7988 global_total_weight,
7989 usedCutCoordinate,
7990 current_target_part_weights,
7991 future_num_part_in_parts,
7992 next_future_num_parts_in_parts,
7993 concurrent_current_part_index,
7994 obtained_part_index);
7995
7996 mj_lno_t coordinate_end_index =
7997 host_part_xadj(concurrent_current_part_index);
7998 mj_lno_t coordinate_begin_index =
7999 concurrent_current_part_index==0 ? 0 :
8000 host_part_xadj(concurrent_current_part_index - 1);
8001
8002 this->set_initial_coordinate_parts(
8003 max_coordinate,
8004 min_coordinate,
8005 coordinate_begin_index, coordinate_end_index,
8006 this->coordinate_permutations,
8007 mj_current_dim_coords,
8008 this->assigned_part_ids,
8009 partition_count);
8010 }
8011 else {
8012 // e.g., if have fewer coordinates than parts, don't need to do
8013 // next dim.
8014 this->incomplete_cut_count(kk) = 0;
8015 }
8016
8017 obtained_part_index += partition_count;
8018 }
8019
8020 // used imbalance, it is always 0, as it is difficult to
8021 // estimate a range.
8022 double used_imbalance = 0;
8023 // Determine cut lines for all concurrent parts parts here.
8024 this->mj_env->timerStart(MACRO_TIMERS,
8025 mj_timer_base_string + "Problem_Partitioning Get Part Weights");
8026
8027 this->mj_1D_part(
8028 mj_current_dim_coords,
8029 used_imbalance,
8030 current_work_part,
8031 current_concurrent_num_parts,
8032 current_cut_coordinates,
8033 total_incomplete_cut_count,
8034 view_rectilinear_cut_count,
8035 view_total_reduction_size);
8036
8037 this->mj_env->timerStop(MACRO_TIMERS,
8038 mj_timer_base_string + "Problem_Partitioning Get Part Weights");
8039 }
8040
8041 // create new part chunks
8042 {
8043 mj_part_t output_array_shift = 0;
8044 mj_part_t cut_shift = 0;
8045 size_t tlr_shift = 0;
8046 size_t partweight_array_shift = 0;
8047 for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
8048
8049 mj_part_t current_concurrent_work_part = current_work_part + kk;
8050
8051 mj_part_t num_parts = host_num_partitioning_in_current_dim(
8052 current_concurrent_work_part);
8053
8054 // if the part is empty, skip the part.
8055 int coordinateA_bigger_than_coordinateB =
8056 host_global_min_max_coord_total_weight(kk) >
8057 host_global_min_max_coord_total_weight(
8058 kk + current_concurrent_num_parts);
8059
8060 if((num_parts != 1) && coordinateA_bigger_than_coordinateB) {
8061 // we still need to write the begin and end point of the empty part.
8062 // simply set it zero, the array indices will be shifted later
8063 auto local_new_part_xadj = this->new_part_xadj;
8064 Kokkos::parallel_for(
8065 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
8066 (0, num_parts), KOKKOS_LAMBDA (mj_part_t jj) {
8067 local_new_part_xadj(
8068 output_part_index + output_array_shift + jj) = 0;
8069 });
8070
8071 cut_shift += num_parts - 1;
8072 tlr_shift += (4 *(num_parts - 1) + 1);
8073 output_array_shift += num_parts;
8074 partweight_array_shift += (2 * (num_parts - 1) + 1);
8075 continue;
8076 }
8077
8078 Kokkos::View<mj_scalar_t *, device_t>
8079 current_concurrent_cut_coordinate =
8080 Kokkos::subview(current_cut_coordinates,
8081 std::pair<mj_lno_t, mj_lno_t>(
8082 cut_shift,
8083 current_cut_coordinates.size()));
8084 Kokkos::View<mj_scalar_t *, device_t>
8085 used_local_cut_line_weight_to_left =
8086 Kokkos::subview(process_cut_line_weight_to_put_left,
8087 std::pair<mj_lno_t, mj_lno_t>(
8088 cut_shift,
8089 process_cut_line_weight_to_put_left.size()));
8090
8091 this->thread_part_weight_work =
8092 Kokkos::subview(
8093 this->thread_part_weights,
8094 std::pair<mj_lno_t, mj_lno_t>(
8095 partweight_array_shift,
8096 this->thread_part_weights.extent(0)));
8097
8098 if(num_parts > 1) {
8099 if(this->mj_keep_part_boxes) {
8100 // if part boxes are to be stored update the boundaries.
8101 for(mj_part_t j = 0; j < num_parts - 1; ++j) {
8102 mj_scalar_t temp_get_val;
8103 Kokkos::parallel_reduce("Read single",
8104 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
8105 KOKKOS_LAMBDA(int dummy, mj_scalar_t & set_single) {
8106 set_single = current_concurrent_cut_coordinate(j);
8107 }, temp_get_val);
8108 (*output_part_boxes)
8109 [output_array_shift + output_part_index + j].
8110 updateMinMax(temp_get_val, 1 /*update max*/, coordInd);
8111 (*output_part_boxes)
8112 [output_array_shift + output_part_index + j + 1].
8113 updateMinMax(temp_get_val, 0 /*update max*/, coordInd);
8114 }
8115 }
8116
8117 // Rewrite the indices based on the computed cuts.
8118 Kokkos::View<mj_lno_t*, device_t> sub_new_part_xadj =
8119 Kokkos::subview(this->new_part_xadj,
8120 std::pair<mj_lno_t, mj_lno_t>(
8121 output_part_index + output_array_shift,
8122 this->new_part_xadj.size()));
8123
8124 this->mj_create_new_partitions(
8125 num_parts,
8126 current_concurrent_work_part,
8127 mj_current_dim_coords,
8128 current_concurrent_cut_coordinate,
8129 used_local_cut_line_weight_to_left,
8130 sub_new_part_xadj);
8131 }
8132 else {
8133
8134 mj_lno_t coordinate_end = host_part_xadj(
8135 current_concurrent_work_part);
8136 mj_lno_t coordinate_begin =
8137 current_concurrent_work_part==0 ? 0 : host_part_xadj(
8138 current_concurrent_work_part - 1);
8139
8140 // if this part is partitioned into 1 then just copy
8141 // the old values.
8142 mj_lno_t part_size = coordinate_end - coordinate_begin;
8143
8144 // Awkward here to set one value - need some broader
8145 // refactoring to improve this one.
8146 auto local_new_part_xadj = this->new_part_xadj;
8147 Kokkos::parallel_for(
8148 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
8149 (0, 1), KOKKOS_LAMBDA (int dummy) {
8150 local_new_part_xadj(
8151 output_part_index + output_array_shift) = part_size;
8152 });
8153
8154 auto subview_new_coordinate_permutations =
8155 Kokkos::subview(this->new_coordinate_permutations,
8156 std::pair<mj_lno_t, mj_lno_t>(
8157 coordinate_begin,
8158 coordinate_begin + part_size));
8159 auto subview_coordinate_permutations =
8160 Kokkos::subview(this->coordinate_permutations,
8161 std::pair<mj_lno_t, mj_lno_t>(
8162 coordinate_begin,
8163 coordinate_begin + part_size));
8164 Kokkos::deep_copy(subview_new_coordinate_permutations,
8165 subview_coordinate_permutations);
8166 }
8167 cut_shift += num_parts - 1;
8168 output_array_shift += num_parts;
8169 partweight_array_shift += (2 * (num_parts - 1) + 1);
8170 }
8171
8172 // shift cut coordinates so that all cut coordinates are stored.
8173 // no shift now because we dont keep the cuts.
8174 // current_cut_coordinates += cut_shift;
8175 // mj_create_new_partitions from coordinates partitioned the parts
8176 // and write the indices as if there were a single part.
8177 // now we need to shift the beginning indices.
8178 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
8179 mj_part_t num_parts =
8180 host_num_partitioning_in_current_dim(current_work_part + kk);
8181
8182 // These two kernels are a bit awkward but need broader redesign to
8183 // avoid this situation.
8184 auto local_new_part_xadj = this->new_part_xadj;
8185 Kokkos::parallel_for(
8186 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
8187 (0, num_parts), KOKKOS_LAMBDA (mj_part_t ii) {
8188 local_new_part_xadj(output_part_index+ii) +=
8189 output_coordinate_end_index;
8190 });
8191
8192 // increase the previous count by current end.
8193 mj_part_t temp_get;
8194 Kokkos::parallel_reduce("Read single",
8195 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
8196 KOKKOS_LAMBDA(int dummy, mj_part_t & set_single) {
8197 set_single =
8198 local_new_part_xadj(output_part_index + num_parts - 1);
8199 }, temp_get);
8200 output_coordinate_end_index = temp_get;
8201 //increase the current out.
8202 output_part_index += num_parts;
8203 }
8204 }
8205 }
8206
8207 // end of this partitioning dimension
8208 int current_world_size = this->comm->getSize();
8209 long migration_reduce_all_population =
8210 this->total_dim_num_reduce_all * current_world_size;
8211 bool is_migrated_in_current_dimension = false;
8212
8213 // we migrate if there are more partitionings to be done after this step
8214 // and if the migration is not forced to be avoided.
8215 // and the operation is not sequential.
8216 if(future_num_parts > 1 &&
8217 this->check_migrate_avoid_migration_option >= 0 &&
8218 current_world_size > 1) {
8219 this->mj_env->timerStart(MACRO_TIMERS,
8220 mj_timer_base_string + "Problem_Migration-" + istring);
8221 mj_part_t num_parts = output_part_count_in_dimension;
8222
8223 if(this->mj_perform_migration(
8224 num_parts,
8225 current_num_parts, //output
8226 next_future_num_parts_in_parts, //output
8227 output_part_begin_index,
8228 migration_reduce_all_population,
8229 this->num_global_coords / (future_num_parts * current_num_parts),
8230 istring,
8231 input_part_boxes, output_part_boxes) )
8232 {
8233 is_migrated_in_current_dimension = true;
8234 is_data_ever_migrated = true;
8235 this->mj_env->timerStop(MACRO_TIMERS,
8236 mj_timer_base_string + "Problem_Migration-" + istring);
8237 // since data is migrated, we reduce the number of reduceAll
8238 // operations for the last part.
8239 this->total_dim_num_reduce_all /= num_parts;
8240 }
8241 else {
8242 is_migrated_in_current_dimension = false;
8243 this->mj_env->timerStop(MACRO_TIMERS,
8244 mj_timer_base_string + "Problem_Migration-" + istring);
8245 }
8246 }
8247
8248 // swap the coordinate permutations for the next dimension.
8249 Kokkos::View<mj_lno_t*, device_t> tmp =
8250 this->coordinate_permutations;
8251 this->coordinate_permutations =
8252 this->new_coordinate_permutations;
8253
8254 this->new_coordinate_permutations = tmp;
8255 if(!is_migrated_in_current_dimension) {
8256 this->total_dim_num_reduce_all -= current_num_parts;
8257 current_num_parts = output_part_count_in_dimension;
8258 }
8259
8260 {
8261 this->part_xadj = this->new_part_xadj;
8262 local_part_xadj = this->new_part_xadj;
8263 this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
8264 Kokkos::deep_copy(host_part_xadj, part_xadj); // keep in sync
8265
8266 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>("empty", 0);
8267 this->mj_env->timerStop(MACRO_TIMERS,
8268 mj_timer_base_string + "Problem_Partitioning_" + istring);
8269 }
8270 }
8271
8272 // Partitioning is done
8273 delete future_num_part_in_parts;
8274 delete next_future_num_parts_in_parts;
8275 this->mj_env->timerStop(MACRO_TIMERS,
8276 mj_timer_base_string + "Problem_Partitioning");
8278
8279 //get the final parts of each initial coordinate
8280 //the results will be written to
8281 //this->assigned_part_ids for gnos given in this->current_mj_gnos
8282 this->set_final_parts(
8283 current_num_parts,
8284 output_part_begin_index,
8285 output_part_boxes,
8286 is_data_ever_migrated);
8287
8288 result_assigned_part_ids_ = this->assigned_part_ids;
8289 result_mj_gnos_ = this->current_mj_gnos;
8290 this->mj_env->timerStop(MACRO_TIMERS,
8291 mj_timer_base_string + "Total");
8292 this->mj_env->debug(3, "Out of MultiJagged");
8293}
8294
8295template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
8296 typename mj_part_t, typename mj_node_t>
8297RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t, mj_node_t>::
8298 mj_partBoxVector_t>
8300 get_kept_boxes() const
8301{
8302 if(this->mj_keep_part_boxes) {
8303 return this->kept_boxes;
8304 }
8305 else {
8306 throw std::logic_error("Error: part boxes are not stored.");
8307 }
8308}
8309
8310template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
8311 typename mj_part_t, typename mj_node_t>
8312RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t, mj_node_t>::
8313 mj_partBoxVector_t>
8315 compute_global_box_boundaries(RCP<mj_partBoxVector_t> &localPartBoxes) const
8316{
8317 typedef typename Zoltan2::coordinateModelPartBox::coord_t coord_t;
8318 mj_part_t ntasks = this->num_global_parts;
8319 int dim = (*localPartBoxes)[0].getDim();
8320 coord_t *localPartBoundaries = new coord_t[ntasks * 2 *dim];
8321
8322 memset(localPartBoundaries, 0, sizeof(coord_t) * ntasks * 2 *dim);
8323
8324 coord_t *globalPartBoundaries = new coord_t[ntasks * 2 *dim];
8325 memset(globalPartBoundaries, 0, sizeof(coord_t) * ntasks * 2 *dim);
8326
8327 coord_t *localPartMins = localPartBoundaries;
8328 coord_t *localPartMaxs = localPartBoundaries + ntasks * dim;
8329
8330 coord_t *globalPartMins = globalPartBoundaries;
8331 coord_t *globalPartMaxs = globalPartBoundaries + ntasks * dim;
8332
8333 mj_part_t boxCount = localPartBoxes->size();
8334 for(mj_part_t i = 0; i < boxCount; ++i) {
8335 mj_part_t pId = (*localPartBoxes)[i].getpId();
8336
8337 // cout << "me:" << comm->getRank() << " has:" << pId << endl;
8338
8339 coord_t *lmins = (*localPartBoxes)[i].getlmins();
8340 coord_t *lmaxs = (*localPartBoxes)[i].getlmaxs();
8341
8342 for(int j = 0; j < dim; ++j) {
8343 localPartMins[dim * pId + j] = lmins[j];
8344 localPartMaxs[dim * pId + j] = lmaxs[j];
8345
8346 /*
8347 std::cout << "me:" << comm->getRank() <<
8348 " dim * pId + j:"<< dim * pId + j <<
8349 " localMin:" << localPartMins[dim * pId + j] <<
8350 " localMax:" << localPartMaxs[dim * pId + j] << std::endl;
8351 */
8352 }
8353 }
8354
8355 Teuchos::Zoltan2_BoxBoundaries<int, coord_t> reductionOp(ntasks * 2 *dim);
8356
8357 reduceAll<int, coord_t>(*mj_problemComm, reductionOp,
8358 ntasks * 2 *dim, localPartBoundaries, globalPartBoundaries);
8359
8360 RCP<mj_partBoxVector_t> pB(new mj_partBoxVector_t(),true);
8361 for(mj_part_t i = 0; i < ntasks; ++i) {
8363 globalPartMins + dim * i,
8364 globalPartMaxs + dim * i);
8365
8366 /*
8367 for(int j = 0; j < dim; ++j) {
8368 std::cout << "me:" << comm->getRank() <<
8369 " dim * pId + j:"<< dim * i + j <<
8370 " globalMin:" << globalPartMins[dim * i + j] <<
8371 " globalMax:" << globalPartMaxs[dim * i + j] << std::endl;
8372 }
8373 */
8374
8375 pB->push_back(tpb);
8376 }
8377 delete []localPartBoundaries;
8378 delete []globalPartBoundaries;
8379 //RCP <mj_partBoxVector_t> tmpRCPBox(pB, true);
8380 return pB;
8381}
8382
8385template <typename Adapter>
8386class Zoltan2_AlgMJ : public Algorithm<Adapter>
8387{
8388
8389private:
8390
8391#ifndef DOXYGEN_SHOULD_SKIP_THIS
8392 // For coordinates and weights, MJ needs floats or doubles
8393 // But Adapter can provide other scalars, e.g., ints.
8394 // So have separate scalar_t for MJ and adapter.
8395 typedef typename Adapter::scalar_t adapter_scalar_t;
8396
8397 // Provide a default type for mj_scalar_t;
8398 typedef float default_mj_scalar_t;
8399
8400 // If Adapter provided float or double scalar_t, use it (prevents copies).
8401 // Otherwise, use the default type of mj_scalar_t;
8402 typedef typename
8403 std::conditional<
8404 (std::is_same<adapter_scalar_t, float>::value ||
8405 std::is_same<adapter_scalar_t, double>::value),
8406 adapter_scalar_t, default_mj_scalar_t>::type mj_scalar_t;
8407
8408 typedef typename Adapter::gno_t mj_gno_t;
8409 typedef typename Adapter::lno_t mj_lno_t;
8410 typedef typename Adapter::part_t mj_part_t;
8411 typedef typename Adapter::node_t mj_node_t;
8412 typedef coordinateModelPartBox mj_partBox_t;
8413 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
8414 typedef typename mj_node_t::device_type device_t;
8415#endif
8416
8418
8419 RCP<const Environment> mj_env; // the environment object
8420 RCP<const Comm<int> > mj_problemComm; // initial comm object
8421 RCP<const typename Adapter::base_adapter_t> mj_adapter; // coordinate adapter
8422
8423 // PARAMETERS
8424 double imbalance_tolerance; // input imbalance tolerance.
8425
8426 int num_teams; // how many teams to run main loop with
8427
8428 size_t num_global_parts; // the targeted number of parts
8429
8430 // input part array specifying num part to divide along each dim.
8431 Kokkos::View<mj_part_t*, Kokkos::HostSpace> part_no_array;
8432
8433 // the number of steps that partitioning will be solved in.
8434 int recursion_depth;
8435
8436 int coord_dim; // coordinate dimension.
8437 mj_lno_t num_local_coords; //number of local coords.
8438 mj_gno_t num_global_coords; //number of global coords.
8439
8440 // initial global ids of the coordinates.
8441 Kokkos::View<const mj_gno_t*, device_t> initial_mj_gnos;
8442
8443 // two dimension coordinate array.
8444 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8445 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
8446 mj_coordinates;
8447
8448 int num_weights_per_coord; // number of weights per coordinate
8449
8450 // if the target parts are uniform.
8451 Kokkos::View<bool*, Kokkos::HostSpace> mj_uniform_weights;
8452
8453 // two dimensional weight array.
8454 Kokkos::View<mj_scalar_t**, device_t> mj_weights;
8455
8456 // if the target parts are uniform
8457 Kokkos::View<bool*, Kokkos::HostSpace> mj_uniform_parts;
8458
8459 // Nonuniform first level partitioning
8460 // Currently used for Dragonfly task mapping by partitioning Dragonfly RCA
8461 // machine coordinates and application coordinates.
8462 // An optimization that completely partitions the most important machine
8463 // dimension first (i.e. the Dragonfly group coordinate, or RCA's x
8464 // coordinate). The standard MJ alg follows after the nonuniform first level
8465 // partitioning.
8466 // If used, number of parts for the first level partitioning
8467 mj_part_t num_first_level_parts;
8468
8469 // If used, the distribution of parts for the nonuniform
8470 // first level partitioning
8471 Kokkos::View<mj_part_t*, Kokkos::HostSpace> first_level_distribution;
8472
8473 // if partitioning can distribute points on same coordiante to
8474 // different parts.
8475 bool distribute_points_on_cut_lines;
8476
8477 // how many parts we can calculate concurrently.
8478 mj_part_t max_concurrent_part_calculation;
8479
8480 // whether to migrate=1, avoid migrate=2, or leave decision to MJ=0
8481 int check_migrate_avoid_migration_option;
8482
8483 // when doing the migration, 0 will aim for perfect load-imbalance,
8484 int migration_type;
8485
8486 // 1 for minimized messages
8487
8488 // when MJ decides whether to migrate, the minimum imbalance for migration.
8489 double minimum_migration_imbalance;
8490 bool mj_keep_part_boxes; //if the boxes need to be kept.
8491
8492 // if this is set, then recursion depth is adjusted to its maximum value.
8493 bool mj_run_as_rcb;
8494 int mj_premigration_option;
8495 int min_coord_per_rank_for_premigration;
8496
8497 // communication graph xadj
8498 ArrayRCP<mj_part_t> comXAdj_;
8499
8500 // communication graph adj.
8501 ArrayRCP<mj_part_t> comAdj_;
8502
8503 void copy(
8504 const RCP<PartitioningSolution<Adapter> >&solution);
8505
8506 void set_input_parameters(const Teuchos::ParameterList &p);
8507
8508 RCP<mj_partBoxVector_t> getGlobalBoxBoundaries() const;
8509
8510 bool mj_premigrate_to_subset(
8511 int used_num_ranks,
8512 int migration_selection_option,
8513 RCP<const Environment> mj_env_,
8514 RCP<const Comm<int> > mj_problemComm_,
8515 int coord_dim_,
8516 mj_lno_t num_local_coords_,
8517 mj_gno_t num_global_coords_, size_t num_global_parts_,
8518 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
8519 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8520 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8521 mj_coordinates_,
8522 int num_weights_per_coord_,
8523 Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
8524 //results
8525 RCP<const Comm<int> > &result_problemComm_,
8526 mj_lno_t & result_num_local_coords_,
8527 Kokkos::View<mj_gno_t*, device_t> & result_initial_mj_gnos_,
8528 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8529 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8530 result_mj_coordinates_,
8531 Kokkos::View<mj_scalar_t**, device_t> & result_mj_weights_,
8532 int * &result_actual_owner_rank_);
8533
8534public:
8535
8536 Zoltan2_AlgMJ(const RCP<const Environment> &env,
8537 RCP<const Comm<int> > &problemComm,
8538 const RCP<const typename Adapter::base_adapter_t> &adapter) :
8539 mj_partitioner(),
8540 mj_env(env),
8541 mj_problemComm(problemComm),
8542 mj_adapter(adapter),
8543 imbalance_tolerance(0),
8544 num_teams(0),
8545 num_global_parts(1),
8546 recursion_depth(0),
8547 coord_dim(0),
8548 num_local_coords(0),
8549 num_global_coords(0),
8550 num_weights_per_coord(0),
8551 num_first_level_parts(1),
8552 distribute_points_on_cut_lines(true),
8553 max_concurrent_part_calculation(1),
8554 check_migrate_avoid_migration_option(0),
8555 migration_type(0),
8556 minimum_migration_imbalance(0.30),
8557 mj_keep_part_boxes(false),
8558 mj_run_as_rcb(false),
8559 mj_premigration_option(0),
8560 min_coord_per_rank_for_premigration(32000),
8561 comXAdj_(),
8562 comAdj_()
8563 {
8564 }
8565
8567 {
8568 }
8569
8572 static void getValidParameters(ParameterList & pl)
8573 {
8574 const bool bUnsorted = true; // this clarifies the flag is for unsrorted
8575 RCP<Zoltan2::IntegerRangeListValidator<int>> mj_parts_Validator =
8576 Teuchos::rcp( new Zoltan2::IntegerRangeListValidator<int>(bUnsorted) );
8577 pl.set("mj_parts", "0", "list of parts for multiJagged partitioning "
8578 "algorithm. As many as the dimension count.", mj_parts_Validator);
8579
8580 pl.set("mj_concurrent_part_count", 1, "The number of parts whose cut "
8581 "coordinates will be calculated concurently.",
8583
8584 pl.set("mj_minimum_migration_imbalance", 1.1,
8585 "mj_minimum_migration_imbalance, the minimum imbalance of the "
8586 "processors to avoid migration",
8588
8589 RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_option_validator =
8590 Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(0, 2) );
8591 pl.set("mj_migration_option", 1, "Migration option, 0 for decision "
8592 "depending on the imbalance, 1 for forcing migration, 2 for "
8593 "avoiding migration", mj_migration_option_validator);
8594
8595 RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_type_validator =
8596 Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(0, 1) );
8597 pl.set("mj_migration_type", 0,
8598 "Migration type, 0 for migration to minimize the imbalance "
8599 "1 for migration to minimize messages exchanged the migration.",
8600 mj_migration_option_validator);
8601
8602 // bool parameter
8603 pl.set("mj_keep_part_boxes", false, "Keep the part boundaries of the "
8604 "geometric partitioning.", Environment::getBoolValidator());
8605
8606 // bool parameter
8607 pl.set("mj_enable_rcb", false, "Use MJ as RCB.",
8609
8610 pl.set("mj_recursion_depth", -1, "Recursion depth for MJ: Must be "
8611 "greater than 0.", Environment::getAnyIntValidator());
8612
8613 RCP<Teuchos::EnhancedNumberValidator<int>>
8614 mj_num_teams_validator =
8615 Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(
8616 0, Teuchos::EnhancedNumberTraits<int>::max()) );
8617 pl.set("mj_num_teams", 0,
8618 "How many teams for the main kernel loop"
8619 , mj_num_teams_validator);
8620
8621 RCP<Teuchos::EnhancedNumberValidator<int>>
8622 mj_premigration_option_validator =
8623 Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(0, 1024) );
8624
8625 pl.set("mj_premigration_option", 0,
8626 "Whether to do premigration or not. 0 for no migration "
8627 "x > 0 for migration to consecutive processors, "
8628 "the subset will be 0,x,2x,3x,...subset ranks."
8629 , mj_premigration_option_validator);
8630
8631 pl.set("mj_premigration_coordinate_count", 32000, "How many coordinate to "
8632 "assign each rank in multijagged after premigration"
8634 }
8635
8641 void partition(const RCP<PartitioningSolution<Adapter> > &solution);
8642
8643 mj_partBoxVector_t &getPartBoxesView() const
8644 {
8645 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
8646 return *pBoxes;
8647 }
8648
8649 mj_part_t pointAssign(int dim, adapter_scalar_t *point) const;
8650
8651 void boxAssign(int dim, adapter_scalar_t *lower, adapter_scalar_t *upper,
8652 size_t &nPartsFound, mj_part_t **partsFound) const;
8653
8657 const PartitioningSolution<Adapter> *solution,
8658 ArrayRCP<mj_part_t> &comXAdj,
8659 ArrayRCP<mj_part_t> &comAdj);
8660
8661 void set_up_partitioning_data( // public for CUDA
8662 const RCP<PartitioningSolution<Adapter> >&solution);
8663
8664 private:
8665 std::string timer_base_string; // used for making timers
8666
8667 // After loading views from coordinate adapter we may need to copy them
8668 // if mj type is different, but otherwise we just want to assign the view.
8669 // So purpose of this code is to make that assign only happen when the types
8670 // match. The empty case would otherwise not compile.
8671 // If they don't match the internal code handles allocating the new view
8672 // and copying the elements. See the test Zoltan2_mj_int_coordinates.
8673 template<class dst_t, class src_t> // version for same types
8674 typename std::enable_if<std::is_same<typename dst_t::value_type,
8675 typename src_t::value_type>::value>::type
8676 assign_if_same(dst_t & dst, const src_t & src) {
8677 dst = src;
8678 }
8679 template<class dst_t, class src_t> // version for different types
8680 typename std::enable_if<!std::is_same<typename dst_t::value_type,
8681 typename src_t::value_type>::value>::type
8682 assign_if_same(dst_t & dst, const src_t & src) {
8683 // do nothing - handled manually
8684 }
8685};
8686
8687template <typename Adapter>
8688bool Zoltan2_AlgMJ<Adapter>::mj_premigrate_to_subset(
8689 int used_num_ranks,
8690 int migration_selection_option,
8691 RCP<const Environment> mj_env_,
8692 RCP<const Comm<int> > mj_problemComm_,
8693 int coord_dim_,
8694 mj_lno_t num_local_coords_,
8695 mj_gno_t num_global_coords_, size_t num_global_parts_,
8696 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
8697 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8698 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
8699 int num_weights_per_coord_,
8700 Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
8701 //results
8702 RCP<const Comm<int> > & result_problemComm_,
8703 mj_lno_t &result_num_local_coords_,
8704 Kokkos::View<mj_gno_t*, device_t> & result_initial_mj_gnos_,
8705 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8706 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8707 result_mj_coordinates_,
8708 Kokkos::View<mj_scalar_t**, device_t> & result_mj_weights_,
8709 int * &result_actual_owner_rank_)
8710{
8711 mj_env_->timerStart(MACRO_TIMERS,
8712 timer_base_string + "PreMigration DistributorPlanCreating");
8713
8714 int myRank = mj_problemComm_->getRank();
8715 int worldSize = mj_problemComm_->getSize();
8716
8717 mj_part_t groupsize = worldSize / used_num_ranks;
8718
8719 std::vector<mj_part_t> group_begins(used_num_ranks + 1, 0);
8720
8721 mj_part_t i_am_sending_to = 0;
8722 bool am_i_a_receiver = false;
8723
8724 for(int i = 0; i < used_num_ranks; ++i) {
8725 group_begins[i+ 1] = group_begins[i] + groupsize;
8726 if(worldSize % used_num_ranks > i) group_begins[i+ 1] += 1;
8727 if(i == used_num_ranks) group_begins[i+ 1] = worldSize;
8728 if(myRank >= group_begins[i] && myRank < group_begins[i + 1]) {
8729 i_am_sending_to = group_begins[i];
8730 }
8731 if(myRank == group_begins[i]) {
8732 am_i_a_receiver = true;
8733 }
8734 }
8735
8736 ArrayView<const mj_part_t> idView(&(group_begins[0]), used_num_ranks );
8737 result_problemComm_ = mj_problemComm_->createSubcommunicator(idView);
8738
8739 Tpetra::Distributor distributor(mj_problemComm_);
8740
8741 std::vector<mj_part_t>
8742 coordinate_destinations(num_local_coords_, i_am_sending_to);
8743
8744 ArrayView<const mj_part_t>
8745 destinations(&(coordinate_destinations[0]), num_local_coords_);
8746 mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
8747 result_num_local_coords_ = num_incoming_gnos;
8748 mj_env_->timerStop(MACRO_TIMERS,
8749 timer_base_string + "PreMigration DistributorPlanCreating");
8750
8751 mj_env_->timerStart(MACRO_TIMERS,
8752 timer_base_string + "PreMigration DistributorMigration");
8753
8754
8755 // migrate gnos.
8756 // MPI buffers should be on Kokkos::HostSpace not Kokkos::CudaUVMSpace
8757 // Note, with UVM space, create_mirror_view does NOT create a non-UVM
8758 // view; need the explicit Host creation and deep_copy.
8759 {
8760 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> sent_gnos(
8761 Kokkos::ViewAllocateWithoutInitializing("sent_gnos"),
8762 initial_mj_gnos_.size()); // initial_mj_gnos_ is const mj_gno_t *
8763 Kokkos::deep_copy(sent_gnos, initial_mj_gnos_);
8764
8765 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos (
8766 Kokkos::ViewAllocateWithoutInitializing("received_gnos"),
8767 num_incoming_gnos);
8768
8769 distributor.doPostsAndWaits(sent_gnos, 1, received_gnos);
8770
8771 result_initial_mj_gnos_ = Kokkos::View<mj_gno_t*, device_t>(
8772 Kokkos::ViewAllocateWithoutInitializing("result_initial_mj_gnos_"),
8773 num_incoming_gnos);
8774 Kokkos::deep_copy(result_initial_mj_gnos_, received_gnos);
8775 }
8776
8777 // migrate coordinates
8778 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8779
8780 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, Kokkos::HostSpace>
8781 host_src_coordinates(
8782 Kokkos::ViewAllocateWithoutInitializing("mj_coordinates"),
8783 this->mj_coordinates.extent(0), this->mj_coordinates.extent(1));
8784
8785 Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
8786
8787 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> dst_coordinates(
8788 Kokkos::ViewAllocateWithoutInitializing("mj_coordinates"),
8789 num_incoming_gnos, this->coord_dim);
8790
8791 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_coord(
8792 Kokkos::ViewAllocateWithoutInitializing("received_coord"),
8793 num_incoming_gnos);
8794
8795 for(int i = 0; i < this->coord_dim; ++i) {
8796
8797 auto sent_coord = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
8798
8799 distributor.doPostsAndWaits(sent_coord, 1, received_coord);
8800
8801 Kokkos::deep_copy(Kokkos::subview(dst_coordinates, Kokkos::ALL, i),
8802 received_coord);
8803 Kokkos::fence();
8804 }
8805 result_mj_coordinates_ = dst_coordinates;
8806
8807 // migrate weights.
8808
8809 Kokkos::View<mj_scalar_t**, device_t> dst_weights(
8810 Kokkos::ViewAllocateWithoutInitializing("mj_weights"),
8811 num_incoming_gnos, this->num_weights_per_coord);
8812 auto host_dst_weights = Kokkos::create_mirror_view(dst_weights);
8813
8814 auto host_src_weights = Kokkos::create_mirror_view_and_copy(
8815 Kokkos::HostSpace(), this->mj_weights);
8816
8817 // contiguous buffers to gather potentially strided data
8818 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sent_weight(
8819 Kokkos::ViewAllocateWithoutInitializing("send_weight_buffer"),
8820 this->num_local_coords);
8821
8822 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_weight(
8823 Kokkos::ViewAllocateWithoutInitializing("received_weight_buffer"),
8824 num_incoming_gnos);
8825
8826 for(int i = 0; i < this->num_weights_per_coord; ++i) {
8827
8828 auto sub_host_src_weights
8829 = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
8830 auto sub_host_dst_weights
8831 = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
8832
8833 // Layout Right means these weights are not contiguous
8834 // However we don't have any systems setup with more than 1 weight so
8835 // really I have not tested any of this code with num weights > 1.
8836 // I think this is the right thing to do. Note that there are other
8837 // places in the code which don't handle the possibility of more weights.
8838 // So evaluating all that and adding tests would be another project.
8839 for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
8840 sent_weight[n] = sub_host_src_weights(n);
8841 }
8842
8843 distributor.doPostsAndWaits(sent_weight, 1, received_weight);
8844
8845 // Again we copy by index due to layout
8846 for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
8847 sub_host_dst_weights(n) = received_weight[n];
8848 }
8849 }
8850 Kokkos::deep_copy(dst_weights, host_dst_weights);
8851 result_mj_weights_ = dst_weights;
8852
8853 // migrate the owners of the coordinates
8854 {
8855 Kokkos::View<int*, Kokkos::HostSpace> sent_owners(
8856 Kokkos::ViewAllocateWithoutInitializing("sent_owners"),
8857 num_local_coords_);
8858 Kokkos::deep_copy(sent_owners, myRank);
8859
8860 Kokkos::View<int*, Kokkos::HostSpace> received_owners(
8861 Kokkos::ViewAllocateWithoutInitializing("received_owners"),
8862 num_incoming_gnos);
8863
8864 distributor.doPostsAndWaits(sent_owners, 1, received_owners);
8865
8866 result_actual_owner_rank_ = new int[num_incoming_gnos];
8867 memcpy(
8868 result_actual_owner_rank_,
8869 received_owners.data(),
8870 num_incoming_gnos * sizeof(int));
8871 }
8872
8873 mj_env_->timerStop(MACRO_TIMERS,
8874 timer_base_string + "PreMigration DistributorMigration");
8875 return am_i_a_receiver;
8876}
8877
8885template <typename Adapter>
8887 const RCP<PartitioningSolution<Adapter> > &solution)
8888{
8889 // purpose of this code is to validate node and UVM status for the tests
8890 // std::cout << "Memory Space: " << mj_node_t::memory_space::name() << " "
8891 // << "Execution Space: " << mj_node_t::execution_space::name()
8892 // << std::endl;
8893
8894 int execute_counter =
8896 timer_base_string = "partition(" + std::to_string(execute_counter) + ") - ";
8897
8898 this->mj_env->timerStart(MACRO_TIMERS, timer_base_string + "all");
8899 {
8900 this->mj_env->timerStart(MACRO_TIMERS, timer_base_string + "setup");
8901
8902 this->set_up_partitioning_data(solution);
8903
8904 this->set_input_parameters(this->mj_env->getParameters());
8905 if(this->mj_keep_part_boxes) {
8906 this->mj_partitioner.set_to_keep_part_boxes();
8907 }
8908
8909 this->mj_partitioner.set_partitioning_parameters(
8910 this->distribute_points_on_cut_lines,
8911 this->max_concurrent_part_calculation,
8912 this->check_migrate_avoid_migration_option,
8913 this->minimum_migration_imbalance, this->migration_type);
8914
8915 RCP<const Comm<int> > result_problemComm = this->mj_problemComm;
8916 mj_lno_t result_num_local_coords = this->num_local_coords;
8917 Kokkos::View<mj_gno_t*, device_t> result_initial_mj_gnos;
8918 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8919 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
8920 result_mj_coordinates = this->mj_coordinates;
8921 Kokkos::View<mj_scalar_t**, device_t> result_mj_weights =
8922 this->mj_weights;
8923 int *result_actual_owner_rank = NULL;
8924
8925 Kokkos::View<const mj_gno_t*, device_t> result_initial_mj_gnos_ =
8926 this->initial_mj_gnos;
8927
8928 // TODO: MD 08/2017: Further discussion is required.
8929 // MueLu calls MJ when it has very few coordinates per processors,
8930 // such as 10. For example, it begins with 1K processor with 1K coordinate
8931 // in each. Then with coarsening this reduces to 10 coordinate per procesor.
8932 // It calls MJ to repartition these to 10 coordinates.
8933 // MJ runs with 1K processor, 10 coordinate in each, and partitions to
8934 // 10 parts. As expected strong scaling is problem here, because
8935 // computation is almost 0, and communication cost of MJ linearly increases.
8936 // Premigration option gathers the coordinates to 10 parts before MJ starts
8937 // therefore MJ will run with a smalller subset of the problem.
8938 // Below, I am migrating the coordinates if mj_premigration_option is set,
8939 // and the result parts are less than the current part count, and the
8940 // average number of local coordinates is less than some threshold.
8941 // For example, premigration may not help if 1000 processors are
8942 // partitioning data to 10, but each of them already have 1M coordinate.
8943 // In that case, we premigration would not help.
8944 int current_world_size = this->mj_problemComm->getSize();
8945 mj_lno_t threshold_num_local_coords =
8946 this->min_coord_per_rank_for_premigration;
8947 bool is_pre_migrated = false;
8948 bool am_i_in_subset = true;
8949
8950 // Note that we need to add testing for migration and should also cover the
8951 // zoltan case when ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION is defined.
8952 // Currently did a minimal test of this code by running mjTest with
8953 // PM=1, TB=0 then run again with C=3 instead of C=4 (numProcs is 4).
8954 if(mj_premigration_option > 0 &&
8955 size_t (current_world_size) > this->num_global_parts &&
8956 this->num_global_coords < mj_gno_t (
8957 current_world_size * threshold_num_local_coords))
8958 {
8959 if(this->mj_keep_part_boxes) {
8960 throw std::logic_error("Multijagged: mj_keep_part_boxes and "
8961 "mj_premigration_option are not supported together yet.");
8962 }
8963
8964 is_pre_migrated =true;
8965 int migration_selection_option = mj_premigration_option;
8966 if(migration_selection_option * this->num_global_parts >
8967 (size_t) (current_world_size)) {
8968 migration_selection_option =
8969 current_world_size / this->num_global_parts;
8970 }
8971
8972 int used_num_ranks = int (this->num_global_coords /
8973 float (threshold_num_local_coords) + 0.5);
8974
8975 if(used_num_ranks == 0) {
8976 used_num_ranks = 1;
8977 }
8978
8979 am_i_in_subset = this->mj_premigrate_to_subset(
8980 used_num_ranks,
8981 migration_selection_option,
8982 this->mj_env,
8983 this->mj_problemComm,
8984 this->coord_dim,
8985 this->num_local_coords,
8986 this->num_global_coords,
8987 this->num_global_parts,
8988 this->initial_mj_gnos,
8989 this->mj_coordinates,
8990 this->num_weights_per_coord,
8991 this->mj_weights,
8992 //results
8993 result_problemComm,
8994 result_num_local_coords,
8995 result_initial_mj_gnos,
8996 result_mj_coordinates,
8997 result_mj_weights,
8998 result_actual_owner_rank);
8999
9000 result_initial_mj_gnos_ = result_initial_mj_gnos;
9001 }
9002
9003 Kokkos::View<mj_part_t *, device_t> result_assigned_part_ids;
9004 Kokkos::View<mj_gno_t*, device_t> result_mj_gnos;
9005
9006 this->mj_env->timerStop(MACRO_TIMERS, timer_base_string + "setup");
9007
9008 if(am_i_in_subset) {
9009 this->mj_partitioner.multi_jagged_part(
9010 this->mj_env,
9011 result_problemComm, //this->mj_problemComm,
9012 this->imbalance_tolerance,
9013 this->num_teams,
9014 this->num_global_parts,
9015 this->part_no_array,
9016 this->recursion_depth,
9017 this->coord_dim,
9018 result_num_local_coords, //this->num_local_coords,
9019 this->num_global_coords,
9020 result_initial_mj_gnos_,
9021 result_mj_coordinates,
9022 this->num_weights_per_coord,
9023 this->mj_uniform_weights,
9024 result_mj_weights,
9025 this->mj_uniform_parts,
9026 result_assigned_part_ids,
9027 result_mj_gnos
9028 );
9029 }
9030
9031 this->mj_env->timerStart(MACRO_TIMERS, timer_base_string + "cleanup");
9032
9033 // Reorder results so that they match the order of the input
9034 std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid;
9035 localGidToLid.reserve(result_num_local_coords);
9036 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> host_result_initial_mj_gnos(
9037 Kokkos::ViewAllocateWithoutInitializing("host_result_initial_mj_gnos"),
9038 result_initial_mj_gnos_.size());
9039 Kokkos::deep_copy(host_result_initial_mj_gnos, result_initial_mj_gnos_);
9040 for(mj_lno_t i = 0; i < result_num_local_coords; i++) {
9041 localGidToLid[host_result_initial_mj_gnos(i)] = i;
9042 }
9043
9044 ArrayRCP<mj_part_t> partId = arcp(new mj_part_t[result_num_local_coords],
9045 0, result_num_local_coords, true);
9046 auto host_result_assigned_part_ids =
9047 Kokkos::create_mirror_view(result_assigned_part_ids);
9048 Kokkos::deep_copy(host_result_assigned_part_ids, result_assigned_part_ids);
9049 auto host_result_mj_gnos = Kokkos::create_mirror_view(result_mj_gnos);
9050 Kokkos::deep_copy(host_result_mj_gnos, result_mj_gnos);
9051 for(mj_lno_t i = 0; i < result_num_local_coords; i++) {
9052 mj_lno_t origLID = localGidToLid[host_result_mj_gnos(i)];
9053 partId[origLID] = host_result_assigned_part_ids(i);
9054 }
9055
9056 //now the results are reordered. but if premigration occured,
9057 //then we need to send these ids to actual owners again.
9058 if(is_pre_migrated) {
9059 this->mj_env->timerStart(MACRO_TIMERS, timer_base_string +
9060 "PostMigration DistributorPlanCreating");
9061 Tpetra::Distributor distributor(this->mj_problemComm);
9062
9063 ArrayView<const mj_part_t> actual_owner_destinations(
9064 result_actual_owner_rank , result_num_local_coords);
9065
9066 mj_lno_t num_incoming_gnos = distributor.createFromSends(
9067 actual_owner_destinations);
9068
9069 if(num_incoming_gnos != this->num_local_coords) {
9070 throw std::logic_error("Zoltan2 - Multijagged Post Migration - "
9071 "num incoming is not equal to num local coords");
9072 }
9073
9074 mj_env->timerStop(MACRO_TIMERS, timer_base_string +
9075 "PostMigration DistributorPlanCreating");
9076 mj_env->timerStart(MACRO_TIMERS, timer_base_string +
9077 "PostMigration DistributorMigration");
9078
9079 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos(
9080 Kokkos::ViewAllocateWithoutInitializing("received_gnos"),
9081 num_incoming_gnos);
9082 Kokkos::View<mj_part_t*, Kokkos::HostSpace> received_partids(
9083 Kokkos::ViewAllocateWithoutInitializing("received_partids"),
9084 num_incoming_gnos);
9085
9086 distributor.doPostsAndWaits(host_result_initial_mj_gnos, 1,
9087 received_gnos);
9088 {
9089 Kokkos::View<mj_part_t*, Kokkos::HostSpace> sent_partnos;
9090 if (partId.size() > 0) {
9091 sent_partnos = Kokkos::View<mj_part_t*, Kokkos::HostSpace>(
9092 partId.getRawPtr(), partId.size()); //unmanaged
9093 }
9094 distributor.doPostsAndWaits(sent_partnos, 1, received_partids);
9095 }
9096
9097 partId = arcp(new mj_part_t[this->num_local_coords],
9098 0, this->num_local_coords, true);
9099
9100 {
9101 std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid2;
9102 localGidToLid2.reserve(this->num_local_coords);
9103 auto host_initial_mj_gnos =
9104 Kokkos::create_mirror_view(this->initial_mj_gnos);
9105 Kokkos::deep_copy(host_initial_mj_gnos,
9106 this->initial_mj_gnos);
9107 for(mj_lno_t i = 0; i < this->num_local_coords; i++) {
9108 localGidToLid2[host_initial_mj_gnos(i)] = i;
9109 }
9110
9111 for(mj_lno_t i = 0; i < this->num_local_coords; i++) {
9112 mj_lno_t origLID = localGidToLid2[received_gnos[i]];
9113 partId[origLID] = received_partids[i];
9114 }
9115 }
9116
9117 {
9118 delete [] result_actual_owner_rank;
9119 }
9120 mj_env->timerStop(MACRO_TIMERS,
9121 timer_base_string + "PostMigration DistributorMigration");
9122 }
9123 solution->setParts(partId);
9124 this->mj_env->timerStop(MACRO_TIMERS, timer_base_string + "cleanup");
9125 }
9126
9127 this->mj_env->timerStop(MACRO_TIMERS, timer_base_string + "all");
9128
9129 // reset the view (release the reference to device data)
9130 this->mj_coordinates = Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>();
9131}
9132
9133/* \brief Sets the partitioning data for multijagged algorithm.
9134 * */
9135template <typename Adapter>
9137 const RCP<PartitioningSolution<Adapter> > &solution
9138)
9139{
9140 modelFlag_t flags;
9141 CoordinateModel<Adapter> mj_coords(mj_adapter, mj_env, mj_problemComm, flags);
9142
9143 this->coord_dim = mj_coords.getCoordinateDim();
9144 this->num_weights_per_coord = mj_coords.getNumWeightsPerCoordinate();
9145 this->num_local_coords = mj_coords.getLocalNumCoordinates();
9146 this->num_global_coords = mj_coords.getGlobalNumCoordinates();
9147
9148 int criteria_dim = (this->num_weights_per_coord ?
9149 this->num_weights_per_coord : 1);
9150 // From the Solution we get part information.
9151 // If the part sizes for a given criteria are not uniform,
9152 // then they are values that sum to 1.0.
9153 this->num_global_parts = solution->getTargetGlobalNumberOfParts();
9154 // allocate only two dimensional pointer.
9155 // raw pointer addresess will be obtained from multivector.
9156 this->mj_uniform_parts = Kokkos::View<bool *, Kokkos::HostSpace>(
9157 "uniform parts", criteria_dim);
9158 this->mj_uniform_weights = Kokkos::View<bool *, Kokkos::HostSpace>(
9159 "uniform weights", criteria_dim);
9160
9161 Kokkos::View<const mj_gno_t *, device_t> gnos;
9162 Kokkos::View<adapter_scalar_t **, Kokkos::LayoutLeft, device_t> xyz_adapter;
9163 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
9164 Kokkos::View<adapter_scalar_t **, device_t> wgts_adapter;
9165 mj_coords.getCoordinatesKokkos(gnos, xyz_adapter, wgts_adapter);
9166 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
9167 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> xyz;
9168 Kokkos::View<mj_scalar_t **, device_t> wgts;
9169
9170 // Now we must get the data from the adapter.
9171 // If the types match we point to the view but if not, we must copy.
9172 if(std::is_same<mj_scalar_t, adapter_scalar_t>()) {
9173 // we can just point the views but we must specialize because this code
9174 // only compiles in this case - for is_same false assign does nothing.
9175 assign_if_same(xyz, xyz_adapter);
9176 assign_if_same(wgts, wgts_adapter);
9177 }
9178 else {
9179 // we only allocate a new view if we are going to copy
9180 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
9181 xyz = Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>
9182 (Kokkos::ViewAllocateWithoutInitializing(
9183 "xyz"), xyz_adapter.extent(0), xyz_adapter.extent(1));
9184 wgts = Kokkos::View<mj_scalar_t **, device_t>(
9185 Kokkos::ViewAllocateWithoutInitializing("wgts"),
9186 wgts_adapter.extent(0), wgts_adapter.extent(1));
9187
9188 typedef typename Kokkos::View<mj_scalar_t **, device_t>::size_type view_size_t;
9189 Kokkos::parallel_for(
9190 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
9191 (0, xyz_adapter.extent(0)), KOKKOS_LAMBDA (int i) {
9192 for(view_size_t n = 0; n < xyz_adapter.extent(1); ++n) {
9193 xyz(i, n) = static_cast<mj_scalar_t>(xyz_adapter(i, n));
9194 }
9195 });
9196 Kokkos::parallel_for(
9197 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
9198 (0, wgts.extent(0)), KOKKOS_LAMBDA (int i) {
9199 for(view_size_t n = 0; n < wgts.extent(1); ++n) {
9200 wgts(i, n) = static_cast<mj_scalar_t>(wgts_adapter(i, n));
9201 }
9202 });
9203 }
9204
9205 // obtain global ids.
9206 this->initial_mj_gnos = gnos;
9207 // extract coordinates from multivector.
9208 this->mj_coordinates = xyz;
9209 // if no weights are provided set uniform weight.
9210
9211 if(this->num_weights_per_coord == 0) {
9212 this->mj_uniform_weights(0) = true;
9213 Kokkos::resize(this->mj_weights, 0, 0);
9214 }
9215 else{
9216 this->mj_weights = wgts;
9217 for(int wdim = 0; wdim < this->num_weights_per_coord; ++wdim) {
9218 this->mj_uniform_weights(wdim) = false;
9219 }
9220 }
9221
9222 for(int wdim = 0; wdim < criteria_dim; ++wdim) {
9223 if(solution->criteriaHasUniformPartSizes(wdim)) {
9224 this->mj_uniform_parts(wdim) = true;
9225 }
9226 else {
9227 printf("Error: MJ does not support non uniform target part weights\n");
9228 std::terminate();
9229 }
9230 }
9231}
9232
9233/* \brief Sets the partitioning parameters for multijagged algorithm.
9234 * \param pl: is the parameter list provided to zoltan2 call
9235 * */
9236template <typename Adapter>
9238 const Teuchos::ParameterList &pl)
9239{
9240 const Teuchos::ParameterEntry *pe = pl.getEntryPtr("imbalance_tolerance");
9241 if(pe) {
9242 double tol;
9243 tol = pe->getValue(&tol);
9244 this->imbalance_tolerance = tol - 1.0;
9245 }
9246
9247 // TODO: May be a more relaxed tolerance is needed. RCB uses 10%
9248 if(this->imbalance_tolerance <= 0) {
9249 this->imbalance_tolerance= 10e-4;
9250 }
9251
9252 // if an input partitioning array is provided.
9253 Kokkos::resize(this->part_no_array, 0);
9254
9255 // the length of the input partitioning array.
9256 this->recursion_depth = 0;
9257
9258 if(pl.getPtr<int>("mj_num_teams")) {
9259 this->num_teams = pl.get<int>("mj_num_teams");
9260 }
9261
9262 if(pl.getPtr<Array <mj_part_t> >("mj_parts")) {
9263 auto mj_parts = pl.get<Array <mj_part_t> >("mj_parts");
9264 int mj_parts_size = static_cast<int>(mj_parts.size());
9265
9266 // build the view we'll have data on and copy values from host
9267 this->part_no_array = Kokkos::View<mj_part_t*, Kokkos::HostSpace>(
9268 "part_no_array", mj_parts_size);
9269 for(int i = 0; i < mj_parts_size; ++i) {
9270 this->part_no_array(i) = mj_parts.getRawPtr()[i];
9271 }
9272
9273 this->recursion_depth = mj_parts_size - 1;
9274 this->mj_env->debug(2, "mj_parts provided by user");
9275 }
9276
9277 // get mj specific parameters.
9278 this->distribute_points_on_cut_lines = true;
9279 this->max_concurrent_part_calculation = 1;
9280
9281 this->mj_run_as_rcb = false;
9282 this->mj_premigration_option = 0;
9283 this->min_coord_per_rank_for_premigration = 32000;
9284
9285 int mj_user_recursion_depth = -1;
9286 this->mj_keep_part_boxes = false;
9287 this->check_migrate_avoid_migration_option = 0;
9288 this->migration_type = 0;
9289 this->minimum_migration_imbalance = 0.35;
9290
9291 pe = pl.getEntryPtr("mj_minimum_migration_imbalance");
9292 if(pe) {
9293 double imb;
9294 imb = pe->getValue(&imb);
9295 this->minimum_migration_imbalance = imb - 1.0;
9296 }
9297
9298 pe = pl.getEntryPtr("mj_migration_option");
9299 if(pe) {
9300 this->check_migrate_avoid_migration_option =
9301 pe->getValue(&this->check_migrate_avoid_migration_option);
9302 } else {
9303 this->check_migrate_avoid_migration_option = 0;
9304 }
9305 if(this->check_migrate_avoid_migration_option > 1) {
9306 this->check_migrate_avoid_migration_option = -1;
9307 }
9308
9310 pe = pl.getEntryPtr("mj_migration_type");
9311 if(pe) {
9312 this->migration_type = pe->getValue(&this->migration_type);
9313 } else {
9314 this->migration_type = 0;
9315 }
9316
9317 //std::cout << " this->migration_type:" << this->migration_type << std::endl;
9319
9320 pe = pl.getEntryPtr("mj_concurrent_part_count");
9321 if(pe) {
9322 this->max_concurrent_part_calculation =
9323 pe->getValue(&this->max_concurrent_part_calculation);
9324 } else {
9325 this->max_concurrent_part_calculation = 1; // Set to 1 if not provided.
9326 }
9327
9328 pe = pl.getEntryPtr("mj_keep_part_boxes");
9329 if(pe) {
9330 this->mj_keep_part_boxes = pe->getValue(&this->mj_keep_part_boxes);
9331 } else {
9332 this->mj_keep_part_boxes = false; // Set to invalid value
9333 }
9334
9335 // For now, need keep_part_boxes to do pointAssign and boxAssign.
9336 // pe = pl.getEntryPtr("keep_cuts");
9337 // if(pe) {
9338 // int tmp = pe->getValue(&tmp);
9339 // if(tmp) this->mj_keep_part_boxes = true;
9340 // }
9341
9342 //need to keep part boxes if mapping type is geometric.
9343 if(this->mj_keep_part_boxes == false) {
9344 pe = pl.getEntryPtr("mapping_type");
9345 if(pe) {
9346 int mapping_type = -1;
9347 mapping_type = pe->getValue(&mapping_type);
9348 if(mapping_type == 0) {
9349 mj_keep_part_boxes = true;
9350 }
9351 }
9352 }
9353
9354 // need to keep part boxes if mapping type is geometric.
9355 pe = pl.getEntryPtr("mj_enable_rcb");
9356 if(pe) {
9357 this->mj_run_as_rcb = pe->getValue(&this->mj_run_as_rcb);
9358 } else {
9359 this->mj_run_as_rcb = false; // Set to invalid value
9360 }
9361
9362 pe = pl.getEntryPtr("mj_premigration_option");
9363 if(pe) {
9364 mj_premigration_option = pe->getValue(&mj_premigration_option);
9365 } else {
9366 mj_premigration_option = 0;
9367 }
9368
9369 pe = pl.getEntryPtr("mj_premigration_coordinate_count");
9370 if(pe) {
9371 min_coord_per_rank_for_premigration = pe->getValue(&mj_premigration_option);
9372 } else {
9373 min_coord_per_rank_for_premigration = 32000;
9374 }
9375
9376 pe = pl.getEntryPtr("mj_recursion_depth");
9377 if(pe) {
9378 mj_user_recursion_depth = pe->getValue(&mj_user_recursion_depth);
9379 } else {
9380 mj_user_recursion_depth = -1; // Set to invalid value
9381 }
9382
9383 bool val = false;
9384 pe = pl.getEntryPtr("rectilinear");
9385 if(pe) {
9386 val = pe->getValue(&val);
9387 }
9388 if(val) {
9389 this->distribute_points_on_cut_lines = false;
9390 } else {
9391 this->distribute_points_on_cut_lines = true;
9392 }
9393
9394 if(this->mj_run_as_rcb) {
9395 mj_user_recursion_depth =
9396 (int)(ceil(log ((this->num_global_parts)) / log (2.0)));
9397 }
9398 if(this->recursion_depth < 1) {
9399 if(mj_user_recursion_depth > 0) {
9400 this->recursion_depth = mj_user_recursion_depth;
9401 }
9402 else {
9403 this->recursion_depth = this->coord_dim;
9404 }
9405 }
9406}
9407
9409template <typename Adapter>
9411 int dim,
9412 adapter_scalar_t *lower,
9413 adapter_scalar_t *upper,
9414 size_t &nPartsFound,
9415 typename Adapter::part_t **partsFound) const
9416{
9417 // TODO: Implement with cuts rather than boxes to reduce algorithmic
9418 // TODO: complexity. Or at least do a search through the boxes, using
9419 // TODO: p x q x r x ... if possible.
9420
9421 nPartsFound = 0;
9422 *partsFound = NULL;
9423
9424 if(this->mj_keep_part_boxes) {
9425
9426 // Get vector of part boxes
9427 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
9428
9429 size_t nBoxes = (*partBoxes).size();
9430 if(nBoxes == 0) {
9431 throw std::logic_error("no part boxes exist");
9432 }
9433
9434 // Determine whether the box overlaps the globalBox at all
9435 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
9436
9437 if(globalBox->boxesOverlap(dim, lower, upper)) {
9438
9439 std::vector<typename Adapter::part_t> partlist;
9440
9441 // box overlaps the global box; find specific overlapping boxes
9442 for(size_t i = 0; i < nBoxes; i++) {
9443 try {
9444 if((*partBoxes)[i].boxesOverlap(dim, lower, upper)) {
9445 nPartsFound++;
9446 partlist.push_back((*partBoxes)[i].getpId());
9447 /*
9448 std::cout << "Given box (";
9449 for(int j = 0; j < dim; j++)
9450 std::cout << lower[j] << " ";
9451 std::cout << ") x (";
9452 for(int j = 0; j < dim; j++)
9453 std::cout << upper[j] << " ";
9454 std::cout << ") overlaps PartBox "
9455 << (*partBoxes)[i].getpId() << " (";
9456 for(int j = 0; j < dim; j++)
9457 std::cout << (*partBoxes)[i].getlmins()[j] << " ";
9458 std::cout << ") x (";
9459 for(int j = 0; j < dim; j++)
9460 std::cout << (*partBoxes)[i].getlmaxs()[j] << " ";
9461 std::cout << ")" << std::endl;
9462 */
9463 }
9464 }
9466 }
9467 if(nPartsFound) {
9468 *partsFound = new mj_part_t[nPartsFound];
9469 for(size_t i = 0; i < nPartsFound; i++)
9470 (*partsFound)[i] = partlist[i];
9471 }
9472 }
9473 else {
9474 // Box does not overlap the domain at all. Find the closest part
9475 // Not sure how to perform this operation for MJ without having the
9476 // cuts. With the RCB cuts, the concept of a part extending to
9477 // infinity was natural. With the boxes, it is much more difficult.
9478 // TODO: For now, return information indicating NO OVERLAP.
9479 }
9480 }
9481 else {
9482 throw std::logic_error("need to use keep_cuts parameter for boxAssign");
9483 }
9484}
9485
9487template <typename Adapter>
9489 int dim,
9490 adapter_scalar_t *point) const
9491{
9492 // TODO: Implement with cuts rather than boxes to reduce algorithmic
9493 // TODO: complexity. Or at least do a search through the boxes, using
9494 // TODO: p x q x r x ... if possible.
9495
9496 if(this->mj_keep_part_boxes) {
9497 typename Adapter::part_t foundPart = -1;
9498
9499 // Get vector of part boxes
9500 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
9501
9502 size_t nBoxes = (*partBoxes).size();
9503 if(nBoxes == 0) {
9504 throw std::logic_error("no part boxes exist");
9505 }
9506
9507 // Determine whether the point is within the global domain
9508 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
9509
9510 if(globalBox->pointInBox(dim, point)) {
9511
9512 // point is in the global domain; determine in which part it is.
9513 size_t i;
9514 for(i = 0; i < nBoxes; i++) {
9515 try {
9516 if((*partBoxes)[i].pointInBox(dim, point)) {
9517 foundPart = (*partBoxes)[i].getpId();
9518 // std::cout << "Point (";
9519 // for(int j = 0; j < dim; j++) std::cout << point[j] << " ";
9520 // std::cout << ") found in box " << i << " part " << foundPart
9521 // << std::endl;
9522 // (*partBoxes)[i].print();
9523 break;
9524 }
9525 }
9527 }
9528
9529 if(i == nBoxes) {
9530 // This error should never occur
9531 std::ostringstream oss;
9532 oss << "Point (";
9533 for(int j = 0; j < dim; j++) oss << point[j] << " ";
9534 oss << ") not found in domain";
9535 throw std::logic_error(oss.str());
9536 }
9537 }
9538
9539 else {
9540 // Point is outside the global domain.
9541 // Determine to which part it is closest.
9542 // TODO: with cuts, would not need this special case
9543
9544 typedef typename Zoltan2::coordinateModelPartBox::coord_t coord_t;
9545 size_t closestBox = 0;
9546 coord_t minDistance = std::numeric_limits<coord_t>::max();
9547 coord_t *centroid = new coord_t[dim];
9548 for(size_t i = 0; i < nBoxes; i++) {
9549 (*partBoxes)[i].computeCentroid(centroid);
9550 coord_t sum = 0.;
9551 coord_t diff;
9552 for(int j = 0; j < dim; j++) {
9553 diff = centroid[j] - point[j];
9554 sum += diff * diff;
9555 }
9556 if(sum < minDistance) {
9557 minDistance = sum;
9558 closestBox = i;
9559 }
9560 }
9561 foundPart = (*partBoxes)[closestBox].getpId();
9562 delete [] centroid;
9563 }
9564
9565 return foundPart;
9566 }
9567 else {
9568 throw std::logic_error("need to use keep_cuts parameter for pointAssign");
9569 }
9570}
9571
9572template <typename Adapter>
9574 const PartitioningSolution<Adapter> *solution,
9575 ArrayRCP<typename Zoltan2_AlgMJ<Adapter>::mj_part_t> &comXAdj,
9576 ArrayRCP<typename Zoltan2_AlgMJ<Adapter>::mj_part_t> &comAdj)
9577{
9578 if(comXAdj_.getRawPtr() == NULL && comAdj_.getRawPtr() == NULL) {
9579 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
9580 mj_part_t ntasks = (*pBoxes).size();
9581 int dim = (*pBoxes)[0].getDim();
9582 GridHash grid(pBoxes, ntasks, dim);
9583 grid.getAdjArrays(comXAdj_, comAdj_);
9584 }
9585 comAdj = comAdj_;
9586 comXAdj = comXAdj_;
9587}
9588
9589template <typename Adapter>
9590RCP<typename Zoltan2_AlgMJ<Adapter>::mj_partBoxVector_t>
9592{
9593 return this->mj_partitioner.get_kept_boxes();
9594}
9595} // namespace Zoltan2
9596
9597#endif
typename node_t::device_type device_t
Defines the CoordinateModel classes.
#define Z2_FORWARD_EXCEPTIONS
Forward an exception back through call stack.
#define Z2_ASSERT_VALUE(actual, expected)
Throw an error when actual value is not equal to expected value.
#define Z2_THROW_OUTSIDE_ERROR(env)
Throw an error returned from outside the Zoltan2 library.
Define IntegerRangeList validator.
Contains Teuchos redcution operators for the Multi-jagged algorthm.
Defines Parameter related enumerators, declares functions.
A gathering of useful namespace methods.
Zoltan2_BoxBoundaries is a reduction operation to all reduce the all box boundaries.
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
Zoltan2_BoxBoundaries(Ordinal s_)
Constructor.
Multi Jagged coordinate partitioning algorithm.
void set_partitioning_parameters(bool distribute_points_on_cut_lines_, int max_concurrent_part_calculation_, int check_migrate_avoid_migration_option_, double minimum_migration_imbalance_, int migration_type_=0)
Multi Jagged coordinate partitioning algorithm.
RCP< mj_partBoxVector_t > compute_global_box_boundaries(RCP< mj_partBoxVector_t > &localPartBoxes) const
DOCWORK: Documentation.
void sequential_task_partitioning(const RCP< const Environment > &env, mj_lno_t num_total_coords, mj_lno_t num_selected_coords, size_t num_target_part, int coord_dim, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates_, Kokkos::View< mj_lno_t *, device_t > &initial_selected_coords_output_permutation, mj_lno_t *output_xadj, int recursion_depth_, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, bool partition_along_longest_dim, int num_ranks_per_node, bool divide_to_prime_first_, mj_part_t num_first_level_parts_=1, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &first_level_distribution_=Kokkos::View< mj_part_t *, Kokkos::HostSpace >())
Special function for partitioning for task mapping. Runs sequential, and performs deterministic parti...
void multi_jagged_part(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, double imbalance_tolerance, int num_teams, size_t num_global_parts, Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, int recursion_depth, int coord_dim, mj_lno_t num_local_coords, mj_gno_t num_global_coords, Kokkos::View< const mj_gno_t *, device_t > &initial_mj_gnos, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates, int num_weights_per_coord, Kokkos::View< bool *, Kokkos::HostSpace > &mj_uniform_weights, Kokkos::View< mj_scalar_t **, device_t > &mj_weights, Kokkos::View< bool *, Kokkos::HostSpace > &mj_uniform_parts, Kokkos::View< mj_part_t *, device_t > &result_assigned_part_ids, Kokkos::View< mj_gno_t *, device_t > &result_mj_gnos)
Multi Jagged coordinate partitioning algorithm.
RCP< mj_partBoxVector_t > get_kept_boxes() const
DOCWORK: Documentation.
AlgMJ()
Multi Jagged coordinate partitioning algorithm default constructor.
RCP< mj_partBox_t > get_global_box() const
DOCWORK: Documentation.
void set_to_keep_part_boxes()
Function call, if the part boxes are intended to be kept.
Algorithm defines the base class for all algorithms.
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
global_size_t getGlobalNumCoordinates() const
Returns the global number coordinates.
size_t getCoordinatesKokkos(Kokkos::View< const gno_t *, typename node_t::device_type > &Ids, Kokkos::View< scalar_t **, Kokkos::LayoutLeft, typename node_t::device_type > &xyz, Kokkos::View< scalar_t **, typename node_t::device_type > &wgts) const
Returns the coordinate ids, values and optional weights.
int getCoordinateDim() const
Returns the dimension of the coordinates.
size_t getLocalNumCoordinates() const
Returns the number of coordinates on this process.
int getNumWeightsPerCoordinate() const
Returns the number (0 or greater) of weights per coordinate.
static RCP< Teuchos::BoolParameterEntryValidator > getBoolValidator()
Exists to make setting up validators less cluttered.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyDoubleValidator()
Exists to make setting up validators less cluttered.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyIntValidator()
Exists to make setting up validators less cluttered.
GridHash Class, Hashing Class for part boxes.
void getAdjArrays(ArrayRCP< part_t > &comXAdj_, ArrayRCP< part_t > &comAdj_)
GridHash Class, returns the adj arrays.
A ParameterList validator for integer range lists.
A PartitioningSolution is a solution to a partitioning problem.
Multi Jagged coordinate partitioning algorithm.
void set_up_partitioning_data(const RCP< PartitioningSolution< Adapter > > &solution)
Zoltan2_AlgMJ(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, const RCP< const typename Adapter::base_adapter_t > &adapter)
void partition(const RCP< PartitioningSolution< Adapter > > &solution)
Multi Jagged coordinate partitioning algorithm.
mj_part_t pointAssign(int dim, adapter_scalar_t *point) const
void boxAssign(int dim, adapter_scalar_t *lower, adapter_scalar_t *upper, size_t &nPartsFound, mj_part_t **partsFound) const
static void getValidParameters(ParameterList &pl)
Set up validators specific to this algorithm.
void getCommunicationGraph(const PartitioningSolution< Adapter > *solution, ArrayRCP< mj_part_t > &comXAdj, ArrayRCP< mj_part_t > &comAdj)
returns communication graph resulting from MJ partitioning.
mj_partBoxVector_t & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
coordinateModelPartBox Class, represents the boundaries of the box which is a result of a geometric p...
Class for sorting items with multiple values. First sorting with respect to val[0],...
void set(IT index_, CT count_, WT *vals_)
bool operator<(const uMultiSortItem< IT, CT, WT > &other) const
uMultiSortItem(IT index_, CT count_, WT *vals_)
Created by mbenlioglu on Aug 31, 2020.
Tpetra::global_size_t global_size_t
std::bitset< NUM_MODEL_FLAGS > modelFlag_t
@ MACRO_TIMERS
Time an algorithm (or other entity) as a whole.
void uqsort(IT n, uSortItem< IT, WT > *arr)
Quick sort function. Sorts the arr of uSortItems, with respect to increasing vals....
void uqSignsort(IT n, uSignedSortItem< IT, WT, SIGN > *arr)
Quick sort function. Sorts the arr of uSignedSortItems, with respect to increasing vals.
#define epsilon
Definition nd.cpp:47
static RCP< tMVector_t > coordinates
SparseMatrixAdapter_t::part_t part_t
KOKKOS_INLINE_FUNCTION value_type & reference() const
KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const
Zoltan2_MJArrayType< scalar_t > value_type
KOKKOS_INLINE_FUNCTION void join(volatile value_type &dst, const volatile value_type &src) const
KOKKOS_INLINE_FUNCTION ArrayCombinationReducer(scalar_t mj_max_scalar, value_type &val, int mj_value_count_rightleft, int mj_value_count_weights)
KOKKOS_INLINE_FUNCTION void init(value_type &dst) const
Zoltan2_MJArrayType< scalar_t > value_type
KOKKOS_INLINE_FUNCTION void init(value_type &dst) const
KOKKOS_INLINE_FUNCTION ArrayReducer(value_type &val, int mj_value_count)
KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const
KOKKOS_INLINE_FUNCTION value_type & reference() const
KOKKOS_INLINE_FUNCTION void init(value_type dst) const
Kokkos::View< part_t *, device_t > parts
Kokkos::View< scalar_t * > scalar_view_t
Kokkos::View< index_t *, device_t > part_xadj
ReduceArrayFunctor(part_t mj_concurrent_current_part, part_t mj_weight_array_size, Kokkos::View< index_t *, device_t > &mj_permutations, Kokkos::View< scalar_t *, device_t > &mj_coordinates, Kokkos::View< part_t *, device_t > &mj_parts, Kokkos::View< index_t *, device_t > &mj_part_xadj, Kokkos::View< index_t *, device_t > &mj_track_on_cuts)
Kokkos::View< index_t *, device_t > track_on_cuts
Kokkos::View< scalar_t *, device_t > coordinates
size_t team_shmem_size(int team_size) const
Kokkos::View< index_t *, device_t > permutations
KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const
Kokkos::View< scalar_t *, device_t > cut_coordinates
KOKKOS_INLINE_FUNCTION void init(value_type dst) const
Kokkos::View< scalar_t **, device_t > weights
ReduceWeightsFunctor(int mj_loop_count, array_t mj_max_scalar, part_t mj_concurrent_current_part, part_t mj_num_cuts, part_t mj_current_work_part, part_t mj_current_concurrent_num_parts, part_t mj_left_right_array_size, part_t mj_weight_array_size, Kokkos::View< index_t *, device_t > &mj_permutations, Kokkos::View< scalar_t *, device_t > &mj_coordinates, Kokkos::View< scalar_t **, device_t > &mj_weights, Kokkos::View< part_t *, device_t > &mj_parts, Kokkos::View< scalar_t *, device_t > &mj_cut_coordinates, Kokkos::View< index_t *, device_t > &mj_part_xadj, bool mj_uniform_weights0, scalar_t mj_sEpsilon)
KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const
Kokkos::View< scalar_t *, device_t > coordinates
Kokkos::View< part_t *, device_t > parts
size_t team_shmem_size(int team_size) const
Kokkos::View< index_t *, device_t > part_xadj
Kokkos::View< index_t *, device_t > permutations
KOKKOS_INLINE_FUNCTION void operator()(const member_type &teamMember, value_type teamSum) const
Kokkos::View< scalar_t * > scalar_view_t
KOKKOS_INLINE_FUNCTION Zoltan2_MJArrayType()
KOKKOS_INLINE_FUNCTION Zoltan2_MJArrayType(scalar_t *pSetPtr)
bool operator<=(const uSignedSortItem< IT, WT, SIGN > &rhs)
bool operator<(const uSignedSortItem< IT, WT, SIGN > &rhs) const
Sort items for quick sort function.