14#ifndef _ZOLTAN2_ALGMultiJagged_HPP_
15#define _ZOLTAN2_ALGMultiJagged_HPP_
24#include <Tpetra_Distributor.hpp>
25#include <Teuchos_StandardParameterEntryValidators.hpp>
26#include <Teuchos_ParameterList.hpp>
27#include <Kokkos_Sort.hpp>
31#include <unordered_map>
33#ifdef ZOLTAN2_USEZOLTANCOMM
34#ifdef HAVE_ZOLTAN2_MPI
35#define ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
36#include "zoltan_comm_cpp.h"
37#include "zoltan_types.h"
46template <
typename Ordinal,
typename T>
70 void reduce(
const Ordinal count,
const T inBuffer[], T inoutBuffer[])
const {
71 for(Ordinal i = 0; i < count; i++) {
73 inoutBuffer[i] = inBuffer[i];
89template <
typename IT,
typename CT,
typename WT>
104 this->epsilon = std::numeric_limits<WT>::epsilon() * 100;
109 this->index = index_;
110 this->count = count_;
112 this->epsilon = std::numeric_limits<WT>::epsilon() * 100;
118 void set(IT index_ ,CT count_, WT *vals_) {
119 this->index = index_;
120 this->count = count_;
125 assert(this->count == other.
count);
126 for(CT i = 0; i < this->
count; ++i) {
128 if(std::abs(this->val[i] - other.
val[i]) < this->epsilon) {
132 if(this->val[i] < other.
val[i]) {
141 return this->index < other.
index;
147template <
class IT,
class WT>
158template <
class IT,
class WT>
160 const int NSTACK = 50;
162 IT i, ir=n, j, k, l=1;
163 IT jstack=0, istack[NSTACK];
170 for(j=l+1;j<=ir;j++) {
173 for(i=j-1;i>=1;i--) {
174 if(arr[i].val <= aval)
187 std::swap(arr[k],arr[l+1]);
188 if(arr[l+1].val > arr[ir].val) {
189 std::swap(arr[l+1],arr[ir]);
191 if(arr[l].val > arr[ir].val) {
192 std::swap(arr[l],arr[ir]);
194 if(arr[l+1].val > arr[l].val) {
195 std::swap(arr[l+1],arr[l]);
202 do i++;
while (arr[i].val < aval);
203 do j--;
while (arr[j].val > aval);
205 std::swap(arr[i],arr[j]);
210 if(jstack > NSTACK) {
211 std::cout <<
"uqsort: NSTACK too small in sort." << std::endl;
228template <
class IT,
class WT,
class SIGN>
236 if(this->signbit < rhs.
signbit) {
240 else if(this->signbit == rhs.
signbit) {
241 if(this->val < rhs.
val) {
245 else if(this->val > rhs.
val) {
260 return (this->val == rhs.
val && this->signbit == rhs.
signbit) || (*
this < rhs);
267template <
class IT,
class WT,
class SIGN>
269 const IT NSTACK = 50;
271 IT i, ir=n, j, k, l=1;
272 IT jstack=0, istack[NSTACK];
278 for(j=l+1;j<=ir;j++) {
280 for(i=j-1;i>=1;i--) {
296 std::swap(arr[k],arr[l+1]);
297 if(arr[ir] < arr[l+1]) {
298 std::swap(arr[l+1],arr[ir]);
300 if(arr[ir] < arr[l] ) {
301 std::swap(arr[l],arr[ir]);
303 if(arr[l] < arr[l+1]) {
304 std::swap(arr[l+1],arr[l]);
310 do i++;
while (arr[i] < a);
311 do j--;
while (a < arr[j]);
313 std::swap(arr[i],arr[j]);
318 if(jstack > NSTACK) {
319 std::cout <<
"uqsort: NSTACK too small in sort." << std::endl;
355 static int counter = 0;
359 static int counter = 0;
366template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
367 typename mj_part_t,
typename mj_node_t>
371 typedef typename mj_node_t::device_type device_t;
373 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
378 static constexpr size_t future_reduceall_cutoff = 1500000;
382 static constexpr mj_lno_t min_work_last_dim = 1000;
384 static constexpr mj_scalar_t least_signifiance = 0.0001;
385 static constexpr int significance_mul = 1000;
387 std::string mj_timer_base_string;
389 RCP<const Environment> mj_env;
390 RCP<const Comm<int> > mj_problemComm;
391 RCP<Comm<int> > comm;
392 double imbalance_tolerance;
395 int num_weights_per_coord;
396 size_t initial_num_loc_coords;
398 mj_lno_t num_local_coords;
399 mj_gno_t num_global_coords;
400 mj_scalar_t sEpsilon;
403 bool distribute_points_on_cut_lines;
406 mj_part_t max_concurrent_part_calculation;
409 int mj_user_recursion_depth;
410 bool mj_keep_part_boxes;
413 int check_migrate_avoid_migration_option;
420 double minimum_migration_imbalance;
437 mj_part_t num_first_level_parts;
441 Kokkos::View<mj_part_t*, Kokkos::HostSpace> first_level_distribution;
443 mj_part_t total_num_cut ;
444 mj_part_t total_num_part;
446 mj_part_t max_num_part_along_dim ;
447 mj_part_t max_num_cut_along_dim;
450 size_t max_num_total_part_along_dim;
452 mj_part_t total_dim_num_reduce_all;
456 mj_part_t last_dim_num_part;
459 Kokkos::View<mj_part_t *, Kokkos::HostSpace> part_no_array;
463 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>
467 Kokkos::View<mj_scalar_t **, device_t> mj_weights;
470 Kokkos::View<bool *, Kokkos::HostSpace> mj_uniform_parts;
473 Kokkos::View<bool *, Kokkos::HostSpace> mj_uniform_weights;
477 size_t num_global_parts;
480 RCP<mj_partBoxVector_t> kept_boxes;
482 RCP<mj_partBox_t> global_box;
487 bool divide_to_prime_first;
490 Kokkos::View<const mj_gno_t*, device_t> initial_mj_gnos;
493 Kokkos::View<mj_gno_t*, device_t> current_mj_gnos;
496 Kokkos::View<int*, Kokkos::HostSpace> owner_of_coordinate;
499 Kokkos::View<mj_lno_t*, device_t> coordinate_permutations;
502 Kokkos::View<mj_lno_t*, device_t> new_coordinate_permutations;
505 Kokkos::View<mj_part_t*, device_t> assigned_part_ids;
508 Kokkos::View<mj_lno_t *, device_t> part_xadj;
511 Kokkos::View<mj_lno_t *, device_t> new_part_xadj;
513 Kokkos::View<mj_scalar_t *, device_t> all_cut_coordinates;
516 Kokkos::View<mj_scalar_t *, device_t>
517 process_cut_line_weight_to_put_left;
520 Kokkos::View<mj_scalar_t *, device_t>
521 thread_cut_line_weight_to_put_left;
527 Kokkos::View<mj_scalar_t *, device_t> cut_coordinates_work_array;
530 Kokkos::View<mj_scalar_t *, device_t> temp_cut_coords;
533 Kokkos::View<mj_scalar_t *, device_t> target_part_weights;
536 Kokkos::View<mj_scalar_t *, device_t> cut_upper_bound_coordinates;
539 Kokkos::View<mj_scalar_t *, device_t> cut_lower_bound_coordinates;
542 Kokkos::View<mj_scalar_t *, device_t> cut_lower_bound_weights;
545 Kokkos::View<mj_scalar_t *, device_t> cut_upper_bound_weights;
549 Kokkos::View<mj_scalar_t *, device_t>
550 process_local_min_max_coord_total_weight;
553 Kokkos::View<mj_scalar_t *, device_t>
554 global_min_max_coord_total_weight;
558 Kokkos::View<bool *, device_t> is_cut_line_determined;
564 Kokkos::View<mj_part_t *, device_t> device_incomplete_cut_count;
565 typename decltype(device_incomplete_cut_count)::host_mirror_type
566 incomplete_cut_count;
569 typename decltype (part_xadj)::host_mirror_type host_part_xadj;
572 Kokkos::View<double *, device_t>
576 Kokkos::View<double *, device_t>
577 thread_part_weight_work;
581 Kokkos::View<mj_scalar_t *, device_t>
582 thread_cut_left_closest_point;
586 Kokkos::View<mj_scalar_t *, device_t>
587 thread_cut_right_closest_point;
590 Kokkos::View<mj_lno_t *, device_t>
593 Kokkos::View<mj_scalar_t *, device_t> process_rectilinear_cut_weight;
594 Kokkos::View<mj_scalar_t *, device_t> global_rectilinear_cut_weight;
600 Kokkos::View<mj_scalar_t *, device_t>
601 total_part_weight_left_right_closests;
602 Kokkos::View<mj_scalar_t *, device_t>
603 global_total_part_weight_left_right_closests;
605 Kokkos::View<mj_part_t*, device_t> device_num_partitioning_in_current_dim;
606 typename decltype(device_num_partitioning_in_current_dim)::host_mirror_type
607 host_num_partitioning_in_current_dim;
614 KOKKOS_INLINE_FUNCTION
615 double calculate_imbalance(mj_scalar_t achieved, mj_scalar_t expected) {
616 return static_cast<double>(achieved) /
static_cast<double>(expected) - 1.0;
625 void set_part_specifications();
632 inline mj_part_t get_part_count(
633 mj_part_t num_total_future,
642 void init_part_boxes(RCP<mj_partBoxVector_t> & outPartBoxes);
663 mj_part_t update_part_num_arrays(
664 std::vector<mj_part_t> *future_num_part_in_parts,
665 std::vector<mj_part_t> *next_future_num_parts_in_parts,
666 mj_part_t &future_num_parts,
667 mj_part_t current_num_parts,
668 int current_iteration,
669 RCP<mj_partBoxVector_t> input_part_boxes,
670 RCP<mj_partBoxVector_t> output_part_boxes,
671 mj_part_t atomic_part_count);
685 KOKKOS_INLINE_FUNCTION
686 void mj_calculate_new_cut_position (
687 mj_scalar_t cut_upper_bound,
688 mj_scalar_t cut_lower_bound,
689 mj_scalar_t cut_upper_weight,
690 mj_scalar_t cut_lower_weight,
691 mj_scalar_t expected_weight,
692 mj_scalar_t &new_cut_position,
693 mj_scalar_t sEpsilon);
719 bool mj_perform_migration(
720 mj_part_t in_num_parts,
721 mj_part_t &out_num_parts,
722 std::vector<mj_part_t> *next_future_num_parts_in_parts,
723 mj_part_t &output_part_begin_index,
724 size_t migration_reduce_all_population,
725 mj_lno_t num_coords_for_last_dim_part,
726 std::string iteration,
727 RCP<mj_partBoxVector_t> &input_part_boxes,
728 RCP<mj_partBoxVector_t> &output_part_boxes);
747 bool mj_check_to_migrate(
748 size_t migration_reduce_all_population,
749 mj_lno_t num_coords_for_last_dim_part,
752 mj_gno_t *num_points_in_all_processor_parts);
778 void mj_migration_part_proc_assignment(
779 mj_gno_t * num_points_in_all_processor_parts,
782 mj_lno_t *send_count_to_each_proc,
783 std::vector<mj_part_t> &processor_ranks_for_subcomm,
784 std::vector<mj_part_t> *next_future_num_parts_in_parts,
785 mj_part_t &out_num_part,
786 std::vector<mj_part_t> &out_part_indices,
787 mj_part_t &output_part_numbering_begin_index,
788 int *coordinate_destinations);
815 void mj_assign_proc_to_parts(
816 mj_gno_t * num_points_in_all_processor_parts,
819 mj_lno_t *send_count_to_each_proc,
820 std::vector<mj_part_t> &processor_ranks_for_subcomm,
821 std::vector<mj_part_t> *next_future_num_parts_in_parts,
822 mj_part_t &out_part_index,
823 mj_part_t &output_part_numbering_begin_index,
824 int *coordinate_destinations);
841 void assign_send_destinations(
843 mj_part_t *part_assignment_proc_begin_indices,
844 mj_part_t *processor_chains_in_parts,
845 mj_lno_t *send_count_to_each_proc,
846 int *coordinate_destinations);
862 void assign_send_destinations2(
865 int *coordinate_destinations,
866 mj_part_t &output_part_numbering_begin_index,
867 std::vector<mj_part_t> *next_future_num_parts_in_parts);
891 void mj_assign_parts_to_procs(
892 mj_gno_t * num_points_in_all_processor_parts,
895 mj_lno_t *send_count_to_each_proc,
896 std::vector<mj_part_t> *next_future_num_parts_in_parts,
897 mj_part_t &out_num_part,
898 std::vector<mj_part_t> &out_part_indices,
899 mj_part_t &output_part_numbering_begin_index,
900 int *coordinate_destinations);
915 void mj_migrate_coords(
917 mj_lno_t &num_new_local_points,
918 std::string iteration,
919 int *coordinate_destinations,
920 mj_part_t num_parts);
927 void create_sub_communicator(
928 std::vector<mj_part_t> &processor_ranks_for_subcomm);
934 mj_part_t find_largest_prime_factor(mj_part_t num_parts) {
935 mj_part_t largest_factor = 1;
936 mj_part_t n = num_parts;
937 mj_part_t divisor = 2;
939 while (n % divisor == 0) {
941 largest_factor = divisor;
944 if(divisor * divisor > n) {
951 return largest_factor;
984 const RCP<const Environment> &env,
985 RCP<
const Comm<int> > &problemComm,
986 double imbalance_tolerance,
988 size_t num_global_parts,
989 Kokkos::View<mj_part_t*, Kokkos::HostSpace> & part_no_array,
992 mj_lno_t num_local_coords,
993 mj_gno_t num_global_coords,
994 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos,
996 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates,
997 int num_weights_per_coord,
998 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_weights,
999 Kokkos::View<mj_scalar_t**, device_t> & mj_weights,
1000 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_parts,
1001 Kokkos::View<mj_part_t*, device_t> & result_assigned_part_ids,
1002 Kokkos::View<mj_gno_t*, device_t> & result_mj_gnos);
1017 bool distribute_points_on_cut_lines_,
1018 int max_concurrent_part_calculation_,
1019 int check_migrate_avoid_migration_option_,
1020 double minimum_migration_imbalance_,
1021 int migration_type_ = 0);
1038 RCP<mj_partBoxVector_t> &localPartBoxes)
const;
1079 const RCP<const Environment> &env,
1080 mj_lno_t num_total_coords,
1081 mj_lno_t num_selected_coords,
1082 size_t num_target_part,
1085 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
1086 Kokkos::View<mj_lno_t *, device_t> &
1087 initial_selected_coords_output_permutation,
1088 mj_lno_t *output_xadj,
1089 int recursion_depth_,
1090 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & part_no_array,
1091 bool partition_along_longest_dim,
1092 int num_ranks_per_node,
1093 bool divide_to_prime_first_,
1094 mj_part_t num_first_level_parts_ = 1,
1095 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & first_level_distribution_
1096 = Kokkos::View<mj_part_t *, Kokkos::HostSpace>());
1098#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
1106 void allocate_set_work_memory();
1109 void compute_global_box();
1118 void mj_get_local_min_max_coord_totW(
1119 mj_part_t current_work_part,
1120 mj_part_t current_concurrent_num_parts,
1121 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords);
1135 void mj_get_global_min_max_coord_totW(
1136 mj_part_t current_concurrent_num_parts,
1137 Kokkos::View<mj_scalar_t *, device_t> & local_min_max_total,
1138 Kokkos::View<mj_scalar_t *, device_t> & global_min_max_total);
1170 void mj_get_initial_cut_coords_target_weights(
1171 mj_scalar_t min_coord,
1172 mj_scalar_t max_coord,
1173 mj_part_t num_cuts ,
1174 mj_scalar_t global_weight,
1175 Kokkos::View<mj_scalar_t *, device_t> & initial_cut_coords,
1176 Kokkos::View<mj_scalar_t *, device_t> & target_part_weights,
1177 std::vector <mj_part_t> *future_num_part_in_parts,
1178 std::vector <mj_part_t> *next_future_num_parts_in_parts,
1179 mj_part_t concurrent_current_part,
1180 mj_part_t obtained_part_index,
1181 mj_part_t num_target_first_level_parts = 1,
1182 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & target_first_level_dist =
1183 Kokkos::View<mj_part_t *, Kokkos::HostSpace>());
1201 void set_initial_coordinate_parts(
1202 mj_scalar_t &max_coordinate,
1203 mj_scalar_t &min_coordinate,
1204 mj_lno_t coordinate_begin_index,
1205 mj_lno_t coordinate_end_index,
1206 Kokkos::View<mj_lno_t *, device_t> &
1207 mj_current_coordinate_permutations,
1208 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1209 Kokkos::View<mj_part_t *, device_t> & mj_part_ids,
1210 mj_part_t &partition_count);
1229 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1230 double imbalanceTolerance,
1231 mj_part_t current_work_part,
1232 mj_part_t current_concurrent_num_parts,
1233 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
1234 mj_part_t total_incomplete_cut_count,
1235 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count,
1236 Kokkos::View<size_t*, device_t> & view_total_reduction_size);
1243 void mj_1D_part_get_part_weights(
1244 mj_part_t current_concurrent_num_parts,
1245 mj_part_t current_work_part,
1246 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1256 void mj_combine_rightleft_and_weights(
1257 mj_part_t current_work_part,
1258 mj_part_t current_concurrent_num_parts);
1272 void mj_create_new_partitions(
1273 mj_part_t num_parts,
1274 mj_part_t current_concurrent_work_part,
1275 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1276 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
1277 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
1278 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj);
1315 void mj_get_new_cut_coordinates(
1316 mj_part_t current_concurrent_num_parts,
1318 const mj_part_t &num_cuts,
1319 const double &used_imbalance_tolerance,
1320 Kokkos::View<mj_scalar_t *, device_t> & current_global_part_weights,
1321 Kokkos::View<mj_scalar_t *, device_t> & current_local_part_weights,
1322 Kokkos::View<mj_scalar_t *, device_t> & current_part_target_weights,
1323 Kokkos::View<bool *, device_t> & current_cut_line_determined,
1324 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
1325 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_bounds,
1326 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bounds,
1327 Kokkos::View<mj_scalar_t *, device_t> & current_global_left_closest_points,
1328 Kokkos::View<mj_scalar_t *, device_t> & current_global_right_closest_points,
1329 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bound_weights,
1330 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_weights,
1331 Kokkos::View<mj_scalar_t *, device_t> & new_current_cut_coordinates,
1332 Kokkos::View<mj_scalar_t *, device_t> &
1333 current_part_cut_line_weight_to_put_left,
1334 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count);
1345 void get_processor_num_points_in_parts(
1346 mj_part_t num_procs,
1347 mj_part_t num_parts,
1348 mj_gno_t *&num_points_in_all_processor_parts);
1354 void fill_permutation_array(
1355 mj_part_t output_num_parts,
1356 mj_part_t num_parts);
1379 void create_consistent_chunks(
1380 mj_part_t num_parts,
1381 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1382 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
1383 mj_lno_t coordinate_begin,
1384 mj_lno_t coordinate_end,
1385 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
1386 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj,
1388 bool longest_dim_part,
1399 void set_final_parts(
1400 mj_part_t current_num_parts,
1401 mj_part_t output_part_begin_index,
1402 RCP<mj_partBoxVector_t> &output_part_boxes,
1403 bool is_data_ever_migrated);
1408template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1409 typename mj_part_t,
typename mj_node_t>
1411 mj_env(), mj_problemComm(), comm(), imbalance_tolerance(0),
1412 recursion_depth(0), coord_dim(0),
1413 num_weights_per_coord(0), initial_num_loc_coords(0),
1414 initial_num_glob_coords(0),
1415 num_local_coords(0), num_global_coords(0),
1416 sEpsilon(std::numeric_limits<mj_scalar_t>::
epsilon() * 100),
1417 distribute_points_on_cut_lines(true),
1418 max_concurrent_part_calculation(1),
1419 mj_run_as_rcb(false), mj_user_recursion_depth(0),
1420 mj_keep_part_boxes(false),
1421 check_migrate_avoid_migration_option(0), migration_type(0),
1422 minimum_migration_imbalance(0.30),
1423 num_first_level_parts(1),
1424 total_num_cut(0), total_num_part(0), max_num_part_along_dim(0),
1425 max_num_cut_along_dim(0),
1426 max_num_total_part_along_dim(0),
1427 total_dim_num_reduce_all(0),
1428 last_dim_num_part(0),
1430 num_global_parts(1),
1431 kept_boxes(), global_box(),
1432 myRank(0), myActualRank(0),
1433 divide_to_prime_first(false)
1480template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1481 typename mj_part_t,
typename mj_node_t>
1484 const RCP<const Environment> &env,
1485 mj_lno_t num_total_coords,
1486 mj_lno_t num_selected_coords,
1487 size_t num_target_part,
1490 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> &
1492 Kokkos::View<mj_lno_t *, device_t> & initial_adjList_output_adjlist,
1493 mj_lno_t *output_xadj,
1494 int recursion_depth_,
1495 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & part_no_array_,
1496 bool partition_along_longest_dim,
1497 int num_ranks_per_node,
1498 bool divide_to_prime_first_,
1499 mj_part_t num_first_level_parts_,
1500 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & first_level_distribution_)
1503 const RCP<Comm<int> > commN;
1504 this->mj_problemComm = Teuchos::DefaultComm<int>::getDefaultSerialComm(commN);
1505 this->comm = Teuchos::rcp_const_cast<Comm<int> >(this->mj_problemComm);
1506 this->myActualRank = this->myRank = 1;
1508 this->divide_to_prime_first = divide_to_prime_first_;
1513 this->imbalance_tolerance = 0;
1514 this->num_global_parts = num_target_part;
1515 this->part_no_array = part_no_array_;
1516 this->recursion_depth = recursion_depth_;
1520 this->num_first_level_parts = num_first_level_parts_;
1522 this->first_level_distribution = first_level_distribution_;
1524 this->coord_dim = coord_dim_;
1525 this->num_local_coords = num_total_coords;
1527 this->num_global_coords = num_total_coords;
1528 this->mj_coordinates = mj_coordinates_;
1531 this->initial_mj_gnos =
1532 Kokkos::View<mj_gno_t*, device_t>(
"gids", this->num_local_coords);
1534 this->num_weights_per_coord = 0;
1536 this->mj_uniform_weights = Kokkos::View<bool*, Kokkos::HostSpace>(
1537 "uniform weights", 1);
1538 this->mj_uniform_weights(0) =
true;
1540 this->mj_weights = Kokkos::View<mj_scalar_t**, device_t>
1543 this->mj_uniform_parts =
1544 Kokkos::View<bool*, Kokkos::HostSpace>(
"uniform parts", 1);
1545 this->mj_uniform_parts(0) =
true;
1547 this->set_part_specifications();
1549 this->allocate_set_work_memory();
1552 auto local_part_xadj = this->part_xadj;
1553 Kokkos::parallel_for(
1554 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
1555 KOKKOS_LAMBDA (
int dummy) {
1556 local_part_xadj(0) =
static_cast<mj_lno_t
>(num_selected_coords);
1559 Kokkos::deep_copy(coordinate_permutations, initial_adjList_output_adjlist);
1561 mj_part_t current_num_parts = 1;
1563 Kokkos::View<mj_scalar_t *, device_t> current_cut_coordinates =
1564 this->all_cut_coordinates;
1566 mj_part_t future_num_parts = this->total_num_part;
1568 std::vector<mj_part_t> *future_num_part_in_parts =
1569 new std::vector<mj_part_t>();
1570 std::vector<mj_part_t> *next_future_num_parts_in_parts =
1571 new std::vector<mj_part_t>();
1572 next_future_num_parts_in_parts->push_back(this->num_global_parts);
1573 RCP<mj_partBoxVector_t> t1;
1574 RCP<mj_partBoxVector_t> t2;
1576 std::vector <uSignedSortItem<int, mj_scalar_t, char>>
1577 coord_dimension_range_sorted(this->coord_dim);
1579 &(coord_dimension_range_sorted[0]);
1580 std::vector <mj_scalar_t> coord_dim_mins(this->coord_dim);
1581 std::vector <mj_scalar_t> coord_dim_maxs(this->coord_dim);
1585 Kokkos::View<mj_part_t*, device_t>
1586 view_rectilinear_cut_count(
"view_rectilinear_cut_count", 1);
1587 Kokkos::View<size_t*, device_t>
1588 view_total_reduction_size(
"view_total_reduction_size", 1);
1590 for(
int rd = 0; rd < this->recursion_depth; ++rd) {
1596 std::vector<mj_part_t> *tmpPartVect = future_num_part_in_parts;
1597 future_num_part_in_parts = next_future_num_parts_in_parts;
1598 next_future_num_parts_in_parts = tmpPartVect;
1602 next_future_num_parts_in_parts->clear();
1605 mj_part_t output_part_count_in_dimension =
1606 this->update_part_num_arrays(
1607 future_num_part_in_parts,
1608 next_future_num_parts_in_parts,
1613 t2, num_ranks_per_node);
1618 if(output_part_count_in_dimension == current_num_parts) {
1619 tmpPartVect = future_num_part_in_parts;
1620 future_num_part_in_parts = next_future_num_parts_in_parts;
1621 next_future_num_parts_in_parts = tmpPartVect;
1626 std::string istring = std::to_string(rd);
1630 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
1631 "new part xadj", output_part_count_in_dimension);
1635 mj_part_t output_part_index = 0;
1639 mj_part_t output_coordinate_end_index = 0;
1641 mj_part_t current_work_part = 0;
1642 mj_part_t current_concurrent_num_parts = 1;
1644 mj_part_t obtained_part_index = 0;
1647 int coordInd = rd % this->coord_dim;
1649 Kokkos::View<mj_scalar_t *, device_t> mj_current_dim_coords =
1650 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1652 auto host_process_local_min_max_coord_total_weight =
1653 Kokkos::create_mirror_view(process_local_min_max_coord_total_weight);
1654 auto host_global_min_max_coord_total_weight =
1655 Kokkos::create_mirror_view(global_min_max_coord_total_weight);
1658 for(; current_work_part < current_num_parts;
1659 current_work_part += current_concurrent_num_parts) {
1661 mj_part_t actual_work_part_count = 0;
1666 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1667 mj_part_t current_work_part_in_concurrent_parts =
1668 current_work_part + kk;
1672 mj_part_t partition_count = host_num_partitioning_in_current_dim(
1673 current_work_part_in_concurrent_parts);
1674 if(partition_count == 1) {
1677 ++actual_work_part_count;
1678 if(partition_along_longest_dim) {
1679 auto local_process_local_min_max_coord_total_weight =
1680 this->process_local_min_max_coord_total_weight;
1681 for(
int coord_traverse_ind = 0;
1682 coord_traverse_ind < this->coord_dim; ++coord_traverse_ind) {
1684 Kokkos::View<mj_scalar_t *, device_t> coords =
1685 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coord_traverse_ind);
1687 this->mj_get_local_min_max_coord_totW(
1689 current_concurrent_num_parts,
1692 coord_dimension_range_sorted[coord_traverse_ind].id =
1694 coord_dimension_range_sorted[coord_traverse_ind].signbit = 1;
1696 Kokkos::deep_copy(host_process_local_min_max_coord_total_weight,
1697 process_local_min_max_coord_total_weight);
1699 coord_dim_mins[coord_traverse_ind] =
1700 host_process_local_min_max_coord_total_weight(kk);
1701 coord_dim_maxs[coord_traverse_ind] =
1702 host_process_local_min_max_coord_total_weight(
1703 kk + current_concurrent_num_parts);
1704 coord_dimension_range_sorted[coord_traverse_ind].val =
1705 host_process_local_min_max_coord_total_weight(
1706 kk + current_concurrent_num_parts) -
1707 host_process_local_min_max_coord_total_weight(kk);
1710 uqSignsort(this->coord_dim, p_coord_dimension_range_sorted);
1711 coordInd = p_coord_dimension_range_sorted[this->coord_dim - 1].
id;
1712 auto set_min = coord_dim_mins[coordInd];
1713 auto set_max = coord_dim_maxs[coordInd];
1714 Kokkos::parallel_for(
1715 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
1716 (0, 1), KOKKOS_LAMBDA (
int dummy) {
1717 local_process_local_min_max_coord_total_weight(kk) = set_min;
1718 local_process_local_min_max_coord_total_weight(
1719 kk + current_concurrent_num_parts) = set_max;
1722 mj_current_dim_coords =
1723 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1726 Kokkos::View<mj_scalar_t *, device_t> coords =
1727 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1728 this->mj_get_local_min_max_coord_totW(
1730 current_concurrent_num_parts,
1736 if(actual_work_part_count > 0) {
1738 this->mj_get_global_min_max_coord_totW(
1739 current_concurrent_num_parts,
1740 this->process_local_min_max_coord_total_weight,
1741 this->global_min_max_coord_total_weight);
1744 Kokkos::deep_copy(host_global_min_max_coord_total_weight,
1745 global_min_max_coord_total_weight);
1749 mj_part_t total_incomplete_cut_count = 0;
1754 mj_part_t concurrent_part_cut_shift = 0;
1755 mj_part_t concurrent_part_part_shift = 0;
1756 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1757 mj_scalar_t min_coordinate =
1758 host_global_min_max_coord_total_weight(kk);
1759 mj_scalar_t max_coordinate = host_global_min_max_coord_total_weight(
1760 kk + current_concurrent_num_parts);
1761 mj_scalar_t global_total_weight = host_global_min_max_coord_total_weight(
1762 kk + 2*current_concurrent_num_parts);
1764 mj_part_t concurrent_current_part_index = current_work_part + kk;
1766 mj_part_t partition_count = host_num_partitioning_in_current_dim(
1767 concurrent_current_part_index);
1769 Kokkos::View<mj_scalar_t *, device_t> usedCutCoordinate =
1770 Kokkos::subview(current_cut_coordinates,
1771 std::pair<mj_lno_t, mj_lno_t>(
1772 concurrent_part_cut_shift,
1773 current_cut_coordinates.size()));
1774 Kokkos::View<mj_scalar_t *, device_t>
1775 current_target_part_weights =
1776 Kokkos::subview(target_part_weights,
1777 std::pair<mj_lno_t, mj_lno_t>(
1778 concurrent_part_part_shift,
1779 target_part_weights.size()));
1782 concurrent_part_cut_shift += partition_count - 1;
1784 concurrent_part_part_shift += partition_count;
1787 if(partition_count > 1 && min_coordinate <= max_coordinate) {
1790 total_incomplete_cut_count += partition_count - 1;
1792 this->incomplete_cut_count(kk) = partition_count - 1;
1800 this->mj_get_initial_cut_coords_target_weights(
1803 partition_count - 1,
1804 global_total_weight,
1806 current_target_part_weights,
1807 future_num_part_in_parts,
1808 next_future_num_parts_in_parts,
1809 concurrent_current_part_index,
1810 obtained_part_index,
1811 rd == 0 ? this->num_first_level_parts : 1,
1812 this->first_level_distribution);
1814 mj_lno_t coordinate_end_index =
1815 host_part_xadj(concurrent_current_part_index);
1816 mj_lno_t coordinate_begin_index =
1817 (concurrent_current_part_index==0) ? 0 :
1818 host_part_xadj[concurrent_current_part_index - 1];
1821 this->set_initial_coordinate_parts(
1824 coordinate_begin_index, coordinate_end_index,
1825 this->coordinate_permutations,
1826 mj_current_dim_coords,
1827 this->assigned_part_ids,
1833 this->incomplete_cut_count(kk) = 0;
1835 obtained_part_index += partition_count;
1840 double used_imbalance = 0;
1844 mj_timer_base_string +
"mj_1D_part()");
1847 mj_current_dim_coords,
1850 current_concurrent_num_parts,
1851 current_cut_coordinates,
1852 total_incomplete_cut_count,
1853 view_rectilinear_cut_count,
1854 view_total_reduction_size);
1857 mj_timer_base_string +
"mj_1D_part()");
1860 obtained_part_index += current_concurrent_num_parts;
1864 mj_part_t output_array_shift = 0;
1865 mj_part_t cut_shift = 0;
1866 size_t tlr_shift = 0;
1867 size_t partweight_array_shift = 0;
1869 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1870 mj_part_t current_concurrent_work_part = current_work_part + kk;
1872 mj_part_t num_parts = host_num_partitioning_in_current_dim(
1873 current_concurrent_work_part);
1876 int coordinateA_bigger_than_coordinateB =
1877 host_global_min_max_coord_total_weight(kk) >
1878 host_global_min_max_coord_total_weight(
1879 kk + current_concurrent_num_parts);
1881 if((num_parts != 1) && coordinateA_bigger_than_coordinateB) {
1884 auto local_new_part_xadj = this->new_part_xadj;
1885 Kokkos::parallel_for(
1886 Kokkos::RangePolicy<
typename mj_node_t::execution_space,
1887 mj_part_t> (0, num_parts), KOKKOS_LAMBDA(mj_part_t jj) {
1888 local_new_part_xadj(
1889 output_part_index + output_array_shift + jj) = 0;
1892 cut_shift += num_parts - 1;
1893 tlr_shift += (4 *(num_parts - 1) + 1);
1894 output_array_shift += num_parts;
1895 partweight_array_shift += (2 * (num_parts - 1) + 1);
1898 mj_lno_t coordinate_end =
1899 host_part_xadj(current_concurrent_work_part);
1900 mj_lno_t coordinate_begin =
1901 current_concurrent_work_part==0 ? 0 :
1902 host_part_xadj(current_concurrent_work_part-1);
1904 Kokkos::View<mj_scalar_t *, device_t>
1905 current_concurrent_cut_coordinate =
1906 Kokkos::subview(current_cut_coordinates,
1907 std::pair<mj_lno_t, mj_lno_t>(
1909 current_cut_coordinates.size()));
1910 Kokkos::View<mj_scalar_t *, device_t>
1911 used_local_cut_line_weight_to_left =
1912 Kokkos::subview(process_cut_line_weight_to_put_left,
1913 std::pair<mj_lno_t, mj_lno_t>(
1915 process_cut_line_weight_to_put_left.size()));
1917 this->thread_part_weight_work =
1919 this->thread_part_weights,
1920 std::pair<mj_lno_t, mj_lno_t>(
1921 partweight_array_shift,
1922 this->thread_part_weights.size()));
1926 Kokkos::View<mj_lno_t *, device_t> subview_new_part_xadj =
1927 Kokkos::subview(this->new_part_xadj,
1928 std::pair<mj_lno_t, mj_lno_t>(
1929 output_part_index + output_array_shift,
1930 this->new_part_xadj.size()));
1932 this->create_consistent_chunks(
1934 mj_current_dim_coords,
1935 current_concurrent_cut_coordinate,
1938 used_local_cut_line_weight_to_left,
1939 subview_new_part_xadj,
1941 partition_along_longest_dim,
1942 p_coord_dimension_range_sorted);
1947 mj_lno_t part_size = coordinate_end - coordinate_begin;
1949 auto local_new_part_xadj = this->new_part_xadj;
1950 Kokkos::parallel_for(
1951 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
1952 (0, 1), KOKKOS_LAMBDA (
int dummy) {
1953 local_new_part_xadj(output_part_index + output_array_shift)
1957 auto subview_new_coordinate_permutations =
1958 Kokkos::subview(this->new_coordinate_permutations,
1959 std::pair<mj_lno_t, mj_lno_t>(
1961 coordinate_begin + part_size));
1962 auto subview_coordinate_permutations =
1963 Kokkos::subview(this->coordinate_permutations,
1964 std::pair<mj_lno_t, mj_lno_t>(
1966 coordinate_begin + part_size));
1967 Kokkos::deep_copy(subview_new_coordinate_permutations,
1968 subview_coordinate_permutations);
1971 cut_shift += num_parts - 1;
1972 tlr_shift += (4 *(num_parts - 1) + 1);
1973 output_array_shift += num_parts;
1974 partweight_array_shift += (2 * (num_parts - 1) + 1);
1983 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
1984 mj_part_t num_parts =
1985 host_num_partitioning_in_current_dim(current_work_part + kk);
1986 auto local_new_part_xadj = this->new_part_xadj;
1987 auto local_mj_current_dim_coords = mj_current_dim_coords;
1988 auto local_new_coordinate_permutations =
1989 new_coordinate_permutations;
1990 Kokkos::parallel_for(
1991 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t> (
1992 0, num_parts), KOKKOS_LAMBDA (mj_part_t ii) {
1994 local_new_part_xadj(output_part_index+ii) +=
1995 output_coordinate_end_index;
1998 mj_lno_t coordinate_end =
1999 local_new_part_xadj(output_part_index+ii);
2000 mj_lno_t coordinate_begin =
2001 local_new_part_xadj(output_part_index);
2003 for(mj_lno_t task_traverse = coordinate_begin;
2004 task_traverse < coordinate_end; ++task_traverse) {
2005 mj_lno_t l = local_new_coordinate_permutations(task_traverse);
2007 local_mj_current_dim_coords(l) = -local_mj_current_dim_coords(l);
2013 mj_part_t get_single;
2014 Kokkos::parallel_reduce(
"Read new_part_xadj",
2015 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0, 1),
2016 KOKKOS_LAMBDA(
int dummy, mj_part_t & set_single) {
2017 set_single = local_new_part_xadj(output_part_index + num_parts - 1);
2020 output_coordinate_end_index = get_single;
2022 output_part_index += num_parts;
2029 current_num_parts = output_part_count_in_dimension;
2032 Kokkos::View<mj_lno_t *, device_t> tmp = this->coordinate_permutations;
2033 this->coordinate_permutations = this->new_coordinate_permutations;
2034 this->new_coordinate_permutations = tmp;
2036 this->part_xadj = this->new_part_xadj;
2037 this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
2038 Kokkos::deep_copy(host_part_xadj, part_xadj);
2039 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
"empty", 0);
2042 Kokkos::deep_copy(initial_adjList_output_adjlist, coordinate_permutations);
2046 for(
size_t i = 0; i < this->num_global_parts ; ++i) {
2047 output_xadj[i+1] = host_part_xadj(i);
2050 delete future_num_part_in_parts;
2051 delete next_future_num_parts_in_parts;
2057template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2058 typename mj_part_t,
typename mj_node_t>
2060 <mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t,mj_node_t>::mj_partBox_t>
2064 return this->global_box;
2069template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2070 typename mj_part_t,
typename mj_node_t>
2071void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2072 mj_node_t>::set_to_keep_part_boxes()
2074 this->mj_keep_part_boxes =
true;
2084template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2085 typename mj_part_t,
typename mj_node_t>
2089 this->total_num_cut = 0;
2090 this->total_num_part = 1;
2091 this->max_num_part_along_dim = 0;
2092 this->total_dim_num_reduce_all = 0;
2093 this->last_dim_num_part = 1;
2096 this->max_num_cut_along_dim = 0;
2097 this->max_num_total_part_along_dim = 0;
2099 if(this->part_no_array.size()) {
2100 auto local_recursion_depth = this->recursion_depth;
2102 this->total_dim_num_reduce_all =
2103 this->total_num_part * this->recursion_depth;
2105 this->total_num_part = 1;
2106 for(
int i = 0; i < local_recursion_depth; ++i) {
2107 this->total_num_part *= this->part_no_array(i);
2110 mj_part_t track_max = 0;
2111 for(
int i = 0; i < local_recursion_depth; ++i) {
2112 if(part_no_array(i) > track_max) {
2113 track_max = this->part_no_array(i);
2117 this->last_dim_num_part = this->total_num_part /
2118 this->part_no_array(local_recursion_depth-1);
2120 this->max_num_part_along_dim = track_max;
2121 this->num_global_parts = this->total_num_part;
2123 mj_part_t future_num_parts = this->num_global_parts;
2127 if (this->first_level_distribution.size() != 0 &&
2128 this->num_first_level_parts > 1) {
2129 this->max_num_part_along_dim = this->num_first_level_parts;
2134 for(
int rd = 0; rd < this->recursion_depth; ++rd) {
2135 mj_part_t maxNoPartAlongI = 0;
2136 mj_part_t nfutureNumParts = 0;
2142 this->first_level_distribution.size() != 0 &&
2143 this->num_first_level_parts > 1) {
2145 maxNoPartAlongI = this->num_first_level_parts;
2146 this->max_num_part_along_dim = this->num_first_level_parts;
2148 mj_part_t sum_first_level_dist = 0;
2149 mj_part_t max_part = 0;
2152 for (
int i = 0; i < this->num_first_level_parts; ++i) {
2153 sum_first_level_dist += this->first_level_distribution(i);
2154 if (this->first_level_distribution(i) > max_part)
2155 max_part = this->first_level_distribution(i);
2161 this->num_global_parts * max_part / sum_first_level_dist;
2165 maxNoPartAlongI = this->get_part_count(future_num_parts,
2166 1.0f / (this->recursion_depth - rd));
2167 if (maxNoPartAlongI > this->max_num_part_along_dim)
2168 this->max_num_part_along_dim = maxNoPartAlongI;
2169 nfutureNumParts = future_num_parts / maxNoPartAlongI;
2170 if (future_num_parts % maxNoPartAlongI) {
2174 future_num_parts = nfutureNumParts;
2176 this->total_num_part = this->num_global_parts;
2178 if(this->divide_to_prime_first) {
2179 this->total_dim_num_reduce_all = this->num_global_parts * 2;
2180 this->last_dim_num_part = this->num_global_parts;
2187 for(
int i = 0; i < this->recursion_depth; ++i) {
2188 this->total_dim_num_reduce_all += p;
2189 p *= this->max_num_part_along_dim;
2192 if(p / this->max_num_part_along_dim > this->num_global_parts) {
2193 this->last_dim_num_part = this->num_global_parts;
2196 this->last_dim_num_part = p / this->max_num_part_along_dim;
2201 this->total_num_cut = this->total_num_part - 1;
2202 this->max_num_cut_along_dim = this->max_num_part_along_dim - 1;
2203 this->max_num_total_part_along_dim = this->max_num_part_along_dim +
2204 size_t(this->max_num_cut_along_dim);
2209 if(this->max_concurrent_part_calculation > this->last_dim_num_part) {
2210 if(this->mj_problemComm->getRank() == 0) {
2211 std::cerr <<
"Warning: Concurrent part count (" <<
2212 this->max_concurrent_part_calculation <<
2213 ") has been set bigger than maximum amount that can be used." <<
2214 " Setting to:" << this->last_dim_num_part <<
"." << std::endl;
2216 this->max_concurrent_part_calculation = this->last_dim_num_part;
2225template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2226 typename mj_part_t,
typename mj_node_t>
2227inline mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2228 get_part_count(mj_part_t num_total_future,
double root)
2230 double fp = pow(num_total_future, root);
2231 mj_part_t ip = mj_part_t(fp);
2232 if(fp - ip < std::numeric_limits<float>::epsilon() * 100) {
2259template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2260 typename mj_part_t,
typename mj_node_t>
2261mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2262 update_part_num_arrays(
2263 std::vector<mj_part_t> *future_num_part_in_parts,
2264 std::vector<mj_part_t> *next_future_num_parts_in_parts,
2265 mj_part_t &future_num_parts,
2266 mj_part_t current_num_parts,
2267 int current_iteration,
2268 RCP<mj_partBoxVector_t> input_part_boxes,
2269 RCP<mj_partBoxVector_t> output_part_boxes,
2270 mj_part_t atomic_part_count)
2272 std::vector<mj_part_t> num_partitioning_in_current_dim;
2275 mj_part_t output_num_parts = 0;
2276 if(this->part_no_array.size()) {
2280 mj_part_t current_part_no_array =
2281 this->part_no_array(current_iteration);
2283 if(current_part_no_array < 1) {
2284 std::cout <<
"Current recursive iteration: " << current_iteration <<
2285 " part_no_array[" << current_iteration <<
"] is given as:" <<
2286 current_part_no_array << std::endl;
2289 if(current_part_no_array == 1) {
2290 return current_num_parts;
2294 if (this->first_level_distribution.size() != 0 &&
2295 current_iteration == 0 &&
2296 current_part_no_array != this->num_first_level_parts) {
2297 std::cout <<
"Current recursive iteration: " << current_iteration
2298 <<
" part_no_array[" << current_iteration <<
"] is given as: " <<
2299 current_part_no_array <<
" and contradicts num_first_level_parts: " <<
2300 this->num_first_level_parts << std::endl;
2304 for(mj_part_t ii = 0; ii < current_num_parts; ++ii) {
2305 num_partitioning_in_current_dim.push_back(current_part_no_array);
2322 future_num_parts /= num_partitioning_in_current_dim[0];
2323 output_num_parts = current_num_parts *
2324 num_partitioning_in_current_dim[0];
2325 if(this->mj_keep_part_boxes) {
2326 for(mj_part_t k = 0; k < current_num_parts; ++k) {
2328 for(mj_part_t j = 0; j <
2329 num_partitioning_in_current_dim[0]; ++j) {
2330 output_part_boxes->push_back((*input_part_boxes)[k]);
2338 for(mj_part_t ii = 0; ii < output_num_parts; ++ii) {
2339 next_future_num_parts_in_parts->push_back(future_num_parts);
2349 future_num_parts = 1;
2352 for(mj_part_t ii = 0; ii < current_num_parts; ++ii) {
2354 mj_part_t future_num_parts_of_part_ii = (*future_num_part_in_parts)[ii];
2358 mj_part_t num_partitions_in_current_dim =
2359 this->get_part_count(future_num_parts_of_part_ii,
2360 1.0 / (this->recursion_depth - current_iteration)
2362 if(num_partitions_in_current_dim > this->max_num_part_along_dim) {
2363 std::cerr <<
"ERROR: maxPartNo calculation is wrong."
2364 " num_partitions_in_current_dim: "
2365 << num_partitions_in_current_dim <<
" this->max_num_part_along_dim: "
2366 << this->max_num_part_along_dim <<
2367 " this->recursion_depth: " << this->recursion_depth <<
2368 " current_iteration:" << current_iteration <<
2369 " future_num_parts_of_part_ii: " << future_num_parts_of_part_ii <<
2370 " might need to fix max part no calculation for "
2371 "largest_prime_first partitioning." <<
2383 if (current_iteration == 0 &&
2384 this->first_level_distribution.size() != 0 &&
2385 this->num_first_level_parts > 1) {
2388 num_partitioning_in_current_dim.push_back(this->num_first_level_parts);
2391 output_num_parts = this->num_first_level_parts;
2394 future_num_parts /= this->num_first_level_parts;
2396 mj_part_t max_part = 0;
2397 mj_part_t sum_first_level_dist = 0;
2401 for (
int i = 0; i < this->num_first_level_parts; ++i) {
2402 sum_first_level_dist += this->first_level_distribution(i);
2404 if (this->first_level_distribution(i) > max_part)
2405 max_part = this->first_level_distribution(i);
2409 future_num_parts = this->num_global_parts * max_part / sum_first_level_dist;
2413 for (
int i = 0; i < this->num_first_level_parts; ++i) {
2414 next_future_num_parts_in_parts->push_back(this->first_level_distribution(i) *
2415 this->num_global_parts / sum_first_level_dist);
2418 else if (this->divide_to_prime_first) {
2420 num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2422 mj_part_t largest_prime_factor = num_partitions_in_current_dim;
2425 output_num_parts += num_partitions_in_current_dim;
2427 if (future_num_parts_of_part_ii == atomic_part_count ||
2428 future_num_parts_of_part_ii % atomic_part_count != 0) {
2429 atomic_part_count = 1;
2432 largest_prime_factor =
2433 this->find_largest_prime_factor(future_num_parts_of_part_ii / atomic_part_count);
2440 if (largest_prime_factor < num_partitions_in_current_dim) {
2441 largest_prime_factor = num_partitions_in_current_dim;
2444 mj_part_t ideal_num_future_parts_in_part =
2445 (future_num_parts_of_part_ii / atomic_part_count) / largest_prime_factor;
2447 mj_part_t ideal_prime_scale = largest_prime_factor / num_partitions_in_current_dim;
2455 for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii) {
2457 mj_part_t my_ideal_primescale = ideal_prime_scale;
2459 if (iii < (largest_prime_factor) % num_partitions_in_current_dim) {
2460 ++my_ideal_primescale;
2463 mj_part_t num_future_parts_for_part_iii =
2464 ideal_num_future_parts_in_part * my_ideal_primescale;
2467 if (iii < (future_num_parts_of_part_ii / atomic_part_count) % largest_prime_factor) {
2469 ++num_future_parts_for_part_iii;
2472 next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii * atomic_part_count);
2475 if (this->mj_keep_part_boxes) {
2476 output_part_boxes->push_back((*input_part_boxes)[ii]);
2480 if (num_future_parts_for_part_iii > future_num_parts)
2481 future_num_parts = num_future_parts_for_part_iii;
2487 num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2490 output_num_parts += num_partitions_in_current_dim;
2492 if((future_num_parts_of_part_ii == atomic_part_count) ||
2493 (future_num_parts_of_part_ii % atomic_part_count != 0)) {
2494 atomic_part_count = 1;
2497 mj_part_t ideal_num_future_parts_in_part =
2498 (future_num_parts_of_part_ii / atomic_part_count) /
2499 num_partitions_in_current_dim;
2500 for(mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii) {
2501 mj_part_t num_future_parts_for_part_iii =
2502 ideal_num_future_parts_in_part;
2505 if(iii < (future_num_parts_of_part_ii / atomic_part_count) %
2506 num_partitions_in_current_dim) {
2508 ++num_future_parts_for_part_iii;
2511 next_future_num_parts_in_parts->push_back(
2512 num_future_parts_for_part_iii * atomic_part_count);
2516 if(this->mj_keep_part_boxes) {
2517 output_part_boxes->push_back((*input_part_boxes)[ii]);
2520 if(num_future_parts_for_part_iii > future_num_parts)
2521 future_num_parts = num_future_parts_for_part_iii;
2527 device_num_partitioning_in_current_dim = Kokkos::View<
2528 mj_part_t*,
device_t>(
"test", num_partitioning_in_current_dim.size());
2529 host_num_partitioning_in_current_dim =
2530 Kokkos::create_mirror_view(device_num_partitioning_in_current_dim);
2531 for(
size_t n = 0; n < num_partitioning_in_current_dim.size(); ++n) {
2532 host_num_partitioning_in_current_dim(n) =
2533 num_partitioning_in_current_dim[n];
2538 Kokkos::deep_copy(device_num_partitioning_in_current_dim,
2539 host_num_partitioning_in_current_dim);
2540 return output_num_parts;
2545template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2546 typename mj_part_t,
typename mj_node_t>
2547void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2548 allocate_set_work_memory()
2553 this->coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>(
2554 Kokkos::ViewAllocateWithoutInitializing(
"coordinate_permutations"),
2555 this->num_local_coords);
2556 auto local_coordinate_permutations = coordinate_permutations;
2557 Kokkos::parallel_for(
2558 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t> (
2559 0, this->num_local_coords), KOKKOS_LAMBDA (mj_lno_t i) {
2560 local_coordinate_permutations(i) = i;
2564 this->new_coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>(
2565 Kokkos::ViewAllocateWithoutInitializing(
"num_local_coords"),
2566 this->num_local_coords);
2568 this->assigned_part_ids = Kokkos::View<mj_part_t*, device_t>(
2569 Kokkos::ViewAllocateWithoutInitializing(
"assigned parts"), 0);
2570 if(this->num_local_coords > 0) {
2571 this->assigned_part_ids = Kokkos::View<mj_part_t*, device_t>(
2572 Kokkos::ViewAllocateWithoutInitializing(
"assigned part ids"),
2573 this->num_local_coords);
2580 this->part_xadj = Kokkos::View<mj_lno_t*, device_t>(
2581 Kokkos::ViewAllocateWithoutInitializing(
"part xadj"), 1);
2582 this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
2583 host_part_xadj(0) = num_local_coords;
2584 Kokkos::deep_copy(this->part_xadj, host_part_xadj);
2587 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
2588 Kokkos::ViewAllocateWithoutInitializing(
"empty"), 0);
2591 this->all_cut_coordinates = Kokkos::View<mj_scalar_t*, device_t>(
2592 Kokkos::ViewAllocateWithoutInitializing(
"all cut coordinates"),
2593 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2596 this->process_cut_line_weight_to_put_left = Kokkos::View<mj_scalar_t*,
2597 device_t>(Kokkos::ViewAllocateWithoutInitializing(
"empty"), 0);
2601 this->thread_cut_line_weight_to_put_left =
2602 Kokkos::View<mj_scalar_t*, device_t>(
2603 Kokkos::ViewAllocateWithoutInitializing(
"empty"), 0);
2605 if(this->distribute_points_on_cut_lines) {
2606 this->process_cut_line_weight_to_put_left =
2607 Kokkos::View<mj_scalar_t *, device_t>(
2608 Kokkos::ViewAllocateWithoutInitializing(
2609 "process_cut_line_weight_to_put_left"),
2610 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2611 this->thread_cut_line_weight_to_put_left =
2612 Kokkos::View<mj_scalar_t *, device_t>(
2613 Kokkos::ViewAllocateWithoutInitializing(
2614 "thread_cut_line_weight_to_put_left"),
2615 this->max_num_cut_along_dim);
2616 this->process_rectilinear_cut_weight =
2617 Kokkos::View<mj_scalar_t *, device_t>(
2618 Kokkos::ViewAllocateWithoutInitializing(
"process_rectilinear_cut_weight"),
2619 this->max_num_cut_along_dim);
2620 this->global_rectilinear_cut_weight =
2621 Kokkos::View<mj_scalar_t *, device_t>(
2622 Kokkos::ViewAllocateWithoutInitializing(
"global_rectilinear_cut_weight"),
2623 this->max_num_cut_along_dim);
2630 this->cut_coordinates_work_array =
2631 Kokkos::View<mj_scalar_t *, device_t>(
2632 Kokkos::ViewAllocateWithoutInitializing(
"cut_coordinates_work_array"),
2633 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2636 this->target_part_weights = Kokkos::View<mj_scalar_t*, device_t>(
2637 Kokkos::ViewAllocateWithoutInitializing(
"target_part_weights"),
2638 this->max_num_part_along_dim * this->max_concurrent_part_calculation);
2641 this->cut_upper_bound_coordinates =
2642 Kokkos::View<mj_scalar_t*, device_t>(
2643 Kokkos::ViewAllocateWithoutInitializing(
"cut_upper_bound_coordinates"),
2644 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2647 this->cut_lower_bound_coordinates =
2648 Kokkos::View<mj_scalar_t*, device_t>(
2649 Kokkos::ViewAllocateWithoutInitializing(
"cut_lower_bound_coordinates"),
2650 this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2653 this->cut_lower_bound_weights =
2654 Kokkos::View<mj_scalar_t*, device_t>(
2655 Kokkos::ViewAllocateWithoutInitializing(
"cut_lower_bound_weights"),
2656 this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2659 this->cut_upper_bound_weights =
2660 Kokkos::View<mj_scalar_t*, device_t>(
2661 Kokkos::ViewAllocateWithoutInitializing(
"cut_upper_bound_weights"),
2662 this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2666 this->process_local_min_max_coord_total_weight =
2667 Kokkos::View<mj_scalar_t*, device_t>(
2668 Kokkos::ViewAllocateWithoutInitializing(
2669 "process_local_min_max_coord_total_weight"),
2670 3 * this->max_concurrent_part_calculation);
2673 this->global_min_max_coord_total_weight =
2674 Kokkos::View<mj_scalar_t*, device_t>(
2675 Kokkos::ViewAllocateWithoutInitializing(
"global_min_max_coord_total_weight"),
2676 3 * this->max_concurrent_part_calculation);
2681 this->is_cut_line_determined = Kokkos::View<bool *, device_t>(
2682 Kokkos::ViewAllocateWithoutInitializing(
"is_cut_line_determined"),
2683 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2689 this->device_incomplete_cut_count = Kokkos::View<mj_part_t *, device_t>(
2690 Kokkos::ViewAllocateWithoutInitializing(
"device_incomplete_cut_count"),
2691 this->max_concurrent_part_calculation);
2692 this->incomplete_cut_count =
2693 Kokkos::create_mirror_view(device_incomplete_cut_count);
2696 this->thread_part_weights = Kokkos::View<double *, device_t>(
2697 Kokkos::ViewAllocateWithoutInitializing(
"thread_part_weights"),
2698 this->max_num_total_part_along_dim * this->max_concurrent_part_calculation);
2700 this->thread_cut_left_closest_point = Kokkos::View<mj_scalar_t *, device_t>(
2701 Kokkos::ViewAllocateWithoutInitializing(
"thread_cut_left_closest_point"),
2702 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2706 this->thread_cut_right_closest_point = Kokkos::View<mj_scalar_t *, device_t>(
2707 Kokkos::ViewAllocateWithoutInitializing(
"thread_cut_right_closest_point"),
2708 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2711 this->thread_point_counts = Kokkos::View<mj_lno_t *, device_t>(
2712 Kokkos::ViewAllocateWithoutInitializing(
"thread_point_counts"),
2713 this->max_num_part_along_dim);
2719 this->total_part_weight_left_right_closests =
2720 Kokkos::View<mj_scalar_t*, device_t>(
2721 Kokkos::ViewAllocateWithoutInitializing(
2722 "total_part_weight_left_right_closests"),
2723 (this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) *
2724 this->max_concurrent_part_calculation);
2726 this->global_total_part_weight_left_right_closests =
2727 Kokkos::View<mj_scalar_t*, device_t>(
2728 Kokkos::ViewAllocateWithoutInitializing(
2729 "global_total_part_weight_left_right_closests"),
2730 (this->max_num_total_part_along_dim +
2731 this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2733 this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
2734 Kokkos::ViewAllocateWithoutInitializing(
"gids"), num_local_coords);
2736 this->owner_of_coordinate = Kokkos::View<int *, Kokkos::HostSpace>(
2737 Kokkos::ViewAllocateWithoutInitializing(
"owner_of_coordinate"),
2743 Kokkos::deep_copy(owner_of_coordinate, myActualRank);
2745 auto local_current_mj_gnos = current_mj_gnos;
2746 auto local_initial_mj_gnos = initial_mj_gnos;
2747 Kokkos::parallel_for(
2748 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2749 (0, num_local_coords), KOKKOS_LAMBDA (mj_lno_t j) {
2750 local_current_mj_gnos(j) = local_initial_mj_gnos(j);
2756template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2757 typename mj_part_t,
typename mj_node_t>
2758void AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t,
2759 mj_node_t>::compute_global_box()
2762 mj_scalar_t *mins =
new mj_scalar_t[this->coord_dim];
2764 mj_scalar_t *gmins =
new mj_scalar_t[this->coord_dim];
2766 mj_scalar_t *maxs =
new mj_scalar_t[this->coord_dim];
2768 mj_scalar_t *gmaxs =
new mj_scalar_t[this->coord_dim];
2770 auto local_mj_coordinates = this->mj_coordinates;
2774 for(
int i = 0; i < this->coord_dim; ++i) {
2779 for(
int i = 0; i < std::min(this->recursion_depth, this->coord_dim); ++i) {
2780 Kokkos::parallel_reduce(
"MinReduce",
2781 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2782 (0, this->num_local_coords),
2783 KOKKOS_LAMBDA(mj_lno_t j, mj_scalar_t & running_min) {
2784 if(local_mj_coordinates(j,i) < running_min) {
2785 running_min = local_mj_coordinates(j,i);
2787 }, Kokkos::Min<mj_scalar_t>(mins[i]));
2788 Kokkos::parallel_reduce(
"MaxReduce",
2789 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2790 (0, this->num_local_coords),
2791 KOKKOS_LAMBDA(mj_lno_t j, mj_scalar_t & running_max) {
2792 if(local_mj_coordinates(j,i) > running_max) {
2793 running_max = local_mj_coordinates(j,i);
2795 }, Kokkos::Max<mj_scalar_t>(maxs[i]));
2798 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MIN,
2799 this->coord_dim, mins, gmins
2802 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MAX,
2803 this->coord_dim, maxs, gmaxs
2807 global_box = rcp(
new mj_partBox_t(0,this->coord_dim,gmins,gmaxs));
2821template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2822 typename mj_part_t,
typename mj_node_t>
2823void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2824 mj_node_t>::init_part_boxes(
2825 RCP<mj_partBoxVector_t> & initial_partitioning_boxes)
2827 mj_partBox_t tmp_box(*global_box);
2828 initial_partitioning_boxes->push_back(tmp_box);
2835template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2838void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2839 mj_get_local_min_max_coord_totW(
2840 mj_part_t current_work_part,
2841 mj_part_t current_concurrent_num_parts,
2842 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords)
2844 auto local_coordinate_permutations = this->coordinate_permutations;
2845 auto local_process_local_min_max_coord_total_weight =
2846 this->process_local_min_max_coord_total_weight;
2847 auto local_mj_weights = this->mj_weights;
2849 bool bUniformWeights = mj_uniform_weights(0);
2851 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
2853 mj_part_t concurrent_current_part = current_work_part + kk;
2854 mj_lno_t coordinate_begin_index = concurrent_current_part == 0 ? 0 :
2855 host_part_xadj(concurrent_current_part - 1);
2856 mj_lno_t coordinate_end_index =
2857 host_part_xadj(concurrent_current_part);
2859 mj_scalar_t my_min_coord = 0;
2860 mj_scalar_t my_max_coord = 0;
2861 mj_scalar_t my_total_weight;
2864 if(coordinate_begin_index >= coordinate_end_index)
2866 my_min_coord = std::numeric_limits<mj_scalar_t>::max();
2867 my_max_coord = -std::numeric_limits<mj_scalar_t>::max();
2868 my_total_weight = 0;
2872 Kokkos::parallel_reduce(
"get min",
2873 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2874 (coordinate_begin_index, coordinate_end_index),
2875 KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & running_min) {
2876 int i = local_coordinate_permutations(j);
2877 if(mj_current_dim_coords(i) < running_min)
2878 running_min = mj_current_dim_coords(i);
2879 }, Kokkos::Min<mj_scalar_t>(my_min_coord));
2881 Kokkos::parallel_reduce(
"get max",
2882 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2883 (coordinate_begin_index, coordinate_end_index),
2884 KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & running_max) {
2885 int i = local_coordinate_permutations(j);
2886 if(mj_current_dim_coords(i) > running_max)
2887 running_max = mj_current_dim_coords(i);
2888 }, Kokkos::Max<mj_scalar_t>(my_max_coord));
2889 if(bUniformWeights) {
2890 my_total_weight = coordinate_end_index - coordinate_begin_index;
2893 my_total_weight = 0;
2894 Kokkos::parallel_reduce(
"get weight",
2895 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2896 (coordinate_begin_index, coordinate_end_index),
2897 KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & lsum) {
2898 int i = local_coordinate_permutations(j);
2899 lsum += local_mj_weights(i,0);
2900 }, my_total_weight);
2905 Kokkos::parallel_for(
2906 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
2907 (0, 1), KOKKOS_LAMBDA (
int dummy) {
2908 local_process_local_min_max_coord_total_weight(kk) =
2910 local_process_local_min_max_coord_total_weight(
2911 kk + current_concurrent_num_parts) = my_max_coord;
2912 local_process_local_min_max_coord_total_weight(
2913 kk + 2*current_concurrent_num_parts) = my_total_weight;
2930template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2931 typename mj_part_t,
typename mj_node_t>
2932void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2933 mj_node_t>::mj_get_global_min_max_coord_totW(
2934 mj_part_t current_concurrent_num_parts,
2935 Kokkos::View<mj_scalar_t *, device_t> & local_min_max_total,
2936 Kokkos::View<mj_scalar_t *, device_t> & global_min_max_total) {
2940 if(this->comm->getSize() > 1) {
2943 auto host_local_min_max_total =
2944 Kokkos::create_mirror_view(Kokkos::HostSpace(), local_min_max_total);
2945 auto host_global_min_max_total =
2946 Kokkos::create_mirror_view(Kokkos::HostSpace(), global_min_max_total);
2947 Kokkos::deep_copy(host_local_min_max_total, local_min_max_total);
2949 reductionOp(current_concurrent_num_parts,
2950 current_concurrent_num_parts, current_concurrent_num_parts);
2952 reduceAll<int, mj_scalar_t>(
2955 3 * current_concurrent_num_parts,
2956 host_local_min_max_total.data(),
2957 host_global_min_max_total.data());
2960 Kokkos::deep_copy(global_min_max_total, host_global_min_max_total);
2963 mj_part_t s = 3 * current_concurrent_num_parts;
2964 Kokkos::parallel_for(
2965 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
2966 (0, s), KOKKOS_LAMBDA (mj_part_t i) {
2967 global_min_max_total(i) = local_min_max_total(i);
3004template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3005 typename mj_part_t,
typename mj_node_t>
3006void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
3007 mj_get_initial_cut_coords_target_weights(
3008 mj_scalar_t min_coord,
3009 mj_scalar_t max_coord,
3010 mj_part_t num_cuts ,
3011 mj_scalar_t global_weight,
3013 Kokkos::View<mj_scalar_t *, device_t> & initial_cut_coords,
3015 Kokkos::View<mj_scalar_t *, device_t> & current_target_part_weights ,
3016 std::vector <mj_part_t> *future_num_part_in_parts,
3017 std::vector <mj_part_t> *next_future_num_parts_in_parts,
3018 mj_part_t concurrent_current_part,
3019 mj_part_t obtained_part_index,
3020 mj_part_t num_target_first_level_parts,
3021 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & target_first_level_dist)
3023 mj_scalar_t coord_range = max_coord - min_coord;
3030 bool bUniformPartsCheck =
3031 num_target_first_level_parts <= 1 && this->mj_uniform_parts(0);
3033 if(!bUniformPartsCheck) {
3034 bool bValidNonUniformTargetWeights =
3035 (num_target_first_level_parts > 1 && target_first_level_dist.size() != 0);
3036 if(!bValidNonUniformTargetWeights) {
3037 std::cerr <<
"MJ does not support non uniform part weights beyond the first partition" << std::endl;
3042 Kokkos::View<mj_scalar_t*, device_t> device_cumulative(
3043 "device_cumulative", num_cuts);
3044 auto host_cumulative = Kokkos::create_mirror_view(device_cumulative);
3046 mj_scalar_t cumulative = 0;
3048 if(bUniformPartsCheck) {
3050 mj_scalar_t total_future_part_count_in_part =
3051 static_cast<mj_scalar_t
>((*future_num_part_in_parts)[concurrent_current_part]);
3054 mj_scalar_t unit_part_weight =
3055 global_weight / total_future_part_count_in_part;
3057 for(mj_part_t i = 0; i < num_cuts; ++i) {
3058 cumulative += unit_part_weight *
static_cast<mj_scalar_t
>((*next_future_num_parts_in_parts)[i + obtained_part_index]);
3059 host_cumulative(i) = cumulative;
3064 mj_scalar_t sum_target_first_level_dist = 0.0;
3065 for (
int i = 0; i < num_target_first_level_parts; ++i) {
3066 sum_target_first_level_dist += target_first_level_dist(i);
3069 for(mj_part_t i = 0; i < num_cuts; ++i) {
3070 cumulative += global_weight * target_first_level_dist(i) /
3071 sum_target_first_level_dist;
3072 host_cumulative(i) = cumulative;
3076 Kokkos::deep_copy(device_cumulative, host_cumulative);
3078 Kokkos::parallel_for(
"Write num in parts",
3079 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3080 (0, num_cuts), KOKKOS_LAMBDA(mj_part_t cut) {
3082 current_target_part_weights(cut) = device_cumulative(cut);
3083 initial_cut_coords(cut) = min_coord +
3084 (coord_range * device_cumulative(cut)) / global_weight;
3086 current_target_part_weights(num_cuts) = global_weight;
3092 if (!bUniformPartsCheck || this->mj_uniform_weights[0]) {
3093 Kokkos::parallel_for(
3094 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3096 KOKKOS_LAMBDA (mj_part_t i) {
3097 current_target_part_weights(i) =
3098 long(current_target_part_weights(i) + 0.5);
3119template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3120 typename mj_part_t,
typename mj_node_t>
3121void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
3122 set_initial_coordinate_parts(
3123 mj_scalar_t &max_coordinate,
3124 mj_scalar_t &min_coordinate,
3125 mj_lno_t coordinate_begin_index,
3126 mj_lno_t coordinate_end_index,
3127 Kokkos::View<mj_lno_t *, device_t> & mj_current_coordinate_permutations,
3128 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
3129 Kokkos::View<mj_part_t *, device_t> & mj_part_ids,
3130 mj_part_t &partition_count)
3132 mj_scalar_t coordinate_range = max_coordinate - min_coordinate;
3136 if(std::abs(coordinate_range) < this->sEpsilon ) {
3137 Kokkos::parallel_for(
3138 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
3139 (coordinate_begin_index, coordinate_end_index),
3140 KOKKOS_LAMBDA (mj_lno_t ii) {
3141 mj_part_ids(mj_current_coordinate_permutations[ii]) = 0;
3147 mj_scalar_t slice = coordinate_range / partition_count;
3148 Kokkos::parallel_for(
3149 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
3150 (coordinate_begin_index, coordinate_end_index),
3151 KOKKOS_LAMBDA (mj_lno_t ii) {
3152 mj_lno_t iii = mj_current_coordinate_permutations[ii];
3154 mj_part_t((mj_current_dim_coords[iii] - min_coordinate) / slice);
3155 if(pp >= partition_count) {
3156 pp = partition_count - 1;
3158 mj_part_ids[iii] = 2 * pp;
3177template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3178 typename mj_part_t,
typename mj_node_t>
3179void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,mj_node_t>::mj_1D_part(
3180 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
3181 double used_imbalance_tolerance,
3182 mj_part_t current_work_part,
3183 mj_part_t current_concurrent_num_parts,
3184 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
3185 mj_part_t total_incomplete_cut_count,
3186 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count,
3187 Kokkos::View<size_t*, device_t> & view_total_reduction_size)
3189 this->temp_cut_coords = current_cut_coordinates;
3192 *reductionOp = NULL;
3194 bool bSingleProcess = (this->comm->getSize() == 1);
3196 std::vector<mj_part_t> temp(host_num_partitioning_in_current_dim.size());
3197 if(!bSingleProcess) {
3198 for(
size_t n = 0; n < host_num_partitioning_in_current_dim.size(); ++n) {
3199 temp[n] = host_num_partitioning_in_current_dim(n);
3202 <mj_part_t, mj_scalar_t>(
3205 current_concurrent_num_parts);
3208 auto local_cut_lower_bound_coordinates =
3209 cut_lower_bound_coordinates;
3210 auto local_cut_upper_bound_coordinates =
3211 cut_upper_bound_coordinates;
3212 auto local_cut_upper_bound_weights = cut_upper_bound_weights;
3213 auto local_cut_lower_bound_weights = cut_lower_bound_weights;
3214 bool local_distribute_points_on_cut_lines = distribute_points_on_cut_lines;
3215 auto local_process_cut_line_weight_to_put_left =
3216 process_cut_line_weight_to_put_left;
3217 auto local_temp_cut_coords = temp_cut_coords;
3218 auto local_global_total_part_weight_left_right_closests =
3219 global_total_part_weight_left_right_closests;
3220 auto local_cut_coordinates_work_array =
3221 cut_coordinates_work_array;
3222 auto local_part_xadj = part_xadj;
3223 auto local_global_min_max_coord_total_weight =
3224 global_min_max_coord_total_weight;
3225 auto local_target_part_weights =
3226 target_part_weights;
3227 auto local_global_rectilinear_cut_weight =
3228 global_rectilinear_cut_weight;
3229 auto local_process_rectilinear_cut_weight =
3230 process_rectilinear_cut_weight;
3232 auto local_is_cut_line_determined = this->is_cut_line_determined;
3233 auto local_device_num_partitioning_in_current_dim =
3234 device_num_partitioning_in_current_dim;
3236 Kokkos::parallel_for(
3237 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
3238 KOKKOS_LAMBDA (
int dummy) {
3241 view_rectilinear_cut_count(0) = 0;
3242 view_total_reduction_size(0) = 0;
3246 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
3247 mj_part_t num_part_in_dim =
3248 local_device_num_partitioning_in_current_dim(current_work_part + i);
3249 mj_part_t num_cut_in_dim = num_part_in_dim - 1;
3250 view_total_reduction_size(0) += (4 * num_cut_in_dim + 1);
3252 for(mj_part_t ii = 0; ii < num_cut_in_dim; ++ii) {
3253 local_is_cut_line_determined(next) =
false;
3255 local_cut_lower_bound_coordinates(next) =
3256 local_global_min_max_coord_total_weight(i);
3258 local_cut_upper_bound_coordinates(next) =
3259 local_global_min_max_coord_total_weight(
3260 i + current_concurrent_num_parts);
3262 local_cut_upper_bound_weights(next) =
3263 local_global_min_max_coord_total_weight(
3264 i + 2 * current_concurrent_num_parts);
3265 local_cut_lower_bound_weights(next) = 0;
3266 if(local_distribute_points_on_cut_lines) {
3267 local_process_cut_line_weight_to_put_left(next) = 0;
3278 while (total_incomplete_cut_count != 0) {
3279 this->mj_1D_part_get_part_weights(
3280 current_concurrent_num_parts,
3282 mj_current_dim_coords,
3286 this->mj_combine_rightleft_and_weights(
3288 current_concurrent_num_parts);
3291 if(!bSingleProcess) {
3294 auto host_total_part_weight_left_right_closests =
3295 Kokkos::create_mirror_view(Kokkos::HostSpace(),
3296 total_part_weight_left_right_closests);
3297 auto host_global_total_part_weight_left_right_closests =
3298 Kokkos::create_mirror_view(Kokkos::HostSpace(),
3299 global_total_part_weight_left_right_closests);
3301 Kokkos::deep_copy(host_total_part_weight_left_right_closests,
3302 total_part_weight_left_right_closests);
3304 size_t host_view_total_reduction_size;
3305 Kokkos::parallel_reduce(
"Read single",
3306 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
3307 KOKKOS_LAMBDA(
int dummy,
size_t & set_single) {
3308 set_single = view_total_reduction_size(0);
3309 }, host_view_total_reduction_size);
3311 reduceAll<int, mj_scalar_t>( *(this->comm), *reductionOp,
3312 host_view_total_reduction_size,
3313 host_total_part_weight_left_right_closests.data(),
3314 host_global_total_part_weight_left_right_closests.data());
3315 Kokkos::deep_copy(global_total_part_weight_left_right_closests,
3316 host_global_total_part_weight_left_right_closests);
3319 local_global_total_part_weight_left_right_closests =
3320 this->total_part_weight_left_right_closests;
3325 mj_part_t cut_shift = 0;
3329 size_t tlr_shift = 0;
3331 Kokkos::View<mj_part_t*, Kokkos::HostSpace>
3332 save_initial_incomplete_cut_count(
"save_initial_incomplete_cut_count",
3333 current_concurrent_num_parts);
3335 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
3337 mj_part_t num_parts =
3338 host_num_partitioning_in_current_dim(current_work_part + kk);
3340 mj_part_t num_cuts = num_parts - 1;
3341 size_t num_total_part = num_parts + size_t (num_cuts);
3346 mj_part_t kk_incomplete_cut_count = this->incomplete_cut_count(kk);
3348 if(kk_incomplete_cut_count == 0) {
3349 cut_shift += num_cuts;
3350 tlr_shift += (num_total_part + 2 * num_cuts);
3354 Kokkos::View<mj_scalar_t *, device_t> current_local_part_weights =
3355 Kokkos::subview(this->total_part_weight_left_right_closests,
3356 std::pair<mj_lno_t, mj_lno_t>(
3358 this->total_part_weight_left_right_closests.size()));
3360 Kokkos::View<mj_scalar_t *, device_t> current_global_tlr =
3362 local_global_total_part_weight_left_right_closests,
3363 std::pair<mj_lno_t, mj_lno_t>(
3365 local_global_total_part_weight_left_right_closests.size()));
3366 Kokkos::View<mj_scalar_t *, device_t>
3367 current_global_left_closest_points =
3368 Kokkos::subview(current_global_tlr,
3369 std::pair<mj_lno_t, mj_lno_t>(
3371 current_global_tlr.size()));
3372 Kokkos::View<mj_scalar_t *, device_t>
3373 current_global_right_closest_points =
3374 Kokkos::subview(current_global_tlr,
3375 std::pair<mj_lno_t, mj_lno_t>(
3376 num_total_part + num_cuts,
3377 current_global_tlr.size()));
3378 Kokkos::View<mj_scalar_t *, device_t> current_global_part_weights =
3381 Kokkos::View<bool *, device_t> current_cut_line_determined =
3382 Kokkos::subview(this->is_cut_line_determined,
3383 std::pair<mj_lno_t, mj_lno_t>(
3385 this->is_cut_line_determined.size()));
3386 Kokkos::View<mj_scalar_t *, device_t> current_part_target_weights =
3387 Kokkos::subview(local_target_part_weights,
3388 std::pair<mj_lno_t, mj_lno_t>(
3390 local_target_part_weights.size()));
3391 Kokkos::View<mj_scalar_t *, device_t>
3392 current_part_cut_line_weight_to_put_left =
3393 Kokkos::subview(local_process_cut_line_weight_to_put_left,
3394 std::pair<mj_lno_t, mj_lno_t>(
3396 local_process_cut_line_weight_to_put_left.size()));
3398 save_initial_incomplete_cut_count(kk) =
3399 kk_incomplete_cut_count;
3401 Kokkos::View<mj_scalar_t *, device_t>
3402 current_cut_lower_bound_weights =
3403 Kokkos::subview(local_cut_lower_bound_weights,
3404 std::pair<mj_lno_t, mj_lno_t>(
3406 local_cut_lower_bound_weights.size()));
3407 Kokkos::View<mj_scalar_t *, device_t> current_cut_upper_weights =
3408 Kokkos::subview(local_cut_upper_bound_weights,
3409 std::pair<mj_lno_t, mj_lno_t>(
3411 local_cut_upper_bound_weights.size()));
3412 Kokkos::View<mj_scalar_t *, device_t> current_cut_upper_bounds =
3413 Kokkos::subview(local_cut_upper_bound_coordinates,
3414 std::pair<mj_lno_t, mj_lno_t>(
3416 local_cut_upper_bound_coordinates.size()));
3417 Kokkos::View<mj_scalar_t *, device_t> current_cut_lower_bounds =
3418 Kokkos::subview(local_cut_lower_bound_coordinates,
3419 std::pair<mj_lno_t, mj_lno_t>(
3421 local_cut_lower_bound_coordinates.size()));
3424 Kokkos::View<mj_scalar_t*, device_t> sub_temp_cut_coords =
3425 Kokkos::subview(this->temp_cut_coords,
3426 std::pair<mj_lno_t, mj_lno_t>(
3427 cut_shift, this->temp_cut_coords.size()));
3428 Kokkos::View<mj_scalar_t*, device_t> sub_cut_coordinates_work_array =
3429 Kokkos::subview(this->cut_coordinates_work_array,
3430 std::pair<mj_lno_t, mj_lno_t>(
3431 cut_shift, this->cut_coordinates_work_array.size()));
3433 this->mj_get_new_cut_coordinates(
3434 current_concurrent_num_parts,
3437 used_imbalance_tolerance,
3438 current_global_part_weights,
3439 current_local_part_weights,
3440 current_part_target_weights,
3441 current_cut_line_determined,
3442 sub_temp_cut_coords,
3443 current_cut_upper_bounds,
3444 current_cut_lower_bounds,
3445 current_global_left_closest_points,
3446 current_global_right_closest_points,
3447 current_cut_lower_bound_weights,
3448 current_cut_upper_weights,
3449 sub_cut_coordinates_work_array,
3450 current_part_cut_line_weight_to_put_left,
3451 view_rectilinear_cut_count);
3453 cut_shift += num_cuts;
3454 tlr_shift += (num_total_part + 2 * num_cuts);
3457 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
3458 mj_part_t iteration_complete_cut_count =
3459 save_initial_incomplete_cut_count(kk) - this->incomplete_cut_count(kk);
3460 total_incomplete_cut_count -= iteration_complete_cut_count;
3463 Kokkos::parallel_for(
3464 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
3465 (0, local_temp_cut_coords.size()), KOKKOS_LAMBDA(
int n) {
3466 auto t = local_temp_cut_coords(n);
3467 local_temp_cut_coords(n) = local_cut_coordinates_work_array(n);
3468 local_cut_coordinates_work_array(n) = t;
3476 if(current_cut_coordinates != local_temp_cut_coords) {
3477 Kokkos::parallel_for(
3478 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
3479 (0, 1), KOKKOS_LAMBDA(
int dummy) {
3481 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
3482 mj_part_t num_parts = -1;
3483 num_parts = local_device_num_partitioning_in_current_dim(
3484 current_work_part + i);
3485 mj_part_t num_cuts = num_parts - 1;
3486 for(mj_part_t ii = 0; ii < num_cuts; ++ii) {
3487 current_cut_coordinates(next + ii) = local_temp_cut_coords(next + ii);
3492 static_cast<int>(local_cut_coordinates_work_array.size()); ++n) {
3493 local_cut_coordinates_work_array(n) = local_temp_cut_coords(n);
3501template<
class scalar_t>
3507 KOKKOS_INLINE_FUNCTION
3510 KOKKOS_INLINE_FUNCTION
3514#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
3516template<
class policy_t,
class scalar_t,
class part_t>
3527 scalar_t mj_max_scalar,
3529 int mj_value_count_rightleft,
3530 int mj_value_count_weights) :
3537 KOKKOS_INLINE_FUNCTION
3542 KOKKOS_INLINE_FUNCTION
3545 dst.ptr[n] += src.ptr[n];
3550 if(src.ptr[n] > dst.ptr[n]) {
3551 dst.ptr[n] = src.ptr[n];
3553 if(src.ptr[n+1] < dst.ptr[n+1]) {
3554 dst.ptr[n+1] = src.ptr[n+1];
3559 KOKKOS_INLINE_FUNCTION
3562 dst.ptr[n] += src.ptr[n];
3567 if(src.ptr[n] > dst.ptr[n]) {
3568 dst.ptr[n] = src.ptr[n];
3570 if(src.ptr[n+1] < dst.ptr[n+1]) {
3571 dst.ptr[n+1] = src.ptr[n+1];
3577 dst.ptr =
value->ptr;
3592template<
class policy_t,
class scalar_t,
class part_t,
class index_t,
3598#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
3621#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3622 Kokkos::View<double *, device_t> current_part_weights;
3623 Kokkos::View<scalar_t *, device_t> current_left_closest;
3624 Kokkos::View<scalar_t *, device_t> current_right_closest;
3629 array_t mj_max_scalar,
3630 part_t mj_concurrent_current_part,
3632 part_t mj_current_work_part,
3633 part_t mj_current_concurrent_num_parts,
3634 part_t mj_left_right_array_size,
3635 part_t mj_weight_array_size,
3636 Kokkos::View<index_t*, device_t> & mj_permutations,
3637 Kokkos::View<scalar_t *, device_t> & mj_coordinates,
3638 Kokkos::View<scalar_t**, device_t> & mj_weights,
3639 Kokkos::View<part_t*, device_t> & mj_parts,
3640 Kokkos::View<scalar_t *, device_t> & mj_cut_coordinates,
3641 Kokkos::View<index_t *, device_t> & mj_part_xadj,
3642 bool mj_uniform_weights0,
3643 scalar_t mj_sEpsilon
3644#
if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3645 ,Kokkos::View<double *, device_t> & mj_current_part_weights,
3646 Kokkos::View<scalar_t *, device_t> & mj_current_left_closest,
3647 Kokkos::View<scalar_t *, device_t> & mj_current_right_closest
3658 value_count(mj_weight_array_size+mj_left_right_array_size),
3667#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3668 ,current_part_weights(mj_current_part_weights),
3669 current_left_closest(mj_current_left_closest),
3670 current_right_closest(mj_current_right_closest)
3676#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3677 int result =
sizeof(array_t) *
3680 int result =
sizeof(array_t) *
3685 int remainder = result % 8;
3686 if(remainder != 0) {
3687 result += 8 - remainder;
3692 KOKKOS_INLINE_FUNCTION
3693#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3703 index_t num_working_points = all_end - all_begin;
3704 int num_teams = teamMember.league_size();
3706 index_t stride = num_working_points / num_teams;
3707 if((num_working_points % num_teams) > 0) {
3714 index_t begin = all_begin + stride * teamMember.league_rank();
3715 index_t end = begin + stride;
3720#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3724 array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
3728 Kokkos::single(Kokkos::PerTeam(teamMember), [&] () {
3738 teamMember.team_barrier();
3740 Kokkos::parallel_for(
3741 Kokkos::TeamThreadRange(teamMember, begin, end),
3748 array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
3761 Kokkos::parallel_reduce(
3762 Kokkos::TeamThreadRange(teamMember, begin, end),
3763#
if (__cplusplus > 201703L)
3775 index_t part =
parts(i)/2;
3786#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3787 Kokkos::atomic_add(&shared_ptr[part*2], w);
3789 threadSum.
ptr[part*2] += w;
3795#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3796 array_t new_value = (array_t) coord;
3798 while(new_value < prev_value) {
3799 prev_value = Kokkos::atomic_compare_exchange(
3801 prev_value, new_value);
3804 while(new_value > prev_value) {
3805 prev_value = Kokkos::atomic_compare_exchange(
3807 prev_value, new_value);
3825 if(coord < b + sEpsilon && coord > b -
sEpsilon) {
3829#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3830 Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3834 threadSum.
ptr[part*2+1] += w;
3839 parts(i) = part*2+1;
3850 scalar_t delta = b - base_coord;
3851 if(delta < 0) delta = -delta;
3856#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3857 Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3861 threadSum.
ptr[part*2+1] += w;
3872 scalar_t delta = b - base_coord;
3873 if(delta < 0) delta = -delta;
3878#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3879 Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3883 threadSum.
ptr[part*2+1] += w;
3908 if(part == lower + 1) {
3913 part -= (part - lower)/2;
3916 else if(part == upper - 1) {
3921 part += (upper - part)/2;
3925#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3928 }, arraySumReducer);
3931 teamMember.team_barrier();
3934#if (__cplusplus > 201703L)
3935 Kokkos::single(Kokkos::PerTeam(teamMember), [=,
this] () {
3937 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
3940#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3941 Kokkos::atomic_add(¤t_part_weights(n),
3942 static_cast<double>(shared_ptr[n]));
3944 teamSum[n] += array.
ptr[n];
3948#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3949 int insert_left = 0;
3950 int insert_right = 0;
3955#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3956 scalar_t new_value = shared_ptr[n+1];
3957 scalar_t prev_value = current_right_closest(insert_right);
3958 while(new_value < prev_value) {
3959 prev_value = Kokkos::atomic_compare_exchange(
3960 ¤t_right_closest(insert_right), prev_value, new_value);
3963 new_value = shared_ptr[n];
3964 prev_value = current_left_closest(insert_left);
3965 while(new_value > prev_value) {
3966 prev_value = Kokkos::atomic_compare_exchange(
3967 ¤t_left_closest(insert_left), prev_value, new_value);
3973 if(array.
ptr[n] > teamSum[n]) {
3974 teamSum[n] = array.
ptr[n];
3976 if(array.
ptr[n+1] < teamSum[n+1]) {
3977 teamSum[n+1] = array.
ptr[n+1];
3983 teamMember.team_barrier();
3986#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
3987 KOKKOS_INLINE_FUNCTION
3995 if(src[n] > dst[n]) {
3998 if(src[n+1] < dst[n+1]) {
3999 dst[n+1] = src[n+1];
4025template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4026 typename mj_part_t,
typename mj_node_t>
4027void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t,mj_part_t, mj_node_t>::
4028 mj_1D_part_get_part_weights(
4031 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
4034 auto local_is_cut_line_determined = is_cut_line_determined;
4035 auto local_thread_part_weights = thread_part_weights;
4036 auto local_thread_cut_left_closest_point = thread_cut_left_closest_point;
4037 auto local_thread_cut_right_closest_point = thread_cut_right_closest_point;
4041 auto local_sEpsilon = this->
sEpsilon;
4042 auto local_assigned_part_ids = this->assigned_part_ids;
4043 auto local_coordinate_permutations = this->coordinate_permutations;
4044 auto local_mj_weights = this->mj_weights;
4046 auto local_global_min_max_coord_total_weight =
4047 this->global_min_max_coord_total_weight;
4049 typedef Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy_t;
4051 auto local_device_num_partitioning_in_current_dim =
4052 device_num_partitioning_in_current_dim;
4054 Kokkos::deep_copy(device_incomplete_cut_count, this->incomplete_cut_count);
4055 auto local_device_incomplete_cut_count = device_incomplete_cut_count;
4057 mj_part_t total_part_shift = 0;
4059 mj_part_t concurrent_cut_shifts = 0;
4061 Kokkos::View<mj_scalar_t *, device_t> local_temp_cut_coords =
4062 Kokkos::subview(temp_cut_coords, std::pair<mj_lno_t, mj_lno_t>(
4063 concurrent_cut_shifts, temp_cut_coords.size()));
4065 mj_part_t num_parts =
4067 mj_part_t
num_cuts = num_parts - 1;
4068 mj_part_t total_part_count = num_parts +
num_cuts;
4069 mj_part_t weight_array_length =
num_cuts + num_parts;
4072 mj_part_t right_left_array_length = (
num_cuts + 2) * 2;
4074 if(this->incomplete_cut_count(kk) == 0) {
4075 total_part_shift += total_part_count;
4081 auto policy_ReduceWeightsFunctor = policy_t(
4082 mj_num_teams ? mj_num_teams : 60, Kokkos::AUTO);
4084#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4085 int total_array_length =
4086 weight_array_length + right_left_array_length;
4093 typedef mj_scalar_t array_t;
4095#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4096 Kokkos::View<array_t*, Kokkos::HostSpace> reduce_array(
"reduce_array", total_array_length);
4099 int offset_cuts = 0;
4100 for(
int kk2 = 0; kk2 < kk; ++kk2) {
4104 Kokkos::View<double *, device_t> my_current_part_weights =
4105 Kokkos::subview(local_thread_part_weights,
4106 std::pair<mj_lno_t, mj_lno_t>(total_part_shift,
4107 total_part_shift + total_part_count));
4108 Kokkos::View<mj_scalar_t *, device_t> my_current_left_closest =
4109 Kokkos::subview(local_thread_cut_left_closest_point,
4110 std::pair<mj_lno_t, mj_lno_t>(
4112 local_thread_cut_left_closest_point.size()));
4113 Kokkos::View<mj_scalar_t *, device_t> my_current_right_closest =
4114 Kokkos::subview(local_thread_cut_right_closest_point,
4115 std::pair<mj_lno_t, mj_lno_t>(
4117 local_thread_cut_right_closest_point.size()));
4119 array_t
max_scalar = std::numeric_limits<array_t>::max();
4121#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4123 Kokkos::parallel_for(
4124 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4125 KOKKOS_LAMBDA (
int dummy) {
4126 for(
int n = 0; n < weight_array_length; ++n) {
4127 my_current_part_weights(n) = 0;
4129 for(
int n = 0; n <
num_cuts; ++n) {
4140 typename mj_node_t::device_type, array_t>
4148 right_left_array_length,
4149 weight_array_length,
4150 coordinate_permutations,
4151 mj_current_dim_coords,
4154 local_temp_cut_coords,
4156 mj_uniform_weights(0),
4158#
if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4159 ,my_current_part_weights,
4160 my_current_left_closest,
4161 my_current_right_closest
4165#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4166 Kokkos::parallel_for(policy_ReduceWeightsFunctor, teamFunctor);
4168 Kokkos::parallel_reduce(policy_ReduceWeightsFunctor,
4169 teamFunctor, reduce_array);
4173#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4174 auto hostArray = Kokkos::create_mirror_view(my_current_part_weights);
4176 for(
int i = 0; i < static_cast<int>(total_part_count); ++i) {
4177 hostArray(i) = reduce_array[i];
4180 Kokkos::deep_copy(my_current_part_weights, hostArray);
4182 auto hostLeftArray = Kokkos::create_mirror_view(my_current_left_closest);
4183 auto hostRightArray = Kokkos::create_mirror_view(my_current_right_closest);
4184 for(mj_part_t cut = 0; cut <
num_cuts; ++cut) {
4185 hostLeftArray(cut) = reduce_array[weight_array_length + (cut+1)*2+0];
4186 hostRightArray(cut) = reduce_array[weight_array_length + (cut+1)*2+1];
4188 Kokkos::deep_copy(my_current_left_closest, hostLeftArray);
4189 Kokkos::deep_copy(my_current_right_closest, hostRightArray);
4192 total_part_shift += total_part_count;
4196 auto local_temp_cut_coords = temp_cut_coords;
4198 Kokkos::parallel_for(
4199 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
4201 mj_part_t num_parts = local_device_num_partitioning_in_current_dim(
4203 mj_part_t
num_cuts = num_parts - 1;
4204 mj_part_t total_part_count = num_parts +
num_cuts;
4206 if(local_device_incomplete_cut_count(kk) > 0) {
4210 size_t offset_cuts = 0;
4211 for(mj_part_t kk2 = 0; kk2 < kk; ++kk2) {
4212 auto num_parts_kk2 = local_device_num_partitioning_in_current_dim(
4214 offset += num_parts_kk2 * 2 - 1;
4215 offset_cuts += num_parts_kk2 - 1;
4218 for(mj_part_t i = 1; i < total_part_count; ++i) {
4222 if(i % 2 == 0 && i > 1 && i < total_part_count - 1 &&
4223 std::abs(local_temp_cut_coords(offset_cuts + i / 2) -
4224 local_temp_cut_coords(offset_cuts + i /2 - 1))
4229 local_thread_part_weights(offset + i)
4230 = local_thread_part_weights(offset + i-2);
4235 local_thread_part_weights(offset + i) +=
4236 local_thread_part_weights(offset + i-1);
4249template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4250 typename mj_part_t,
typename mj_node_t>
4251void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
4252 mj_combine_rightleft_and_weights(
4256 auto local_thread_part_weights = this->thread_part_weights;
4257 auto local_is_cut_line_determined = this->is_cut_line_determined;
4258 auto local_thread_cut_left_closest_point =
4259 this->thread_cut_left_closest_point;
4260 auto local_thread_cut_right_closest_point =
4261 this->thread_cut_right_closest_point;
4262 auto local_total_part_weight_left_right_closests =
4263 this->total_part_weight_left_right_closests;
4264 auto local_device_num_partitioning_in_current_dim =
4265 device_num_partitioning_in_current_dim;
4266 Kokkos::parallel_for(
4267 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0,1),
4268 KOKKOS_LAMBDA (
int dummy) {
4270 size_t tlr_array_shift = 0;
4271 mj_part_t cut_shift = 0;
4272 size_t total_part_array_shift = 0;
4278 mj_part_t num_parts_in_part =
4280 mj_part_t num_cuts_in_part = num_parts_in_part - 1;
4281 size_t num_total_part_in_part =
4282 num_parts_in_part + size_t (num_cuts_in_part);
4285 for(
int ii = 0; ii < num_cuts_in_part; ++ii) {
4286 mj_part_t next = tlr_array_shift + ii;
4287 mj_part_t cut_index = cut_shift + ii;
4289 if(!local_is_cut_line_determined(cut_index)) {
4290 mj_scalar_t left_closest_in_process =
4291 local_thread_cut_left_closest_point(cut_index);
4292 mj_scalar_t right_closest_in_process =
4293 local_thread_cut_right_closest_point(cut_index);
4296 local_total_part_weight_left_right_closests(
4297 num_total_part_in_part + next) = left_closest_in_process;
4299 local_total_part_weight_left_right_closests(
4300 num_total_part_in_part + num_cuts_in_part + next) =
4301 right_closest_in_process;
4305 for(
size_t j = 0; j < num_total_part_in_part; ++j) {
4306 mj_part_t cut_ind = j / 2 + cut_shift;
4312 if(j == num_total_part_in_part - 1 ||
4313 !local_is_cut_line_determined(cut_ind)) {
4314 double pwj = local_thread_part_weights(total_part_array_shift + j);
4315 local_total_part_weight_left_right_closests(tlr_array_shift + j) = pwj;
4320 cut_shift += num_cuts_in_part;
4321 tlr_array_shift += num_total_part_in_part + 2 * num_cuts_in_part;
4322 total_part_array_shift += num_total_part_in_part;
4339template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4340 typename mj_part_t,
typename mj_node_t>
4341KOKKOS_INLINE_FUNCTION
4342void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
4343 mj_node_t>::mj_calculate_new_cut_position(mj_scalar_t cut_upper_bound,
4344 mj_scalar_t cut_lower_bound,
4345 mj_scalar_t cut_upper_weight,
4346 mj_scalar_t cut_lower_weight,
4347 mj_scalar_t expected_weight,
4348 mj_scalar_t &new_cut_position,
4351 if(std::abs(cut_upper_bound - cut_lower_bound) <
sEpsilon) {
4352 new_cut_position = cut_upper_bound;
4355 if(std::abs(cut_upper_weight - cut_lower_weight) <
sEpsilon) {
4356 new_cut_position = cut_lower_bound;
4359 mj_scalar_t coordinate_range = (cut_upper_bound - cut_lower_bound);
4360 mj_scalar_t weight_range = (cut_upper_weight - cut_lower_weight);
4361 mj_scalar_t my_weight_diff = (expected_weight - cut_lower_weight);
4363 mj_scalar_t required_shift = (my_weight_diff / weight_range);
4364 int scale_constant = 20;
4365 int shiftint= int (required_shift * scale_constant);
4366 if(shiftint == 0) shiftint = 1;
4367 required_shift = mj_scalar_t (shiftint) / scale_constant;
4368 new_cut_position = coordinate_range * required_shift + cut_lower_bound;
4371#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4373template<
class policy_t,
class scalar_t>
4383 int mj_value_count) :
4385 value_count(mj_value_count)
4388 KOKKOS_INLINE_FUNCTION
4393 KOKKOS_INLINE_FUNCTION
4395 for(
int n = 0; n < value_count; ++n) {
4396 dst.ptr[n] += src.ptr[n];
4401 dst.ptr = value->ptr;
4402 for(
int n = 0; n < value_count; ++n) {
4410template<
class policy_t,
class scalar_t,
class part_t,
class index_t,
4416#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4417 typedef array_t value_type[];
4428#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4429 Kokkos::View<int *, device_t> local_point_counts;
4433 part_t mj_concurrent_current_part,
4434 part_t mj_weight_array_size,
4435 Kokkos::View<index_t*, device_t> & mj_permutations,
4436 Kokkos::View<scalar_t *, device_t> & mj_coordinates,
4437 Kokkos::View<part_t*, device_t> & mj_parts,
4438 Kokkos::View<index_t *, device_t> & mj_part_xadj,
4439 Kokkos::View<index_t *, device_t> & mj_track_on_cuts
4440#
if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4441 ,Kokkos::View<int *, device_t> & mj_local_point_counts
4444 concurrent_current_part(mj_concurrent_current_part),
4445 value_count(mj_weight_array_size),
4446 permutations(mj_permutations),
4449 part_xadj(mj_part_xadj),
4450 track_on_cuts(mj_track_on_cuts)
4451#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4452 ,local_point_counts(mj_local_point_counts)
4458#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4459 int result =
sizeof(array_t) * (value_count);
4461 int result =
sizeof(array_t) * (value_count) * team_size;
4465 int remainder = result % 8;
4466 if(remainder != 0) {
4467 result += 8 - remainder;
4472 KOKKOS_INLINE_FUNCTION
4473#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4474 void operator() (
const member_type & teamMember)
const {
4476 void operator() (
const member_type & teamMember, value_type teamSum)
const {
4478 index_t all_begin = (concurrent_current_part == 0) ? 0 :
4479 part_xadj(concurrent_current_part - 1);
4480 index_t all_end = part_xadj(concurrent_current_part);
4482 index_t num_working_points = all_end - all_begin;
4483 int num_teams = teamMember.league_size();
4485 index_t stride = num_working_points / num_teams;
4486 if((num_working_points % num_teams) > 0) {
4490 index_t begin = all_begin + stride * teamMember.league_rank();
4491 index_t end = begin + stride;
4496 int track_on_cuts_insert_index = track_on_cuts.size() - 1;
4499#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4500 size_t sh_mem_size =
sizeof(array_t) * (value_count);
4502 size_t sh_mem_size =
4503 sizeof(array_t) * (value_count) * teamMember.team_size();
4506 array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
4509#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4511 Kokkos::single(Kokkos::PerTeam(teamMember), [&] () {
4512 for(
int n = 0; n < value_count; ++n) {
4516 teamMember.team_barrier();
4518 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, begin, end),
4528 Kokkos::parallel_reduce(
4529 Kokkos::TeamThreadRange(teamMember, begin, end),
4530#
if (__cplusplus > 201703L)
4537 index_t coordinate_index = permutations(ii);
4538 part_t place = parts(coordinate_index);
4540 if(place % 2 == 0) {
4541#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4542 Kokkos::atomic_add(&shared_ptr[part], 1);
4544 threadSum.
ptr[part] += 1;
4547 parts(coordinate_index) = part;
4552 index_t set_index = Kokkos::atomic_fetch_add(
4553 &track_on_cuts(track_on_cuts_insert_index), 1);
4554 track_on_cuts(set_index) = ii;
4556#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4562 teamMember.team_barrier();
4565#if (__cplusplus > 201703L)
4566 Kokkos::single(Kokkos::PerTeam(teamMember), [=,
this] () {
4568 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
4570 for(
int n = 0; n < value_count; ++n) {
4571#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4572 Kokkos::atomic_add(&local_point_counts(n), shared_ptr[n]);
4574 teamSum[n] += array.
ptr[n];
4579 teamMember.team_barrier();
4582#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4584 KOKKOS_INLINE_FUNCTION
4585 void join(value_type dst,
const value_type src)
const {
4586 for(
int n = 0; n < value_count; ++n) {
4591 KOKKOS_INLINE_FUNCTION
void init (value_type dst)
const {
4592 for(
int n = 0; n < value_count; ++n) {
4614template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4615 typename mj_part_t,
typename mj_node_t>
4616void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
4617mj_create_new_partitions(
4618 mj_part_t num_parts,
4619 mj_part_t current_concurrent_work_part,
4620 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
4621 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
4622 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
4623 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj)
4626 auto local_thread_part_weight_work = this->thread_part_weight_work;
4627 auto local_point_counts = this->thread_point_counts;
4628 auto local_distribute_points_on_cut_lines =
4629 this->distribute_points_on_cut_lines;
4630 auto local_thread_cut_line_weight_to_put_left =
4631 this->thread_cut_line_weight_to_put_left;
4632 auto local_sEpsilon = this->sEpsilon;
4633 auto local_coordinate_permutations = this->coordinate_permutations;
4634 auto local_mj_weights = this->mj_weights;
4635 auto local_assigned_part_ids = this->assigned_part_ids;
4636 auto local_new_coordinate_permutations = this->new_coordinate_permutations;
4638 mj_part_t num_cuts = num_parts - 1;
4640 Kokkos::parallel_for(
4641 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4642 KOKKOS_LAMBDA(
int dummy) {
4644 if(local_distribute_points_on_cut_lines) {
4645 for(
int i = 0; i < num_cuts; ++i) {
4646 mj_scalar_t left_weight = used_local_cut_line_weight_to_left(i);
4647 if(left_weight > local_sEpsilon) {
4649 mj_scalar_t thread_ii_weight_on_cut =
4650 local_thread_part_weight_work(i * 2 + 1) -
4651 local_thread_part_weight_work(i * 2);
4653 if(thread_ii_weight_on_cut < left_weight) {
4655 local_thread_cut_line_weight_to_put_left(i) =
4656 thread_ii_weight_on_cut;
4660 local_thread_cut_line_weight_to_put_left(i) = left_weight;
4662 left_weight -= thread_ii_weight_on_cut;
4665 local_thread_cut_line_weight_to_put_left(i) = 0;
4671 for(mj_part_t i = num_cuts - 1; i > 0 ; --i) {
4672 if(std::abs(current_concurrent_cut_coordinate(i) -
4673 current_concurrent_cut_coordinate(i -1)) < local_sEpsilon) {
4674 local_thread_cut_line_weight_to_put_left(i) -=
4675 local_thread_cut_line_weight_to_put_left(i - 1);
4677 local_thread_cut_line_weight_to_put_left(i) =
4678 static_cast<long long>((local_thread_cut_line_weight_to_put_left(i) +
4679 least_signifiance) * significance_mul) /
4680 static_cast<mj_scalar_t
>(significance_mul);
4684 for(mj_part_t i = 0; i < num_parts; ++i) {
4685 local_point_counts(i) = 0;
4689 mj_lno_t coordinate_begin_index =
4690 current_concurrent_work_part == 0 ? 0 :
4691 host_part_xadj(current_concurrent_work_part - 1);
4692 mj_lno_t coordinate_end_index =
4693 host_part_xadj(current_concurrent_work_part);
4695 mj_lno_t total_on_cut;
4696 Kokkos::parallel_reduce(
"Get total_on_cut",
4697 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (
4698 coordinate_begin_index, coordinate_end_index),
4699 KOKKOS_LAMBDA(
int ii, mj_lno_t & val) {
4700 mj_lno_t coordinate_index = local_coordinate_permutations(ii);
4701 mj_part_t coordinate_assigned_place =
4702 local_assigned_part_ids(coordinate_index);
4703 if(coordinate_assigned_place % 2 == 1) {
4708 Kokkos::View<mj_lno_t *, device_t> track_on_cuts;
4709 if(total_on_cut > 0) {
4710 track_on_cuts = Kokkos::View<mj_lno_t *, device_t>(
4721 typedef Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy_t;
4724 int use_num_teams = mj_num_teams ? mj_num_teams : 60;
4726 auto policy_ReduceFunctor = policy_t(use_num_teams, Kokkos::AUTO);
4727 typedef int array_t;
4731#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4732 Kokkos::View<array_t*, Kokkos::HostSpace> reduce_array(
"reduce_array", num_parts);
4735 ReduceArrayFunctor<policy_t, mj_scalar_t, mj_part_t, mj_lno_t,
4736 typename mj_node_t::device_type, array_t>teamFunctor(
4737 current_concurrent_work_part,
4739 coordinate_permutations,
4740 mj_current_dim_coords,
4744#
if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4749#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4750 Kokkos::parallel_for(policy_ReduceFunctor, teamFunctor);
4752 Kokkos::parallel_reduce(policy_ReduceFunctor, teamFunctor, reduce_array);
4756#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4757 for(mj_part_t part = 0; part < num_parts; ++part) {
4758 local_point_counts(part) = reduce_array[part];
4764 if(track_on_cuts.size() > 0) {
4765 auto track_on_cuts_sort = Kokkos::subview(track_on_cuts,
4766 std::pair<mj_lno_t, mj_lno_t>(0, track_on_cuts.size() - 1));
4767 Kokkos::sort(track_on_cuts_sort);
4770 bool uniform_weights0 = this->mj_uniform_weights(0);
4771 Kokkos::parallel_for(
4772 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4773 KOKKOS_LAMBDA (
int dummy) {
4775 for(
int j = 0; j < total_on_cut; ++j) {
4776 int ii = track_on_cuts(j);
4777 mj_lno_t coordinate_index = local_coordinate_permutations(ii);
4778 mj_scalar_t coordinate_weight = uniform_weights0 ? 1 :
4779 local_mj_weights(coordinate_index,0);
4780 mj_part_t coordinate_assigned_place =
4781 local_assigned_part_ids(coordinate_index);
4782 mj_part_t coordinate_assigned_part = coordinate_assigned_place / 2;
4784 if(local_distribute_points_on_cut_lines &&
4785 local_thread_cut_line_weight_to_put_left(
4786 coordinate_assigned_part) > local_sEpsilon) {
4790 local_thread_cut_line_weight_to_put_left(
4791 coordinate_assigned_part) -= coordinate_weight;
4796 if(local_thread_cut_line_weight_to_put_left(
4797 coordinate_assigned_part) < 0 && coordinate_assigned_part <
4799 std::abs(current_concurrent_cut_coordinate(
4800 coordinate_assigned_part+1) -
4801 current_concurrent_cut_coordinate(
4802 coordinate_assigned_part)) < local_sEpsilon)
4804 local_thread_cut_line_weight_to_put_left(
4805 coordinate_assigned_part + 1) +=
4806 local_thread_cut_line_weight_to_put_left(
4807 coordinate_assigned_part);
4809 ++local_point_counts(coordinate_assigned_part);
4810 local_assigned_part_ids(coordinate_index) =
4811 coordinate_assigned_part;
4816 ++coordinate_assigned_part;
4819 while(local_distribute_points_on_cut_lines &&
4820 coordinate_assigned_part < num_cuts)
4823 if(std::abs(current_concurrent_cut_coordinate(
4824 coordinate_assigned_part) -
4825 current_concurrent_cut_coordinate(
4826 coordinate_assigned_part - 1)) < local_sEpsilon)
4829 if(local_thread_cut_line_weight_to_put_left(
4830 coordinate_assigned_part) > local_sEpsilon &&
4831 local_thread_cut_line_weight_to_put_left(
4832 coordinate_assigned_part) >=
4833 std::abs(local_thread_cut_line_weight_to_put_left(
4834 coordinate_assigned_part) - coordinate_weight))
4836 local_thread_cut_line_weight_to_put_left(
4837 coordinate_assigned_part) -= coordinate_weight;
4841 if(local_thread_cut_line_weight_to_put_left(
4842 coordinate_assigned_part) < 0 &&
4843 coordinate_assigned_part < num_cuts - 1 &&
4844 std::abs(current_concurrent_cut_coordinate(
4845 coordinate_assigned_part+1) -
4846 current_concurrent_cut_coordinate(
4847 coordinate_assigned_part)) < local_sEpsilon)
4849 local_thread_cut_line_weight_to_put_left(
4850 coordinate_assigned_part + 1) +=
4851 local_thread_cut_line_weight_to_put_left(
4852 coordinate_assigned_part);
4860 ++coordinate_assigned_part;
4862 local_point_counts(coordinate_assigned_part) += 1;
4863 local_assigned_part_ids(coordinate_index) = coordinate_assigned_part;
4867 for(
int j = 0; j < num_parts; ++j) {
4868 out_part_xadj(j) = local_point_counts(j);
4869 local_point_counts(j) = 0;
4872 out_part_xadj(j) += out_part_xadj(j - 1);
4873 local_point_counts(j) += out_part_xadj(j - 1);
4881#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4886 Kokkos::parallel_for(
4887 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t> (
4888 coordinate_begin_index, coordinate_end_index),
4889 KOKKOS_LAMBDA (mj_lno_t ii) {
4890 mj_lno_t i = local_coordinate_permutations(ii);
4891 mj_part_t p = local_assigned_part_ids(i);
4892 mj_lno_t idx = Kokkos::atomic_fetch_add(&local_point_counts(p), 1);
4893 local_new_coordinate_permutations(coordinate_begin_index + idx) = i;
4898#ifdef KOKKOS_ENABLE_OPENMP
4900 const int num_threads = 1;
4902 const int num_threads = 1;
4905 const int num_teams = 1;
4908 Kokkos::View<mj_lno_t*, device_t>
4909 point_counter(
"insert indices", num_teams * num_threads * num_parts);
4913 Kokkos::TeamPolicy<typename mj_node_t::execution_space>
4914 block_policy(num_teams, num_threads);
4915 typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
4916 member_type member_type;
4917 mj_lno_t range = coordinate_end_index - coordinate_begin_index;
4918 mj_lno_t block_size = range / num_teams + 1;
4919 Kokkos::parallel_for(block_policy, KOKKOS_LAMBDA(member_type team_member) {
4920 int team = team_member.league_rank();
4921 int team_offset = team * num_threads * num_parts;
4922 mj_lno_t begin = coordinate_begin_index + team * block_size;
4923 mj_lno_t end = begin + block_size;
4924 if(end > coordinate_end_index) {
4925 end = coordinate_end_index;
4928 Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, begin, end),
4930 int thread = team_member.team_rank();
4931 mj_lno_t i = local_coordinate_permutations(ii);
4932 mj_part_t p = local_assigned_part_ids(i);
4933 int index = team_offset + thread * num_parts + p;
4934 ++point_counter(index);
4942 Kokkos::parallel_for(
4943 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4944 KOKKOS_LAMBDA (
int dummy) {
4945 int num_sets = point_counter.size() / num_parts;
4946 for(
int set = num_sets - 1; set >= 1; set -=1) {
4947 int base = set * num_parts;
4948 for(
int part = 0; part < num_parts; ++part) {
4949 point_counter(base + part) = point_counter(base + part - num_parts);
4953 for(
int part = 0; part < num_parts; ++part) {
4954 point_counter(part) = 0;
4957 for(
int set = 1; set < num_sets; ++set) {
4958 int base = set * num_parts;
4959 for(
int part = 0; part < num_parts; ++part) {
4960 point_counter(base + part) += point_counter(base + part - num_parts);
4966 Kokkos::parallel_for(block_policy, KOKKOS_LAMBDA(member_type team_member) {
4967 int team = team_member.league_rank();
4968 int team_offset = team * num_threads * num_parts;
4969 mj_lno_t begin = coordinate_begin_index + team * block_size;
4970 mj_lno_t end = begin + block_size;
4971 if(end > coordinate_end_index) {
4972 end = coordinate_end_index;
4974 Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, begin, end),
4976 int thread = team_member.team_rank();
4977 mj_lno_t i = local_coordinate_permutations(ii);
4978 mj_part_t p = local_assigned_part_ids(i);
4979 int index = team_offset + thread * num_parts + p;
4980 int set_counter = (point_counter(index)++) + local_point_counts(p);
4981 local_new_coordinate_permutations(coordinate_begin_index + set_counter) = i;
5030template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5031 typename mj_part_t,
typename mj_node_t>
5032void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
5033 mj_node_t>::mj_get_new_cut_coordinates(
5034 mj_part_t current_concurrent_num_parts,
5036 const mj_part_t &num_cuts,
5037 const double &used_imbalance_tolerance,
5038 Kokkos::View<mj_scalar_t *, device_t> & current_global_part_weights,
5039 Kokkos::View<mj_scalar_t *, device_t> & current_local_part_weights,
5040 Kokkos::View<mj_scalar_t *, device_t> & current_part_target_weights,
5041 Kokkos::View<bool *, device_t> & current_cut_line_determined,
5042 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
5043 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_bounds,
5044 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bounds,
5045 Kokkos::View<mj_scalar_t *, device_t> & current_global_left_closest_points,
5046 Kokkos::View<mj_scalar_t *, device_t> & current_global_right_closest_points,
5047 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bound_weights,
5048 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_weights,
5049 Kokkos::View<mj_scalar_t *, device_t> & new_current_cut_coordinates,
5050 Kokkos::View<mj_scalar_t *, device_t> &
5051 current_part_cut_line_weight_to_put_left,
5052 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count)
5054 Kokkos::deep_copy(device_incomplete_cut_count, this->incomplete_cut_count);
5056 auto local_device_incomplete_cut_count = device_incomplete_cut_count;
5057 auto local_sEpsilon = sEpsilon;
5058 auto local_distribute_points_on_cut_lines = distribute_points_on_cut_lines;
5059 auto local_global_rectilinear_cut_weight = global_rectilinear_cut_weight;
5060 auto local_process_rectilinear_cut_weight = process_rectilinear_cut_weight;
5061 auto local_global_min_max_coord_total_weight =
5062 global_min_max_coord_total_weight;
5064 const auto _sEpsilon = this->sEpsilon;
5072 Kokkos::TeamPolicy<typename mj_node_t::execution_space>
5073 policy_one_team(1, Kokkos::AUTO());
5074 typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
5075 member_type member_type;
5076 Kokkos::parallel_for(policy_one_team, KOKKOS_LAMBDA(member_type team_member) {
5078 mj_scalar_t min_coordinate =
5079 local_global_min_max_coord_total_weight(kk);
5080 mj_scalar_t max_coordinate =
5081 local_global_min_max_coord_total_weight(
5082 kk + current_concurrent_num_parts);
5083 mj_scalar_t global_total_weight =
5084 local_global_min_max_coord_total_weight(
5085 kk + current_concurrent_num_parts * 2);
5087 Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, num_cuts),
5092 current_global_left_closest_points(i) > local_sEpsilon) {
5093 current_global_left_closest_points(i) =
5094 current_cut_coordinates(i);
5096 if(current_global_right_closest_points(i) -
5097 max_coordinate > local_sEpsilon) {
5098 current_global_right_closest_points(i) =
5099 current_cut_coordinates(i);
5102 team_member.team_barrier();
5104 Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, num_cuts),
5106 using algMJ_t = AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
5109 mj_scalar_t seen_weight_in_part = 0;
5111 mj_scalar_t expected_weight_in_part = 0;
5113 double imbalance_on_left = 0, imbalance_on_right = 0;
5114 if(local_distribute_points_on_cut_lines) {
5116 local_global_rectilinear_cut_weight(i) = 0;
5117 local_process_rectilinear_cut_weight(i) = 0;
5119 bool bContinue =
false;
5122 if(current_cut_line_determined(i)) {
5123 new_current_cut_coordinates(i) =
5124 current_cut_coordinates(i);
5129 seen_weight_in_part = current_global_part_weights(i * 2);
5132 expected_weight_in_part = current_part_target_weights(i);
5135 imbalance_on_left = algMJ_t::calculate_imbalance(seen_weight_in_part,
5136 expected_weight_in_part);
5139 imbalance_on_right = algMJ_t::calculate_imbalance(global_total_weight -
5140 seen_weight_in_part, global_total_weight - expected_weight_in_part);
5141 bool is_left_imbalance_valid = std::abs(imbalance_on_left) -
5142 used_imbalance_tolerance < local_sEpsilon ;
5143 bool is_right_imbalance_valid = std::abs(imbalance_on_right) -
5144 used_imbalance_tolerance < local_sEpsilon;
5146 if(is_left_imbalance_valid && is_right_imbalance_valid) {
5147 current_cut_line_determined(i) =
true;
5148 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5149 new_current_cut_coordinates(i) = current_cut_coordinates(i);
5151 else if(imbalance_on_left < 0) {
5153 if(local_distribute_points_on_cut_lines) {
5158 if(current_global_part_weights(i * 2 + 1) ==
5159 expected_weight_in_part) {
5161 current_cut_line_determined(i) =
true;
5162 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5165 new_current_cut_coordinates(i) =
5166 current_cut_coordinates(i);
5168 current_part_cut_line_weight_to_put_left(i) =
5169 current_local_part_weights(i * 2 + 1) -
5170 current_local_part_weights(i * 2);
5173 else if(current_global_part_weights(i * 2 + 1) >
5174 expected_weight_in_part) {
5177 current_cut_line_determined(i) =
true;
5178 Kokkos::atomic_add(&view_rectilinear_cut_count(0), 1);
5182 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5183 new_current_cut_coordinates(i) =
5184 current_cut_coordinates(i);
5185 local_process_rectilinear_cut_weight[i] =
5186 current_local_part_weights(i * 2 + 1) -
5187 current_local_part_weights(i * 2);
5196 current_cut_lower_bounds(i) =
5197 current_global_right_closest_points(i);
5200 current_cut_lower_bound_weights(i) = seen_weight_in_part;
5205 for(mj_part_t ii = i + 1; ii < num_cuts ; ++ii) {
5206 mj_scalar_t p_weight = current_global_part_weights(ii * 2);
5207 mj_scalar_t line_weight =
5208 current_global_part_weights(ii * 2 + 1);
5209 if(p_weight >= expected_weight_in_part) {
5214 if(p_weight == expected_weight_in_part) {
5215 current_cut_upper_bounds(i) =
5216 current_cut_coordinates(ii);
5217 current_cut_upper_weights(i) = p_weight;
5218 current_cut_lower_bounds(i) =
5219 current_cut_coordinates(ii);
5220 current_cut_lower_bound_weights(i) = p_weight;
5221 }
else if(p_weight < current_cut_upper_weights(i)) {
5224 current_cut_upper_bounds(i) =
5225 current_global_left_closest_points(ii);
5226 current_cut_upper_weights(i) = p_weight;
5232 if(line_weight >= expected_weight_in_part) {
5236 current_cut_upper_bounds(i) =
5237 current_cut_coordinates(ii);
5238 current_cut_upper_weights(i) = line_weight;
5239 current_cut_lower_bounds(i) =
5240 current_cut_coordinates(ii);
5241 current_cut_lower_bound_weights(i) = p_weight;
5246 if(p_weight <= expected_weight_in_part && p_weight >=
5247 current_cut_lower_bound_weights(i)) {
5248 current_cut_lower_bounds(i) =
5249 current_global_right_closest_points(ii);
5250 current_cut_lower_bound_weights(i) = p_weight;
5254 mj_scalar_t new_cut_position = 0;
5255 algMJ_t::mj_calculate_new_cut_position(
5256 current_cut_upper_bounds(i),
5257 current_cut_lower_bounds(i),
5258 current_cut_upper_weights(i),
5259 current_cut_lower_bound_weights(i),
5260 expected_weight_in_part, new_cut_position,
5265 if(std::abs(current_cut_coordinates(i) -
5266 new_cut_position) < local_sEpsilon) {
5267 current_cut_line_determined(i) =
true;
5268 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5271 new_current_cut_coordinates(i) =
5272 current_cut_coordinates(i);
5274 new_current_cut_coordinates(i) = new_cut_position;
5280 current_cut_upper_bounds(i) =
5281 current_global_left_closest_points(i);
5282 current_cut_upper_weights(i) =
5283 seen_weight_in_part;
5286 for(
int ii = i - 1; ii >= 0; --ii) {
5287 mj_scalar_t p_weight =
5288 current_global_part_weights(ii * 2);
5289 mj_scalar_t line_weight =
5290 current_global_part_weights(ii * 2 + 1);
5291 if(p_weight <= expected_weight_in_part) {
5292 if(p_weight == expected_weight_in_part) {
5295 current_cut_upper_bounds(i) =
5296 current_cut_coordinates(ii);
5297 current_cut_upper_weights(i) = p_weight;
5298 current_cut_lower_bounds(i) =
5299 current_cut_coordinates(ii);
5300 current_cut_lower_bound_weights(i) = p_weight;
5302 else if(p_weight > current_cut_lower_bound_weights(i)) {
5305 current_cut_lower_bounds(i) =
5306 current_global_right_closest_points(ii);
5307 current_cut_lower_bound_weights(i) = p_weight;
5313 if(line_weight > expected_weight_in_part) {
5314 current_cut_upper_bounds(i) =
5315 current_global_right_closest_points(ii);
5316 current_cut_upper_weights(i) = line_weight;
5326 if(p_weight >= expected_weight_in_part &&
5327 (p_weight < current_cut_upper_weights(i) ||
5328 (p_weight == current_cut_upper_weights(i) &&
5329 current_cut_upper_bounds(i) >
5330 current_global_left_closest_points(ii)))) {
5331 current_cut_upper_bounds(i) =
5332 current_global_left_closest_points(ii);
5333 current_cut_upper_weights(i) = p_weight;
5336 mj_scalar_t new_cut_position = 0;
5337 algMJ_t::mj_calculate_new_cut_position(
5338 current_cut_upper_bounds(i),
5339 current_cut_lower_bounds(i),
5340 current_cut_upper_weights(i),
5341 current_cut_lower_bound_weights(i),
5342 expected_weight_in_part,
5347 if(std::abs(current_cut_coordinates(i) -
5348 new_cut_position) < local_sEpsilon) {
5349 current_cut_line_determined(i) =
true;
5350 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5352 new_current_cut_coordinates(i) =
5353 current_cut_coordinates(i);
5355 new_current_cut_coordinates(i) =
5362 team_member.team_barrier();
5366 mj_part_t rectilinear_cut_count;
5367 Kokkos::parallel_reduce(
"Read bDoingWork",
5368 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0, 1),
5369 KOKKOS_LAMBDA(
int dummy,
int & set_single) {
5370 set_single = view_rectilinear_cut_count(0);
5371 }, rectilinear_cut_count);
5373 if(rectilinear_cut_count > 0) {
5374 auto host_local_process_rectilinear_cut_weight =
5375 Kokkos::create_mirror_view(Kokkos::HostSpace(),
5376 local_process_rectilinear_cut_weight);
5377 auto host_local_global_rectilinear_cut_weight =
5378 Kokkos::create_mirror_view(Kokkos::HostSpace(),
5379 local_global_rectilinear_cut_weight);
5380 Kokkos::deep_copy(host_local_process_rectilinear_cut_weight,
5381 local_process_rectilinear_cut_weight);
5382 Kokkos::deep_copy(host_local_global_rectilinear_cut_weight,
5383 local_global_rectilinear_cut_weight);
5384 Teuchos::scan<int,mj_scalar_t>(
5385 *comm, Teuchos::REDUCE_SUM,
5387 host_local_process_rectilinear_cut_weight.data(),
5388 host_local_global_rectilinear_cut_weight.data());
5389 Kokkos::deep_copy(local_process_rectilinear_cut_weight,
5390 host_local_process_rectilinear_cut_weight);
5391 Kokkos::deep_copy(local_global_rectilinear_cut_weight,
5392 host_local_global_rectilinear_cut_weight);
5394 Kokkos::parallel_for(
"finish up mj_get_new_cut_coordinates",
5395 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
5396 KOKKOS_LAMBDA(
int dummy) {
5397 for(mj_part_t i = 0; i < num_cuts; ++i) {
5399 if(local_global_rectilinear_cut_weight(i) > 0) {
5401 mj_scalar_t expected_part_weight = current_part_target_weights(i);
5403 mj_scalar_t necessary_weight_on_line_for_left =
5404 expected_part_weight - current_global_part_weights(i * 2);
5407 mj_scalar_t my_weight_on_line =
5408 local_process_rectilinear_cut_weight(i);
5412 mj_scalar_t weight_on_line_upto_process_inclusive =
5413 local_global_rectilinear_cut_weight(i);
5417 mj_scalar_t space_to_put_left =
5418 necessary_weight_on_line_for_left -
5419 weight_on_line_upto_process_inclusive;
5422 mj_scalar_t space_left_to_me =
5423 space_to_put_left + my_weight_on_line;
5436 if(space_left_to_me < 0) {
5439 current_part_cut_line_weight_to_put_left(i) = 0;
5441 else if(space_left_to_me >= my_weight_on_line) {
5445 current_part_cut_line_weight_to_put_left(i) =
5452 current_part_cut_line_weight_to_put_left(i) =
5459 view_rectilinear_cut_count(0) = 0;
5463 Kokkos::deep_copy(this->incomplete_cut_count, device_incomplete_cut_count);
5475template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5476 typename mj_part_t,
typename mj_node_t>
5477void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5478 get_processor_num_points_in_parts(
5479 mj_part_t num_procs,
5480 mj_part_t num_parts,
5481 mj_gno_t *&num_points_in_all_processor_parts)
5484 size_t allocation_size = num_parts * (num_procs + 1);
5491 mj_gno_t *num_local_points_in_each_part_to_reduce_sum =
5492 new mj_gno_t[allocation_size];
5496 mj_gno_t *my_local_points_to_reduce_sum =
5497 num_local_points_in_each_part_to_reduce_sum + num_procs * num_parts;
5501 mj_gno_t *my_local_point_counts_in_each_part =
5502 num_local_points_in_each_part_to_reduce_sum + this->myRank * num_parts;
5505 memset(num_local_points_in_each_part_to_reduce_sum, 0,
5506 sizeof(mj_gno_t)*allocation_size);
5508 auto local_new_part_xadj = this->new_part_xadj;
5509 Kokkos::View<mj_gno_t *, typename mj_node_t::device_type> points_per_part(
5510 Kokkos::ViewAllocateWithoutInitializing(
"points per part"), num_parts);
5511 Kokkos::parallel_for(
"get vals on device",
5512 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_gno_t>
5513 (0, num_parts), KOKKOS_LAMBDA(mj_gno_t i) {
5514 points_per_part(i) =
5515 local_new_part_xadj(i) - ((i == 0) ? 0 : local_new_part_xadj(i-1));
5517 auto host_points_per_part = Kokkos::create_mirror_view(points_per_part);
5518 Kokkos::deep_copy(host_points_per_part, points_per_part);
5519 for(
int i = 0; i < num_parts; ++i) {
5520 my_local_points_to_reduce_sum[i] = host_points_per_part(i);
5525 memcpy (my_local_point_counts_in_each_part, my_local_points_to_reduce_sum,
5526 sizeof(mj_gno_t) * (num_parts) );
5533 reduceAll<int, mj_gno_t>(
5535 Teuchos::REDUCE_SUM,
5537 num_local_points_in_each_part_to_reduce_sum,
5538 num_points_in_all_processor_parts);
5542 delete [] num_local_points_in_each_part_to_reduce_sum;
5560template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5561 typename mj_part_t,
typename mj_node_t>
5562bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5563 mj_check_to_migrate(
5564 size_t migration_reduce_all_population,
5565 mj_lno_t num_coords_for_last_dim_part,
5566 mj_part_t num_procs,
5567 mj_part_t num_parts,
5568 mj_gno_t *num_points_in_all_processor_parts)
5571 if(migration_reduce_all_population > future_reduceall_cutoff) {
5576 if(num_coords_for_last_dim_part < min_work_last_dim) {
5581 if(this->check_migrate_avoid_migration_option == 0) {
5582 double global_imbalance = 0;
5584 size_t global_shift = num_procs * num_parts;
5586 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5587 for(mj_part_t i = 0; i < num_parts; ++i) {
5588 double ideal_num = num_points_in_all_processor_parts[global_shift + i]
5589 / double(num_procs);
5591 global_imbalance += std::abs(ideal_num -
5592 num_points_in_all_processor_parts[ii * num_parts + i]) / (ideal_num);
5595 global_imbalance /= num_parts;
5596 global_imbalance /= num_procs;
5598 if(global_imbalance <= this->minimum_migration_imbalance) {
5624template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5625 typename mj_part_t,
typename mj_node_t>
5626void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5627 assign_send_destinations(
5628 mj_part_t num_parts,
5629 mj_part_t *part_assignment_proc_begin_indices,
5630 mj_part_t *processor_chains_in_parts,
5631 mj_lno_t *send_count_to_each_proc,
5632 int *coordinate_destinations) {
5634 auto host_new_part_xadj = Kokkos::create_mirror_view(this->new_part_xadj);
5635 deep_copy(host_new_part_xadj, this->new_part_xadj);
5637 auto host_new_coordinate_permutations =
5638 Kokkos::create_mirror_view(this->new_coordinate_permutations);
5639 deep_copy(host_new_coordinate_permutations, this->new_coordinate_permutations);
5641 for(mj_part_t p = 0; p < num_parts; ++p) {
5642 mj_lno_t part_begin = 0;
5643 if(p > 0) part_begin = host_new_part_xadj(p - 1);
5644 mj_lno_t part_end = host_new_part_xadj(p);
5646 mj_part_t proc_to_sent = part_assignment_proc_begin_indices[p];
5648 mj_lno_t num_total_send = 0;
5649 for(mj_lno_t j=part_begin; j < part_end; j++) {
5650 mj_lno_t local_ind = host_new_coordinate_permutations(j);
5651 while (num_total_send >= send_count_to_each_proc[proc_to_sent]) {
5655 part_assignment_proc_begin_indices[p] =
5656 processor_chains_in_parts[proc_to_sent];
5658 processor_chains_in_parts[proc_to_sent] = -1;
5660 proc_to_sent = part_assignment_proc_begin_indices[p];
5663 coordinate_destinations[local_ind] = proc_to_sent;
5689template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5690 typename mj_part_t,
typename mj_node_t>
5691void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5692 mj_assign_proc_to_parts(
5693 mj_gno_t * num_points_in_all_processor_parts,
5694 mj_part_t num_parts,
5695 mj_part_t num_procs,
5696 mj_lno_t *send_count_to_each_proc,
5697 std::vector<mj_part_t> &processor_ranks_for_subcomm,
5698 std::vector<mj_part_t> *next_future_num_parts_in_parts,
5699 mj_part_t &out_part_index,
5700 mj_part_t &output_part_numbering_begin_index,
5701 int * coordinate_destinations) {
5702 mj_gno_t *global_num_points_in_parts =
5703 num_points_in_all_processor_parts + num_procs * num_parts;
5704 mj_part_t *num_procs_assigned_to_each_part =
new mj_part_t[num_parts];
5707 bool did_i_find_my_group =
false;
5709 mj_part_t num_free_procs = num_procs;
5710 mj_part_t minimum_num_procs_required_for_rest_of_parts = num_parts - 1;
5712 double max_imbalance_difference = 0;
5713 mj_part_t max_differing_part = 0;
5716 for(mj_part_t i = 0; i < num_parts; i++) {
5719 double scalar_required_proc = num_procs *
5720 (double (global_num_points_in_parts[i]) /
5721 double (this->num_global_coords));
5724 mj_part_t required_proc =
5725 static_cast<mj_part_t
> (0.5 + scalar_required_proc);
5726 if(required_proc == 0) required_proc = 1;
5732 required_proc < minimum_num_procs_required_for_rest_of_parts) {
5733 required_proc = num_free_procs -
5734 (minimum_num_procs_required_for_rest_of_parts);
5738 num_free_procs -= required_proc;
5742 --minimum_num_procs_required_for_rest_of_parts;
5745 num_procs_assigned_to_each_part[i] = required_proc;
5750 double imbalance_wrt_ideal =
5751 (scalar_required_proc - required_proc) / required_proc;
5752 if(imbalance_wrt_ideal > max_imbalance_difference) {
5753 max_imbalance_difference = imbalance_wrt_ideal;
5754 max_differing_part = i;
5760 if(num_free_procs > 0) {
5761 num_procs_assigned_to_each_part[max_differing_part] += num_free_procs;
5768 mj_part_t *part_assignment_proc_begin_indices =
new mj_part_t[num_parts];
5772 mj_part_t *processor_chains_in_parts =
new mj_part_t [num_procs];
5773 mj_part_t *processor_part_assignments =
new mj_part_t[num_procs];
5782 for(
int i = 0; i < num_procs; ++i ) {
5783 processor_part_assignments[i] = -1;
5784 processor_chains_in_parts[i] = -1;
5786 for(
int i = 0; i < num_parts; ++i ) {
5787 part_assignment_proc_begin_indices[i] = -1;
5793 uSignedSortItem<mj_part_t, mj_gno_t, char> *
5794 sort_item_num_part_points_in_procs =
5795 new uSignedSortItem<mj_part_t, mj_gno_t, char>[num_procs];
5797 for(mj_part_t i = 0; i < num_parts; ++i) {
5802 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5803 sort_item_num_part_points_in_procs[ii].id = ii;
5806 if(processor_part_assignments[ii] == -1) {
5807 sort_item_num_part_points_in_procs[ii].val =
5808 num_points_in_all_processor_parts[ii * num_parts + i];
5810 sort_item_num_part_points_in_procs[ii].signbit = 1;
5823 sort_item_num_part_points_in_procs[ii].val =
5824 num_points_in_all_processor_parts[ii * num_parts + i];
5825 sort_item_num_part_points_in_procs[ii].signbit = 0;
5830 uqSignsort<mj_part_t, mj_gno_t,char>
5831 (num_procs, sort_item_num_part_points_in_procs);
5843 mj_part_t required_proc_count = num_procs_assigned_to_each_part[i];
5844 mj_gno_t total_num_points_in_part = global_num_points_in_parts[i];
5845 mj_gno_t ideal_num_points_in_a_proc = Teuchos::as<mj_gno_t>(
5846 ceil(total_num_points_in_part /
double (required_proc_count)));
5849 mj_part_t next_proc_to_send_index = num_procs - required_proc_count;
5850 mj_part_t next_proc_to_send_id =
5851 sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
5852 mj_lno_t space_left_in_sent_proc = ideal_num_points_in_a_proc -
5853 sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
5857 for(mj_part_t ii = num_procs - 1;
5858 ii >= num_procs - required_proc_count; --ii) {
5859 mj_part_t proc_id = sort_item_num_part_points_in_procs[ii].id;
5861 processor_part_assignments[proc_id] = i;
5864 bool did_change_sign =
false;
5866 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5869 if(sort_item_num_part_points_in_procs[ii].signbit == 0) {
5870 did_change_sign =
true;
5871 sort_item_num_part_points_in_procs[ii].signbit = 1;
5878 if(did_change_sign) {
5881 uqSignsort<mj_part_t, mj_gno_t>(num_procs - required_proc_count,
5882 sort_item_num_part_points_in_procs);
5897 if(!did_i_find_my_group) {
5898 for(mj_part_t ii = num_procs - 1; ii >=
5899 num_procs - required_proc_count; --ii) {
5901 mj_part_t proc_id_to_assign = sort_item_num_part_points_in_procs[ii].id;
5904 processor_ranks_for_subcomm.push_back(proc_id_to_assign);
5906 if(proc_id_to_assign == this->myRank) {
5908 did_i_find_my_group =
true;
5911 part_assignment_proc_begin_indices[i] = this->myRank;
5912 processor_chains_in_parts[this->myRank] = -1;
5916 send_count_to_each_proc[this->myRank] =
5917 sort_item_num_part_points_in_procs[ii].val;
5921 for(mj_part_t in = 0; in < i; ++in) {
5922 output_part_numbering_begin_index +=
5923 (*next_future_num_parts_in_parts)[in];
5931 if(!did_i_find_my_group) {
5932 processor_ranks_for_subcomm.clear();
5940 for(mj_part_t ii = num_procs - required_proc_count - 1; ii >= 0; --ii) {
5941 mj_part_t nonassigned_proc_id =
5942 sort_item_num_part_points_in_procs[ii].id;
5943 mj_lno_t num_points_to_sent =
5944 sort_item_num_part_points_in_procs[ii].val;
5950 if(num_points_to_sent < 0) {
5951 cout <<
"Migration - processor assignments - for part:" << i
5952 <<
"from proc:" << nonassigned_proc_id <<
" num_points_to_sent:"
5953 << num_points_to_sent << std::endl;
5958 switch (migration_type) {
5962 while (num_points_to_sent > 0) {
5964 if(num_points_to_sent <= space_left_in_sent_proc) {
5966 space_left_in_sent_proc -= num_points_to_sent;
5968 if(this->myRank == nonassigned_proc_id) {
5970 send_count_to_each_proc[next_proc_to_send_id] =
5975 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
5976 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
5977 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
5979 num_points_to_sent = 0;
5983 if(space_left_in_sent_proc > 0) {
5984 num_points_to_sent -= space_left_in_sent_proc;
5987 if(this->myRank == nonassigned_proc_id) {
5989 send_count_to_each_proc[next_proc_to_send_id] =
5990 space_left_in_sent_proc;
5991 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
5992 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
5993 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
5997 ++next_proc_to_send_index;
6000 if(next_part_to_send_index < nprocs - required_proc_count ) {
6001 cout <<
"Migration - processor assignments - for part:"
6003 <<
" next_part_to_send :" << next_part_to_send_index
6004 <<
" nprocs:" << nprocs
6005 <<
" required_proc_count:" << required_proc_count
6006 <<
" Error: next_part_to_send_index <" <<
6007 <<
" nprocs - required_proc_count" << std::endl;
6012 next_proc_to_send_id =
6013 sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
6015 space_left_in_sent_proc = ideal_num_points_in_a_proc -
6016 sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
6027 if(this->myRank == nonassigned_proc_id) {
6029 send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
6033 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6034 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6035 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6037 num_points_to_sent = 0;
6038 ++next_proc_to_send_index;
6042 if(next_proc_to_send_index == num_procs) {
6043 next_proc_to_send_index = num_procs - required_proc_count;
6046 next_proc_to_send_id =
6047 sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
6049 space_left_in_sent_proc = ideal_num_points_in_a_proc -
6050 sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
6063 this->assign_send_destinations(
6065 part_assignment_proc_begin_indices,
6066 processor_chains_in_parts,
6067 send_count_to_each_proc,
6068 coordinate_destinations);
6069 delete [] part_assignment_proc_begin_indices;
6070 delete [] processor_chains_in_parts;
6071 delete [] processor_part_assignments;
6072 delete [] sort_item_num_part_points_in_procs;
6073 delete [] num_procs_assigned_to_each_part;
6091template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6092 typename mj_part_t,
typename mj_node_t>
6093void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6094 assign_send_destinations2(
6095 mj_part_t num_parts,
6096 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment,
6097 int *coordinate_destinations,
6098 mj_part_t &output_part_numbering_begin_index,
6099 std::vector<mj_part_t> *next_future_num_parts_in_parts)
6101 mj_part_t part_shift_amount = output_part_numbering_begin_index;
6102 mj_part_t previous_processor = -1;
6104 auto local_new_part_xadj = Kokkos::create_mirror_view(this->new_part_xadj);
6105 Kokkos::deep_copy(local_new_part_xadj, this->new_part_xadj);
6107 auto local_new_coordinate_permutations =
6108 Kokkos::create_mirror_view(this->new_coordinate_permutations);
6109 Kokkos::deep_copy(local_new_coordinate_permutations,
6110 this->new_coordinate_permutations);
6112 for(mj_part_t i = 0; i < num_parts; ++i) {
6113 mj_part_t p = sort_item_part_to_proc_assignment[i].id;
6116 mj_lno_t part_begin_index = 0;
6119 part_begin_index = local_new_part_xadj(p - 1);
6122 mj_lno_t part_end_index = local_new_part_xadj(p);
6124 mj_part_t assigned_proc = sort_item_part_to_proc_assignment[i].val;
6125 if(this->myRank == assigned_proc && previous_processor != assigned_proc) {
6126 output_part_numbering_begin_index = part_shift_amount;
6128 previous_processor = assigned_proc;
6129 part_shift_amount += (*next_future_num_parts_in_parts)[p];
6131 for(mj_lno_t j= part_begin_index; j < part_end_index; j++) {
6132 mj_lno_t localInd = local_new_coordinate_permutations(j);
6133 coordinate_destinations[localInd] = assigned_proc;
6159template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6160 typename mj_part_t,
typename mj_node_t>
6161void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6162 mj_assign_parts_to_procs(
6163 mj_gno_t * num_points_in_all_processor_parts,
6164 mj_part_t num_parts,
6165 mj_part_t num_procs,
6166 mj_lno_t *send_count_to_each_proc,
6167 std::vector<mj_part_t> *next_future_num_parts_in_parts,
6168 mj_part_t &out_num_part,
6169 std::vector<mj_part_t> &out_part_indices,
6170 mj_part_t &output_part_numbering_begin_index,
6171 int *coordinate_destinations) {
6174 mj_gno_t *global_num_points_in_parts =
6175 num_points_in_all_processor_parts + num_procs * num_parts;
6176 out_part_indices.clear();
6180 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment =
6181 new uSortItem<mj_part_t, mj_part_t>[num_parts];
6182 uSortItem<mj_part_t, mj_gno_t> * sort_item_num_points_of_proc_in_part_i =
6183 new uSortItem<mj_part_t, mj_gno_t>[num_procs];
6187 mj_lno_t work_each =
6188 mj_lno_t (this->num_global_coords / (
double (num_procs)) + 0.5f);
6192 mj_lno_t *space_in_each_processor =
new mj_lno_t[num_procs];
6195 for(mj_part_t i = 0; i < num_procs; ++i) {
6196 space_in_each_processor[i] = work_each;
6203 mj_part_t *num_parts_proc_assigned =
new mj_part_t[num_procs];
6204 memset(num_parts_proc_assigned, 0,
sizeof(mj_part_t) * num_procs);
6205 int empty_proc_count = num_procs;
6209 uSortItem<mj_part_t, mj_gno_t> * sort_item_point_counts_in_parts =
6210 new uSortItem<mj_part_t, mj_gno_t>[num_parts];
6215 for(mj_part_t i = 0; i < num_parts; ++i) {
6216 sort_item_point_counts_in_parts[i].id = i;
6217 sort_item_point_counts_in_parts[i].val = global_num_points_in_parts[i];
6221 uqsort<mj_part_t, mj_gno_t>(num_parts, sort_item_point_counts_in_parts);
6226 for(mj_part_t j = 0; j < num_parts; ++j) {
6228 mj_part_t i = sort_item_point_counts_in_parts[num_parts - 1 - j].id;
6231 mj_gno_t load = global_num_points_in_parts[i];
6234 mj_part_t assigned_proc = -1;
6237 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
6238 sort_item_num_points_of_proc_in_part_i[ii].id = ii;
6243 if(empty_proc_count < num_parts - j ||
6244 num_parts_proc_assigned[ii] == 0) {
6246 sort_item_num_points_of_proc_in_part_i[ii].val =
6247 num_points_in_all_processor_parts[ii * num_parts + i];
6250 sort_item_num_points_of_proc_in_part_i[ii].val = -1;
6254 uqsort<mj_part_t, mj_gno_t>(num_procs,
6255 sort_item_num_points_of_proc_in_part_i);
6258 for(mj_part_t iii = num_procs - 1; iii >= 0; --iii) {
6259 mj_part_t ii = sort_item_num_points_of_proc_in_part_i[iii].id;
6260 if(assigned_proc == -1 ||
6261 (space_in_each_processor[ii] > space_in_each_processor[assigned_proc])) {
6264 else if(space_in_each_processor[ii] == space_in_each_processor[assigned_proc]) {
6265 if(ii < assigned_proc) {
6281 if(num_parts_proc_assigned[assigned_proc]++ == 0) {
6285 space_in_each_processor[assigned_proc] -= load;
6287 sort_item_part_to_proc_assignment[j].id = i;
6290 sort_item_part_to_proc_assignment[j].val = assigned_proc;
6293 if(assigned_proc == this->myRank) {
6295 out_part_indices.push_back(i);
6301 send_count_to_each_proc[assigned_proc] +=
6302 num_points_in_all_processor_parts[this->myRank * num_parts + i];
6305 delete [] num_parts_proc_assigned;
6306 delete [] sort_item_num_points_of_proc_in_part_i;
6307 delete [] sort_item_point_counts_in_parts;
6308 delete [] space_in_each_processor;
6311 uqsort<mj_part_t, mj_part_t>(num_parts, sort_item_part_to_proc_assignment);
6314 this->assign_send_destinations2(
6316 sort_item_part_to_proc_assignment,
6317 coordinate_destinations,
6318 output_part_numbering_begin_index,
6319 next_future_num_parts_in_parts);
6321 delete [] sort_item_part_to_proc_assignment;
6348template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6349 typename mj_part_t,
typename mj_node_t>
6350void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6351 mj_migration_part_proc_assignment(
6352 mj_gno_t * num_points_in_all_processor_parts,
6353 mj_part_t num_parts,
6354 mj_part_t num_procs,
6355 mj_lno_t *send_count_to_each_proc,
6356 std::vector<mj_part_t> &processor_ranks_for_subcomm,
6357 std::vector<mj_part_t> *next_future_num_parts_in_parts,
6358 mj_part_t &out_num_part,
6359 std::vector<mj_part_t> &out_part_indices,
6360 mj_part_t &output_part_numbering_begin_index,
6361 int *coordinate_destinations)
6363 processor_ranks_for_subcomm.clear();
6365 if(num_procs > num_parts) {
6370 mj_part_t out_part_index = 0;
6372 this->mj_assign_proc_to_parts(
6373 num_points_in_all_processor_parts,
6376 send_count_to_each_proc,
6377 processor_ranks_for_subcomm,
6378 next_future_num_parts_in_parts,
6380 output_part_numbering_begin_index,
6381 coordinate_destinations
6385 out_part_indices.clear();
6386 out_part_indices.push_back(out_part_index);
6393 processor_ranks_for_subcomm.push_back(this->myRank);
6398 this->mj_assign_parts_to_procs(
6399 num_points_in_all_processor_parts,
6402 send_count_to_each_proc,
6403 next_future_num_parts_in_parts,
6406 output_part_numbering_begin_index,
6407 coordinate_destinations);
6424template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6425 typename mj_part_t,
typename mj_node_t>
6426void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6428 mj_part_t num_procs,
6429 mj_lno_t &num_new_local_points,
6430 std::string iteration,
6431 int *coordinate_destinations,
6432 mj_part_t num_parts)
6435#ifdef ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
6436 if(
sizeof(mj_lno_t) <=
sizeof(
int)) {
6440 ZOLTAN_COMM_OBJ *plan = NULL;
6441 MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->comm));
6442 int num_incoming_gnos = 0;
6443 int message_tag = 7859;
6445 this->mj_env->timerStart(MACRO_TIMERS,
6446 mj_timer_base_string +
"Migration Z1PlanCreating-" + iteration);
6447 int ierr = Zoltan_Comm_Create(
6449 int(this->num_local_coords),
6450 coordinate_destinations,
6453 &num_incoming_gnos);
6456 this->mj_env->timerStop(MACRO_TIMERS,
6457 mj_timer_base_string +
"Migration Z1PlanCreating-" + iteration);
6459 this->mj_env->timerStart(MACRO_TIMERS,
6460 mj_timer_base_string +
"Migration Z1Migration-" + iteration);
6468 auto host_current_mj_gnos = Kokkos::create_mirror_view(
6469 Kokkos::HostSpace(), this->current_mj_gnos);
6470 Kokkos::deep_copy(host_current_mj_gnos, this->current_mj_gnos);
6471 Kokkos::View<mj_gno_t*, device_t> dst_gnos(
6472 Kokkos::ViewAllocateWithoutInitializing(
"dst_gnos"), num_incoming_gnos);
6473 auto host_dst_gnos = Kokkos::create_mirror_view(
6474 Kokkos::HostSpace(), dst_gnos);
6476 ierr = Zoltan_Comm_Do(
6479 (
char *) host_current_mj_gnos.data(),
6481 (
char *) host_dst_gnos.data());
6483 Kokkos::deep_copy(dst_gnos, host_dst_gnos);
6484 this->current_mj_gnos = dst_gnos;
6490 auto host_src_coordinates = Kokkos::create_mirror_view(
6491 Kokkos::HostSpace(), this->mj_coordinates);
6492 Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
6493 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
6494 dst_coordinates(Kokkos::ViewAllocateWithoutInitializing(
"mj_coordinates"),
6495 num_incoming_gnos, this->coord_dim);
6496 auto host_dst_coordinates = Kokkos::create_mirror_view(
6497 Kokkos::HostSpace(), dst_coordinates);
6498 for(
int i = 0; i < this->coord_dim; ++i) {
6499 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sub_host_src_coordinates
6500 = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
6501 Kokkos::View<mj_scalar_t *, Kokkos::HostSpace> sub_host_dst_coordinates
6502 = Kokkos::subview(host_dst_coordinates, Kokkos::ALL, i);
6505 ierr = Zoltan_Comm_Do(
6508 (
char *) sub_host_src_coordinates.data(),
6509 sizeof(mj_scalar_t),
6510 (
char *) sub_host_dst_coordinates.data());
6513 deep_copy(dst_coordinates, host_dst_coordinates);
6514 this->mj_coordinates = dst_coordinates;
6519 auto host_src_weights = Kokkos::create_mirror_view(
6520 Kokkos::HostSpace(), this->mj_weights);
6521 Kokkos::deep_copy(host_src_weights, this->mj_weights);
6522 Kokkos::View<mj_scalar_t**, device_t> dst_weights(
6523 Kokkos::ViewAllocateWithoutInitializing(
"mj_weights"),
6524 num_incoming_gnos, this->num_weights_per_coord);
6525 auto host_dst_weights = Kokkos::create_mirror_view(dst_weights);
6526 for(
int i = 0; i < this->num_weights_per_coord; ++i) {
6527 auto sub_host_src_weights
6528 = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
6529 auto sub_host_dst_weights
6530 = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
6531 ArrayRCP<mj_scalar_t> sent_weight(this->num_local_coords);
6533 for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
6534 sent_weight[n] = sub_host_src_weights(n);
6536 ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
6538 ierr = Zoltan_Comm_Do(
6541 (
char *) sent_weight.getRawPtr(),
6542 sizeof(mj_scalar_t),
6543 (
char *) received_weight.getRawPtr());
6546 for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
6547 sub_host_dst_weights(n) = received_weight[n];
6550 deep_copy(dst_weights, host_dst_weights);
6551 this->mj_weights = dst_weights;
6557 Kokkos::View<int *, Kokkos::HostSpace> dst_owners_of_coordinate(
6558 Kokkos::ViewAllocateWithoutInitializing(
"owner_of_coordinate"),
6561 ierr = Zoltan_Comm_Do(
6564 (
char *) owner_of_coordinate.data(),
6566 (
char *) dst_owners_of_coordinate.data());
6568 this->owner_of_coordinate = dst_owners_of_coordinate;
6575 auto host_src_assigned_part_ids = Kokkos::create_mirror_view(
6576 Kokkos::HostSpace(), this->assigned_part_ids);
6577 Kokkos::deep_copy(host_src_assigned_part_ids, this->assigned_part_ids);
6578 Kokkos::View<int *, device_t> dst_assigned_part_ids(
6579 Kokkos::ViewAllocateWithoutInitializing(
"assigned_part_ids"),
6581 auto host_dst_assigned_part_ids = Kokkos::create_mirror_view(
6582 Kokkos::HostSpace(), dst_assigned_part_ids);
6583 mj_part_t *new_parts =
new mj_part_t[num_incoming_gnos];
6584 if(num_procs < num_parts) {
6586 ierr = Zoltan_Comm_Do(
6589 (
char *) host_src_assigned_part_ids.data(),
6591 (
char *) host_dst_assigned_part_ids.data());
6593 Kokkos::deep_copy(dst_assigned_part_ids, host_dst_assigned_part_ids);
6597 this->assigned_part_ids = dst_assigned_part_ids;
6600 ierr = Zoltan_Comm_Destroy(&plan);
6602 num_new_local_points = num_incoming_gnos;
6603 this->mj_env->timerStop(MACRO_TIMERS,
6604 mj_timer_base_string +
"Migration Z1Migration-" + iteration);
6609 this->mj_env->timerStart(MACRO_TIMERS, mj_timer_base_string +
6610 "Migration DistributorPlanCreating-" + iteration);
6612 Tpetra::Distributor distributor(this->comm);
6613 ArrayView<const mj_part_t> destinations( coordinate_destinations,
6614 this->num_local_coords);
6615 mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
6616 this->mj_env->timerStop(MACRO_TIMERS, mj_timer_base_string +
6617 "Migration DistributorPlanCreating-" + iteration);
6618 this->mj_env->timerStart(MACRO_TIMERS, mj_timer_base_string +
6619 "Migration DistributorMigration-" + iteration);
6627 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos(
6628 Kokkos::ViewAllocateWithoutInitializing(
"received_gnos"),
6631 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> sent_gnos(
6632 Kokkos::ViewAllocateWithoutInitializing(
"sent_gnos"),
6633 this->current_mj_gnos.extent(0));
6634 Kokkos::deep_copy(sent_gnos, this->current_mj_gnos);
6636 distributor.doPostsAndWaits(sent_gnos, 1, received_gnos);
6638 this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
6639 Kokkos::ViewAllocateWithoutInitializing(
"gids"), num_incoming_gnos);
6641 Kokkos::deep_copy(this->current_mj_gnos, received_gnos);
6646 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
6647 dst_coordinates(
"mj_coordinates", num_incoming_gnos, this->coord_dim);
6649 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, Kokkos::HostSpace>
6650 host_src_coordinates(
6651 Kokkos::ViewAllocateWithoutInitializing(
"host_coords"),
6652 this->mj_coordinates.extent(0), this->mj_coordinates.extent(1));
6653 Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
6655 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_coord(
6656 Kokkos::ViewAllocateWithoutInitializing(
"received_coord"),
6659 for(
int i = 0; i < this->coord_dim; ++i) {
6663 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sent_coord
6664 = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
6666 distributor.doPostsAndWaits(sent_coord, 1, received_coord);
6668 Kokkos::deep_copy(Kokkos::subview(dst_coordinates, Kokkos::ALL, i),
6674 this->mj_coordinates = dst_coordinates;
6677 Kokkos::View<mj_scalar_t**, device_t> dst_weights(
6678 "mj_weights", num_incoming_gnos, this->num_weights_per_coord);
6679 auto host_dst_weights = Kokkos::create_mirror_view(Kokkos::HostSpace(),
6682 auto host_src_weights = Kokkos::create_mirror_view_and_copy(
6683 Kokkos::HostSpace(), this->mj_weights);
6686 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sent_weight(
6687 Kokkos::ViewAllocateWithoutInitializing(
"send_weight_buffer"),
6688 this->num_local_coords);
6690 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_weight(
6691 Kokkos::ViewAllocateWithoutInitializing(
"received_weight_buffer"),
6694 for(
int i = 0; i < this->num_weights_per_coord; ++i) {
6696 auto sub_host_src_weights
6697 = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
6699 auto sub_host_dst_weights
6700 = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
6707 for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
6708 sent_weight[n] = sub_host_src_weights(n);
6711 distributor.doPostsAndWaits(sent_weight, 1, received_weight);
6714 for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
6715 sub_host_dst_weights(n) = received_weight[n];
6718 Kokkos::deep_copy(dst_weights, host_dst_weights);
6719 this->mj_weights = dst_weights;
6724 Kokkos::View<int *, Kokkos::HostSpace> received_owners(
6725 Kokkos::ViewAllocateWithoutInitializing(
"owner_of_coordinate"),
6728 distributor.doPostsAndWaits(owner_of_coordinate, 1, received_owners);
6730 this->owner_of_coordinate = received_owners;
6736 if(num_procs < num_parts) {
6737 Kokkos::View<mj_part_t*, Kokkos::HostSpace> sent_partids(
6738 Kokkos::ViewAllocateWithoutInitializing(
"host_parts"),
6739 this->assigned_part_ids.extent(0));
6740 Kokkos::deep_copy(sent_partids, assigned_part_ids);
6742 Kokkos::View<mj_part_t*, Kokkos::HostSpace> received_partids(
6743 Kokkos::ViewAllocateWithoutInitializing(
"received_partids"),
6746 distributor.doPostsAndWaits(sent_partids, 1, received_partids);
6748 this->assigned_part_ids = Kokkos::View<mj_part_t *, device_t>
6749 (
"assigned_part_ids", num_incoming_gnos);
6750 Kokkos::deep_copy(this->assigned_part_ids, received_partids);
6753 this->assigned_part_ids = Kokkos::View<mj_part_t *, device_t>
6754 (
"assigned_part_ids", num_incoming_gnos);
6756 this->mj_env->timerStop(MACRO_TIMERS,
"" + mj_timer_base_string +
6757 "Migration DistributorMigration-" + iteration);
6759 num_new_local_points = num_incoming_gnos;
6768template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6769 typename mj_part_t,
typename mj_node_t>
6770void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6771 create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm)
6773 mj_part_t group_size = processor_ranks_for_subcomm.size();
6774 mj_part_t *ids =
new mj_part_t[group_size];
6775 for(mj_part_t i = 0; i < group_size; ++i) {
6776 ids[i] = processor_ranks_for_subcomm[i];
6778 ArrayView<const mj_part_t> idView(ids, group_size);
6779 this->comm = this->comm->createSubcommunicator(idView);
6788template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6789 typename mj_part_t,
typename mj_node_t>
6790void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6791 fill_permutation_array(
6792 mj_part_t output_num_parts,
6793 mj_part_t num_parts)
6796 if(output_num_parts == 1) {
6797 auto local_new_coordinate_permutations = this->new_coordinate_permutations;
6798 Kokkos::parallel_for(
6799 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
6800 (0, this->num_local_coords),
6801 KOKKOS_LAMBDA(mj_lno_t i) {
6802 local_new_coordinate_permutations(i) = i;
6804 auto local_new_part_xadj = this->new_part_xadj;
6805 auto local_num_local_coords = this->num_local_coords;
6806 Kokkos::parallel_for(
6807 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0,1),
6808 KOKKOS_LAMBDA(
int dummy) {
6809 local_new_part_xadj(0) = local_num_local_coords;
6813 auto local_num_local_coords = this->num_local_coords;
6814 auto local_assigned_part_ids = this->assigned_part_ids;
6815 auto local_new_part_xadj = this->new_part_xadj;
6816 auto local_new_coordinate_permutations = this->new_coordinate_permutations;
6819 Kokkos::View<mj_part_t*, device_t> part_shifts(
"part_shifts", num_parts);
6824 Kokkos::View<mj_lno_t*, device_t> num_points_in_parts(
6825 "num_points_in_parts", num_parts);
6827 Kokkos::parallel_for(
6828 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0,1),
6829 KOKKOS_LAMBDA(
int dummy) {
6831 for(mj_lno_t i = 0; i < local_num_local_coords; ++i) {
6832 mj_part_t ii = local_assigned_part_ids(i);
6833 ++num_points_in_parts(ii);
6838 mj_lno_t prev_index = 0;
6839 for(mj_part_t i = 0; i < num_parts; ++i) {
6840 if(num_points_in_parts(i) > 0) {
6841 local_new_part_xadj(p) = prev_index + num_points_in_parts(i);
6842 prev_index += num_points_in_parts(i);
6843 part_shifts(i) = p++;
6848 mj_part_t assigned_num_parts = p - 1;
6849 for(;p < num_parts; ++p) {
6850 local_new_part_xadj(p) =
6851 local_new_part_xadj(assigned_num_parts);
6853 for(mj_part_t i = 0; i < output_num_parts; ++i) {
6854 num_points_in_parts(i) = local_new_part_xadj(i);
6860 for(mj_lno_t i = local_num_local_coords - 1; i >= 0; --i) {
6862 part_shifts[mj_part_t(local_assigned_part_ids(i))];
6863 local_new_coordinate_permutations(--num_points_in_parts[part]) = i;
6893template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6894 typename mj_part_t,
typename mj_node_t>
6895bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6896 mj_perform_migration(
6897 mj_part_t input_num_parts,
6898 mj_part_t &output_num_parts,
6899 std::vector<mj_part_t> *next_future_num_parts_in_parts,
6900 mj_part_t &output_part_begin_index,
6901 size_t migration_reduce_all_population,
6902 mj_lno_t num_coords_for_last_dim_part,
6903 std::string iteration,
6904 RCP<mj_partBoxVector_t> &input_part_boxes,
6905 RCP<mj_partBoxVector_t> &output_part_boxes)
6907 mj_part_t num_procs = this->comm->getSize();
6908 this->myRank = this->comm->getRank();
6913 mj_gno_t *num_points_in_all_processor_parts =
6914 new mj_gno_t[input_num_parts * (num_procs + 1)];
6917 this->get_processor_num_points_in_parts(
6920 num_points_in_all_processor_parts);
6923 if(!this->mj_check_to_migrate(
6924 migration_reduce_all_population,
6925 num_coords_for_last_dim_part,
6928 num_points_in_all_processor_parts)) {
6929 delete [] num_points_in_all_processor_parts;
6933 mj_lno_t *send_count_to_each_proc = NULL;
6934 int *coordinate_destinations =
new int[this->num_local_coords];
6935 send_count_to_each_proc =
new mj_lno_t[num_procs];
6937 for(
int i = 0; i < num_procs; ++i) {
6938 send_count_to_each_proc[i] = 0;
6941 std::vector<mj_part_t> processor_ranks_for_subcomm;
6942 std::vector<mj_part_t> out_part_indices;
6945 this->mj_migration_part_proc_assignment(
6946 num_points_in_all_processor_parts,
6949 send_count_to_each_proc,
6950 processor_ranks_for_subcomm,
6951 next_future_num_parts_in_parts,
6954 output_part_begin_index,
6955 coordinate_destinations);
6957 delete [] send_count_to_each_proc;
6958 std::vector <mj_part_t> tmpv;
6960 std::sort (out_part_indices.begin(), out_part_indices.end());
6961 mj_part_t outP = out_part_indices.size();
6962 mj_gno_t new_global_num_points = 0;
6963 mj_gno_t *global_num_points_in_parts =
6964 num_points_in_all_processor_parts + num_procs * input_num_parts;
6966 if(this->mj_keep_part_boxes) {
6967 input_part_boxes->clear();
6972 for(mj_part_t i = 0; i < outP; ++i) {
6973 mj_part_t ind = out_part_indices[i];
6974 new_global_num_points += global_num_points_in_parts[ind];
6975 tmpv.push_back((*next_future_num_parts_in_parts)[ind]);
6976 if(this->mj_keep_part_boxes) {
6977 input_part_boxes->push_back((*output_part_boxes)[ind]);
6982 if(this->mj_keep_part_boxes) {
6983 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
6984 input_part_boxes = output_part_boxes;
6985 output_part_boxes = tmpPartBoxes;
6987 next_future_num_parts_in_parts->clear();
6988 for(mj_part_t i = 0; i < outP; ++i) {
6989 mj_part_t p = tmpv[i];
6990 next_future_num_parts_in_parts->push_back(p);
6993 delete [] num_points_in_all_processor_parts;
6995 mj_lno_t num_new_local_points = 0;
6997 this->mj_migrate_coords(
6999 num_new_local_points,
7001 coordinate_destinations,
7004 delete [] coordinate_destinations;
7005 if(this->num_local_coords != num_new_local_points) {
7006 this->new_coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>
7007 (Kokkos::ViewAllocateWithoutInitializing(
"new_coordinate_permutations"),
7008 num_new_local_points);
7009 this->coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>
7010 (Kokkos::ViewAllocateWithoutInitializing(
"coordinate_permutations"),
7011 num_new_local_points);
7013 this->num_local_coords = num_new_local_points;
7014 this->num_global_coords = new_global_num_points;
7017 this->create_sub_communicator(processor_ranks_for_subcomm);
7019 processor_ranks_for_subcomm.clear();
7022 this->fill_permutation_array(output_num_parts, input_num_parts);
7045template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7046 typename mj_part_t,
typename mj_node_t>
7047void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
7048 create_consistent_chunks(
7049 mj_part_t num_parts,
7050 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
7051 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
7052 mj_lno_t coordinate_begin,
7053 mj_lno_t coordinate_end,
7054 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
7055 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj,
7057 bool longest_dim_part,
7058 uSignedSortItem<int, mj_scalar_t, char> * p_coord_dimension_range_sorted)
7068 mj_part_t no_cuts = num_parts - 1;
7072 if(this->distribute_points_on_cut_lines) {
7073 auto local_thread_cut_line_weight_to_put_left =
7074 this->thread_cut_line_weight_to_put_left;
7075 auto local_thread_part_weight_work =
7076 this->thread_part_weight_work;
7077 auto local_sEpsilon = this->sEpsilon;
7079 Kokkos::parallel_for(
7080 Kokkos::RangePolicy<
typename mj_node_t::execution_space,
7081 mj_part_t> (0, no_cuts), KOKKOS_LAMBDA (mj_part_t i) {
7083 mj_scalar_t left_weight = used_local_cut_line_weight_to_left(i);
7084 if(left_weight > local_sEpsilon) {
7086 mj_scalar_t thread_ii_weight_on_cut =
7087 local_thread_part_weight_work(i * 2 + 1) -
7088 local_thread_part_weight_work(i * 2);
7089 if(thread_ii_weight_on_cut < left_weight) {
7090 local_thread_cut_line_weight_to_put_left(i) =
7091 thread_ii_weight_on_cut;
7094 local_thread_cut_line_weight_to_put_left(i) = left_weight;
7098 local_thread_cut_line_weight_to_put_left(i) = 0;
7103 auto local_least_signifiance = least_signifiance;
7104 auto local_significance_mul = significance_mul;
7105 Kokkos::parallel_for(
7106 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
7107 (0, 1), KOKKOS_LAMBDA (
int dummy) {
7111 for(mj_part_t i = no_cuts - 1; i > 0 ; --i) {
7112 mj_scalar_t cut1 = current_concurrent_cut_coordinate(i-1);
7113 mj_scalar_t cut2 = current_concurrent_cut_coordinate(i);
7114 mj_scalar_t delta = cut2 - cut1;
7115 mj_scalar_t abs_delta = (delta > 0) ? delta : -delta;
7116 if(abs_delta < local_sEpsilon) {
7117 local_thread_cut_line_weight_to_put_left(i) -=
7118 local_thread_cut_line_weight_to_put_left(i - 1);
7120 local_thread_cut_line_weight_to_put_left(i) =
7121 static_cast<long long>((local_thread_cut_line_weight_to_put_left(i) +
7122 local_least_signifiance) * local_significance_mul) /
7123 static_cast<mj_scalar_t
>(local_significance_mul);
7129 auto local_thread_point_counts = this->thread_point_counts;
7130 Kokkos::parallel_for(
7131 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
7132 (0, num_parts), KOKKOS_LAMBDA (mj_part_t i) {
7133 local_thread_point_counts(i) = 0;
7144 mj_part_t *cut_map =
new mj_part_t[no_cuts];
7146 typedef uMultiSortItem<mj_lno_t, int, mj_scalar_t> multiSItem;
7147 typedef std::vector< multiSItem > multiSVector;
7148 typedef std::vector<multiSVector> multiS2Vector;
7151 std::vector<mj_scalar_t *>allocated_memory;
7154 multiS2Vector sort_vector_points_on_cut;
7157 mj_part_t different_cut_count = 1;
7163 multiSVector tmpMultiSVector;
7164 sort_vector_points_on_cut.push_back(tmpMultiSVector);
7166 auto local_current_concurrent_cut_coordinate =
7167 current_concurrent_cut_coordinate;
7168 auto host_current_concurrent_cut_coordinate =
7169 Kokkos::create_mirror_view(local_current_concurrent_cut_coordinate);
7170 Kokkos::deep_copy(host_current_concurrent_cut_coordinate,
7171 local_current_concurrent_cut_coordinate);
7173 for(mj_part_t i = 1; i < no_cuts ; ++i) {
7176 if(std::abs(host_current_concurrent_cut_coordinate(i) -
7177 host_current_concurrent_cut_coordinate(i-1)) < this->sEpsilon) {
7178 cut_map[i] = cut_map[i-1];
7181 cut_map[i] = different_cut_count++;
7182 multiSVector tmp2MultiSVector;
7183 sort_vector_points_on_cut.push_back(tmp2MultiSVector);
7186 Kokkos::deep_copy(current_concurrent_cut_coordinate,
7187 host_current_concurrent_cut_coordinate);
7190 auto host_coordinate_permutations =
7191 Kokkos::create_mirror_view(coordinate_permutations);
7192 Kokkos::deep_copy(host_coordinate_permutations, coordinate_permutations);
7194 auto host_assigned_part_ids = Kokkos::create_mirror_view(assigned_part_ids);
7195 Kokkos::deep_copy(host_assigned_part_ids, assigned_part_ids);
7197 auto host_mj_coordinates = Kokkos::create_mirror_view(mj_coordinates);
7198 Kokkos::deep_copy(host_mj_coordinates, mj_coordinates);
7200 auto host_thread_point_counts = Kokkos::create_mirror_view(thread_point_counts);
7201 Kokkos::deep_copy(host_thread_point_counts, thread_point_counts);
7203 auto local_coord_dim = this->coord_dim;
7205 for(mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii) {
7206 mj_lno_t i = host_coordinate_permutations(ii);
7207 mj_part_t pp = host_assigned_part_ids(i);
7208 mj_part_t p = pp / 2;
7211 mj_scalar_t *vals =
new mj_scalar_t[local_coord_dim -1];
7212 allocated_memory.push_back(vals);
7217 if(longest_dim_part) {
7219 for(
int dim = local_coord_dim - 2; dim >= 0; --dim) {
7222 int next_largest_coord_dim = p_coord_dimension_range_sorted[dim].id;
7227 host_mj_coordinates(i,next_largest_coord_dim);
7231 for(
int dim = coordInd + 1; dim < local_coord_dim; ++dim) {
7232 vals[val_ind++] = host_mj_coordinates(i,dim);
7234 for(
int dim = 0; dim < coordInd; ++dim) {
7235 vals[val_ind++] = host_mj_coordinates(i,dim);
7239 multiSItem tempSortItem(i, local_coord_dim -1, vals);
7241 mj_part_t cmap = cut_map[p];
7242 sort_vector_points_on_cut[cmap].push_back(tempSortItem);
7246 ++host_thread_point_counts(p);
7247 host_assigned_part_ids(i) = p;
7252 for(mj_part_t i = 0; i < different_cut_count; ++i) {
7253 std::sort (sort_vector_points_on_cut[i].begin(),
7254 sort_vector_points_on_cut[i].end());
7257 mj_part_t previous_cut_map = cut_map[0];
7259 auto host_thread_cut_line_weight_to_put_left =
7260 Kokkos::create_mirror_view(thread_cut_line_weight_to_put_left);
7261 Kokkos::deep_copy(host_thread_cut_line_weight_to_put_left,
7262 thread_cut_line_weight_to_put_left);
7264 auto host_mj_weights = Kokkos::create_mirror_view(mj_weights);
7265 Kokkos::deep_copy(host_mj_weights, mj_weights);
7275 mj_scalar_t weight_stolen_from_previous_part = 0;
7276 for(mj_part_t p = 0; p < no_cuts; ++p) {
7277 mj_part_t mapped_cut = cut_map[p];
7281 if(previous_cut_map != mapped_cut) {
7282 mj_lno_t sort_vector_end = (mj_lno_t)
7283 sort_vector_points_on_cut[previous_cut_map].size() - 1;
7284 for(; sort_vector_end >= 0; --sort_vector_end) {
7286 sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
7287 mj_lno_t i = t.index;
7288 ++host_thread_point_counts(p);
7289 host_assigned_part_ids(i) = p;
7291 sort_vector_points_on_cut[previous_cut_map].clear();
7295 mj_lno_t sort_vector_end = (mj_lno_t)
7296 sort_vector_points_on_cut[mapped_cut].size() - 1;
7302 for(; sort_vector_end >= 0; --sort_vector_end) {
7305 multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_end];
7307 mj_lno_t i = t.index;
7308 mj_scalar_t w = this->mj_uniform_weights(0) ? 1 :
7309 this->mj_weights(i,0);
7311 if(host_thread_cut_line_weight_to_put_left(p) +
7312 weight_stolen_from_previous_part> this->sEpsilon &&
7313 host_thread_cut_line_weight_to_put_left(p) +
7314 weight_stolen_from_previous_part -
7315 std::abs(host_thread_cut_line_weight_to_put_left(p) +
7316 weight_stolen_from_previous_part - w)> this->sEpsilon)
7318 host_thread_cut_line_weight_to_put_left(p) -= w;
7320 sort_vector_points_on_cut[mapped_cut].pop_back();
7322 ++host_thread_point_counts(p);
7323 host_assigned_part_ids(i) = p;
7327 if(p < no_cuts - 1 &&
7328 host_thread_cut_line_weight_to_put_left(p) < this->sEpsilon) {
7329 if(mapped_cut == cut_map[p + 1] ) {
7332 if(previous_cut_map != mapped_cut) {
7333 weight_stolen_from_previous_part =
7334 host_thread_cut_line_weight_to_put_left(p);
7339 weight_stolen_from_previous_part +=
7340 host_thread_cut_line_weight_to_put_left(p);
7344 weight_stolen_from_previous_part =
7345 -host_thread_cut_line_weight_to_put_left(p);
7354 if(p < no_cuts - 1 && mapped_cut == cut_map[p + 1]) {
7355 if(previous_cut_map != mapped_cut) {
7356 weight_stolen_from_previous_part =
7357 host_thread_cut_line_weight_to_put_left(p);
7360 weight_stolen_from_previous_part +=
7361 host_thread_cut_line_weight_to_put_left(p);
7365 weight_stolen_from_previous_part =
7366 -host_thread_cut_line_weight_to_put_left(p);
7372 previous_cut_map = mapped_cut;
7377 mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[
7378 previous_cut_map].size() - 1;
7384 for(; sort_vector_end >= 0; --sort_vector_end) {
7386 multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
7389 mj_lno_t i = t.index;
7390 ++host_thread_point_counts(no_cuts);
7391 host_assigned_part_ids(i) = no_cuts;
7394 sort_vector_points_on_cut[previous_cut_map].clear();
7398 mj_lno_t vSize = (mj_lno_t) allocated_memory.size();
7399 for(mj_lno_t i = 0; i < vSize; ++i) {
7400 delete [] allocated_memory[i];
7403 auto local_out_part_xadj = out_part_xadj;
7404 auto host_out_part_xadj = Kokkos::create_mirror_view(local_out_part_xadj);
7405 Kokkos::deep_copy(host_out_part_xadj, out_part_xadj);
7408 for(mj_part_t j = 0; j < num_parts; ++j) {
7409 host_out_part_xadj(j) = host_thread_point_counts(j);
7410 host_thread_point_counts(j) = 0;
7414 for(mj_part_t j = 1; j < num_parts; ++j) {
7415 host_out_part_xadj(j) += host_out_part_xadj(j - 1);
7420 for(mj_part_t j = 1; j < num_parts; ++j) {
7421 host_thread_point_counts(j) += host_out_part_xadj(j - 1);
7424 auto host_new_coordinate_permutations =
7425 Kokkos::create_mirror_view(new_coordinate_permutations);
7426 Kokkos::deep_copy(host_new_coordinate_permutations,
7427 new_coordinate_permutations);
7431 for(mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii) {
7432 mj_lno_t i = host_coordinate_permutations(ii);
7433 mj_part_t p = host_assigned_part_ids(i);
7434 host_new_coordinate_permutations(coordinate_begin +
7435 host_thread_point_counts(p)++) = i;
7438 Kokkos::deep_copy(thread_point_counts, host_thread_point_counts);
7439 Kokkos::deep_copy(new_coordinate_permutations,
7440 host_new_coordinate_permutations);
7441 Kokkos::deep_copy(local_out_part_xadj, host_out_part_xadj);
7453template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7454 typename mj_part_t,
typename mj_node_t>
7455void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
7457 mj_part_t current_num_parts,
7458 mj_part_t output_part_begin_index,
7459 RCP<mj_partBoxVector_t> &output_part_boxes,
7460 bool is_data_ever_migrated)
7462 this->mj_env->timerStart(MACRO_TIMERS,
7463 mj_timer_base_string +
"Part_Assignment");
7465 auto local_part_xadj = part_xadj;
7466 auto local_mj_keep_part_boxes = mj_keep_part_boxes;
7467 auto local_coordinate_permutations = coordinate_permutations;
7468 auto local_assigned_part_ids = assigned_part_ids;
7470 if(local_mj_keep_part_boxes) {
7471 for(
int i = 0; i < current_num_parts; ++i) {
7472 (*output_part_boxes)[i].setpId(i + output_part_begin_index);
7476 Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy(
7477 current_num_parts, Kokkos::AUTO());
7478 typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
7479 member_type member_type;
7480 Kokkos::parallel_for(policy, KOKKOS_LAMBDA(member_type team_member) {
7481 int i = team_member.league_rank();
7482 Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, (i != 0) ?
7483 local_part_xadj(i-1) : 0, local_part_xadj(i)),
7485 mj_lno_t k = local_coordinate_permutations(ii);
7486 local_assigned_part_ids(k) = i + output_part_begin_index;
7490 if(is_data_ever_migrated) {
7491#ifdef ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
7492 if(
sizeof(mj_lno_t) <=
sizeof(int)) {
7499 ZOLTAN_COMM_OBJ *plan = NULL;
7500 MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->mj_problemComm));
7503 int message_tag = 7856;
7506 mj_timer_base_string +
"Final Z1PlanCreating");
7509 int ierr = Zoltan_Comm_Create( &plan,
int(this->num_local_coords),
7510 this->owner_of_coordinate.data(), mpi_comm, message_tag, &incoming);
7514 mj_timer_base_string +
"Final Z1PlanCreating" );
7517 mj_timer_base_string +
"Final Z1PlanComm");
7524 auto host_current_mj_gnos = Kokkos::create_mirror_view(
7525 Kokkos::HostSpace(), this->current_mj_gnos);
7526 deep_copy(host_current_mj_gnos, this->current_mj_gnos);
7527 Kokkos::View<mj_gno_t*, device_t> dst_gnos(
7528 Kokkos::ViewAllocateWithoutInitializing(
"dst_gnos"), incoming);
7529 auto host_dst_gnos = Kokkos::create_mirror_view(
7530 Kokkos::HostSpace(), dst_gnos);
7532 ierr = Zoltan_Comm_Do( plan, message_tag,
7533 (
char *) host_current_mj_gnos.data(),
7534 sizeof(mj_gno_t), (
char *) host_dst_gnos.data());
7536 Kokkos::deep_copy(dst_gnos, host_dst_gnos);
7537 this->current_mj_gnos = dst_gnos;
7540 auto host_src_part_ids = Kokkos::create_mirror_view(
7541 Kokkos::HostSpace(), this->assigned_part_ids);
7542 deep_copy(host_src_part_ids, this->assigned_part_ids);
7543 Kokkos::View<mj_part_t*, device_t> dst_part_ids(
7544 Kokkos::ViewAllocateWithoutInitializing(
"dst_part_ids"), incoming);
7545 auto host_dst_part_ids = Kokkos::create_mirror_view(
7546 Kokkos::HostSpace(), dst_part_ids);
7548 ierr = Zoltan_Comm_Do( plan, message_tag,
7549 (
char *) host_src_part_ids.data(),
7550 sizeof(mj_part_t), (
char *) host_dst_part_ids.data());
7552 Kokkos::deep_copy(dst_part_ids, host_dst_part_ids);
7553 this->assigned_part_ids = dst_part_ids;
7555 ierr = Zoltan_Comm_Destroy(&plan);
7558 this->num_local_coords = incoming;
7561 mj_timer_base_string +
"Final Z1PlanComm");
7568 mj_timer_base_string +
"Final DistributorPlanCreating");
7569 Tpetra::Distributor distributor(this->mj_problemComm);
7570 ArrayView<const mj_part_t> owners_of_coords(
7571 this->owner_of_coordinate.data(), this->num_local_coords);
7572 mj_lno_t incoming = distributor.createFromSends(owners_of_coords);
7574 mj_timer_base_string +
"Final DistributorPlanCreating" );
7577 mj_timer_base_string +
"Final DistributorPlanComm");
7583 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> sent_gnos(
7584 Kokkos::ViewAllocateWithoutInitializing(
"sent_gnos"),
7585 this->current_mj_gnos.extent(0));
7586 Kokkos::deep_copy(sent_gnos, this->current_mj_gnos);
7588 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos(
7589 Kokkos::ViewAllocateWithoutInitializing(
"received_gnos"),
7592 distributor.doPostsAndWaits(sent_gnos, 1, received_gnos);
7594 this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
7595 Kokkos::ViewAllocateWithoutInitializing(
"current_mj_gnos"), incoming);
7597 Kokkos::deep_copy(this->current_mj_gnos, received_gnos);
7600 Kokkos::View<mj_part_t *, Kokkos::HostSpace> sent_partids(
7601 Kokkos::ViewAllocateWithoutInitializing(
"sent_partids"),
7602 this->assigned_part_ids.extent(0));
7603 Kokkos::deep_copy(sent_partids, this->assigned_part_ids);
7605 Kokkos::View<mj_part_t *, Kokkos::HostSpace> received_partids(
7606 Kokkos::ViewAllocateWithoutInitializing(
"received_partids"),
7609 distributor.doPostsAndWaits(sent_partids, 1, received_partids);
7611 this->assigned_part_ids =
7612 Kokkos::View<mj_part_t*, device_t>(
7613 Kokkos::ViewAllocateWithoutInitializing(
"assigned_part_ids"),
7616 Kokkos::deep_copy(this->assigned_part_ids, received_partids);
7617 this->num_local_coords = incoming;
7620 mj_timer_base_string +
"Final DistributorPlanComm");
7625 mj_timer_base_string +
"Part_Assignment");
7628 mj_timer_base_string +
"Solution_Part_Assignment");
7633 if(this->mj_keep_part_boxes) {
7634 this->kept_boxes = compute_global_box_boundaries(output_part_boxes);
7638 mj_timer_base_string +
"Solution_Part_Assignment");
7653template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7654 typename mj_part_t,
typename mj_node_t>
7657 bool distribute_points_on_cut_lines_,
7658 int max_concurrent_part_calculation_,
7659 int check_migrate_avoid_migration_option_,
7660 double minimum_migration_imbalance_,
7661 int migration_type_)
7663 this->distribute_points_on_cut_lines = distribute_points_on_cut_lines_;
7664 this->max_concurrent_part_calculation = max_concurrent_part_calculation_;
7665 this->check_migrate_avoid_migration_option =
7666 check_migrate_avoid_migration_option_;
7667 this->minimum_migration_imbalance = minimum_migration_imbalance_;
7668 this->migration_type = migration_type_;
7698template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7699 typename mj_part_t,
typename mj_node_t>
7702 const RCP<const Environment> &env,
7703 RCP<
const Comm<int> > &problemComm,
7704 double imbalance_tolerance_,
7706 size_t num_global_parts_,
7707 Kokkos::View<mj_part_t*, Kokkos::HostSpace> & part_no_array_,
7708 int recursion_depth_,
7710 mj_lno_t num_local_coords_,
7711 mj_gno_t num_global_coords_,
7712 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
7714 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
7715 int num_weights_per_coord_,
7716 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_weights_,
7717 Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
7718 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_parts_,
7719 Kokkos::View<mj_part_t *, device_t> & result_assigned_part_ids_,
7720 Kokkos::View<mj_gno_t*, device_t> & result_mj_gnos_)
7725 this->mj_timer_base_string =
"MJ(" + std::to_string(execute_counter) +
") - ";
7728 this->mj_problemComm = problemComm;
7729 this->myActualRank = this->myRank = this->mj_problemComm->getRank();
7731 mj_timer_base_string +
"Total");
7732 this->mj_env->debug(3,
"In MultiJagged Jagged");
7733 this->imbalance_tolerance = imbalance_tolerance_;
7734 this->mj_num_teams = num_teams_;
7735 this->num_global_parts = num_global_parts_;
7736 this->part_no_array = part_no_array_;
7737 this->recursion_depth = recursion_depth_;
7738 this->coord_dim = coord_dim_;
7739 this->num_local_coords = num_local_coords_;
7740 this->num_global_coords = num_global_coords_;
7741 this->mj_coordinates = mj_coordinates_;
7742 this->initial_mj_gnos = initial_mj_gnos_;
7743 this->num_weights_per_coord = num_weights_per_coord_;
7744 this->mj_uniform_weights = mj_uniform_weights_;
7745 this->mj_weights = mj_weights_;
7746 this->mj_uniform_parts = mj_uniform_parts_;
7750 this->set_part_specifications();
7753 mj_timer_base_string +
"Allocate Views");
7754 this->allocate_set_work_memory();
7756 mj_timer_base_string +
"Allocate Views");
7760 this->comm = this->mj_problemComm->duplicate();
7763 if(comm->getRank() == 0) {
7764 std::cout <<
"size of gno:" <<
sizeof(mj_gno_t) << std::endl;
7765 std::cout <<
"size of lno:" <<
sizeof(mj_lno_t) << std::endl;
7766 std::cout <<
"size of mj_scalar_t:" <<
sizeof(mj_scalar_t) << std::endl;
7771 mj_part_t current_num_parts = 1;
7772 Kokkos::View<mj_scalar_t *, device_t> current_cut_coordinates =
7773 this->all_cut_coordinates;
7775 mj_timer_base_string +
"Problem_Partitioning");
7776 mj_part_t output_part_begin_index = 0;
7777 mj_part_t future_num_parts = this->total_num_part;
7778 bool is_data_ever_migrated =
false;
7780 std::vector<mj_part_t> *future_num_part_in_parts =
7781 new std::vector<mj_part_t> ();
7782 std::vector<mj_part_t> *next_future_num_parts_in_parts =
7783 new std::vector<mj_part_t> ();
7785 next_future_num_parts_in_parts->push_back(this->num_global_parts);
7787 RCP<mj_partBoxVector_t> input_part_boxes;
7788 RCP<mj_partBoxVector_t> output_part_boxes;
7790 if(this->mj_keep_part_boxes) {
7791 input_part_boxes = RCP<mj_partBoxVector_t>(
new mj_partBoxVector_t(),
true);
7792 output_part_boxes = RCP<mj_partBoxVector_t>(
new mj_partBoxVector_t(),
true);
7793 compute_global_box();
7794 this->init_part_boxes(output_part_boxes);
7797 auto local_part_xadj = this->part_xadj;
7801 Kokkos::View<mj_part_t*, device_t>
7802 view_rectilinear_cut_count(
"view_rectilinear_cut_count", 1);
7803 Kokkos::View<size_t*, device_t>
7804 view_total_reduction_size(
"view_total_reduction_size", 1);
7806 for(
int i = 0; i < this->recursion_depth; ++i) {
7809 std::string istring = std::to_string(i);
7815 std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
7816 future_num_part_in_parts = next_future_num_parts_in_parts;
7817 next_future_num_parts_in_parts = tmpPartVect;
7821 next_future_num_parts_in_parts->clear();
7822 if(this->mj_keep_part_boxes) {
7823 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7824 input_part_boxes = output_part_boxes;
7825 output_part_boxes = tmpPartBoxes;
7826 output_part_boxes->clear();
7830 mj_part_t output_part_count_in_dimension =
7831 this->update_part_num_arrays(
7832 future_num_part_in_parts,
7833 next_future_num_parts_in_parts,
7838 output_part_boxes, 1);
7843 if(output_part_count_in_dimension == current_num_parts) {
7845 tmpPartVect= future_num_part_in_parts;
7846 future_num_part_in_parts = next_future_num_parts_in_parts;
7847 next_future_num_parts_in_parts = tmpPartVect;
7849 if(this->mj_keep_part_boxes) {
7850 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7851 input_part_boxes = output_part_boxes;
7852 output_part_boxes = tmpPartBoxes;
7858 int coordInd = i % this->coord_dim;
7860 Kokkos::View<mj_scalar_t *, device_t> mj_current_dim_coords =
7861 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
7864 mj_timer_base_string +
"Problem_Partitioning_" + istring);
7868 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
7869 "new part xadj", output_part_count_in_dimension);
7872 mj_part_t output_part_index = 0;
7876 mj_part_t output_coordinate_end_index = 0;
7878 mj_part_t current_work_part = 0;
7879 mj_part_t current_concurrent_num_parts =
7880 std::min(current_num_parts - current_work_part,
7881 this->max_concurrent_part_calculation);
7883 mj_part_t obtained_part_index = 0;
7885 auto host_process_local_min_max_coord_total_weight =
7886 Kokkos::create_mirror_view(process_local_min_max_coord_total_weight);
7887 auto host_global_min_max_coord_total_weight =
7888 Kokkos::create_mirror_view(global_min_max_coord_total_weight);
7891 for(; current_work_part < current_num_parts;
7892 current_work_part += current_concurrent_num_parts) {
7894 current_concurrent_num_parts =
7895 std::min(current_num_parts - current_work_part,
7896 this->max_concurrent_part_calculation);
7899 auto local_device_num_partitioning_in_current_dim =
7900 device_num_partitioning_in_current_dim;
7901 Kokkos::parallel_reduce(
"Read bDoingWork",
7902 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
7903 KOKKOS_LAMBDA(
int dummy,
int & set_single) {
7905 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
7906 if(local_device_num_partitioning_in_current_dim(
7907 current_work_part + kk) != 1) {
7913 bool bDoingWork = (bDoingWork_int != 0) ?
true :
false;
7915 this->mj_get_local_min_max_coord_totW(
7917 current_concurrent_num_parts,
7918 mj_current_dim_coords);
7923 this->mj_get_global_min_max_coord_totW(
7924 current_concurrent_num_parts,
7925 this->process_local_min_max_coord_total_weight,
7926 this->global_min_max_coord_total_weight);
7930 mj_part_t total_incomplete_cut_count = 0;
7935 mj_part_t concurrent_part_cut_shift = 0;
7936 mj_part_t concurrent_part_part_shift = 0;
7938 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
7940 Kokkos::deep_copy(host_global_min_max_coord_total_weight,
7941 global_min_max_coord_total_weight);
7943 mj_scalar_t min_coordinate =
7944 host_global_min_max_coord_total_weight(kk);
7945 mj_scalar_t max_coordinate =
7946 host_global_min_max_coord_total_weight(
7947 kk + current_concurrent_num_parts);
7949 mj_scalar_t global_total_weight =
7950 host_global_min_max_coord_total_weight(
7951 kk + 2 * current_concurrent_num_parts);
7953 mj_part_t concurrent_current_part_index = current_work_part + kk;
7955 mj_part_t partition_count = host_num_partitioning_in_current_dim(
7956 concurrent_current_part_index);
7958 Kokkos::View<mj_scalar_t *, device_t> usedCutCoordinate =
7959 Kokkos::subview(current_cut_coordinates,
7960 std::pair<mj_lno_t, mj_lno_t>(
7961 concurrent_part_cut_shift, current_cut_coordinates.size()));
7962 Kokkos::View<mj_scalar_t *, device_t>
7963 current_target_part_weights =
7964 Kokkos::subview(target_part_weights,
7965 std::pair<mj_lno_t, mj_lno_t>(
7966 concurrent_part_part_shift, target_part_weights.size()));
7969 concurrent_part_cut_shift += partition_count - 1;
7971 concurrent_part_part_shift += partition_count;
7975 if(partition_count > 1 && min_coordinate <= max_coordinate) {
7979 total_incomplete_cut_count += partition_count - 1;
7981 this->incomplete_cut_count(kk) = partition_count - 1;
7984 this->mj_get_initial_cut_coords_target_weights(
7987 partition_count - 1,
7988 global_total_weight,
7990 current_target_part_weights,
7991 future_num_part_in_parts,
7992 next_future_num_parts_in_parts,
7993 concurrent_current_part_index,
7994 obtained_part_index);
7996 mj_lno_t coordinate_end_index =
7997 host_part_xadj(concurrent_current_part_index);
7998 mj_lno_t coordinate_begin_index =
7999 concurrent_current_part_index==0 ? 0 :
8000 host_part_xadj(concurrent_current_part_index - 1);
8002 this->set_initial_coordinate_parts(
8005 coordinate_begin_index, coordinate_end_index,
8006 this->coordinate_permutations,
8007 mj_current_dim_coords,
8008 this->assigned_part_ids,
8014 this->incomplete_cut_count(kk) = 0;
8017 obtained_part_index += partition_count;
8022 double used_imbalance = 0;
8025 mj_timer_base_string +
"Problem_Partitioning Get Part Weights");
8028 mj_current_dim_coords,
8031 current_concurrent_num_parts,
8032 current_cut_coordinates,
8033 total_incomplete_cut_count,
8034 view_rectilinear_cut_count,
8035 view_total_reduction_size);
8038 mj_timer_base_string +
"Problem_Partitioning Get Part Weights");
8043 mj_part_t output_array_shift = 0;
8044 mj_part_t cut_shift = 0;
8045 size_t tlr_shift = 0;
8046 size_t partweight_array_shift = 0;
8047 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
8049 mj_part_t current_concurrent_work_part = current_work_part + kk;
8051 mj_part_t num_parts = host_num_partitioning_in_current_dim(
8052 current_concurrent_work_part);
8055 int coordinateA_bigger_than_coordinateB =
8056 host_global_min_max_coord_total_weight(kk) >
8057 host_global_min_max_coord_total_weight(
8058 kk + current_concurrent_num_parts);
8060 if((num_parts != 1) && coordinateA_bigger_than_coordinateB) {
8063 auto local_new_part_xadj = this->new_part_xadj;
8064 Kokkos::parallel_for(
8065 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
8066 (0, num_parts), KOKKOS_LAMBDA (mj_part_t jj) {
8067 local_new_part_xadj(
8068 output_part_index + output_array_shift + jj) = 0;
8071 cut_shift += num_parts - 1;
8072 tlr_shift += (4 *(num_parts - 1) + 1);
8073 output_array_shift += num_parts;
8074 partweight_array_shift += (2 * (num_parts - 1) + 1);
8078 Kokkos::View<mj_scalar_t *, device_t>
8079 current_concurrent_cut_coordinate =
8080 Kokkos::subview(current_cut_coordinates,
8081 std::pair<mj_lno_t, mj_lno_t>(
8083 current_cut_coordinates.size()));
8084 Kokkos::View<mj_scalar_t *, device_t>
8085 used_local_cut_line_weight_to_left =
8086 Kokkos::subview(process_cut_line_weight_to_put_left,
8087 std::pair<mj_lno_t, mj_lno_t>(
8089 process_cut_line_weight_to_put_left.size()));
8091 this->thread_part_weight_work =
8093 this->thread_part_weights,
8094 std::pair<mj_lno_t, mj_lno_t>(
8095 partweight_array_shift,
8096 this->thread_part_weights.extent(0)));
8099 if(this->mj_keep_part_boxes) {
8101 for(mj_part_t j = 0; j < num_parts - 1; ++j) {
8102 mj_scalar_t temp_get_val;
8103 Kokkos::parallel_reduce(
"Read single",
8104 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
8105 KOKKOS_LAMBDA(
int dummy, mj_scalar_t & set_single) {
8106 set_single = current_concurrent_cut_coordinate(j);
8108 (*output_part_boxes)
8109 [output_array_shift + output_part_index + j].
8110 updateMinMax(temp_get_val, 1 , coordInd);
8111 (*output_part_boxes)
8112 [output_array_shift + output_part_index + j + 1].
8113 updateMinMax(temp_get_val, 0 , coordInd);
8118 Kokkos::View<mj_lno_t*, device_t> sub_new_part_xadj =
8119 Kokkos::subview(this->new_part_xadj,
8120 std::pair<mj_lno_t, mj_lno_t>(
8121 output_part_index + output_array_shift,
8122 this->new_part_xadj.size()));
8124 this->mj_create_new_partitions(
8126 current_concurrent_work_part,
8127 mj_current_dim_coords,
8128 current_concurrent_cut_coordinate,
8129 used_local_cut_line_weight_to_left,
8134 mj_lno_t coordinate_end = host_part_xadj(
8135 current_concurrent_work_part);
8136 mj_lno_t coordinate_begin =
8137 current_concurrent_work_part==0 ? 0 : host_part_xadj(
8138 current_concurrent_work_part - 1);
8142 mj_lno_t part_size = coordinate_end - coordinate_begin;
8146 auto local_new_part_xadj = this->new_part_xadj;
8147 Kokkos::parallel_for(
8148 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
8149 (0, 1), KOKKOS_LAMBDA (
int dummy) {
8150 local_new_part_xadj(
8151 output_part_index + output_array_shift) = part_size;
8154 auto subview_new_coordinate_permutations =
8155 Kokkos::subview(this->new_coordinate_permutations,
8156 std::pair<mj_lno_t, mj_lno_t>(
8158 coordinate_begin + part_size));
8159 auto subview_coordinate_permutations =
8160 Kokkos::subview(this->coordinate_permutations,
8161 std::pair<mj_lno_t, mj_lno_t>(
8163 coordinate_begin + part_size));
8164 Kokkos::deep_copy(subview_new_coordinate_permutations,
8165 subview_coordinate_permutations);
8167 cut_shift += num_parts - 1;
8168 output_array_shift += num_parts;
8169 partweight_array_shift += (2 * (num_parts - 1) + 1);
8178 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
8179 mj_part_t num_parts =
8180 host_num_partitioning_in_current_dim(current_work_part + kk);
8184 auto local_new_part_xadj = this->new_part_xadj;
8185 Kokkos::parallel_for(
8186 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
8187 (0, num_parts), KOKKOS_LAMBDA (mj_part_t ii) {
8188 local_new_part_xadj(output_part_index+ii) +=
8189 output_coordinate_end_index;
8194 Kokkos::parallel_reduce(
"Read single",
8195 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
8196 KOKKOS_LAMBDA(
int dummy, mj_part_t & set_single) {
8198 local_new_part_xadj(output_part_index + num_parts - 1);
8200 output_coordinate_end_index = temp_get;
8202 output_part_index += num_parts;
8208 int current_world_size = this->comm->getSize();
8209 long migration_reduce_all_population =
8210 this->total_dim_num_reduce_all * current_world_size;
8211 bool is_migrated_in_current_dimension =
false;
8216 if(future_num_parts > 1 &&
8217 this->check_migrate_avoid_migration_option >= 0 &&
8218 current_world_size > 1) {
8220 mj_timer_base_string +
"Problem_Migration-" + istring);
8221 mj_part_t num_parts = output_part_count_in_dimension;
8223 if(this->mj_perform_migration(
8226 next_future_num_parts_in_parts,
8227 output_part_begin_index,
8228 migration_reduce_all_population,
8229 this->num_global_coords / (future_num_parts * current_num_parts),
8231 input_part_boxes, output_part_boxes) )
8233 is_migrated_in_current_dimension =
true;
8234 is_data_ever_migrated =
true;
8236 mj_timer_base_string +
"Problem_Migration-" + istring);
8239 this->total_dim_num_reduce_all /= num_parts;
8242 is_migrated_in_current_dimension =
false;
8244 mj_timer_base_string +
"Problem_Migration-" + istring);
8249 Kokkos::View<mj_lno_t*, device_t> tmp =
8250 this->coordinate_permutations;
8251 this->coordinate_permutations =
8252 this->new_coordinate_permutations;
8254 this->new_coordinate_permutations = tmp;
8255 if(!is_migrated_in_current_dimension) {
8256 this->total_dim_num_reduce_all -= current_num_parts;
8257 current_num_parts = output_part_count_in_dimension;
8261 this->part_xadj = this->new_part_xadj;
8262 local_part_xadj = this->new_part_xadj;
8263 this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
8264 Kokkos::deep_copy(host_part_xadj, part_xadj);
8266 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
"empty", 0);
8268 mj_timer_base_string +
"Problem_Partitioning_" + istring);
8273 delete future_num_part_in_parts;
8274 delete next_future_num_parts_in_parts;
8276 mj_timer_base_string +
"Problem_Partitioning");
8282 this->set_final_parts(
8284 output_part_begin_index,
8286 is_data_ever_migrated);
8288 result_assigned_part_ids_ = this->assigned_part_ids;
8289 result_mj_gnos_ = this->current_mj_gnos;
8291 mj_timer_base_string +
"Total");
8292 this->mj_env->debug(3,
"Out of MultiJagged");
8295template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
8296 typename mj_part_t,
typename mj_node_t>
8297RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t, mj_node_t>::
8302 if(this->mj_keep_part_boxes) {
8303 return this->kept_boxes;
8306 throw std::logic_error(
"Error: part boxes are not stored.");
8310template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
8311 typename mj_part_t,
typename mj_node_t>
8312RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t, mj_node_t>::
8318 mj_part_t ntasks = this->num_global_parts;
8319 int dim = (*localPartBoxes)[0].getDim();
8320 coord_t *localPartBoundaries =
new coord_t[ntasks * 2 *dim];
8322 memset(localPartBoundaries, 0,
sizeof(coord_t) * ntasks * 2 *dim);
8324 coord_t *globalPartBoundaries =
new coord_t[ntasks * 2 *dim];
8325 memset(globalPartBoundaries, 0,
sizeof(coord_t) * ntasks * 2 *dim);
8327 coord_t *localPartMins = localPartBoundaries;
8328 coord_t *localPartMaxs = localPartBoundaries + ntasks * dim;
8330 coord_t *globalPartMins = globalPartBoundaries;
8331 coord_t *globalPartMaxs = globalPartBoundaries + ntasks * dim;
8333 mj_part_t boxCount = localPartBoxes->size();
8334 for(mj_part_t i = 0; i < boxCount; ++i) {
8335 mj_part_t pId = (*localPartBoxes)[i].getpId();
8339 coord_t *lmins = (*localPartBoxes)[i].getlmins();
8340 coord_t *lmaxs = (*localPartBoxes)[i].getlmaxs();
8342 for(
int j = 0; j < dim; ++j) {
8343 localPartMins[dim * pId + j] = lmins[j];
8344 localPartMaxs[dim * pId + j] = lmaxs[j];
8357 reduceAll<int, coord_t>(*mj_problemComm, reductionOp,
8358 ntasks * 2 *dim, localPartBoundaries, globalPartBoundaries);
8360 RCP<mj_partBoxVector_t> pB(
new mj_partBoxVector_t(),
true);
8361 for(mj_part_t i = 0; i < ntasks; ++i) {
8363 globalPartMins + dim * i,
8364 globalPartMaxs + dim * i);
8377 delete []localPartBoundaries;
8378 delete []globalPartBoundaries;
8385template <
typename Adapter>
8391#ifndef DOXYGEN_SHOULD_SKIP_THIS
8395 typedef typename Adapter::scalar_t adapter_scalar_t;
8398 typedef float default_mj_scalar_t;
8404 (std::is_same<adapter_scalar_t, float>::value ||
8405 std::is_same<adapter_scalar_t, double>::value),
8406 adapter_scalar_t, default_mj_scalar_t>::type mj_scalar_t;
8408 typedef typename Adapter::gno_t mj_gno_t;
8409 typedef typename Adapter::lno_t mj_lno_t;
8410 typedef typename Adapter::part_t mj_part_t;
8411 typedef typename Adapter::node_t mj_node_t;
8413 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
8414 typedef typename mj_node_t::device_type
device_t;
8419 RCP<const Environment> mj_env;
8420 RCP<const Comm<int> > mj_problemComm;
8421 RCP<const typename Adapter::base_adapter_t> mj_adapter;
8424 double imbalance_tolerance;
8428 size_t num_global_parts;
8431 Kokkos::View<mj_part_t*, Kokkos::HostSpace> part_no_array;
8434 int recursion_depth;
8437 mj_lno_t num_local_coords;
8438 mj_gno_t num_global_coords;
8441 Kokkos::View<const mj_gno_t*, device_t> initial_mj_gnos;
8445 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
8448 int num_weights_per_coord;
8451 Kokkos::View<bool*, Kokkos::HostSpace> mj_uniform_weights;
8454 Kokkos::View<mj_scalar_t**, device_t> mj_weights;
8457 Kokkos::View<bool*, Kokkos::HostSpace> mj_uniform_parts;
8467 mj_part_t num_first_level_parts;
8471 Kokkos::View<mj_part_t*, Kokkos::HostSpace> first_level_distribution;
8475 bool distribute_points_on_cut_lines;
8478 mj_part_t max_concurrent_part_calculation;
8481 int check_migrate_avoid_migration_option;
8489 double minimum_migration_imbalance;
8490 bool mj_keep_part_boxes;
8494 int mj_premigration_option;
8495 int min_coord_per_rank_for_premigration;
8498 ArrayRCP<mj_part_t> comXAdj_;
8501 ArrayRCP<mj_part_t> comAdj_;
8506 void set_input_parameters(
const Teuchos::ParameterList &p);
8508 RCP<mj_partBoxVector_t> getGlobalBoxBoundaries()
const;
8510 bool mj_premigrate_to_subset(
8512 int migration_selection_option,
8513 RCP<const Environment> mj_env_,
8514 RCP<
const Comm<int> > mj_problemComm_,
8516 mj_lno_t num_local_coords_,
8517 mj_gno_t num_global_coords_,
size_t num_global_parts_,
8518 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
8520 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8522 int num_weights_per_coord_,
8523 Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
8525 RCP<
const Comm<int> > &result_problemComm_,
8526 mj_lno_t & result_num_local_coords_,
8527 Kokkos::View<mj_gno_t*, device_t> & result_initial_mj_gnos_,
8529 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8530 result_mj_coordinates_,
8531 Kokkos::View<mj_scalar_t**, device_t> & result_mj_weights_,
8532 int * &result_actual_owner_rank_);
8537 RCP<
const Comm<int> > &problemComm,
8538 const RCP<const typename Adapter::base_adapter_t> &adapter) :
8541 mj_problemComm(problemComm),
8542 mj_adapter(adapter),
8543 imbalance_tolerance(0),
8545 num_global_parts(1),
8548 num_local_coords(0),
8549 num_global_coords(0),
8550 num_weights_per_coord(0),
8551 num_first_level_parts(1),
8552 distribute_points_on_cut_lines(true),
8553 max_concurrent_part_calculation(1),
8554 check_migrate_avoid_migration_option(0),
8556 minimum_migration_imbalance(0.30),
8557 mj_keep_part_boxes(false),
8558 mj_run_as_rcb(false),
8559 mj_premigration_option(0),
8560 min_coord_per_rank_for_premigration(32000),
8574 const bool bUnsorted =
true;
8575 RCP<Zoltan2::IntegerRangeListValidator<int>> mj_parts_Validator =
8577 pl.set(
"mj_parts",
"0",
"list of parts for multiJagged partitioning "
8578 "algorithm. As many as the dimension count.", mj_parts_Validator);
8580 pl.set(
"mj_concurrent_part_count", 1,
"The number of parts whose cut "
8581 "coordinates will be calculated concurently.",
8584 pl.set(
"mj_minimum_migration_imbalance", 1.1,
8585 "mj_minimum_migration_imbalance, the minimum imbalance of the "
8586 "processors to avoid migration",
8589 RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_option_validator =
8590 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(0, 2) );
8591 pl.set(
"mj_migration_option", 1,
"Migration option, 0 for decision "
8592 "depending on the imbalance, 1 for forcing migration, 2 for "
8593 "avoiding migration", mj_migration_option_validator);
8595 RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_type_validator =
8596 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(0, 1) );
8597 pl.set(
"mj_migration_type", 0,
8598 "Migration type, 0 for migration to minimize the imbalance "
8599 "1 for migration to minimize messages exchanged the migration.",
8600 mj_migration_option_validator);
8603 pl.set(
"mj_keep_part_boxes",
false,
"Keep the part boundaries of the "
8607 pl.set(
"mj_enable_rcb",
false,
"Use MJ as RCB.",
8610 pl.set(
"mj_recursion_depth", -1,
"Recursion depth for MJ: Must be "
8613 RCP<Teuchos::EnhancedNumberValidator<int>>
8614 mj_num_teams_validator =
8615 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(
8616 0, Teuchos::EnhancedNumberTraits<int>::max()) );
8617 pl.set(
"mj_num_teams", 0,
8618 "How many teams for the main kernel loop"
8619 , mj_num_teams_validator);
8621 RCP<Teuchos::EnhancedNumberValidator<int>>
8622 mj_premigration_option_validator =
8623 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(0, 1024) );
8625 pl.set(
"mj_premigration_option", 0,
8626 "Whether to do premigration or not. 0 for no migration "
8627 "x > 0 for migration to consecutive processors, "
8628 "the subset will be 0,x,2x,3x,...subset ranks."
8629 , mj_premigration_option_validator);
8631 pl.set(
"mj_premigration_coordinate_count", 32000,
"How many coordinate to "
8632 "assign each rank in multijagged after premigration"
8645 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
8649 mj_part_t
pointAssign(
int dim, adapter_scalar_t *point)
const;
8651 void boxAssign(
int dim, adapter_scalar_t *lower, adapter_scalar_t *upper,
8652 size_t &nPartsFound, mj_part_t **partsFound)
const;
8658 ArrayRCP<mj_part_t> &comXAdj,
8659 ArrayRCP<mj_part_t> &comAdj);
8665 std::string timer_base_string;
8673 template<
class dst_t,
class src_t>
8674 typename std::enable_if<std::is_same<
typename dst_t::value_type,
8675 typename src_t::value_type>::value>::type
8676 assign_if_same(dst_t & dst,
const src_t & src) {
8679 template<
class dst_t,
class src_t>
8680 typename std::enable_if<!std::is_same<
typename dst_t::value_type,
8681 typename src_t::value_type>::value>::type
8682 assign_if_same(dst_t & dst,
const src_t & src) {
8687template <
typename Adapter>
8688bool Zoltan2_AlgMJ<Adapter>::mj_premigrate_to_subset(
8690 int migration_selection_option,
8691 RCP<const Environment> mj_env_,
8692 RCP<
const Comm<int> > mj_problemComm_,
8694 mj_lno_t num_local_coords_,
8695 mj_gno_t num_global_coords_,
size_t num_global_parts_,
8696 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
8698 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
8699 int num_weights_per_coord_,
8700 Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
8702 RCP<
const Comm<int> > & result_problemComm_,
8703 mj_lno_t &result_num_local_coords_,
8704 Kokkos::View<mj_gno_t*, device_t> & result_initial_mj_gnos_,
8706 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8707 result_mj_coordinates_,
8708 Kokkos::View<mj_scalar_t**, device_t> & result_mj_weights_,
8709 int * &result_actual_owner_rank_)
8712 timer_base_string +
"PreMigration DistributorPlanCreating");
8714 int myRank = mj_problemComm_->getRank();
8715 int worldSize = mj_problemComm_->getSize();
8717 mj_part_t groupsize = worldSize / used_num_ranks;
8719 std::vector<mj_part_t> group_begins(used_num_ranks + 1, 0);
8721 mj_part_t i_am_sending_to = 0;
8722 bool am_i_a_receiver =
false;
8724 for(
int i = 0; i < used_num_ranks; ++i) {
8725 group_begins[i+ 1] = group_begins[i] + groupsize;
8726 if(worldSize % used_num_ranks > i) group_begins[i+ 1] += 1;
8727 if(i == used_num_ranks) group_begins[i+ 1] = worldSize;
8728 if(myRank >= group_begins[i] && myRank < group_begins[i + 1]) {
8729 i_am_sending_to = group_begins[i];
8731 if(myRank == group_begins[i]) {
8732 am_i_a_receiver =
true;
8736 ArrayView<const mj_part_t> idView(&(group_begins[0]), used_num_ranks );
8737 result_problemComm_ = mj_problemComm_->createSubcommunicator(idView);
8739 Tpetra::Distributor distributor(mj_problemComm_);
8741 std::vector<mj_part_t>
8742 coordinate_destinations(num_local_coords_, i_am_sending_to);
8744 ArrayView<const mj_part_t>
8745 destinations(&(coordinate_destinations[0]), num_local_coords_);
8746 mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
8747 result_num_local_coords_ = num_incoming_gnos;
8749 timer_base_string +
"PreMigration DistributorPlanCreating");
8752 timer_base_string +
"PreMigration DistributorMigration");
8760 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> sent_gnos(
8761 Kokkos::ViewAllocateWithoutInitializing(
"sent_gnos"),
8762 initial_mj_gnos_.size());
8763 Kokkos::deep_copy(sent_gnos, initial_mj_gnos_);
8765 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos (
8766 Kokkos::ViewAllocateWithoutInitializing(
"received_gnos"),
8769 distributor.doPostsAndWaits(sent_gnos, 1, received_gnos);
8771 result_initial_mj_gnos_ = Kokkos::View<mj_gno_t*, device_t>(
8772 Kokkos::ViewAllocateWithoutInitializing(
"result_initial_mj_gnos_"),
8774 Kokkos::deep_copy(result_initial_mj_gnos_, received_gnos);
8780 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, Kokkos::HostSpace>
8781 host_src_coordinates(
8782 Kokkos::ViewAllocateWithoutInitializing(
"mj_coordinates"),
8783 this->mj_coordinates.extent(0), this->mj_coordinates.extent(1));
8785 Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
8787 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> dst_coordinates(
8788 Kokkos::ViewAllocateWithoutInitializing(
"mj_coordinates"),
8789 num_incoming_gnos, this->coord_dim);
8791 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_coord(
8792 Kokkos::ViewAllocateWithoutInitializing(
"received_coord"),
8795 for(
int i = 0; i < this->coord_dim; ++i) {
8797 auto sent_coord = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
8799 distributor.doPostsAndWaits(sent_coord, 1, received_coord);
8801 Kokkos::deep_copy(Kokkos::subview(dst_coordinates, Kokkos::ALL, i),
8805 result_mj_coordinates_ = dst_coordinates;
8809 Kokkos::View<mj_scalar_t**, device_t> dst_weights(
8810 Kokkos::ViewAllocateWithoutInitializing(
"mj_weights"),
8811 num_incoming_gnos, this->num_weights_per_coord);
8812 auto host_dst_weights = Kokkos::create_mirror_view(dst_weights);
8814 auto host_src_weights = Kokkos::create_mirror_view_and_copy(
8815 Kokkos::HostSpace(), this->mj_weights);
8818 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sent_weight(
8819 Kokkos::ViewAllocateWithoutInitializing(
"send_weight_buffer"),
8820 this->num_local_coords);
8822 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_weight(
8823 Kokkos::ViewAllocateWithoutInitializing(
"received_weight_buffer"),
8826 for(
int i = 0; i < this->num_weights_per_coord; ++i) {
8828 auto sub_host_src_weights
8829 = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
8830 auto sub_host_dst_weights
8831 = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
8839 for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
8840 sent_weight[n] = sub_host_src_weights(n);
8843 distributor.doPostsAndWaits(sent_weight, 1, received_weight);
8846 for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
8847 sub_host_dst_weights(n) = received_weight[n];
8850 Kokkos::deep_copy(dst_weights, host_dst_weights);
8851 result_mj_weights_ = dst_weights;
8855 Kokkos::View<int*, Kokkos::HostSpace> sent_owners(
8856 Kokkos::ViewAllocateWithoutInitializing(
"sent_owners"),
8858 Kokkos::deep_copy(sent_owners, myRank);
8860 Kokkos::View<int*, Kokkos::HostSpace> received_owners(
8861 Kokkos::ViewAllocateWithoutInitializing(
"received_owners"),
8864 distributor.doPostsAndWaits(sent_owners, 1, received_owners);
8866 result_actual_owner_rank_ =
new int[num_incoming_gnos];
8868 result_actual_owner_rank_,
8869 received_owners.data(),
8870 num_incoming_gnos *
sizeof(
int));
8874 timer_base_string +
"PreMigration DistributorMigration");
8875 return am_i_a_receiver;
8885template <
typename Adapter>
8894 int execute_counter =
8896 timer_base_string =
"partition(" + std::to_string(execute_counter) +
") - ";
8898 this->mj_env->timerStart(
MACRO_TIMERS, timer_base_string +
"all");
8900 this->mj_env->timerStart(
MACRO_TIMERS, timer_base_string +
"setup");
8902 this->set_up_partitioning_data(solution);
8904 this->set_input_parameters(this->mj_env->getParameters());
8905 if(this->mj_keep_part_boxes) {
8906 this->mj_partitioner.set_to_keep_part_boxes();
8909 this->mj_partitioner.set_partitioning_parameters(
8910 this->distribute_points_on_cut_lines,
8911 this->max_concurrent_part_calculation,
8912 this->check_migrate_avoid_migration_option,
8913 this->minimum_migration_imbalance, this->migration_type);
8915 RCP<const Comm<int> > result_problemComm = this->mj_problemComm;
8916 mj_lno_t result_num_local_coords = this->num_local_coords;
8917 Kokkos::View<mj_gno_t*, device_t> result_initial_mj_gnos;
8919 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
8920 result_mj_coordinates = this->mj_coordinates;
8921 Kokkos::View<mj_scalar_t**, device_t> result_mj_weights =
8923 int *result_actual_owner_rank = NULL;
8925 Kokkos::View<const mj_gno_t*, device_t> result_initial_mj_gnos_ =
8926 this->initial_mj_gnos;
8944 int current_world_size = this->mj_problemComm->getSize();
8945 mj_lno_t threshold_num_local_coords =
8946 this->min_coord_per_rank_for_premigration;
8947 bool is_pre_migrated =
false;
8948 bool am_i_in_subset =
true;
8954 if(mj_premigration_option > 0 &&
8955 size_t (current_world_size) > this->num_global_parts &&
8956 this->num_global_coords < mj_gno_t (
8957 current_world_size * threshold_num_local_coords))
8959 if(this->mj_keep_part_boxes) {
8960 throw std::logic_error(
"Multijagged: mj_keep_part_boxes and "
8961 "mj_premigration_option are not supported together yet.");
8964 is_pre_migrated =
true;
8965 int migration_selection_option = mj_premigration_option;
8966 if(migration_selection_option * this->num_global_parts >
8967 (
size_t) (current_world_size)) {
8968 migration_selection_option =
8969 current_world_size / this->num_global_parts;
8972 int used_num_ranks = int (this->num_global_coords /
8973 float (threshold_num_local_coords) + 0.5);
8975 if(used_num_ranks == 0) {
8979 am_i_in_subset = this->mj_premigrate_to_subset(
8981 migration_selection_option,
8983 this->mj_problemComm,
8985 this->num_local_coords,
8986 this->num_global_coords,
8987 this->num_global_parts,
8988 this->initial_mj_gnos,
8989 this->mj_coordinates,
8990 this->num_weights_per_coord,
8994 result_num_local_coords,
8995 result_initial_mj_gnos,
8996 result_mj_coordinates,
8998 result_actual_owner_rank);
9000 result_initial_mj_gnos_ = result_initial_mj_gnos;
9003 Kokkos::View<mj_part_t *, device_t> result_assigned_part_ids;
9004 Kokkos::View<mj_gno_t*, device_t> result_mj_gnos;
9006 this->mj_env->timerStop(
MACRO_TIMERS, timer_base_string +
"setup");
9008 if(am_i_in_subset) {
9009 this->mj_partitioner.multi_jagged_part(
9012 this->imbalance_tolerance,
9014 this->num_global_parts,
9015 this->part_no_array,
9016 this->recursion_depth,
9018 result_num_local_coords,
9019 this->num_global_coords,
9020 result_initial_mj_gnos_,
9021 result_mj_coordinates,
9022 this->num_weights_per_coord,
9023 this->mj_uniform_weights,
9025 this->mj_uniform_parts,
9026 result_assigned_part_ids,
9031 this->mj_env->timerStart(
MACRO_TIMERS, timer_base_string +
"cleanup");
9034 std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid;
9035 localGidToLid.reserve(result_num_local_coords);
9036 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> host_result_initial_mj_gnos(
9037 Kokkos::ViewAllocateWithoutInitializing(
"host_result_initial_mj_gnos"),
9038 result_initial_mj_gnos_.size());
9039 Kokkos::deep_copy(host_result_initial_mj_gnos, result_initial_mj_gnos_);
9040 for(mj_lno_t i = 0; i < result_num_local_coords; i++) {
9041 localGidToLid[host_result_initial_mj_gnos(i)] = i;
9044 ArrayRCP<mj_part_t> partId = arcp(
new mj_part_t[result_num_local_coords],
9045 0, result_num_local_coords,
true);
9046 auto host_result_assigned_part_ids =
9047 Kokkos::create_mirror_view(result_assigned_part_ids);
9048 Kokkos::deep_copy(host_result_assigned_part_ids, result_assigned_part_ids);
9049 auto host_result_mj_gnos = Kokkos::create_mirror_view(result_mj_gnos);
9050 Kokkos::deep_copy(host_result_mj_gnos, result_mj_gnos);
9051 for(mj_lno_t i = 0; i < result_num_local_coords; i++) {
9052 mj_lno_t origLID = localGidToLid[host_result_mj_gnos(i)];
9053 partId[origLID] = host_result_assigned_part_ids(i);
9058 if(is_pre_migrated) {
9059 this->mj_env->timerStart(
MACRO_TIMERS, timer_base_string +
9060 "PostMigration DistributorPlanCreating");
9061 Tpetra::Distributor distributor(this->mj_problemComm);
9063 ArrayView<const mj_part_t> actual_owner_destinations(
9064 result_actual_owner_rank , result_num_local_coords);
9066 mj_lno_t num_incoming_gnos = distributor.createFromSends(
9067 actual_owner_destinations);
9069 if(num_incoming_gnos != this->num_local_coords) {
9070 throw std::logic_error(
"Zoltan2 - Multijagged Post Migration - "
9071 "num incoming is not equal to num local coords");
9075 "PostMigration DistributorPlanCreating");
9077 "PostMigration DistributorMigration");
9079 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos(
9080 Kokkos::ViewAllocateWithoutInitializing(
"received_gnos"),
9082 Kokkos::View<mj_part_t*, Kokkos::HostSpace> received_partids(
9083 Kokkos::ViewAllocateWithoutInitializing(
"received_partids"),
9086 distributor.doPostsAndWaits(host_result_initial_mj_gnos, 1,
9089 Kokkos::View<mj_part_t*, Kokkos::HostSpace> sent_partnos;
9090 if (partId.size() > 0) {
9091 sent_partnos = Kokkos::View<mj_part_t*, Kokkos::HostSpace>(
9092 partId.getRawPtr(), partId.size());
9094 distributor.doPostsAndWaits(sent_partnos, 1, received_partids);
9097 partId = arcp(
new mj_part_t[this->num_local_coords],
9098 0, this->num_local_coords,
true);
9101 std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid2;
9102 localGidToLid2.reserve(this->num_local_coords);
9103 auto host_initial_mj_gnos =
9104 Kokkos::create_mirror_view(this->initial_mj_gnos);
9105 Kokkos::deep_copy(host_initial_mj_gnos,
9106 this->initial_mj_gnos);
9107 for(mj_lno_t i = 0; i < this->num_local_coords; i++) {
9108 localGidToLid2[host_initial_mj_gnos(i)] = i;
9111 for(mj_lno_t i = 0; i < this->num_local_coords; i++) {
9112 mj_lno_t origLID = localGidToLid2[received_gnos[i]];
9113 partId[origLID] = received_partids[i];
9118 delete [] result_actual_owner_rank;
9121 timer_base_string +
"PostMigration DistributorMigration");
9123 solution->setParts(partId);
9124 this->mj_env->timerStop(
MACRO_TIMERS, timer_base_string +
"cleanup");
9127 this->mj_env->timerStop(
MACRO_TIMERS, timer_base_string +
"all");
9130 this->mj_coordinates = Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>();
9135template <
typename Adapter>
9148 int criteria_dim = (this->num_weights_per_coord ?
9149 this->num_weights_per_coord : 1);
9153 this->num_global_parts = solution->getTargetGlobalNumberOfParts();
9156 this->mj_uniform_parts = Kokkos::View<bool *, Kokkos::HostSpace>(
9157 "uniform parts", criteria_dim);
9158 this->mj_uniform_weights = Kokkos::View<bool *, Kokkos::HostSpace>(
9159 "uniform weights", criteria_dim);
9161 Kokkos::View<const mj_gno_t *, device_t> gnos;
9162 Kokkos::View<adapter_scalar_t **, Kokkos::LayoutLeft, device_t> xyz_adapter;
9164 Kokkos::View<adapter_scalar_t **, device_t> wgts_adapter;
9167 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> xyz;
9168 Kokkos::View<mj_scalar_t **, device_t> wgts;
9172 if(std::is_same<mj_scalar_t, adapter_scalar_t>()) {
9175 assign_if_same(xyz, xyz_adapter);
9176 assign_if_same(wgts, wgts_adapter);
9181 xyz = Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>
9182 (Kokkos::ViewAllocateWithoutInitializing(
9183 "xyz"), xyz_adapter.extent(0), xyz_adapter.extent(1));
9184 wgts = Kokkos::View<mj_scalar_t **, device_t>(
9185 Kokkos::ViewAllocateWithoutInitializing(
"wgts"),
9186 wgts_adapter.extent(0), wgts_adapter.extent(1));
9188 typedef typename Kokkos::View<mj_scalar_t **, device_t>::size_type view_size_t;
9189 Kokkos::parallel_for(
9190 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
9191 (0, xyz_adapter.extent(0)), KOKKOS_LAMBDA (
int i) {
9192 for(view_size_t n = 0; n < xyz_adapter.extent(1); ++n) {
9193 xyz(i, n) =
static_cast<mj_scalar_t
>(xyz_adapter(i, n));
9196 Kokkos::parallel_for(
9197 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
9198 (0, wgts.extent(0)), KOKKOS_LAMBDA (
int i) {
9199 for(view_size_t n = 0; n < wgts.extent(1); ++n) {
9200 wgts(i, n) =
static_cast<mj_scalar_t
>(wgts_adapter(i, n));
9206 this->initial_mj_gnos = gnos;
9208 this->mj_coordinates = xyz;
9211 if(this->num_weights_per_coord == 0) {
9212 this->mj_uniform_weights(0) =
true;
9213 Kokkos::resize(this->mj_weights, 0, 0);
9216 this->mj_weights = wgts;
9217 for(
int wdim = 0; wdim < this->num_weights_per_coord; ++wdim) {
9218 this->mj_uniform_weights(wdim) =
false;
9222 for(
int wdim = 0; wdim < criteria_dim; ++wdim) {
9223 if(solution->criteriaHasUniformPartSizes(wdim)) {
9224 this->mj_uniform_parts(wdim) =
true;
9227 printf(
"Error: MJ does not support non uniform target part weights\n");
9236template <
typename Adapter>
9238 const Teuchos::ParameterList &pl)
9240 const Teuchos::ParameterEntry *pe = pl.getEntryPtr(
"imbalance_tolerance");
9243 tol = pe->getValue(&tol);
9244 this->imbalance_tolerance = tol - 1.0;
9248 if(this->imbalance_tolerance <= 0) {
9249 this->imbalance_tolerance= 10e-4;
9253 Kokkos::resize(this->part_no_array, 0);
9256 this->recursion_depth = 0;
9258 if(pl.getPtr<
int>(
"mj_num_teams")) {
9259 this->num_teams = pl.get<
int>(
"mj_num_teams");
9262 if(pl.getPtr<Array <mj_part_t> >(
"mj_parts")) {
9263 auto mj_parts = pl.get<Array <mj_part_t> >(
"mj_parts");
9264 int mj_parts_size =
static_cast<int>(mj_parts.size());
9267 this->part_no_array = Kokkos::View<mj_part_t*, Kokkos::HostSpace>(
9268 "part_no_array", mj_parts_size);
9269 for(
int i = 0; i < mj_parts_size; ++i) {
9270 this->part_no_array(i) = mj_parts.getRawPtr()[i];
9273 this->recursion_depth = mj_parts_size - 1;
9274 this->mj_env->debug(2,
"mj_parts provided by user");
9278 this->distribute_points_on_cut_lines =
true;
9279 this->max_concurrent_part_calculation = 1;
9281 this->mj_run_as_rcb =
false;
9282 this->mj_premigration_option = 0;
9283 this->min_coord_per_rank_for_premigration = 32000;
9285 int mj_user_recursion_depth = -1;
9286 this->mj_keep_part_boxes =
false;
9287 this->check_migrate_avoid_migration_option = 0;
9288 this->migration_type = 0;
9289 this->minimum_migration_imbalance = 0.35;
9291 pe = pl.getEntryPtr(
"mj_minimum_migration_imbalance");
9294 imb = pe->getValue(&imb);
9295 this->minimum_migration_imbalance = imb - 1.0;
9298 pe = pl.getEntryPtr(
"mj_migration_option");
9300 this->check_migrate_avoid_migration_option =
9301 pe->getValue(&this->check_migrate_avoid_migration_option);
9303 this->check_migrate_avoid_migration_option = 0;
9305 if(this->check_migrate_avoid_migration_option > 1) {
9306 this->check_migrate_avoid_migration_option = -1;
9310 pe = pl.getEntryPtr(
"mj_migration_type");
9312 this->migration_type = pe->getValue(&this->migration_type);
9314 this->migration_type = 0;
9320 pe = pl.getEntryPtr(
"mj_concurrent_part_count");
9322 this->max_concurrent_part_calculation =
9323 pe->getValue(&this->max_concurrent_part_calculation);
9325 this->max_concurrent_part_calculation = 1;
9328 pe = pl.getEntryPtr(
"mj_keep_part_boxes");
9330 this->mj_keep_part_boxes = pe->getValue(&this->mj_keep_part_boxes);
9332 this->mj_keep_part_boxes =
false;
9343 if(this->mj_keep_part_boxes ==
false) {
9344 pe = pl.getEntryPtr(
"mapping_type");
9346 int mapping_type = -1;
9347 mapping_type = pe->getValue(&mapping_type);
9348 if(mapping_type == 0) {
9349 mj_keep_part_boxes =
true;
9355 pe = pl.getEntryPtr(
"mj_enable_rcb");
9357 this->mj_run_as_rcb = pe->getValue(&this->mj_run_as_rcb);
9359 this->mj_run_as_rcb =
false;
9362 pe = pl.getEntryPtr(
"mj_premigration_option");
9364 mj_premigration_option = pe->getValue(&mj_premigration_option);
9366 mj_premigration_option = 0;
9369 pe = pl.getEntryPtr(
"mj_premigration_coordinate_count");
9371 min_coord_per_rank_for_premigration = pe->getValue(&mj_premigration_option);
9373 min_coord_per_rank_for_premigration = 32000;
9376 pe = pl.getEntryPtr(
"mj_recursion_depth");
9378 mj_user_recursion_depth = pe->getValue(&mj_user_recursion_depth);
9380 mj_user_recursion_depth = -1;
9384 pe = pl.getEntryPtr(
"rectilinear");
9386 val = pe->getValue(&val);
9389 this->distribute_points_on_cut_lines =
false;
9391 this->distribute_points_on_cut_lines =
true;
9394 if(this->mj_run_as_rcb) {
9395 mj_user_recursion_depth =
9396 (int)(ceil(log ((this->num_global_parts)) / log (2.0)));
9398 if(this->recursion_depth < 1) {
9399 if(mj_user_recursion_depth > 0) {
9400 this->recursion_depth = mj_user_recursion_depth;
9403 this->recursion_depth = this->coord_dim;
9409template <
typename Adapter>
9412 adapter_scalar_t *lower,
9413 adapter_scalar_t *upper,
9414 size_t &nPartsFound,
9415 typename Adapter::part_t **partsFound)
const
9424 if(this->mj_keep_part_boxes) {
9427 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
9429 size_t nBoxes = (*partBoxes).size();
9431 throw std::logic_error(
"no part boxes exist");
9435 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
9437 if(globalBox->boxesOverlap(dim, lower, upper)) {
9439 std::vector<typename Adapter::part_t> partlist;
9442 for(
size_t i = 0; i < nBoxes; i++) {
9444 if((*partBoxes)[i].boxesOverlap(dim, lower, upper)) {
9446 partlist.push_back((*partBoxes)[i].getpId());
9468 *partsFound =
new mj_part_t[nPartsFound];
9469 for(
size_t i = 0; i < nPartsFound; i++)
9470 (*partsFound)[i] = partlist[i];
9482 throw std::logic_error(
"need to use keep_cuts parameter for boxAssign");
9487template <
typename Adapter>
9490 adapter_scalar_t *point)
const
9496 if(this->mj_keep_part_boxes) {
9497 typename Adapter::part_t foundPart = -1;
9500 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
9502 size_t nBoxes = (*partBoxes).size();
9504 throw std::logic_error(
"no part boxes exist");
9508 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
9510 if(globalBox->pointInBox(dim, point)) {
9514 for(i = 0; i < nBoxes; i++) {
9516 if((*partBoxes)[i].pointInBox(dim, point)) {
9517 foundPart = (*partBoxes)[i].getpId();
9531 std::ostringstream oss;
9533 for(
int j = 0; j < dim; j++) oss << point[j] <<
" ";
9534 oss <<
") not found in domain";
9535 throw std::logic_error(oss.str());
9545 size_t closestBox = 0;
9546 coord_t minDistance = std::numeric_limits<coord_t>::max();
9547 coord_t *centroid =
new coord_t[dim];
9548 for(
size_t i = 0; i < nBoxes; i++) {
9549 (*partBoxes)[i].computeCentroid(centroid);
9552 for(
int j = 0; j < dim; j++) {
9553 diff = centroid[j] - point[j];
9556 if(sum < minDistance) {
9561 foundPart = (*partBoxes)[closestBox].getpId();
9568 throw std::logic_error(
"need to use keep_cuts parameter for pointAssign");
9572template <
typename Adapter>
9578 if(comXAdj_.getRawPtr() == NULL && comAdj_.getRawPtr() == NULL) {
9579 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
9580 mj_part_t ntasks = (*pBoxes).size();
9581 int dim = (*pBoxes)[0].getDim();
9582 GridHash grid(pBoxes, ntasks, dim);
9589template <
typename Adapter>
9590RCP<typename Zoltan2_AlgMJ<Adapter>::mj_partBoxVector_t>
9593 return this->mj_partitioner.get_kept_boxes();
Defines the CoordinateModel classes.
#define Z2_FORWARD_EXCEPTIONS
Forward an exception back through call stack.
#define Z2_ASSERT_VALUE(actual, expected)
Throw an error when actual value is not equal to expected value.
#define Z2_THROW_OUTSIDE_ERROR(env)
Throw an error returned from outside the Zoltan2 library.
Define IntegerRangeList validator.
Contains Teuchos redcution operators for the Multi-jagged algorthm.
Defines Parameter related enumerators, declares functions.
A gathering of useful namespace methods.
Zoltan2_BoxBoundaries is a reduction operation to all reduce the all box boundaries.
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
Zoltan2_BoxBoundaries()
Default Constructor.
Zoltan2_BoxBoundaries(Ordinal s_)
Constructor.
Multi Jagged coordinate partitioning algorithm.
void set_partitioning_parameters(bool distribute_points_on_cut_lines_, int max_concurrent_part_calculation_, int check_migrate_avoid_migration_option_, double minimum_migration_imbalance_, int migration_type_=0)
Multi Jagged coordinate partitioning algorithm.
RCP< mj_partBoxVector_t > compute_global_box_boundaries(RCP< mj_partBoxVector_t > &localPartBoxes) const
DOCWORK: Documentation.
void sequential_task_partitioning(const RCP< const Environment > &env, mj_lno_t num_total_coords, mj_lno_t num_selected_coords, size_t num_target_part, int coord_dim, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates_, Kokkos::View< mj_lno_t *, device_t > &initial_selected_coords_output_permutation, mj_lno_t *output_xadj, int recursion_depth_, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, bool partition_along_longest_dim, int num_ranks_per_node, bool divide_to_prime_first_, mj_part_t num_first_level_parts_=1, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &first_level_distribution_=Kokkos::View< mj_part_t *, Kokkos::HostSpace >())
Special function for partitioning for task mapping. Runs sequential, and performs deterministic parti...
void multi_jagged_part(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, double imbalance_tolerance, int num_teams, size_t num_global_parts, Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, int recursion_depth, int coord_dim, mj_lno_t num_local_coords, mj_gno_t num_global_coords, Kokkos::View< const mj_gno_t *, device_t > &initial_mj_gnos, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates, int num_weights_per_coord, Kokkos::View< bool *, Kokkos::HostSpace > &mj_uniform_weights, Kokkos::View< mj_scalar_t **, device_t > &mj_weights, Kokkos::View< bool *, Kokkos::HostSpace > &mj_uniform_parts, Kokkos::View< mj_part_t *, device_t > &result_assigned_part_ids, Kokkos::View< mj_gno_t *, device_t > &result_mj_gnos)
Multi Jagged coordinate partitioning algorithm.
RCP< mj_partBoxVector_t > get_kept_boxes() const
DOCWORK: Documentation.
AlgMJ()
Multi Jagged coordinate partitioning algorithm default constructor.
RCP< mj_partBox_t > get_global_box() const
DOCWORK: Documentation.
void set_to_keep_part_boxes()
Function call, if the part boxes are intended to be kept.
Algorithm defines the base class for all algorithms.
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
global_size_t getGlobalNumCoordinates() const
Returns the global number coordinates.
size_t getCoordinatesKokkos(Kokkos::View< const gno_t *, typename node_t::device_type > &Ids, Kokkos::View< scalar_t **, Kokkos::LayoutLeft, typename node_t::device_type > &xyz, Kokkos::View< scalar_t **, typename node_t::device_type > &wgts) const
Returns the coordinate ids, values and optional weights.
int getCoordinateDim() const
Returns the dimension of the coordinates.
size_t getLocalNumCoordinates() const
Returns the number of coordinates on this process.
int getNumWeightsPerCoordinate() const
Returns the number (0 or greater) of weights per coordinate.
static RCP< Teuchos::BoolParameterEntryValidator > getBoolValidator()
Exists to make setting up validators less cluttered.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyDoubleValidator()
Exists to make setting up validators less cluttered.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyIntValidator()
Exists to make setting up validators less cluttered.
GridHash Class, Hashing Class for part boxes.
void getAdjArrays(ArrayRCP< part_t > &comXAdj_, ArrayRCP< part_t > &comAdj_)
GridHash Class, returns the adj arrays.
A ParameterList validator for integer range lists.
A PartitioningSolution is a solution to a partitioning problem.
Multi Jagged coordinate partitioning algorithm.
void set_up_partitioning_data(const RCP< PartitioningSolution< Adapter > > &solution)
Zoltan2_AlgMJ(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, const RCP< const typename Adapter::base_adapter_t > &adapter)
void partition(const RCP< PartitioningSolution< Adapter > > &solution)
Multi Jagged coordinate partitioning algorithm.
mj_part_t pointAssign(int dim, adapter_scalar_t *point) const
void boxAssign(int dim, adapter_scalar_t *lower, adapter_scalar_t *upper, size_t &nPartsFound, mj_part_t **partsFound) const
static void getValidParameters(ParameterList &pl)
Set up validators specific to this algorithm.
void getCommunicationGraph(const PartitioningSolution< Adapter > *solution, ArrayRCP< mj_part_t > &comXAdj, ArrayRCP< mj_part_t > &comAdj)
returns communication graph resulting from MJ partitioning.
mj_partBoxVector_t & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
coordinateModelPartBox Class, represents the boundaries of the box which is a result of a geometric p...
Class for sorting items with multiple values. First sorting with respect to val[0],...
void set(IT index_, CT count_, WT *vals_)
bool operator<(const uMultiSortItem< IT, CT, WT > &other) const
uMultiSortItem(IT index_, CT count_, WT *vals_)
Created by mbenlioglu on Aug 31, 2020.
Tpetra::global_size_t global_size_t
std::bitset< NUM_MODEL_FLAGS > modelFlag_t
@ MACRO_TIMERS
Time an algorithm (or other entity) as a whole.
void uqsort(IT n, uSortItem< IT, WT > *arr)
Quick sort function. Sorts the arr of uSortItems, with respect to increasing vals....
void uqSignsort(IT n, uSignedSortItem< IT, WT, SIGN > *arr)
Quick sort function. Sorts the arr of uSignedSortItems, with respect to increasing vals.
SparseMatrixAdapter_t::part_t part_t
KOKKOS_INLINE_FUNCTION value_type & reference() const
KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const
int value_count_rightleft
Zoltan2_MJArrayType< scalar_t > value_type
KOKKOS_INLINE_FUNCTION void join(volatile value_type &dst, const volatile value_type &src) const
KOKKOS_INLINE_FUNCTION ArrayCombinationReducer(scalar_t mj_max_scalar, value_type &val, int mj_value_count_rightleft, int mj_value_count_weights)
ArrayCombinationReducer reducer
KOKKOS_INLINE_FUNCTION void init(value_type &dst) const
Zoltan2_MJArrayType< scalar_t > value_type
KOKKOS_INLINE_FUNCTION void init(value_type &dst) const
KOKKOS_INLINE_FUNCTION ArrayReducer(value_type &val, int mj_value_count)
KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const
KOKKOS_INLINE_FUNCTION value_type & reference() const
KOKKOS_INLINE_FUNCTION void init(value_type dst) const
Kokkos::View< part_t *, device_t > parts
Kokkos::View< scalar_t * > scalar_view_t
policy_t::member_type member_type
Kokkos::View< index_t *, device_t > part_xadj
ReduceArrayFunctor(part_t mj_concurrent_current_part, part_t mj_weight_array_size, Kokkos::View< index_t *, device_t > &mj_permutations, Kokkos::View< scalar_t *, device_t > &mj_coordinates, Kokkos::View< part_t *, device_t > &mj_parts, Kokkos::View< index_t *, device_t > &mj_part_xadj, Kokkos::View< index_t *, device_t > &mj_track_on_cuts)
Kokkos::View< index_t *, device_t > track_on_cuts
Kokkos::View< scalar_t *, device_t > coordinates
part_t concurrent_current_part
size_t team_shmem_size(int team_size) const
Kokkos::View< index_t *, device_t > permutations
KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const
Kokkos::View< scalar_t *, device_t > cut_coordinates
KOKKOS_INLINE_FUNCTION void init(value_type dst) const
part_t concurrent_current_part
Kokkos::View< scalar_t **, device_t > weights
ReduceWeightsFunctor(int mj_loop_count, array_t mj_max_scalar, part_t mj_concurrent_current_part, part_t mj_num_cuts, part_t mj_current_work_part, part_t mj_current_concurrent_num_parts, part_t mj_left_right_array_size, part_t mj_weight_array_size, Kokkos::View< index_t *, device_t > &mj_permutations, Kokkos::View< scalar_t *, device_t > &mj_coordinates, Kokkos::View< scalar_t **, device_t > &mj_weights, Kokkos::View< part_t *, device_t > &mj_parts, Kokkos::View< scalar_t *, device_t > &mj_cut_coordinates, Kokkos::View< index_t *, device_t > &mj_part_xadj, bool mj_uniform_weights0, scalar_t mj_sEpsilon)
KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const
part_t current_concurrent_num_parts
Kokkos::View< scalar_t *, device_t > coordinates
Kokkos::View< part_t *, device_t > parts
size_t team_shmem_size(int team_size) const
Kokkos::View< index_t *, device_t > part_xadj
Kokkos::View< index_t *, device_t > permutations
KOKKOS_INLINE_FUNCTION void operator()(const member_type &teamMember, value_type teamSum) const
Kokkos::View< scalar_t * > scalar_view_t
policy_t::member_type member_type
int value_count_rightleft
static int get_counter_Zoltan2_AlgMJ()
static int get_counter_AlgMJ()
KOKKOS_INLINE_FUNCTION Zoltan2_MJArrayType()
KOKKOS_INLINE_FUNCTION Zoltan2_MJArrayType(scalar_t *pSetPtr)
bool operator<=(const uSignedSortItem< IT, WT, SIGN > &rhs)
bool operator<(const uSignedSortItem< IT, WT, SIGN > &rhs) const
Sort items for quick sort function.