Zoltan2
Loading...
Searching...
No Matches
Zoltan2_TaskMapping.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// Zoltan2: A package of combinatorial algorithms for scientific computing
4//
5// Copyright 2012 NTESS and the Zoltan2 contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef _ZOLTAN2_COORD_PARTITIONMAPPING_HPP_
11#define _ZOLTAN2_COORD_PARTITIONMAPPING_HPP_
12
13#include <fstream>
14#include <ctime>
15#include <vector>
16#include <set>
17#include <tuple>
18
19#include "Zoltan2_Standards.hpp"
21#include "Teuchos_ArrayViewDecl.hpp"
24#include "Teuchos_ReductionOp.hpp"
25
27
29
30#include <Zoltan2_TPLTraits.hpp>
31
32#include "Teuchos_Comm.hpp"
33#ifdef HAVE_ZOLTAN2_MPI
34#include "Teuchos_DefaultMpiComm.hpp"
35#endif // HAVE_ZOLTAN2_MPI
36#include <Teuchos_DefaultSerialComm.hpp>
37
38//#define gnuPlot
41
42namespace Teuchos{
43
47template <typename Ordinal, typename T>
48class Zoltan2_ReduceBestMapping : public ValueTypeReductionOp<Ordinal,T>
49{
50
51private:
52 T epsilon;
53
54public:
57 Zoltan2_ReduceBestMapping() : epsilon(std::numeric_limits<T>::epsilon()) {}
58
61 void reduce(const Ordinal count,
62 const T inBuffer[],
63 T inoutBuffer[]) const {
64
65 for (Ordinal i = 0; i < count; i++) {
66 if (inBuffer[0] - inoutBuffer[0] < -epsilon) {
67 inoutBuffer[0] = inBuffer[0];
68 inoutBuffer[1] = inBuffer[1];
69 } else if(inBuffer[0] - inoutBuffer[0] < epsilon &&
70 inBuffer[1] - inoutBuffer[1] < epsilon) {
71 inoutBuffer[0] = inBuffer[0];
72 inoutBuffer[1] = inBuffer[1];
73 }
74 }
75 }
76};
77
78} // namespace Teuchos
79
80
81namespace Zoltan2{
82
83template <typename it>
84inline it z2Fact(it x) {
85 return (x == 1 ? x : x * z2Fact<it>(x - 1));
86}
87
88template <typename gno_t, typename part_t>
90public:
93};
94
95//returns the ith permutation indices.
96template <typename IT>
97void ithPermutation(const IT n, IT i, IT *perm)
98{
99 IT j, k = 0;
100 IT *fact = new IT[n];
101
102
103 // compute factorial numbers
104 fact[k] = 1;
105 while (++k < n)
106 fact[k] = fact[k - 1] * k;
107
108 // compute factorial code
109 for (k = 0; k < n; ++k)
110 {
111 perm[k] = i / fact[n - 1 - k];
112 i = i % fact[n - 1 - k];
113 }
114
115 // readjust values to obtain the permutation
116 // start from the end and check if preceding values are lower
117 for (k = n - 1; k > 0; --k)
118 for (j = k - 1; j >= 0; --j)
119 if (perm[j] <= perm[k])
120 perm[k]++;
121
122 delete [] fact;
123}
124
125template <typename part_t>
127 part_t *&task_comm_xadj,
128 part_t *&task_comm_adj,
129 std::vector<int> grid_dims) {
130 int dim = grid_dims.size();
131 int neighborCount = 2 * dim;
132 task_comm_xadj = new part_t[taskCount + 1];
133 task_comm_adj = new part_t[taskCount * neighborCount];
134
135 part_t neighBorIndex = 0;
136 task_comm_xadj[0] = 0;
137 for (part_t i = 0; i < taskCount; ++i) {
138 part_t prevDimMul = 1;
139 for (int j = 0; j < dim; ++j) {
140 part_t lNeighbor = i - prevDimMul;
141 part_t rNeighbor = i + prevDimMul;
142 prevDimMul *= grid_dims[j];
143 if (lNeighbor >= 0 &&
144 lNeighbor/ prevDimMul == i / prevDimMul &&
145 lNeighbor < taskCount) {
146 task_comm_adj[neighBorIndex++] = lNeighbor;
147 }
148 if (rNeighbor >= 0 &&
149 rNeighbor/ prevDimMul == i / prevDimMul &&
150 rNeighbor < taskCount) {
151 task_comm_adj[neighBorIndex++] = rNeighbor;
152 }
153 }
154 task_comm_xadj[i + 1] = neighBorIndex;
155 }
156
157}
158//returns the center of the parts.
159template <typename Adapter, typename scalar_t, typename part_t>
161 const Environment *envConst,
162 const Teuchos::Comm<int> *comm,
164 //const Zoltan2::PartitioningSolution<Adapter> *soln_,
165 const part_t *parts,
166 int coordDim,
167 part_t ntasks,
168 scalar_t **partCenters) {
169
170 typedef typename Adapter::lno_t lno_t;
171 typedef typename Adapter::gno_t gno_t;
172
173 typedef StridedData<lno_t, scalar_t> input_t;
174 ArrayView<const gno_t> gnos;
175 ArrayView<input_t> xyz;
176 ArrayView<input_t> wgts;
177 coords->getCoordinates(gnos, xyz, wgts);
178
179 //local and global num coordinates.
180 lno_t numLocalCoords = coords->getLocalNumCoordinates();
181 //gno_t numGlobalCoords = coords->getGlobalNumCoordinates();
182
183 //local number of points in each part.
184 gno_t *point_counts = new gno_t[ntasks];
185 memset(point_counts, 0, sizeof(gno_t) * ntasks);
186
187 //global number of points in each part.
188 gno_t *global_point_counts = new gno_t[ntasks];
189
190 scalar_t **multiJagged_coordinates = new scalar_t*[coordDim];
191
192 ArrayRCP<ArrayRCP<const scalar_t>> ar = arcp(new ArrayRCP<const scalar_t>[coordDim], 0, coordDim);
193 for (int dim=0; dim < coordDim; dim++){
194 xyz[dim].getInputArray(ar[dim]);
195 //multiJagged coordinate values assignment
196 multiJagged_coordinates[dim] = (scalar_t *)ar[dim].getRawPtr();
197 memset(partCenters[dim], 0, sizeof(scalar_t) * ntasks);
198 }
199
200 //get parts with parallel gnos.
201 //const part_t *parts = soln_->getPartListView();
202
203 envConst->timerStart(MACRO_TIMERS, "Mapping - Center Calculation");
204
205 for (lno_t i=0; i < numLocalCoords; i++) {
206 part_t p = parts[i];
207 //add up all coordinates in each part.
208 for(int j = 0; j < coordDim; ++j) {
209 scalar_t c = multiJagged_coordinates[j][i];
210 partCenters[j][p] += c;
211 }
212 ++point_counts[p];
213 }
214
215 //get global number of points in each part.
216 reduceAll<int, gno_t>(*comm, Teuchos::REDUCE_SUM,
217 ntasks, point_counts, global_point_counts);
218
219 for(int j = 0; j < coordDim; ++j) {
220 for (part_t i=0; i < ntasks; ++i) {
221 if (global_point_counts[i] > 0)
222 partCenters[j][i] /= global_point_counts[i];
223 }
224 }
225
226 scalar_t *tmpCoords = new scalar_t[ntasks];
227 for(int j = 0; j < coordDim; ++j) {
228 reduceAll<int, scalar_t>(*comm, Teuchos::REDUCE_SUM,
229 ntasks, partCenters[j], tmpCoords);
230
231 scalar_t *tmp = partCenters[j];
232 partCenters[j] = tmpCoords;
233 tmpCoords = tmp;
234 }
235
236 envConst->timerStop(MACRO_TIMERS, "Mapping - Center Calculation");
237
238 delete [] point_counts;
239 delete [] global_point_counts;
240
241 delete [] tmpCoords;
242 delete [] multiJagged_coordinates;
243}
244
245//returns the coarsend part graph.
246template <typename Adapter, typename scalar_t, typename part_t>
248 const Environment *envConst,
249 const Teuchos::Comm<int> *comm,
251 //const Zoltan2::PartitioningSolution<Adapter> *soln_,
252 part_t np,
253 const part_t *parts,
254 ArrayRCP<part_t> &g_part_xadj,
255 ArrayRCP<part_t> &g_part_adj,
256 ArrayRCP<scalar_t> &g_part_ew) {
257
258 typedef typename Adapter::lno_t t_lno_t;
259 typedef typename Adapter::gno_t t_gno_t;
260 typedef typename Adapter::scalar_t t_scalar_t;
261 typedef typename Adapter::offset_t t_offset_t;
262 typedef typename Zoltan2::GraphModel<
263 typename Adapter::base_adapter_t>::input_t t_input_t;
264
265 //int numRanks = comm->getSize();
266 //int myRank = comm->getRank();
267
268 //get parts with parallel gnos.
269 /*
270 const part_t *parts = soln_->getPartListView();
271
272 part_t np = soln_->getActualGlobalNumberOfParts();
273 if (part_t(soln_->getTargetGlobalNumberOfParts()) > np) {
274 np = soln_->getTargetGlobalNumberOfParts();
275 }
276 */
277
278
279 t_lno_t localNumVertices = graph->getLocalNumVertices();
280 t_lno_t localNumEdges = graph->getLocalNumEdges();
281
282 //get the vertex global ids, and weights
283 ArrayView<const t_gno_t> Ids;
284 ArrayView<t_input_t> v_wghts;
285 graph->getVertexList(Ids, v_wghts);
286
287 //get the edge ids, and weights
288 ArrayView<const t_gno_t> edgeIds;
289 ArrayView<const t_offset_t> offsets;
290 ArrayView<t_input_t> e_wgts;
291 graph->getEdgeList(edgeIds, offsets, e_wgts);
292
293 std::vector<t_scalar_t> edge_weights;
294 int numWeightPerEdge = graph->getNumWeightsPerEdge();
295
296 if (numWeightPerEdge > 0) {
297 edge_weights = std::vector<t_scalar_t>(localNumEdges);
298 for (t_lno_t i = 0; i < localNumEdges; ++i) {
299 edge_weights[i] = e_wgts[0][i];
300 }
301 }
302
303 //create a zoltan dictionary to get the parts of the vertices
304 //at the other end of edges
305 std::vector<part_t> e_parts(localNumEdges);
306#ifdef HAVE_ZOLTAN2_MPI
307 if (comm->getSize() > 1) {
308 const bool bUseLocalIDs = false; // Local IDs not needed
310 int debug_level = 0;
311 const RCP<const Comm<int> > rcp_comm(comm,false);
312 directory_t directory(rcp_comm, bUseLocalIDs, debug_level);
313 directory.update(localNumVertices, &Ids[0], NULL, &parts[0],
314 NULL, directory_t::Update_Mode::Replace);
315 directory.find(localNumEdges, &edgeIds[0], NULL, &e_parts[0],
316 NULL, NULL, false);
317 } else
318#endif
319 {
320
321 /*
322 std::cout << "localNumVertices:" << localNumVertices
323 << " np:" << np
324 << " globalNumVertices:" << graph->getGlobalNumVertices()
325 << " localNumEdges:" << localNumEdges << std::endl;
326 */
327
328 for (t_lno_t i = 0; i < localNumEdges; ++i) {
329 t_gno_t ei = edgeIds[i];
330 part_t p = parts[ei];
331 e_parts[i] = p;
332 }
333
334 //get the vertices in each part in my part.
335 std::vector<t_lno_t> part_begins(np, -1);
336 std::vector<t_lno_t> part_nexts(localNumVertices, -1);
337
338 //cluster vertices according to their parts.
339 //create local part graph.
340 for (t_lno_t i = 0; i < localNumVertices; ++i) {
341 part_t ap = parts[i];
342 part_nexts[i] = part_begins[ap];
343 part_begins[ap] = i;
344 }
345
346
347 g_part_xadj = ArrayRCP<part_t>(np + 1);
348 g_part_adj = ArrayRCP<part_t>(localNumEdges);
349 g_part_ew = ArrayRCP<t_scalar_t>(localNumEdges);
350 part_t nindex = 0;
351 g_part_xadj[0] = 0;
352 std::vector<part_t> part_neighbors(np);
353 std::vector<t_scalar_t> part_neighbor_weights(np, 0);
354 std::vector<t_scalar_t> part_neighbor_weights_ordered(np);
355
356 //coarsen for all vertices in my part in order with parts.
357 for (t_lno_t i = 0; i < np; ++i) {
358 part_t num_neighbor_parts = 0;
359 t_lno_t v = part_begins[i];
360 //get part i, and first vertex in this part v.
361 while (v != -1) {
362 //now get the neightbors of v.
363 for (t_offset_t j = offsets[v]; j < offsets[v+1]; ++j) {
364 //get the part of the second vertex.
365 part_t ep = e_parts[j];
366
367 t_scalar_t ew = 1;
368 if (numWeightPerEdge > 0) {
369 ew = edge_weights[j];
370 }
371
372// std::cout << "part:" << i << " v:" << v
373// << " part2:" << ep << " v2:" << edgeIds[j]
374// << " w:" << ew << std::endl;
375
376 if (part_neighbor_weights[ep] < 0.00001) {
377 part_neighbors[num_neighbor_parts++] = ep;
378 }
379
380 part_neighbor_weights[ep] += ew;
381 }
382
383 v = part_nexts[v];
384 }
385
386
387 //now get the part list.
388 for (t_lno_t j = 0; j < num_neighbor_parts; ++j) {
389 part_t neighbor_part = part_neighbors[j];
390 g_part_adj[nindex] = neighbor_part;
391 g_part_ew[nindex++] = part_neighbor_weights[neighbor_part];
392 part_neighbor_weights[neighbor_part] = 0;
393 }
394 g_part_xadj[i + 1] = nindex;
395 }
396 }
397
398#ifdef HAVE_ZOLTAN2_MPI
399 if(comm->getSize() > 1) { // Otherwise it's already handled above and done
400
401 // struct for directory data - note more extensive comments in
402 // Zoltan2_GraphMetricsUtility.hpp which I didn't want to duplicate
403 // because this is in progress. Similar concept there.
404 struct part_info {
405 part_info() : weight(0) {
406 }
407
408 const part_info & operator+=(const part_info & src) {
409 this->weight += src.weight;
410 return *this;
411 }
412
413 bool operator>(const part_info & src) const {
414 return (destination_part > src.destination_part);
415 }
416
417 bool operator==(const part_info & src) const {
418 return (destination_part == src.destination_part);
419 }
420
421 part_t destination_part;
422 scalar_t weight;
423 };
424
425 bool bUseLocalIDs = false;
426 const int debug_level = 0;
428 directory_t;
429 const RCP<const Comm<int> > rcp_comm(comm,false);
430 directory_t directory(rcp_comm, bUseLocalIDs, debug_level);
431 std::vector<part_t> part_data;
432 std::vector<std::vector<part_info>> user_data;
433
434 envConst->timerStart(MACRO_TIMERS, "GRAPHCREATE Coarsen");
435 {
436 //get the vertices in each part in my part.
437 std::vector<t_lno_t> part_begins(np, -1);
438 std::vector<t_lno_t> part_nexts(localNumVertices, -1);
439
440 //cluster vertices according to their parts.
441 //create local part graph.
442 for (t_lno_t i = 0; i < localNumVertices; ++i) {
443 part_t ap = parts[i];
444 part_nexts[i] = part_begins[ap];
445 part_begins[ap] = i;
446 }
447
448 std::vector<part_t> part_neighbors(np);
449 std::vector<t_scalar_t> part_neighbor_weights(np, 0);
450 std::vector<t_scalar_t> part_neighbor_weights_ordered(np);
451
452 //coarsen for all vertices in my part in order with parts.
453 for (t_lno_t i = 0; i < np; ++i) {
454 part_t num_neighbor_parts = 0;
455 t_lno_t v = part_begins[i];
456
457 //get part i, and first vertex in this part v.
458 while (v != -1) {
459 //now get the neightbors of v.
460 for (t_offset_t j = offsets[v]; j < offsets[v+1]; ++j) {
461 //get the part of the second vertex.
462 part_t ep = e_parts[j];
463
464 t_scalar_t ew = 1;
465 if (numWeightPerEdge > 0) {
466 ew = edge_weights[j];
467 }
468
469 //add it to my local part neighbors for part i.
470 if (part_neighbor_weights[ep] < 0.00001) {
471 part_neighbors[num_neighbor_parts++] = ep;
472 }
473
474 part_neighbor_weights[ep] += ew;
475 }
476
477 v = part_nexts[v];
478 }
479
480 //now get the part list.
481 for (t_lno_t j = 0; j < num_neighbor_parts; ++j) {
482 part_t neighbor_part = part_neighbors[j];
483 part_neighbor_weights_ordered[j] =
484 part_neighbor_weights[neighbor_part];
485 part_neighbor_weights[neighbor_part] = 0;
486 }
487
488 //insert it to tpetra crsmatrix.
489 if (num_neighbor_parts > 0) {
490 part_data.push_back(i); // TODO: optimize to avoid push_back
491 std::vector<part_info> new_user_data(num_neighbor_parts);
492 for(int q = 0; q < num_neighbor_parts; ++q) {
493 part_info & info = new_user_data[q];
494 info.weight = part_neighbor_weights_ordered[q];
495 info.destination_part = part_neighbors[q];
496 }
497 // TODO: optimize to avoid push_back
498 user_data.push_back(new_user_data);
499 }
500 }
501 }
502 envConst->timerStop(MACRO_TIMERS, "GRAPHCREATE Coarsen");
503
504 std::vector<part_t> part_indices(np);
505
506 for (part_t i = 0; i < np; ++i) part_indices[i] = i;
507
508 envConst->timerStart(MACRO_TIMERS, "GRAPHCREATE directory update");
509 directory.update(part_data.size(), &part_data[0], NULL, &user_data[0],
510 NULL, directory_t::Update_Mode::AggregateAdd);
511 envConst->timerStop(MACRO_TIMERS, "GRAPHCREATE directory update");
512
513 envConst->timerStart(MACRO_TIMERS, "GRAPHCREATE directory find");
514 std::vector<std::vector<part_info>> find_user_data(part_indices.size());
515 directory.find(find_user_data.size(), &part_indices[0], NULL,
516 &find_user_data[0], NULL, NULL, false);
517 envConst->timerStop(MACRO_TIMERS, "GRAPHCREATE directory find");
518
519 // Now reconstruct the output data from the directory find data
520 // This code was designed to reproduce the exact format of the original
521 // setup for g_part_xadj, g_part_adj, and g_part_ew but before making any
522 // further changes I wanted to verify if this formatting should be
523 // preserved or potentially changed further.
524
525 // first thing is get the total number of elements
526 int get_total_length = 0;
527 for (size_t n = 0; n < find_user_data.size(); ++n) {
528 get_total_length += find_user_data[n].size();
529 }
530
531 // setup data space
532 g_part_xadj = ArrayRCP<part_t>(np + 1);
533 g_part_adj = ArrayRCP<part_t>(get_total_length);
534 g_part_ew = ArrayRCP<t_scalar_t>(get_total_length);
535
536 // loop through again and fill to match the original formatting
537 int track_insert_index = 0;
538 for(size_t n = 0; n < find_user_data.size(); ++n) {
539 g_part_xadj[n] = track_insert_index;
540 const std::vector<part_info> & user_data_vector = find_user_data[n];
541 for(size_t q = 0; q < user_data_vector.size(); ++q) {
542 const part_info & info = user_data_vector[q];
543 g_part_adj[track_insert_index] = info.destination_part;
544 g_part_ew[track_insert_index] = info.weight;
545 ++track_insert_index;
546 }
547 }
548 g_part_xadj[np] = get_total_length; // complete the series
549 }
550#endif // HAVE_ZOLTAN2_MPI
551}
552
553
556template <class IT, class WT>
558
559 IT heapSize;
560 IT *indices;
561 WT *values;
562 WT epsilon;
563
564
565public:
567 delete [] this->indices;
568 delete [] this->values;
569 }
570
571 void setHeapsize(IT heapsize_){
572 this->heapSize = heapsize_;
573 this->indices = new IT[heapsize_];
574 this->values = new WT[heapsize_];
575 this->epsilon = std::numeric_limits<WT>::epsilon();
576 }
577
578 void addPoint(IT index, WT distance) {
579 WT maxVal = this->values[0];
580 //add only the distance is smaller than the maximum distance.
581// std::cout << "indeX:" << index
582// << " distance:" << distance
583// << " maxVal:" << maxVal << endl;
584 if (distance >= maxVal)
585 return;
586 else {
587 this->values[0] = distance;
588 this->indices[0] = index;
589 this->push_down(0);
590 }
591 }
592
593 //heap push down operation
594 void push_down(IT index_on_heap) {
595 IT child_index1 = 2 * index_on_heap + 1;
596 IT child_index2 = 2 * index_on_heap + 2;
597
598 IT biggerIndex = -1;
599 if(child_index1 < this->heapSize && child_index2 < this->heapSize) {
600
601 if (this->values[child_index1] < this->values[child_index2]) {
602 biggerIndex = child_index2;
603 }
604 else {
605 biggerIndex = child_index1;
606 }
607 }
608 else if(child_index1 < this->heapSize) {
609 biggerIndex = child_index1;
610
611 }
612 else if(child_index2 < this->heapSize) {
613 biggerIndex = child_index2;
614 }
615 if (biggerIndex >= 0 &&
616 this->values[biggerIndex] > this->values[index_on_heap]) {
617 WT tmpVal = this->values[biggerIndex];
618 this->values[biggerIndex] = this->values[index_on_heap];
619 this->values[index_on_heap] = tmpVal;
620
621 IT tmpIndex = this->indices[biggerIndex];
622 this->indices[biggerIndex] = this->indices[index_on_heap];
623 this->indices[index_on_heap] = tmpIndex;
624 this->push_down(biggerIndex);
625 }
626 }
627
628 void initValues() {
629 WT MAXVAL = std::numeric_limits<WT>::max();
630
631 for(IT i = 0; i < this->heapSize; ++i) {
632 this->values[i] = MAXVAL;
633 this->indices[i] = -1;
634 }
635 }
636
637 //returns the total distance to center in the cluster.
639
640 WT nc = 0;
641 for(IT j = 0; j < this->heapSize; ++j) {
642 nc += this->values[j];
643
644// std::cout << "index:" << this->indices[j]
645// << " distance:" << this->values[j] << endl;
646 }
647 return nc;
648 }
649
650 //returns the new center of the cluster.
651 bool getNewCenters(WT *center, WT **coords, int dimension) {
652 bool moved = false;
653
654 for(int i = 0; i < dimension; ++i) {
655 WT nc = 0;
656
657 for(IT j = 0; j < this->heapSize; ++j) {
658 IT k = this->indices[j];
659// std::cout << "i:" << i
660// << " dim:" << dimension
661// << " k:" << k
662// << " heapSize:" << heapSize << endl; nc += coords[i][k];
663 nc += coords[i][k];
664 }
665
666 nc /= this->heapSize;
667 moved = (std::abs(center[i] - nc) > this->epsilon || moved );
668 center[i] = nc;
669 }
670 return moved;
671 }
672
673 void copyCoordinates(IT *permutation) {
674 for (IT i = 0; i < this->heapSize; ++i) {
675 permutation[i] = this->indices[i];
676 }
677 }
678};
679
682template <class IT, class WT>
684
685 int dimension;
686 KmeansHeap<IT,WT> closestPoints;
687
688public:
689
691
693 delete [] center;
694 }
695
696 void setParams(int dimension_, int heapsize) {
697 this->dimension = dimension_;
698 this->center = new WT[dimension_];
699 this->closestPoints.setHeapsize(heapsize);
700 }
701
702 void clearHeap(){
703 this->closestPoints.initValues();
704 }
705
706 bool getNewCenters( WT **coords) {
707 return this->closestPoints.getNewCenters(center, coords, dimension);
708 }
709
710 //returns the distance of the coordinate to the center.
711 //also adds it to the heap.
712 WT getDistance(IT index, WT **elementCoords) {
713 WT distance = 0;
714
715 for (int i = 0; i < this->dimension; ++i) {
716 WT d = (center[i] - elementCoords[i][index]);
717 distance += d * d;
718 }
719 distance = pow(distance, WT(1.0 / this->dimension));
720 closestPoints.addPoint(index, distance);
721
722 return distance;
723 }
724
726 return closestPoints.getTotalDistance();
727 }
728
729 void copyCoordinates(IT *permutation) {
730 closestPoints.copyCoordinates(permutation);
731 }
732};
733
740template <class IT, class WT>
742
743 int dim;
744 IT numElements;
745 WT **elementCoords;
746 IT numClusters;
747 IT required_elements;
748 KMeansCluster<IT,WT> *clusters;
749 WT *maxCoordinates;
750 WT *minCoordinates;
751public:
752
754 delete [] clusters;
755 delete [] maxCoordinates;
756 delete [] minCoordinates;
757 }
758
762 int dim_ ,
763 IT numElements_,
764 WT **elementCoords_,
765 IT required_elements_):
766 dim(dim_),
767 numElements(numElements_),
768 elementCoords(elementCoords_),
769 numClusters((1 << dim_) + 1),
770 required_elements(required_elements_) {
771 this->clusters = new KMeansCluster<IT,WT>[this->numClusters];
772
773 //set dimension and the number of required elements for all clusters.
774 for (int i = 0; i < numClusters; ++i) {
775 this->clusters[i].setParams(this->dim, this->required_elements);
776 }
777
778 this->maxCoordinates = new WT[this->dim];
779 this->minCoordinates = new WT[this->dim];
780
781 //obtain the min and max coordinates for each dimension.
782 for (int j = 0; j < dim; ++j) {
783 this->minCoordinates[j] = this->elementCoords[j][0];
784 this->maxCoordinates[j] = this->elementCoords[j][0];
785
786 for(IT i = 1; i < numElements; ++i) {
787 WT t = this->elementCoords[j][i];
788 if(t > this->maxCoordinates[j]){
789 this->maxCoordinates[j] = t;
790 }
791
792 if (t < minCoordinates[j]) {
793 this->minCoordinates[j] = t;
794 }
795 }
796 }
797
798
799 //assign initial cluster centers.
800 for (int j = 0; j < dim; ++j) {
801 int mod = (1 << (j+1));
802 for (int i = 0; i < numClusters - 1; ++i) {
803 WT c = 0;
804
805 if ( (i % mod) < mod / 2) {
806 c = this->maxCoordinates[j];
807// std::cout << "i:" << i << " j:" << j
808// << " setting max:" << c << endl;
809 }
810 else {
811 c = this->minCoordinates[j];
812 }
813
814 this->clusters[i].center[j] = c;
815 }
816 }
817
818 //last cluster center is placed to middle.
819 for (int j = 0; j < dim; ++j) {
820 this->clusters[numClusters - 1].center[j] =
821 (this->maxCoordinates[j] + this->minCoordinates[j]) / 2;
822 }
823
824
825/*
826 for (int i = 0; i < numClusters; ++i) {
827// std::cout << endl << "cluster:" << i << endl << "\t";
828
829 for (int j = 0; j < dim; ++j) {
830 std::cout << this->clusters[i].center[j] << " ";
831 }
832 }
833*/
834 }
835
836 // Performs kmeans clustering of coordinates.
837 void kmeans() {
838 for (int it = 0; it < 10; ++it) {
839// std::cout << "it:" << it << endl;
840
841 for (IT j = 0; j < this->numClusters; ++j) {
842 this->clusters[j].clearHeap();
843 }
844
845 for (IT i = 0; i < this->numElements; ++i) {
846// std::cout << "i:" << i << " numEl:" << this->numElements << endl;
847
848 for (IT j = 0; j < this->numClusters; ++j) {
849// std::cout << "j:" << j
850// << " numClusters:" << this->numClusters << endl;
851
852 this->clusters[j].getDistance(i,this->elementCoords);
853 }
854 }
855
856 bool moved = false;
857
858 for (IT j = 0; j < this->numClusters; ++j) {
859 moved =
860 (this->clusters[j].getNewCenters(this->elementCoords) || moved );
861 }
862 if (!moved) {
863 break;
864 }
865 }
866 }
867
868 // Finds the cluster in which the coordinates are the closest to each
869 // other.
870 void getMinDistanceCluster(IT *procPermutation) {
871
872 WT minDistance = this->clusters[0].getDistanceToCenter();
873 IT minCluster = 0;
874
875// std::cout << "j:" << 0 << " minDistance:" << minDistance
876// << " minTmpDistance:" << minDistance
877// << " minCluster:" << minCluster << endl;
878
879 for (IT j = 1; j < this->numClusters; ++j) {
880 WT minTmpDistance = this->clusters[j].getDistanceToCenter();
881
882// std::cout << "j:" << j << " minDistance:" << minDistance
883// << " minTmpDistance:" << minTmpDistance
884// << " minCluster:" << minCluster << endl;
885
886 if (minTmpDistance < minDistance) {
887 minDistance = minTmpDistance;
888 minCluster = j;
889 }
890 }
891
892// std::cout << "minCluster:" << minCluster << endl;
893 this->clusters[minCluster].copyCoordinates(procPermutation);
894 }
895};
896
897
898#define MINOF(a,b) (((a)<(b))?(a):(b))
899
907template <typename T>
908void fillContinousArray(T *arr, size_t arrSize, T *val) {
909 if (val == NULL) {
910
911#ifdef HAVE_ZOLTAN2_OMP
912#pragma omp parallel for
913#endif
914 for (size_t i = 0; i < arrSize; ++i) {
915 arr[i] = i;
916 }
917
918 }
919 else {
920 T v = *val;
921#ifdef HAVE_ZOLTAN2_OMP
922#pragma omp parallel for
923#endif
924 for (size_t i = 0; i < arrSize; ++i) {
925// std::cout << "writing to i:" << i << " arr:" << arrSize << endl;
926 arr[i] = v;
927 }
928 }
929}
930
934template <typename part_t, typename pcoord_t, typename node_t>
936
937protected:
938 double commCost;
939
940public:
941
942 // Number of processors and number of tasks
945
946
948 CommunicationModel(part_t no_procs_, part_t no_tasks_):
949 commCost(),
950 no_procs(no_procs_),
951 no_tasks(no_tasks_) {}
952
954
956 return this->no_procs;
957 }
958
960 return this->no_tasks;
961 }
962
964 part_t *task_to_proc,
965 part_t *task_communication_xadj,
966 part_t *task_communication_adj,
967 pcoord_t *task_communication_edge_weight) {
968
969 double totalCost = 0;
970
971 part_t commCount = 0;
972 for (part_t task = 0; task < this->no_tasks; ++task) {
973 int assigned_proc = task_to_proc[task];
974
975 part_t task_adj_begin = task_communication_xadj[task];
976 part_t task_adj_end = task_communication_xadj[task + 1];
977
978 commCount += task_adj_end - task_adj_begin;
979
980 for (part_t task2 = task_adj_begin; task2 < task_adj_end; ++task2) {
981
982 part_t neighborTask = task_communication_adj[task2];
983 int neighborProc = task_to_proc[neighborTask];
984 double distance = getProcDistance(assigned_proc, neighborProc);
985
986 if (task_communication_edge_weight == NULL) {
987 totalCost += distance ;
988 }
989 else {
990 totalCost += distance * task_communication_edge_weight[task2];
991 }
992 }
993 }
994
995 // commCount
996 this->commCost = totalCost;
997 }
998
1000 return this->commCost;
1001 }
1002
1003 virtual double getProcDistance(int procId1, int procId2) const = 0;
1004
1014 virtual void getMapping(
1015 int myRank,
1016 const RCP<const Environment> &env,
1017 ArrayRCP <part_t> &proc_to_task_xadj,
1018 ArrayRCP <part_t> &proc_to_task_adj,
1019 ArrayRCP <part_t> &task_to_proc,
1020 const Teuchos::RCP <const Teuchos::Comm<int> > comm_
1021 ) const = 0;
1022};
1023
1024
1028template <typename pcoord_t, typename tcoord_t, typename part_t, typename node_t>
1030 public CommunicationModel<part_t, pcoord_t, node_t> {
1031public:
1032 //private:
1033
1034 // Dimension of the processors
1036 // Processor coordinates (allocated outside of the class)
1037 pcoord_t **proc_coords;
1038 // Dimension of the tasks coordinates.
1040 // Task coordinates (allocated outside of the class)
1041 tcoord_t **task_coords;
1042
1043 // TODO: Perhaps delete this and just reference the view size?
1044 // Need to check the handling of size -1 versus size 0
1046
1047 Kokkos::View<part_t *, Kokkos::HostSpace> kokkos_partNoArray;
1048
1052
1055
1056 //public:
1058 CommunicationModel<part_t, pcoord_t, node_t>(),
1059 proc_coord_dim(0),
1060 proc_coords(0),
1061 task_coord_dim(0),
1062 task_coords(0),
1063 partArraySize(-1),
1064 machine_extent(NULL),
1066 machine(NULL),
1068 divide_to_prime_first(false){}
1069
1071
1083 int pcoord_dim_,
1084 pcoord_t **pcoords_,
1085 int tcoord_dim_,
1086 tcoord_t **tcoords_,
1087 part_t no_procs_,
1088 part_t no_tasks_,
1089 int *machine_extent_,
1090 bool *machine_extent_wrap_around_,
1091 const MachineRepresentation<pcoord_t,part_t> *machine_ = NULL
1092 ):
1093 CommunicationModel<part_t, pcoord_t, node_t>(no_procs_, no_tasks_),
1094 proc_coord_dim(pcoord_dim_), proc_coords(pcoords_),
1095 task_coord_dim(tcoord_dim_), task_coords(tcoords_),
1096 partArraySize(-1),
1097 machine_extent(machine_extent_),
1098 machine_extent_wrap_around(machine_extent_wrap_around_),
1099 machine(machine_),
1101 divide_to_prime_first(false) {
1102 }
1103
1104
1105 void setPartArraySize(int psize) {
1106 this->partArraySize = psize;
1107 }
1108
1109 void setPartArray(Kokkos::View<part_t *, Kokkos::HostSpace> pNo) {
1110 this->kokkos_partNoArray = pNo;
1111 }
1112
1120 void getClosestSubset(part_t *proc_permutation, part_t nprocs, part_t ntasks) const{
1121 //currently returns a random subset.
1122
1123 part_t minCoordDim = MINOF(this->task_coord_dim, this->proc_coord_dim);
1125 minCoordDim, nprocs,
1126 this->proc_coords, ntasks);
1127
1128 kma.kmeans();
1129 kma.getMinDistanceCluster(proc_permutation);
1130
1131 for(int i = ntasks; i < nprocs; ++i) {
1132 proc_permutation[i] = -1;
1133 }
1134 /*
1135 //fill array.
1136 fillContinousArray<part_t>(proc_permutation, nprocs, NULL);
1137 '
1138 int _u_umpa_seed = 847449649;
1139 srand (time(NULL));
1140
1141 int a = rand() % 1000 + 1;
1142 _u_umpa_seed -= a;
1143
1144 //permute array randomly.
1145 update_visit_order(proc_permutation, nprocs,_u_umpa_seed, 1);
1146 */
1147 }
1148
1149 // Temporary, necessary for random permutation.
1150 static part_t umpa_uRandom(part_t l, int &_u_umpa_seed)
1151 {
1152 int a = 16807;
1153 int m = 2147483647;
1154 int q = 127773;
1155 int r = 2836;
1156 int lo, hi, test;
1157 double d;
1158
1159 lo = _u_umpa_seed % q;
1160 hi = _u_umpa_seed / q;
1161 test = (a * lo) - (r * hi);
1162 if (test>0)
1163 _u_umpa_seed = test;
1164 else
1165 _u_umpa_seed = test + m;
1166
1167 d = (double) ((double) _u_umpa_seed / (double) m);
1168
1169 return (part_t) (d*(double)l);
1170 }
1171
1172 virtual double getProcDistance(int procId1, int procId2) const {
1173 pcoord_t distance = 0;
1174 if (machine == NULL) {
1175 for (int i = 0 ; i < this->proc_coord_dim; ++i) {
1176 double d =
1177 std::abs(proc_coords[i][procId1] - proc_coords[i][procId2]);
1179 if (machine_extent[i] - d < d) {
1180 d = machine_extent[i] - d;
1181 }
1182 }
1183 distance += d;
1184 }
1185 }
1186 else {
1187 this->machine->getHopCount(procId1, procId2, distance);
1188 }
1189
1190 return distance;
1191 }
1192
1193
1194 // Temporary, does random permutation.
1195 void update_visit_order(part_t* visitOrder, part_t n,
1196 int &_u_umpa_seed, part_t rndm) {
1197 part_t *a = visitOrder;
1198
1199 if (rndm) {
1200 part_t i, u, v, tmp;
1201
1202 if (n <= 4)
1203 return;
1204
1205// srand ( time(NULL) );
1206// _u_umpa_seed = _u_umpa_seed1 - (rand()%100);
1207
1208 for (i = 0; i < n; i += 16) {
1209 u = umpa_uRandom(n-4, _u_umpa_seed);
1210 v = umpa_uRandom(n-4, _u_umpa_seed);
1211
1212 // FIXME (mfh 30 Sep 2015) This requires including
1213 // Zoltan2_AlgMultiJagged.hpp.
1214
1215 ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v], a[u], tmp);
1216 ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v + 1], a[u + 1], tmp);
1217 ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v + 2], a[u + 2], tmp);
1218 ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v + 3], a[u + 3], tmp);
1219 }
1220 }
1221 else {
1222 part_t i, end = n / 4;
1223
1224 for (i = 1; i < end; i++) {
1225 part_t j = umpa_uRandom(n - i, _u_umpa_seed);
1226 part_t t = a[j];
1227 a[j] = a[n - i];
1228 a[n - i] = t;
1229 }
1230 }
1231// PermuteInPlace(visitOrder, n);
1232 }
1233
1234
1235
1244 virtual void getMapping(
1245 int myRank,
1246 const RCP<const Environment> &env,
1247 ArrayRCP <part_t> &rcp_proc_to_task_xadj,
1248 ArrayRCP <part_t> &rcp_proc_to_task_adj,
1249 ArrayRCP <part_t> &rcp_task_to_proc,
1250 const Teuchos::RCP <const Teuchos::Comm<int> > comm_
1251 ) const {
1252
1253 rcp_proc_to_task_xadj = ArrayRCP <part_t>(this->no_procs + 1);
1254 rcp_proc_to_task_adj = ArrayRCP <part_t>(this->no_tasks);
1255 rcp_task_to_proc = ArrayRCP <part_t>(this->no_tasks);
1256
1257 // Holds the pointer to the task array
1258 part_t *proc_to_task_xadj = rcp_proc_to_task_xadj.getRawPtr();
1259
1260 // Holds the indices of task wrt to proc_to_task_xadj array.
1261 part_t *proc_to_task_adj = rcp_proc_to_task_adj.getRawPtr();
1262
1263 // Holds the processors mapped to tasks.
1264 part_t *task_to_proc = rcp_task_to_proc.getRawPtr();
1265
1266 part_t invalid = 0;
1267 fillContinousArray<part_t> (proc_to_task_xadj, this->no_procs + 1, &invalid);
1268
1269 // Obtain the number of parts that should be divided.
1270 part_t num_parts = MINOF(this->no_procs, this->no_tasks);
1271
1272 // Obtain the min coordinate dim.
1273 // No more want to do min coord dim. If machine dimension > task_dim,
1274 // we end up with a long line.
1275// part_t minCoordDim = MINOF(this->task_coord_dim, this->proc_coord_dim);
1276
1277 int recursion_depth = partArraySize;
1278
1279// if (partArraySize < minCoordDim)
1280// recursion_depth = minCoordDim;
1281 if (partArraySize == -1) {
1282
1284 // It is difficult to estimate the number of steps in this case
1285 // as each branch will have different depth.
1286 // The worst case happens when all prime factors are 3s.
1287 // P = 3^n, n recursion depth will divide parts to 2x and x
1288 // and n recursion depth with divide 2x into x and x.
1289 // Set it to upperbound here.
1290 // We could calculate the exact value here as well, but the
1291 // partitioning algorithm skips further ones anyways. recursion_depth = log(float(this->no_procs)) / log(2.0) * 2 + 1;
1292 }
1293 else {
1294 recursion_depth = log(float(this->no_procs)) / log(2.0) + 1;
1295 }
1296 }
1297
1298 // Number of permutations for tasks and processors
1299 int taskPerm = 1;
1300 int procPerm = 1;
1301
1302 // Get number of different permutations for task dimension ordering
1303 if (this->task_coord_dim <= 8)
1304 taskPerm = z2Fact<int>(this->task_coord_dim);
1305 // Prevent overflow
1306 else
1307 taskPerm = z2Fact<int>(8);
1308
1309 // Get number of different permutations for proc dimension ordering
1310 if (this->proc_coord_dim <= 8)
1311 procPerm = z2Fact<int>(this->proc_coord_dim);
1312 // Prevent overflow
1313 else
1314 procPerm = z2Fact<int>(8);
1315
1316
1317 // Total number of permutations (both task and proc permuted)
1318 int permutations = taskPerm * procPerm;
1319
1320 // Add permutations where we divide the processors with longest
1321 // dimension but permute tasks.
1322 permutations += taskPerm;
1323
1324 // Add permutations where we divide the tasks with longest
1325 // dimension but permute procs.
1326 permutations += procPerm;
1327
1328 // Add permutation with both tasks and procs divided by longest
1329 // dimension
1330 permutations += 1;
1331 //add one also that partitions based the longest dimension.
1332
1333 // Holds the pointers to proc_adjList
1334 part_t *proc_xadj = new part_t[num_parts+1];
1335
1336 // Holds the processors in parts according to the result of
1337 // partitioning algorithm.
1338 // The processors assigned to part x is at
1339 // proc_adjList[ proc_xadj[x] : proc_xadj[x + 1] ]
1340 part_t *proc_adjList = new part_t[this->no_procs];
1341
1342
1343 part_t used_num_procs = this->no_procs;
1344 if (this->no_procs > this->no_tasks) {
1345 // Obtain the subset of the processors that are closest to each
1346 // other.
1347 this->getClosestSubset(proc_adjList, this->no_procs,
1348 this->no_tasks);
1349 used_num_procs = this->no_tasks;
1350 }
1351 else {
1352 fillContinousArray<part_t>(proc_adjList,this->no_procs, NULL);
1353 }
1354
1355 // Index of the permutation
1356 int myPermutation = myRank % permutations;
1357 bool task_partition_along_longest_dim = false;
1358 bool proc_partition_along_longest_dim = false;
1359
1360
1361 int myProcPerm = 0;
1362 int myTaskPerm = 0;
1363
1364 if (myPermutation == 0) {
1365 task_partition_along_longest_dim = true;
1366 proc_partition_along_longest_dim = true;
1367 }
1368 else {
1369 --myPermutation;
1370 if (myPermutation < taskPerm) {
1371 proc_partition_along_longest_dim = true;
1372 // Index of the task permutation
1373 myTaskPerm = myPermutation;
1374 }
1375 else {
1376 myPermutation -= taskPerm;
1377 if (myPermutation < procPerm) {
1378 task_partition_along_longest_dim = true;
1379 // Index of the task permutation
1380 myProcPerm = myPermutation;
1381 }
1382 else {
1383 myPermutation -= procPerm;
1384 // Index of the proc permutation
1385 myProcPerm = myPermutation % procPerm;
1386 // Index of the task permutation
1387 myTaskPerm = myPermutation / procPerm;
1388 }
1389 }
1390 }
1391
1392/*
1393 if (task_partition_along_longest_dim &&
1394 proc_partition_along_longest_dim) {
1395 std::cout <<"me:" << myRank << " task:longest proc:longest"
1396 << " numPerms:" << permutations << std::endl;
1397 }
1398 else if (proc_partition_along_longest_dim) {
1399 std::cout <<"me:" << myRank << " task:" << myTaskPerm
1400 << " proc:longest" << " numPerms:" << permutations << std::endl;
1401 }
1402 else if (task_partition_along_longest_dim) {
1403 std::cout <<"me:" << myRank << " task: longest" << " proc:"
1404 << myProcPerm << " numPerms:" << permutations << std::endl;
1405 }
1406 else {
1407 std::cout <<"me:" << myRank << " task:" << myTaskPerm << " proc:"
1408 << myProcPerm << " numPerms:" << permutations << std::endl;
1409 }
1410*/
1411
1412 int *permutation = new int[(this->proc_coord_dim > this->task_coord_dim)
1413 ? this->proc_coord_dim : this->task_coord_dim];
1414
1415 // Get the permutation order from the proc permutation index.
1416 if (this->proc_coord_dim <= 8)
1417 ithPermutation<int>(this->proc_coord_dim, myProcPerm, permutation);
1418 else
1419 ithPermutation<int>(8, myProcPerm, permutation);
1420
1421/*
1422 // Reorder the coordinate dimensions.
1423 pcoord_t **pcoords = allocMemory<pcoord_t *>(this->proc_coord_dim);
1424 for (int i = 0; i < this->proc_coord_dim; ++i) {
1425 pcoords[i] = this->proc_coords[permutation[i]];
1426// std::cout << permutation[i] << " ";
1427 }
1428*/
1429
1430 int procdim = this->proc_coord_dim;
1431 pcoord_t **pcoords = this->proc_coords;
1432
1433/*
1434 int procdim = this->proc_coord_dim;
1435 procdim = 6;
1436 //reorder the coordinate dimensions.
1437 pcoord_t **pcoords = allocMemory<pcoord_t *>(procdim);
1438 for (int i = 0; i < procdim; ++i) {
1439 pcoords[i] = new pcoord_t[used_num_procs] ;
1440// this->proc_coords[permutation[i]];
1441 }
1442
1443 for (int k = 0; k < used_num_procs ; k++) {
1444 pcoords[0][k] = (int (this->proc_coords[0][k]) / 2) * 64;
1445 pcoords[3][k] = (int (this->proc_coords[0][k]) % 2) * 8 ;
1446
1447 pcoords[1][k] = (int (this->proc_coords[1][k]) / 2) * 8 * 2400;
1448 pcoords[4][k] = (int (this->proc_coords[1][k]) % 2) * 8;
1449 pcoords[2][k] = ((int (this->proc_coords[2][k])) / 8) * 160;
1450 pcoords[5][k] = ((int (this->proc_coords[2][k])) % 8) * 5;
1451
1452 //if (this->proc_coords[0][k] == 40 &&
1453 // this->proc_coords[1][k] == 8 &&
1454 // this->proc_coords[2][k] == 48) {
1455 if (this->proc_coords[0][k] == 5 &&
1456 this->proc_coords[1][k] == 0 &&
1457 this->proc_coords[2][k] == 10) {
1458 std::cout << "pcoords[0][k]:" << pcoords[0][k]
1459 << "pcoords[1][k]:" << pcoords[1][k]
1460 << "pcoords[2][k]:" << pcoords[2][k]
1461 << "pcoords[3][k]:" << pcoords[3][k]
1462 << "pcoords[4][k]:" << pcoords[4][k]
1463 << "pcoords[5][k]:" << pcoords[5][k] << std::endl;
1464 }
1465 else if (pcoords[0][k] == 64 &&
1466 pcoords[1][k] == 0 &&
1467 pcoords[2][k] == 160 &&
1468 pcoords[3][k] == 16 &&
1469 pcoords[4][k] == 0 &&
1470 pcoords[5][k] == 10) {
1471 std::cout << "this->proc_coords[0][k]:" << this->proc_coords[0][k]
1472 << "this->proc_coords[1][k]:" << this->proc_coords[1][k]
1473 << "this->proc_coords[2][k]:" << this->proc_coords[2][k]
1474 << std::endl;
1475 }
1476 }
1477*/
1478
1479// if (partNoArray == NULL)
1480// std::cout << "partNoArray is null" << std::endl;
1481// std::cout << "recursion_depth:" << recursion_depth
1482// << " partArraySize:" << partArraySize << std::endl;
1483
1484 // Optimization for Dragonfly Networks, First Level of partitioning
1485 // is imbalanced to ensure procs are divided by first RCA
1486 // coord (a.k.a. group).
1487 part_t num_group_count = 1;
1488 part_t *group_count = NULL;
1489
1490 if (machine != NULL)
1491 num_group_count = machine->getNumUniqueGroups();
1492
1493 if (num_group_count > 1) {
1494 group_count = new part_t[num_group_count];
1495 memset(group_count, 0, sizeof(part_t) * num_group_count);
1496
1497 machine->getGroupCount(group_count);
1498 }
1499
1500 // Do the partitioning and renumber the parts.
1501 env->timerStart(MACRO_TIMERS, "Mapping - Proc Partitioning");
1502 // Partitioning of Processors
1504
1505 typedef typename node_t::device_type device_t;
1506 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
1507 Kokkos::View<pcoord_t**, Kokkos::LayoutLeft, device_t>
1508 kokkos_pcoords("pcoords", this->no_procs, procdim);
1509 auto host_kokkos_pcoords = Kokkos::create_mirror_view(kokkos_pcoords);
1510 for(int i = 0; i < procdim; ++i) {
1511 for(int j = 0; j < this->no_procs; ++j) {
1512 host_kokkos_pcoords(j,i) = pcoords[i][j];
1513 }
1514 }
1515 Kokkos::deep_copy(kokkos_pcoords, host_kokkos_pcoords);
1516
1517 Kokkos::View<part_t*, device_t> initial_selected_coords_output_permutation_pcoords(
1518 "initial_selected_coords_output_permutation_pcoords", this->no_procs);
1519 typename Kokkos::View<part_t*, device_t>::host_mirror_type
1520 host_initial_selected_coords_output_permutation_pcoords =
1521 Kokkos::create_mirror_view(initial_selected_coords_output_permutation_pcoords);
1522 for(int n = 0; n < this->no_procs; ++n) {
1523 host_initial_selected_coords_output_permutation_pcoords(n) =
1524 proc_adjList[n];
1525 }
1526 Kokkos::deep_copy(initial_selected_coords_output_permutation_pcoords,
1527 host_initial_selected_coords_output_permutation_pcoords);
1528
1529 // Note num_group_count = 1 when group_count = NULL - perhaps could change
1530 Kokkos::View<part_t *, Kokkos::HostSpace> kokkos_group_count(
1531 "kokkos_group_count", group_count ? num_group_count : 0);
1532 if(group_count) {
1533 for(int n = 0; n < num_group_count; ++n) {
1534 kokkos_group_count(n) = group_count[n];
1535 }
1536 }
1537
1538 mj_partitioner.sequential_task_partitioning(
1539 env,
1540 this->no_procs,
1541 used_num_procs,
1542 num_parts,
1543 procdim,
1544 //minCoordDim,
1545 kokkos_pcoords,
1546 initial_selected_coords_output_permutation_pcoords,
1547 proc_xadj,
1548 recursion_depth,
1550 proc_partition_along_longest_dim, // false
1553 num_group_count,
1554 kokkos_group_count);
1555 env->timerStop(MACRO_TIMERS, "Mapping - Proc Partitioning");
1556// comm_->barrier();
1557// std::cout << "mj_partitioner.for procs over" << std::endl;
1558// freeArray<pcoord_t *>(pcoords);
1559
1560 Kokkos::deep_copy(host_initial_selected_coords_output_permutation_pcoords,
1561 initial_selected_coords_output_permutation_pcoords);
1562 for(int n = 0; n < this->no_procs; ++n) {
1563 proc_adjList[n] =
1564 host_initial_selected_coords_output_permutation_pcoords(n);
1565 }
1566
1567 part_t *task_xadj = new part_t[num_parts + 1];
1568 part_t *task_adjList = new part_t[this->no_tasks];
1569
1570 // Fill task_adjList st: task_adjList[i] <- i.
1571 fillContinousArray<part_t>(task_adjList,this->no_tasks, NULL);
1572
1573 // Get the permutation order from the task permutation index.
1574 if (this->task_coord_dim <= 8)
1575 ithPermutation<int>(this->task_coord_dim, myTaskPerm, permutation);
1576 else
1577 ithPermutation<int>(8, myTaskPerm, permutation);
1578
1579 // Reorder task coordinate dimensions.
1580 tcoord_t **tcoords = new tcoord_t*[this->task_coord_dim];
1581 for(int i = 0; i < this->task_coord_dim; ++i) {
1582 tcoords[i] = this->task_coords[permutation[i]];
1583 }
1584
1585 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
1586 Kokkos::View<tcoord_t**, Kokkos::LayoutLeft, device_t>
1587 kokkos_tcoords("tcoords", this->no_tasks, this->task_coord_dim);
1588 auto host_kokkos_tcoords = Kokkos::create_mirror_view(kokkos_tcoords);
1589 for(int i = 0; i < this->task_coord_dim; ++i) {
1590 for(int j = 0; j < this->no_tasks; ++j) {
1591 host_kokkos_tcoords(j,i) = tcoords[i][j];
1592 }
1593 }
1594 Kokkos::deep_copy(kokkos_tcoords, host_kokkos_tcoords);
1595
1596 env->timerStart(MACRO_TIMERS, "Mapping - Task Partitioning");
1597
1598 Kokkos::View<part_t*, device_t> initial_selected_coords_output_permutation_tcoords(
1599 "initial_selected_coords_output_permutation_tcoords", this->no_tasks);
1600 typename Kokkos::View<part_t*, device_t>::host_mirror_type
1601 host_initial_selected_coords_output_permutation_tcoords =
1602 Kokkos::create_mirror_view(initial_selected_coords_output_permutation_tcoords);
1603 for(int n = 0; n < this->no_tasks; ++n) {
1604 host_initial_selected_coords_output_permutation_tcoords(n) =
1605 task_adjList[n];
1606 }
1607 Kokkos::deep_copy(initial_selected_coords_output_permutation_tcoords,
1608 host_initial_selected_coords_output_permutation_tcoords);
1609
1610 //partitioning of tasks
1611 mj_partitioner.sequential_task_partitioning(
1612 env,
1613 this->no_tasks,
1614 this->no_tasks,
1615 num_parts,
1616 this->task_coord_dim,
1617 //minCoordDim,
1618 kokkos_tcoords,
1619 initial_selected_coords_output_permutation_tcoords,
1620 task_xadj,
1621 recursion_depth,
1623 task_partition_along_longest_dim,
1626 num_group_count,
1627 kokkos_group_count);
1628 env->timerStop(MACRO_TIMERS, "Mapping - Task Partitioning");
1629
1630 Kokkos::deep_copy(host_initial_selected_coords_output_permutation_tcoords,
1631 initial_selected_coords_output_permutation_tcoords);
1632 for(int n = 0; n < this->no_tasks; ++n) {
1633 task_adjList[n] =
1634 host_initial_selected_coords_output_permutation_tcoords(n);
1635 }
1636
1637// std::cout << "myrank:" << myRank << std::endl;
1638// comm_->barrier();
1639// std::cout << "mj_partitioner.sequential_task_partitioning over"
1640// << std::endl;
1641
1642 delete [] tcoords;
1643 delete [] permutation;
1644
1645
1646 //filling proc_to_task_xadj, proc_to_task_adj, task_to_proc arrays.
1647 for(part_t i = 0; i < num_parts; ++i) {
1648
1649 part_t proc_index_begin = proc_xadj[i];
1650 part_t task_begin_index = task_xadj[i];
1651 part_t proc_index_end = proc_xadj[i + 1];
1652 part_t task_end_index = task_xadj[i + 1];
1653
1654
1655 if(proc_index_end - proc_index_begin != 1) {
1656 std::cerr << "Error at partitioning of processors" << std::endl;
1657 std::cerr << "PART:" << i << " is assigned to "
1658 << proc_index_end - proc_index_begin << " processors."
1659 << std::endl;
1660 std::terminate();
1661 }
1662 part_t assigned_proc = proc_adjList[proc_index_begin];
1663 proc_to_task_xadj[assigned_proc] = task_end_index - task_begin_index;
1664 }
1665
1666 //holds the pointer to the task array
1667 //convert proc_to_task_xadj to CSR index array
1668 part_t *proc_to_task_xadj_work = new part_t[this->no_procs];
1669 part_t sum = 0;
1670 for(part_t i = 0; i < this->no_procs; ++i) {
1671 part_t tmp = proc_to_task_xadj[i];
1672 proc_to_task_xadj[i] = sum;
1673 sum += tmp;
1674 proc_to_task_xadj_work[i] = sum;
1675 }
1676 proc_to_task_xadj[this->no_procs] = sum;
1677
1678 for(part_t i = 0; i < num_parts; ++i){
1679
1680 part_t proc_index_begin = proc_xadj[i];
1681 part_t task_begin_index = task_xadj[i];
1682 part_t task_end_index = task_xadj[i + 1];
1683
1684 part_t assigned_proc = proc_adjList[proc_index_begin];
1685
1686 for (part_t j = task_begin_index; j < task_end_index; ++j) {
1687 part_t taskId = task_adjList[j];
1688
1689 task_to_proc[taskId] = assigned_proc;
1690
1691 proc_to_task_adj [--proc_to_task_xadj_work[assigned_proc]] = taskId;
1692 }
1693 }
1694
1695/*
1696 if (myPermutation == 0) {
1697 std::ofstream gnuPlotCode ("mymapping.out", std::ofstream::out);
1698
1699 for (part_t i = 0; i < num_parts; ++i) {
1700
1701 part_t proc_index_begin = proc_xadj[i];
1702 part_t proc_index_end = proc_xadj[i + 1];
1703
1704 if (proc_index_end - proc_index_begin != 1) {
1705 std::cerr << "Error at partitioning of processors" << std::endl;
1706 std::cerr << "PART:" << i << " is assigned to "
1707 << proc_index_end - proc_index_begin << " processors."
1708 << std::endl;
1709 exit(1);
1710 }
1711
1712 part_t assigned_proc = proc_adjList[proc_index_begin];
1713 gnuPlotCode << "Rank:" << i << " "
1714 << this->proc_coords[0][assigned_proc] << " "
1715 << this->proc_coords[1][assigned_proc] << " "
1716 << this->proc_coords[2][assigned_proc] << " "
1717 << pcoords[0][assigned_proc] << " "
1718 << pcoords[1][assigned_proc] << " "
1719 << pcoords[2][assigned_proc] << " "
1720 << pcoords[3][assigned_proc] << std::endl;
1721 }
1722
1723 gnuPlotCode << "Machine Extent:" << std::endl;
1724 //filling proc_to_task_xadj, proc_to_task_adj, task_to_proc arrays.
1725 for (part_t i = 0; i < num_parts; ++i) {
1726
1727 part_t proc_index_begin = proc_xadj[i];
1728 part_t proc_index_end = proc_xadj[i + 1];
1729
1730 if (proc_index_end - proc_index_begin != 1) {
1731 std::cerr << "Error at partitioning of processors" << std::endl;
1732 std::cerr << "PART:" << i << " is assigned to "
1733 << proc_index_end - proc_index_begin << " processors."
1734 << std::endl;
1735 exit(1);
1736 }
1737
1738 part_t assigned_proc = proc_adjList[proc_index_begin];
1739 gnuPlotCode << "Rank:" << i << " "
1740 << this->proc_coords[0][assigned_proc] << " "
1741 << this->proc_coords[1][assigned_proc] << " "
1742 << this->proc_coords[2][assigned_proc] << std::endl;
1743 }
1744 gnuPlotCode.close();
1745 }
1746*/
1747
1748 delete [] proc_to_task_xadj_work;
1749 delete [] task_xadj;
1750 delete [] task_adjList;
1751 delete [] proc_xadj;
1752 delete [] proc_adjList;
1753 }
1754
1755};
1756
1757template <typename Adapter, typename part_t>
1759protected:
1760
1761#ifndef DOXYGEN_SHOULD_SKIP_THIS
1762
1763 typedef typename Adapter::scalar_t pcoord_t;
1764 typedef typename Adapter::scalar_t tcoord_t;
1765 typedef typename Adapter::scalar_t scalar_t;
1766 typedef typename Adapter::lno_t lno_t;
1767
1768#if defined(KOKKOS_ENABLE_CUDA)
1769 using node_t = Tpetra::KokkosCompat::KokkosDeviceWrapperNode<
1770 Kokkos::Cuda, Kokkos::CudaSpace>;
1771#elif defined(KOKKOS_ENABLE_HIP)
1772 using node_t = Tpetra::KokkosCompat::KokkosDeviceWrapperNode<
1773 Kokkos::HIP, Kokkos::HIPSpace>;
1774#else
1775 using node_t = typename Adapter::node_t;
1776#endif
1777
1778#endif
1779
1780// RCP<const Environment> env;
1781
1782 // Holds the pointer to the task array
1783 ArrayRCP<part_t> proc_to_task_xadj;
1784// = allocMemory<part_t> (this->no_procs + 1);
1785
1786 // Holds the indices of tasks wrt to proc_to_task_xadj array.
1787 ArrayRCP<part_t> proc_to_task_adj;
1788// = allocMemory<part_t>(this->no_tasks);
1789
1790 // Holds the processors mapped to tasks.
1791 ArrayRCP<part_t> task_to_proc;
1792// = allocMemory<part_t>(this->no_procs);
1793
1794 // Holds the processors mapped to tasks.
1795 ArrayRCP<part_t> local_task_to_rank;
1796// = allocMemory<part_t>(this->no_procs);
1797
1802 ArrayRCP<part_t> task_communication_xadj;
1803 ArrayRCP<part_t> task_communication_adj;
1805
1806
1810 void doMapping(int myRank,
1811 const Teuchos::RCP<const Teuchos::Comm<int> > comm_) {
1812
1813 if (this->proc_task_comm) {
1814 this->proc_task_comm->getMapping(
1815 myRank,
1816 this->env,
1817 // Holds the pointer to the task array
1818 this->proc_to_task_xadj,
1819 // Holds the indices of task wrt to proc_to_task_xadj array
1820 this->proc_to_task_adj,
1821 // Holds the processors mapped to tasks
1822 this->task_to_proc,
1823 comm_
1824 );
1825 }
1826 else {
1827 std::cerr << "communicationModel is not specified in the Mapper"
1828 << std::endl;
1829 exit(1);
1830 }
1831 }
1832
1833
1837 RCP<Comm<int> > create_subCommunicator() {
1838 int procDim = this->proc_task_comm->proc_coord_dim;
1839 int taskDim = this->proc_task_comm->task_coord_dim;
1840
1841 // Number of permutations for tasks and processors
1842 int taskPerm = 1;
1843 int procPerm = 1;
1844
1845 // Get number of different permutations for task dimension ordering
1846 if (taskDim <= 8)
1847 taskPerm = z2Fact<int>(taskDim);
1848 // Prevent overflow
1849 else
1850 taskPerm = z2Fact<int>(8);
1851
1852 // Get number of different permutations for proc dimension ordering
1853 if (procDim <= 8)
1854 procPerm = z2Fact<int>(procDim);
1855 // Prevent overflow
1856 else
1857 procPerm = z2Fact<int>(8);
1858
1859 // Total number of permutations
1860 int idealGroupSize = taskPerm * procPerm;
1861
1862 // For the one that does longest dimension partitioning.
1863 idealGroupSize += taskPerm + procPerm + 1;
1864
1865 int myRank = this->comm->getRank();
1866 int commSize = this->comm->getSize();
1867
1868 int myGroupIndex = myRank / idealGroupSize;
1869
1870 int prevGroupBegin = (myGroupIndex - 1)* idealGroupSize;
1871 if (prevGroupBegin < 0) prevGroupBegin = 0;
1872 int myGroupBegin = myGroupIndex * idealGroupSize;
1873 int myGroupEnd = (myGroupIndex + 1) * idealGroupSize;
1874 int nextGroupEnd = (myGroupIndex + 2)* idealGroupSize;
1875
1876 if (myGroupEnd > commSize) {
1877 myGroupBegin = prevGroupBegin;
1878 myGroupEnd = commSize;
1879 }
1880 if (nextGroupEnd > commSize) {
1881 myGroupEnd = commSize;
1882 }
1883 int myGroupSize = myGroupEnd - myGroupBegin;
1884
1885 part_t *myGroup = new part_t[myGroupSize];
1886 for (int i = 0; i < myGroupSize; ++i) {
1887 myGroup[i] = myGroupBegin + i;
1888 }
1889// std::cout << "me:" << myRank << " myGroupBegin:" << myGroupBegin
1890// << " myGroupEnd:" << myGroupEnd << endl;
1891
1892 ArrayView<const part_t> myGroupView(myGroup, myGroupSize);
1893
1894 RCP<Comm<int> > subComm =
1895 this->comm->createSubcommunicator(myGroupView);
1896 delete [] myGroup;
1897 return subComm;
1898 }
1899
1900
1905 //create the sub group.
1906 RCP<Comm<int> > subComm = this->create_subCommunicator();
1907 //calculate cost.
1908 double myCost = this->proc_task_comm->getCommunicationCostMetric();
1909// std::cout << "me:" << this->comm->getRank() << " myCost:"
1910// << myCost << std::endl;
1911 double localCost[2], globalCost[2];
1912
1913 localCost[0] = myCost;
1914 localCost[1] = double(subComm->getRank());
1915
1916 globalCost[1] = globalCost[0] = std::numeric_limits<double>::max();
1918 reduceAll<int, double>(*subComm, reduceBest,
1919 2, localCost, globalCost);
1920
1921 int sender = int(globalCost[1]);
1922
1923/*
1924 if ( this->comm->getRank() == 0) {
1925 std::cout << "me:" << localCost[1] <<
1926 " localcost:" << localCost[0]<<
1927 " bestcost:" << globalCost[0] <<
1928 " Sender:" << sender <<
1929 " procDim" << proc_task_comm->proc_coord_dim <<
1930 " taskDim:" << proc_task_comm->task_coord_dim << std::endl;
1931 }
1932*/
1933
1934// std::cout << "me:" << localCost[1] << " localcost:" << localCost[0]
1935// << " bestcost:" << globalCost[0] << endl;
1936// std::cout << "me:" << localCost[1] << " proc:" << globalCost[1]
1937// << endl;
1938 broadcast(*subComm, sender, this->ntasks,
1939 this->task_to_proc.getRawPtr());
1940 broadcast(*subComm, sender, this->nprocs,
1941 this->proc_to_task_xadj.getRawPtr());
1942 broadcast(*subComm, sender, this->ntasks,
1943 this->proc_to_task_adj.getRawPtr());
1944 }
1945
1946 //write mapping to gnuPlot code to visualize.
1948 std::ofstream gnuPlotCode("gnuPlot.plot", std::ofstream::out);
1949
1950 int mindim = MINOF(proc_task_comm->proc_coord_dim,
1951 proc_task_comm->task_coord_dim);
1952 std::string ss = "";
1953 for (part_t i = 0; i < this->nprocs; ++i) {
1954
1955 std::string procFile = Teuchos::toString<int>(i) + "_mapping.txt";
1956 if (i == 0) {
1957 gnuPlotCode << "plot \"" << procFile << "\"\n";
1958 }
1959 else {
1960 gnuPlotCode << "replot \"" << procFile << "\"\n";
1961 }
1962
1963 std::ofstream inpFile(procFile.c_str(), std::ofstream::out);
1964
1965 std::string gnuPlotArrow = "set arrow from ";
1966 for (int j = 0; j < mindim; ++j) {
1967 if (j == mindim - 1) {
1968 inpFile << proc_task_comm->proc_coords[j][i];
1969 gnuPlotArrow +=
1970 Teuchos::toString<float>(proc_task_comm->proc_coords[j][i]);
1971
1972 }
1973 else {
1974 inpFile << proc_task_comm->proc_coords[j][i] << " ";
1975 gnuPlotArrow +=
1976 Teuchos::toString<float>(proc_task_comm->
1977 proc_coords[j][i]) + ",";
1978 }
1979 }
1980 gnuPlotArrow += " to ";
1981
1982 inpFile << std::endl;
1983 ArrayView<part_t> a = this->getAssignedTasksForProc(i);
1984
1985 for (int k = 0; k < a.size(); ++k) {
1986 int j = a[k];
1987// std::cout << "i:" << i << " j:"
1988 std::string gnuPlotArrow2 = gnuPlotArrow;
1989 for (int z = 0; z < mindim; ++z) {
1990 if (z == mindim - 1) {
1991
1992// std::cout << "z:" << z << " j:" << j << " "
1993// << proc_task_comm->task_coords[z][j] << endl;
1994 inpFile << proc_task_comm->task_coords[z][j];
1995 gnuPlotArrow2 +=
1996 Teuchos::toString<float>(proc_task_comm->task_coords[z][j]);
1997 }
1998 else {
1999 inpFile << proc_task_comm->task_coords[z][j] << " ";
2000 gnuPlotArrow2 +=
2001 Teuchos::toString<float>(proc_task_comm->
2002 task_coords[z][j]) + ",";
2003 }
2004 }
2005 ss += gnuPlotArrow2 + "\n";
2006 inpFile << std::endl;
2007 }
2008 inpFile.close();
2009 }
2010 gnuPlotCode << ss;
2011 gnuPlotCode << "\nreplot\n pause -1 \n";
2012 gnuPlotCode.close();
2013 }
2014
2015 //write mapping to gnuPlot code to visualize.
2016 void writeMapping2(int myRank) {
2017 std::string rankStr = Teuchos::toString<int>(myRank);
2018 std::string gnuPlots = "gnuPlot", extentionS = ".plot";
2019 std::string outF = gnuPlots + rankStr+ extentionS;
2020 std::ofstream gnuPlotCode(outF.c_str(), std::ofstream::out);
2021
2023 *tmpproc_task_comm =
2024 static_cast <CoordinateCommunicationModel<
2025 pcoord_t, tcoord_t, part_t, node_t> * > (
2027
2028// int mindim = MINOF(tmpproc_task_comm->proc_coord_dim,
2029// tmpproc_task_comm->task_coord_dim);
2030 int mindim = tmpproc_task_comm->proc_coord_dim;
2031 if (mindim != 3) {
2032 std::cerr << "Mapping Write is only good for 3 dim" << std::endl;
2033 return;
2034 }
2035 std::string ss = "";
2036 std::string procs = "";
2037
2038 std::set < std::tuple<int,int,int,int,int,int> > my_arrows;
2039 for (part_t origin_rank = 0; origin_rank < this->nprocs; ++origin_rank) {
2040 ArrayView<part_t> a = this->getAssignedTasksForProc(origin_rank);
2041 if (a.size() == 0) {
2042 continue;
2043 }
2044
2045 std::string gnuPlotArrow = "set arrow from ";
2046 for (int j = 0; j < mindim; ++j) {
2047 if (j == mindim - 1) {
2048 gnuPlotArrow +=
2049 Teuchos::toString<float>(tmpproc_task_comm->
2050 proc_coords[j][origin_rank]);
2051 procs +=
2052 Teuchos::toString<float>(tmpproc_task_comm->
2053 proc_coords[j][origin_rank]);
2054
2055 }
2056 else {
2057 gnuPlotArrow +=
2058 Teuchos::toString<float>(tmpproc_task_comm->
2059 proc_coords[j][origin_rank]) + ",";
2060 procs +=
2061 Teuchos::toString<float>(tmpproc_task_comm->
2062 proc_coords[j][origin_rank])+ " ";
2063 }
2064 }
2065 procs += "\n";
2066
2067 gnuPlotArrow += " to ";
2068
2069
2070 for (int k = 0; k < a.size(); ++k) {
2071 int origin_task = a[k];
2072
2073 for (int nind = task_communication_xadj[origin_task];
2074 nind < task_communication_xadj[origin_task + 1]; ++nind) {
2075 int neighbor_task = task_communication_adj[nind];
2076
2077 bool differentnode = false;
2078
2079 int neighbor_rank = this->getAssignedProcForTask(neighbor_task);
2080
2081 for (int j = 0; j < mindim; ++j) {
2082 if (int(tmpproc_task_comm->proc_coords[j][origin_rank]) !=
2083 int(tmpproc_task_comm->proc_coords[j][neighbor_rank])) {
2084 differentnode = true; break;
2085 }
2086 }
2087 std::tuple<int,int,int, int, int, int> foo(
2088 int(tmpproc_task_comm->proc_coords[0][origin_rank]),
2089 int(tmpproc_task_comm->proc_coords[1][origin_rank]),
2090 int(tmpproc_task_comm->proc_coords[2][origin_rank]),
2091 int(tmpproc_task_comm->proc_coords[0][neighbor_rank]),
2092 int(tmpproc_task_comm->proc_coords[1][neighbor_rank]),
2093 int(tmpproc_task_comm->proc_coords[2][neighbor_rank]));
2094
2095
2096 if (differentnode && my_arrows.find(foo) == my_arrows.end()) {
2097 my_arrows.insert(foo);
2098
2099 std::string gnuPlotArrow2 = "";
2100 for (int j = 0; j < mindim; ++j) {
2101 if (j == mindim - 1) {
2102 gnuPlotArrow2 +=
2103 Teuchos::toString<float>(tmpproc_task_comm->
2104 proc_coords[j][neighbor_rank]);
2105 }
2106 else {
2107 gnuPlotArrow2 +=
2108 Teuchos::toString<float>(tmpproc_task_comm->
2109 proc_coords[j][neighbor_rank]) + ",";
2110 }
2111 }
2112 ss += gnuPlotArrow + gnuPlotArrow2 + " nohead\n";
2113 }
2114 }
2115 }
2116 }
2117
2118 std::ofstream procFile("procPlot.plot", std::ofstream::out);
2119 procFile << procs << "\n";
2120 procFile.close();
2121
2122 //gnuPlotCode << ss;
2123 if (mindim == 2) {
2124 gnuPlotCode << "plot \"procPlot.plot\" with points pointsize 3\n";
2125 } else {
2126 gnuPlotCode << "splot \"procPlot.plot\" with points pointsize 3\n";
2127 }
2128
2129 gnuPlotCode << ss << "\nreplot\n pause -1 \n";
2130 gnuPlotCode.close();
2131 }
2132
2133
2134// KDD Need to provide access to algorithm for getPartBoxes
2135#ifdef gnuPlot
2136 void writeGnuPlot(
2137 const Teuchos::Comm<int> *comm_,
2139 int coordDim,
2140 tcoord_t **partCenters) {
2141
2142 std::string file = "gggnuPlot";
2143 std::string exten = ".plot";
2144 std::ofstream mm("2d.txt");
2145 file += Teuchos::toString<int>(comm_->getRank()) + exten;
2146 std::ofstream ff(file.c_str());
2147// ff.seekg(0, ff.end);
2148 std::vector<Zoltan2::coordinateModelPartBox <tcoord_t, part_t> >
2149 outPartBoxes =
2152
2153 for (part_t i = 0; i < this->ntasks;++i) {
2154 outPartBoxes[i].writeGnuPlot(ff, mm);
2155 }
2156 if (coordDim == 2) {
2157 ff << "plot \"2d.txt\"" << std::endl;
2158// ff << "\n pause -1" << endl;
2159 }
2160 else {
2161 ff << "splot \"2d.txt\"" << std::endl;
2162// ff << "\n pause -1" << endl;
2163 }
2164 mm.close();
2165
2166 ff << "set style arrow 5 nohead size screen 0.03,15,135 ls 1"
2167 << std::endl;
2168
2169 for (part_t i = 0; i < this->ntasks; ++i) {
2171 part_t pe = task_communication_xadj[i + 1];
2172
2173 for (part_t p = pb; p < pe; ++p) {
2175
2176// std::cout << "i:" << i << " n:" << n << endl;
2177 std::string arrowline = "set arrow from ";
2178 for (int j = 0; j < coordDim - 1; ++j) {
2179 arrowline +=
2180 Teuchos::toString<tcoord_t>(partCenters[j][n]) + ",";
2181 }
2182 arrowline +=
2183 Teuchos::toString<tcoord_t>(partCenters[coordDim - 1][n]) +
2184 " to ";
2185
2186 for (int j = 0; j < coordDim - 1; ++j) {
2187 arrowline +=
2188 Teuchos::toString<tcoord_t>(partCenters[j][i]) + ",";
2189 }
2190 arrowline +=
2191 Teuchos::toString<tcoord_t>(partCenters[coordDim - 1][i]) +
2192 " as 5\n";
2193
2194// std::cout << "arrow:" << arrowline << endl;
2195 ff << arrowline;
2196 }
2197 }
2198
2199 ff << "replot\n pause -1" << std::endl;
2200 ff.close();
2201 }
2202#endif // gnuPlot
2203
2204public:
2205
2206 void getProcTask(part_t* &proc_to_task_xadj_,
2207 part_t* &proc_to_task_adj_) {
2208 proc_to_task_xadj_ = this->proc_to_task_xadj.getRawPtr();
2209 proc_to_task_adj_ = this->proc_to_task_adj.getRawPtr();
2210 }
2211
2212 virtual void map(const RCP<MappingSolution<Adapter> > &mappingsoln) {
2213
2214 // Mapping was already computed in the constructor; we need to store it
2215 // in the solution.
2216 mappingsoln->setMap_RankForLocalElements(local_task_to_rank);
2217
2218 // KDDKDD TODO: Algorithm is also creating task_to_proc, which maybe
2219 // KDDKDD is not needed once we use MappingSolution to answer queries
2220 // KDDKDD instead of this algorithm.
2221 // KDDKDD Ask Mehmet: what is the most efficient way to get the answer
2222 // KDDKDD out of CoordinateTaskMapper and into the MappingSolution?
2223 }
2224
2225
2227 //freeArray<part_t>(proc_to_task_xadj);
2228 //freeArray<part_t>(proc_to_task_adj);
2229 //freeArray<part_t>(task_to_proc);
2230 if(this->isOwnerofModel) {
2231 delete this->proc_task_comm;
2232 }
2233 }
2234
2236 const lno_t num_local_coords,
2237 const part_t *local_coord_parts,
2238 const ArrayRCP<part_t> task_to_proc_) {
2239 local_task_to_rank = ArrayRCP <part_t>(num_local_coords);
2240
2241 for (lno_t i = 0; i < num_local_coords; ++i) {
2242 part_t local_coord_part = local_coord_parts[i];
2243 part_t rank_index = task_to_proc_[local_coord_part];
2244 local_task_to_rank[i] = rank_index;
2245 }
2246 }
2247
2248
2249
2264 const Teuchos::RCP <const Teuchos::Comm<int> > comm_,
2265 const Teuchos::RCP <const MachineRepresentation<pcoord_t, part_t> >
2266 machine_,
2267 const Teuchos::RCP <const Adapter> input_adapter_,
2268 const Teuchos::RCP <const Zoltan2::PartitioningSolution<Adapter> >
2269 soln_,
2270 const Teuchos::RCP <const Environment> envConst,
2271 bool is_input_adapter_distributed = true,
2272 int num_ranks_per_node = 1,
2273 bool divide_to_prime_first = false,
2274 bool reduce_best_mapping = true):
2275 PartitionMapping<Adapter>(comm_, machine_, input_adapter_,
2276 soln_, envConst),
2279 task_to_proc(0),
2280 isOwnerofModel(true),
2281 proc_task_comm(0),
2285
2286 using namespace Teuchos;
2287 typedef typename Adapter::base_adapter_t ctm_base_adapter_t;
2288
2289 RCP<Zoltan2::GraphModel<ctm_base_adapter_t> > graph_model_;
2290 RCP<Zoltan2::CoordinateModel<ctm_base_adapter_t> > coordinateModel_ ;
2291
2292 RCP<const Teuchos::Comm<int> > rcp_comm = comm_;
2293 RCP<const Teuchos::Comm<int> > ia_comm = rcp_comm;
2294 if (!is_input_adapter_distributed) {
2295 ia_comm = Teuchos::createSerialComm<int>();
2296 }
2297
2298 RCP<const Environment> envConst_ = envConst;
2299
2300 RCP<const ctm_base_adapter_t> baseInputAdapter_(
2301 rcp(dynamic_cast<const ctm_base_adapter_t *>(
2302 input_adapter_.getRawPtr()), false));
2303
2304 modelFlag_t coordFlags_, graphFlags_;
2305
2306 //create coordinate model
2307 //since this is coordinate task mapper,
2308 //the adapter has to have the coordinates
2309 coordinateModel_ = rcp(new CoordinateModel<ctm_base_adapter_t>(
2310 baseInputAdapter_, envConst_, ia_comm, coordFlags_));
2311
2312 //if the adapter has also graph model, we will use graph model
2313 //to calculate the cost mapping.
2314 BaseAdapterType inputType_ = input_adapter_->adapterType();
2315 if (inputType_ == MatrixAdapterType ||
2316 inputType_ == GraphAdapterType ||
2317 inputType_ == MeshAdapterType)
2318 {
2319 graph_model_ = rcp(new GraphModel<ctm_base_adapter_t>(
2320 baseInputAdapter_, envConst_, ia_comm,
2321 graphFlags_));
2322 }
2323
2324 if (!machine_->hasMachineCoordinates()) {
2325 throw std::runtime_error("Existing machine does not provide "
2326 "coordinates for coordinate task mapping");
2327 }
2328
2329 //if mapping type is 0 then it is coordinate mapping
2330 int procDim = machine_->getMachineDim();
2331 this->nprocs = machine_->getNumRanks();
2332
2333 //get processor coordinates.
2334 pcoord_t **procCoordinates = NULL;
2335 if (!machine_->getAllMachineCoordinatesView(procCoordinates)) {
2336 throw std::runtime_error("Existing machine does not implement "
2337 "getAllMachineCoordinatesView");
2338 }
2339
2340 //get the machine extent.
2341 //if we have machine extent,
2342 //if the machine has wrap-around links, we would like to shift the
2343 //coordinates, so that the largest hap would be the wrap-around.
2344 std::vector<int> machine_extent_vec(procDim);
2345 //std::vector<bool> machine_extent_wrap_around_vec(procDim, 0);
2346 int *machine_extent = &(machine_extent_vec[0]);
2347 bool *machine_extent_wrap_around = new bool[procDim];
2348 for (int i = 0; i < procDim; ++i)
2349 machine_extent_wrap_around[i] = false;
2350
2351 bool haveWrapArounds = machine_->getMachineExtentWrapArounds(machine_extent_wrap_around);
2352
2353 // KDDKDD ASK MEHMET: SHOULD WE GET AND USE machine_dimension HERE IF IT
2354 // KDDKDD ASK MEHMET: IS PROVIDED BY THE MACHINE REPRESENTATION?
2355 // KDDKDD ASK MEHMET: IF NOT HERE, THEN WHERE?
2356 // MD: Yes, I ADDED BELOW:
2357 if (machine_->getMachineExtent(machine_extent) &&
2358 haveWrapArounds) {
2359
2360 procCoordinates =
2362 procDim,
2363 machine_extent,
2364 machine_extent_wrap_around,
2365 this->nprocs,
2366 procCoordinates);
2367 }
2368
2369 //get the tasks information, such as coordinate dimension,
2370 //number of parts.
2371 int coordDim = coordinateModel_->getCoordinateDim();
2372
2373// int coordDim = machine_->getMachineDim();
2374
2375 this->ntasks = soln_->getActualGlobalNumberOfParts();
2376 if (part_t(soln_->getTargetGlobalNumberOfParts()) > this->ntasks) {
2377 this->ntasks = soln_->getTargetGlobalNumberOfParts();
2378 }
2379 this->solution_parts = soln_->getPartListView();
2380
2381 //we need to calculate the center of parts.
2382 tcoord_t **partCenters = new tcoord_t*[coordDim];
2383 for (int i = 0; i < coordDim; ++i) {
2384 partCenters[i] = new tcoord_t[this->ntasks];
2385 }
2386
2387 typedef typename Adapter::scalar_t t_scalar_t;
2388
2389
2390 envConst->timerStart(MACRO_TIMERS, "Mapping - Solution Center");
2391
2392 //get centers for the parts.
2393 getSolutionCenterCoordinates<Adapter, t_scalar_t,part_t>(
2394 envConst.getRawPtr(),
2395 ia_comm.getRawPtr(),
2396 coordinateModel_.getRawPtr(),
2397 this->solution_parts,
2398// soln_->getPartListView();
2399// this->soln.getRawPtr(),
2400 coordDim,
2401 ntasks,
2402 partCenters);
2403
2404 envConst->timerStop(MACRO_TIMERS, "Mapping - Solution Center");
2405
2406 //create the part graph
2407 if (graph_model_.getRawPtr() != NULL) {
2408 getCoarsenedPartGraph<Adapter, t_scalar_t, part_t>(
2409 envConst.getRawPtr(),
2410 ia_comm.getRawPtr(),
2411 graph_model_.getRawPtr(),
2412 this->ntasks,
2413 this->solution_parts,
2414// soln_->getPartListView(),
2415// this->soln.getRawPtr(),
2419 );
2420 }
2421
2422 //create coordinate communication model.
2423 this->proc_task_comm = new Zoltan2::CoordinateCommunicationModel<
2424 pcoord_t, tcoord_t, part_t, node_t>(
2425 procDim,
2426 procCoordinates,
2427 coordDim,
2428 partCenters,
2429 this->nprocs,
2430 this->ntasks,
2431 machine_extent,
2432 machine_extent_wrap_around,
2433 machine_.getRawPtr());
2434
2435 int myRank = comm_->getRank();
2436 this->proc_task_comm->num_ranks_per_node = num_ranks_per_node ;
2437 this->proc_task_comm->divide_to_prime_first = divide_to_prime_first;
2438
2439 envConst->timerStart(MACRO_TIMERS, "Mapping - Processor Task map");
2440 this->doMapping(myRank, comm_);
2441 envConst->timerStop(MACRO_TIMERS, "Mapping - Processor Task map");
2442
2443 envConst->timerStart(MACRO_TIMERS, "Mapping - Communication Graph");
2444
2445/*
2446 soln_->getCommunicationGraph(task_communication_xadj,
2447 task_communication_adj);
2448*/
2449
2450 envConst->timerStop(MACRO_TIMERS, "Mapping - Communication Graph");
2451 #ifdef gnuPlot1
2452 if (comm_->getRank() == 0) {
2453
2454 part_t taskCommCount = task_communication_xadj.size();
2455 std::cout << " TotalComm:"
2456 << task_communication_xadj[taskCommCount] << std::endl;
2458 for (part_t i = 1; i <= taskCommCount; ++i) {
2459 part_t nc =
2461 if (maxN < nc)
2462 maxN = nc;
2463 }
2464 std::cout << " maxNeighbor:" << maxN << std::endl;
2465 }
2466
2467 this->writeGnuPlot(comm_, soln_, coordDim, partCenters);
2468 #endif
2469
2470 envConst->timerStart(MACRO_TIMERS, "Mapping - Communication Cost");
2471
2472 if (reduce_best_mapping && task_communication_xadj.getRawPtr() &&
2473 task_communication_adj.getRawPtr()) {
2474 this->proc_task_comm->calculateCommunicationCost(
2475 task_to_proc.getRawPtr(),
2476 task_communication_xadj.getRawPtr(),
2477 task_communication_adj.getRawPtr(),
2479 );
2480 }
2481
2482// std::cout << "me: " << comm_->getRank() << " cost:"
2483// << this->proc_task_comm->getCommunicationCostMetric() << std::endl;
2484
2485 envConst->timerStop(MACRO_TIMERS, "Mapping - Communication Cost");
2486
2487 //processors are divided into groups of size procDim! * coordDim!
2488 //each processor in the group obtains a mapping with a different
2489 //rotation and best one is broadcasted all processors.
2490 this->getBestMapping();
2492 coordinateModel_->getLocalNumCoordinates(),
2493 this->solution_parts,
2494 this->task_to_proc);
2495/*
2496 {
2497 if (task_communication_xadj.getRawPtr() &&
2498 task_communication_adj.getRawPtr())
2499 this->proc_task_comm->calculateCommunicationCost(
2500 task_to_proc.getRawPtr(),
2501 task_communication_xadj.getRawPtr(),
2502 task_communication_adj.getRawPtr(),
2503 task_communication_edge_weight.getRawPtr()
2504 );
2505 std::cout << "me: " << comm_->getRank() << " cost:"
2506 << this->proc_task_comm->getCommunicationCostMetric() << std::endl;
2507 }
2508*/
2509
2510
2511 #ifdef gnuPlot
2512 this->writeMapping2(comm_->getRank());
2513 #endif
2514
2515 delete [] machine_extent_wrap_around;
2516
2517 if (machine_->getMachineExtent(machine_extent) &&
2518 haveWrapArounds) {
2519 for (int i = 0; i < procDim; ++i) {
2520 delete [] procCoordinates[i];
2521 }
2522 delete [] procCoordinates;
2523 }
2524
2525 for (int i = 0; i < coordDim; ++i) {
2526 delete [] partCenters[i];
2527 }
2528 delete [] partCenters;
2529
2530
2531 }
2532
2533
2551 const Teuchos::RCP <const Teuchos::Comm<int> > comm_,
2552 const Teuchos::RCP <const MachineRepresentation<pcoord_t,part_t> >
2553 machine_,
2554 const Teuchos::RCP <const Adapter> input_adapter_,
2555 const part_t num_parts_,
2556 const part_t *result_parts,
2557 const Teuchos::RCP <const Environment> envConst,
2558 bool is_input_adapter_distributed = true,
2559 int num_ranks_per_node = 1,
2560 bool divide_to_prime_first = false,
2561 bool reduce_best_mapping = true):
2562 PartitionMapping<Adapter>(comm_, machine_, input_adapter_,
2563 num_parts_, result_parts, envConst),
2566 task_to_proc(0),
2567 isOwnerofModel(true),
2568 proc_task_comm(0),
2572
2573 using namespace Teuchos;
2574 typedef typename Adapter::base_adapter_t ctm_base_adapter_t;
2575
2576 RCP<Zoltan2::GraphModel<ctm_base_adapter_t> > graph_model_;
2577 RCP<Zoltan2::CoordinateModel<ctm_base_adapter_t> > coordinateModel_ ;
2578
2579 RCP<const Teuchos::Comm<int> > rcp_comm = comm_;
2580 RCP<const Teuchos::Comm<int> > ia_comm = rcp_comm;
2581 if (!is_input_adapter_distributed) {
2582 ia_comm = Teuchos::createSerialComm<int>();
2583 }
2584 RCP<const Environment> envConst_ = envConst;
2585
2586 RCP<const ctm_base_adapter_t> baseInputAdapter_(
2587 rcp(dynamic_cast<const ctm_base_adapter_t *>(
2588 input_adapter_.getRawPtr()), false));
2589
2590 modelFlag_t coordFlags_, graphFlags_;
2591
2592 //create coordinate model
2593 //since this is coordinate task mapper,
2594 //the adapter has to have the coordinates
2595 coordinateModel_ = rcp(new CoordinateModel<ctm_base_adapter_t>(
2596 baseInputAdapter_, envConst_, ia_comm, coordFlags_));
2597
2598 //if the adapter has also graph model, we will use graph model
2599 //to calculate the cost mapping.
2600 BaseAdapterType inputType_ = input_adapter_->adapterType();
2601 if (inputType_ == MatrixAdapterType ||
2602 inputType_ == GraphAdapterType ||
2603 inputType_ == MeshAdapterType)
2604 {
2605 graph_model_ = rcp(new GraphModel<ctm_base_adapter_t>(
2606 baseInputAdapter_, envConst_, ia_comm,
2607 graphFlags_));
2608 }
2609
2610 if (!machine_->hasMachineCoordinates()) {
2611 throw std::runtime_error("Existing machine does not provide "
2612 "coordinates for coordinate task mapping.");
2613 }
2614
2615 //if mapping type is 0 then it is coordinate mapping
2616 int procDim = machine_->getMachineDim();
2617 this->nprocs = machine_->getNumRanks();
2618
2619 //get processor coordinates.
2620 pcoord_t **procCoordinates = NULL;
2621 if (!machine_->getAllMachineCoordinatesView(procCoordinates)) {
2622 throw std::runtime_error("Existing machine does not implement "
2623 "getAllMachineCoordinatesView");
2624 }
2625
2626 //get the machine extent.
2627 //if we have machine extent,
2628 //if the machine has wrap-around links, we would like to shift the
2629 //coordinates,
2630 //so that the largest hap would be the wrap-around.
2631 std::vector<int> machine_extent_vec(procDim);
2632// std::vector<bool> machine_extent_wrap_around_vec(procDim, 0);
2633 int *machine_extent = &(machine_extent_vec[0]);
2634 bool *machine_extent_wrap_around = new bool[procDim];
2635 bool haveWrapArounds = machine_->getMachineExtentWrapArounds(machine_extent_wrap_around);
2636
2637 // KDDKDD ASK MEHMET: SHOULD WE GET AND USE machine_dimension HERE IF IT
2638 // KDDKDD ASK MEHMET: IS PROVIDED BY THE MACHINE REPRESENTATION?
2639 // KDDKDD ASK MEHMET: IF NOT HERE, THEN WHERE?
2640 // MD: Yes, I ADDED BELOW:
2641 if (machine_->getMachineExtent(machine_extent) &&
2642 haveWrapArounds) {
2643 procCoordinates =
2645 procDim,
2646 machine_extent,
2647 machine_extent_wrap_around,
2648 this->nprocs,
2649 procCoordinates);
2650 }
2651
2652 //get the tasks information, such as coordinate dimension,
2653 //number of parts.
2654 int coordDim = coordinateModel_->getCoordinateDim();
2655
2656// int coordDim = machine_->getMachineDim();
2657
2658
2659 this->ntasks = num_parts_;
2660 this->solution_parts = result_parts;
2661
2662 //we need to calculate the center of parts.
2663 tcoord_t **partCenters = new tcoord_t*[coordDim];
2664 for (int i = 0; i < coordDim; ++i) {
2665 partCenters[i] = new tcoord_t[this->ntasks];
2666 }
2667
2668 typedef typename Adapter::scalar_t t_scalar_t;
2669
2670
2671 envConst->timerStart(MACRO_TIMERS, "Mapping - Solution Center");
2672
2673 //get centers for the parts.
2674 getSolutionCenterCoordinates<Adapter, t_scalar_t,part_t>(
2675 envConst.getRawPtr(),
2676 ia_comm.getRawPtr(),
2677 coordinateModel_.getRawPtr(),
2678 this->solution_parts,
2679// soln_->getPartListView();
2680// this->soln.getRawPtr(),
2681 coordDim,
2682 ntasks,
2683 partCenters);
2684
2685 envConst->timerStop(MACRO_TIMERS, "Mapping - Solution Center");
2686
2687 envConst->timerStart(MACRO_TIMERS, "GRAPHCREATE");
2688 //create the part graph
2689 if (graph_model_.getRawPtr() != NULL) {
2690 getCoarsenedPartGraph<Adapter, t_scalar_t, part_t>(
2691 envConst.getRawPtr(),
2692 ia_comm.getRawPtr(),
2693 graph_model_.getRawPtr(),
2694 this->ntasks,
2695 this->solution_parts,
2696// soln_->getPartListView(),
2697// this->soln.getRawPtr(),
2701 );
2702 }
2703 envConst->timerStop(MACRO_TIMERS, "GRAPHCREATE");
2704
2705 envConst->timerStart(MACRO_TIMERS,
2706 "CoordinateCommunicationModel Create");
2707 //create coordinate communication model.
2708 this->proc_task_comm = new Zoltan2::CoordinateCommunicationModel<
2709 pcoord_t, tcoord_t, part_t, node_t>(
2710 procDim,
2711 procCoordinates,
2712 coordDim,
2713 partCenters,
2714 this->nprocs,
2715 this->ntasks,
2716 machine_extent,
2717 machine_extent_wrap_around,
2718 machine_.getRawPtr());
2719
2720 envConst->timerStop(MACRO_TIMERS,
2721 "CoordinateCommunicationModel Create");
2722
2723
2724 this->proc_task_comm->num_ranks_per_node = num_ranks_per_node;
2725 this->proc_task_comm->divide_to_prime_first = divide_to_prime_first;
2726
2727 int myRank = comm_->getRank();
2728
2729
2730 envConst->timerStart(MACRO_TIMERS, "Mapping - Processor Task map");
2731 this->doMapping(myRank, comm_);
2732 envConst->timerStop(MACRO_TIMERS, "Mapping - Processor Task map");
2733
2734
2735 envConst->timerStart(MACRO_TIMERS, "Mapping - Communication Graph");
2736
2737/*
2738 soln_->getCommunicationGraph(task_communication_xadj,
2739 task_communication_adj);
2740*/
2741
2742 envConst->timerStop(MACRO_TIMERS, "Mapping - Communication Graph");
2743 #ifdef gnuPlot1
2744 if (comm_->getRank() == 0) {
2745
2746 part_t taskCommCount = task_communication_xadj.size();
2747 std::cout << " TotalComm:"
2748 << task_communication_xadj[taskCommCount] << std::endl;
2750 for (part_t i = 1; i <= taskCommCount; ++i) {
2751 part_t nc =
2753 if (maxN < nc)
2754 maxN = nc;
2755 }
2756 std::cout << " maxNeighbor:" << maxN << std::endl;
2757 }
2758
2759 this->writeGnuPlot(comm_, soln_, coordDim, partCenters);
2760 #endif
2761
2762 envConst->timerStart(MACRO_TIMERS, "Mapping - Communication Cost");
2763
2764 if (reduce_best_mapping && task_communication_xadj.getRawPtr() &&
2765 task_communication_adj.getRawPtr()) {
2766 this->proc_task_comm->calculateCommunicationCost(
2767 task_to_proc.getRawPtr(),
2768 task_communication_xadj.getRawPtr(),
2769 task_communication_adj.getRawPtr(),
2771 );
2772 }
2773
2774// std::cout << "me: " << comm_->getRank() << " cost:"
2775// << this->proc_task_comm->getCommunicationCostMetric() << std::endl;
2776
2777 envConst->timerStop(MACRO_TIMERS, "Mapping - Communication Cost");
2778
2779 //processors are divided into groups of size procDim! * coordDim!
2780 //each processor in the group obtains a mapping with a different rotation
2781 //and best one is broadcasted all processors.
2782 this->getBestMapping();
2783
2785 coordinateModel_->getLocalNumCoordinates(),
2786 this->solution_parts,
2787 this->task_to_proc);
2788/*
2789 {
2790 if (task_communication_xadj.getRawPtr() &&
2791 task_communication_adj.getRawPtr())
2792 this->proc_task_comm->calculateCommunicationCost(
2793 task_to_proc.getRawPtr(),
2794 task_communication_xadj.getRawPtr(),
2795 task_communication_adj.getRawPtr(),
2796 task_communication_edge_weight.getRawPtr()
2797 );
2798 std::cout << "me: " << comm_->getRank() << " cost:"
2799 << this->proc_task_comm->getCommunicationCostMetric() << std::endl;
2800 }
2801*/
2802
2803
2804
2805 #ifdef gnuPlot
2806 this->writeMapping2(comm_->getRank());
2807 #endif
2808
2809 delete [] machine_extent_wrap_around;
2810
2811 if (machine_->getMachineExtent(machine_extent) &&
2812 haveWrapArounds) {
2813 for (int i = 0; i < procDim; ++i) {
2814 delete [] procCoordinates[i];
2815 }
2816 delete [] procCoordinates;
2817 }
2818
2819 for (int i = 0; i < coordDim; ++i) {
2820 delete [] partCenters[i];
2821 }
2822 delete [] partCenters;
2823 }
2824
2891 const Environment *env_const_,
2892 const Teuchos::Comm<int> *problemComm,
2893 int proc_dim,
2894 int num_processors,
2895 pcoord_t **machine_coords,
2896 int task_dim,
2897 part_t num_tasks,
2898 tcoord_t **task_coords,
2899 ArrayRCP<part_t>task_comm_xadj,
2900 ArrayRCP<part_t>task_comm_adj,
2901 pcoord_t *task_communication_edge_weight_,
2902 int recursion_depth,
2903 Kokkos::View<part_t *, Kokkos::HostSpace> part_no_array,
2904 const part_t *machine_dimensions,
2905 int num_ranks_per_node = 1,
2906 bool divide_to_prime_first = false,
2907 bool reduce_best_mapping = true):
2908 PartitionMapping<Adapter>(
2909 Teuchos::rcpFromRef<const Teuchos::Comm<int> >(*problemComm),
2910 Teuchos::rcpFromRef<const Environment>(*env_const_)),
2913 task_to_proc(0),
2914 isOwnerofModel(true),
2915 proc_task_comm(0),
2916 task_communication_xadj(task_comm_xadj),
2917 task_communication_adj(task_comm_adj) {
2918
2919 //if mapping type is 0 then it is coordinate mapping
2920 pcoord_t ** virtual_machine_coordinates = machine_coords;
2921 bool *wrap_arounds = new bool [proc_dim];
2922 for (int i = 0; i < proc_dim; ++i) wrap_arounds[i] = true;
2923
2924 if (machine_dimensions) {
2925 virtual_machine_coordinates =
2927 proc_dim,
2928 machine_dimensions,
2929 wrap_arounds,
2930 num_processors,
2931 machine_coords);
2932 }
2933
2934 this->nprocs = num_processors;
2935
2936 int coordDim = task_dim;
2937 this->ntasks = num_tasks;
2938
2939 //alloc memory for part centers.
2940 tcoord_t **partCenters = task_coords;
2941
2942 //create coordinate communication model.
2943 this->proc_task_comm =
2945 proc_dim,
2946 virtual_machine_coordinates,
2947 coordDim,
2948 partCenters,
2949 this->nprocs,
2950 this->ntasks, NULL, NULL
2951 );
2952
2953 this->proc_task_comm->num_ranks_per_node = num_ranks_per_node;
2954 this->proc_task_comm->divide_to_prime_first = divide_to_prime_first;
2955
2956 this->proc_task_comm->setPartArraySize(recursion_depth);
2957 this->proc_task_comm->setPartArray(part_no_array);
2958
2959 int myRank = problemComm->getRank();
2960
2961 this->doMapping(myRank, this->comm);
2962#ifdef gnuPlot
2963 this->writeMapping2(myRank);
2964#endif
2965
2966 // MDM added this edge case - for example if NX = 1 NY = 1 NZ = 1
2967 // That would pass on original develop so updated this so now it will also pass.
2968 if (reduce_best_mapping && task_communication_xadj.size() &&
2969 task_communication_adj.size()) {
2970 this->proc_task_comm->calculateCommunicationCost(
2971 task_to_proc.getRawPtr(),
2972 task_communication_xadj.getRawPtr(),
2973 task_communication_adj.getRawPtr(),
2974 task_communication_edge_weight_
2975 );
2976
2977
2978 this->getBestMapping();
2979
2980/*
2981 if (myRank == 0) {
2982 this->proc_task_comm->calculateCommunicationCost(
2983 task_to_proc.getRawPtr(),
2984 task_communication_xadj.getRawPtr(),
2985 task_communication_adj.getRawPtr(),
2986 task_communication_edge_weight_
2987 );
2988 cout << "me: " << problemComm->getRank() << " cost:"
2989 << this->proc_task_comm->getCommunicationCostMetric() << endl;
2990 }
2991*/
2992
2993 }
2994
2995 delete [] wrap_arounds;
2996
2997 if (machine_dimensions) {
2998 for (int i = 0; i < proc_dim; ++i) {
2999 delete [] virtual_machine_coordinates[i];
3000 }
3001 delete [] virtual_machine_coordinates;
3002 }
3003#ifdef gnuPlot
3004 if (problemComm->getRank() == 0)
3005 this->writeMapping2(-1);
3006#endif
3007 }
3008
3009
3010 /*
3011 double getCommunicationCostMetric() {
3012 return this->proc_task_comm->getCommCost();
3013 }
3014 */
3015
3018 virtual size_t getLocalNumberOfParts() const{
3019 return 0;
3020 }
3021
3033 int machine_dim,
3034 const part_t *machine_dimensions,
3035 bool *machine_extent_wrap_around,
3036 part_t numProcs,
3037 pcoord_t **mCoords) {
3038
3039 pcoord_t **result_machine_coords = NULL;
3040 result_machine_coords = new pcoord_t*[machine_dim];
3041
3042 for (int i = 0; i < machine_dim; ++i) {
3043 result_machine_coords[i] = new pcoord_t [numProcs];
3044 }
3045
3046 for (int i = 0; i < machine_dim; ++i) {
3047 part_t numMachinesAlongDim = machine_dimensions[i];
3048
3049 part_t *machineCounts = new part_t[numMachinesAlongDim];
3050 memset(machineCounts, 0, sizeof(part_t) * numMachinesAlongDim);
3051
3052 int *filledCoordinates = new int[numMachinesAlongDim];
3053
3054 pcoord_t *coords = mCoords[i];
3055
3056 for (part_t j = 0; j < numProcs; ++j) {
3057 part_t mc = (part_t) coords[j];
3058 ++machineCounts[mc];
3059 }
3060
3061 part_t filledCoordinateCount = 0;
3062 for (part_t j = 0; j < numMachinesAlongDim; ++j) {
3063 if (machineCounts[j] > 0) {
3064 filledCoordinates[filledCoordinateCount++] = j;
3065 }
3066 }
3067
3068 part_t firstProcCoord = filledCoordinates[0];
3069 part_t firstProcCount = machineCounts[firstProcCoord];
3070
3071 part_t lastProcCoord = filledCoordinates[filledCoordinateCount - 1];
3072 part_t lastProcCount = machineCounts[lastProcCoord];
3073
3074 part_t firstLastGap =
3075 numMachinesAlongDim - lastProcCoord + firstProcCoord;
3076 part_t firstLastGapProc = lastProcCount + firstProcCount;
3077
3078 part_t leftSideProcCoord = firstProcCoord;
3079 part_t leftSideProcCount = firstProcCount;
3080 part_t biggestGap = 0;
3081 part_t biggestGapProc = numProcs;
3082
3083 part_t shiftBorderCoordinate = -1;
3084 for (part_t j = 1; j < filledCoordinateCount; ++j) {
3085 part_t rightSideProcCoord= filledCoordinates[j];
3086 part_t rightSideProcCount = machineCounts[rightSideProcCoord];
3087
3088 part_t gap = rightSideProcCoord - leftSideProcCoord;
3089 part_t gapProc = rightSideProcCount + leftSideProcCount;
3090
3091 // Pick the largest gap in this dimension. Use fewer process on
3092 // either side of the largest gap to break the tie. An easy
3093 // addition to this would be to weight the gap by the number of
3094 // processes.
3095 if (gap > biggestGap ||
3096 (gap == biggestGap && biggestGapProc > gapProc)) {
3097 shiftBorderCoordinate = rightSideProcCoord;
3098 biggestGapProc = gapProc;
3099 biggestGap = gap;
3100 }
3101 leftSideProcCoord = rightSideProcCoord;
3102 leftSideProcCount = rightSideProcCount;
3103 }
3104
3105
3106 if (!(biggestGap > firstLastGap ||
3107 (biggestGap == firstLastGap &&
3108 biggestGapProc < firstLastGapProc))) {
3109 shiftBorderCoordinate = -1;
3110 }
3111
3112 for (part_t j = 0; j < numProcs; ++j) {
3113
3114 if (machine_extent_wrap_around[i] &&
3115 coords[j] < shiftBorderCoordinate) {
3116 result_machine_coords[i][j] = coords[j] + numMachinesAlongDim;
3117
3118 }
3119 else {
3120 result_machine_coords[i][j] = coords[j];
3121 }
3122 }
3123 delete [] machineCounts;
3124 delete [] filledCoordinates;
3125 }
3126
3127 return result_machine_coords;
3128
3129 }
3130
3138 virtual void getProcsForPart(part_t taskId, part_t &numProcs,
3139 part_t *&procs) const {
3140 numProcs = 1;
3141 procs = this->task_to_proc.getRawPtr() + taskId;
3142 }
3143
3149 return this->task_to_proc[taskId];
3150 }
3151
3159 virtual void getPartsForProc(int procId, part_t &numParts,
3160 part_t *&parts) const {
3161
3162 part_t task_begin = this->proc_to_task_xadj[procId];
3163 part_t taskend = this->proc_to_task_xadj[procId + 1];
3164
3165 parts = this->proc_to_task_adj.getRawPtr() + task_begin;
3166 numParts = taskend - task_begin;
3167 }
3168
3169 ArrayView<part_t> getAssignedTasksForProc(part_t procId) {
3170 part_t task_begin = this->proc_to_task_xadj[procId];
3171 part_t taskend = this->proc_to_task_xadj[procId + 1];
3172
3173/*
3174 std::cout << "part_t:" << procId << " taskCount:"
3175 << taskend - task_begin << std::endl;
3176
3177 for (part_t i = task_begin; i < taskend; ++i) {
3178 std::cout << "part_t:" << procId << " task:"
3179 << proc_to_task_adj[i] << endl;
3180 }
3181*/
3182 if (taskend - task_begin > 0) {
3183 ArrayView <part_t> assignedParts(
3184 this->proc_to_task_adj.getRawPtr() + task_begin,
3185 taskend - task_begin);
3186
3187 return assignedParts;
3188 }
3189 else {
3190 ArrayView <part_t> assignedParts;
3191
3192 return assignedParts;
3193 }
3194 }
3195
3196};
3197
3276template <typename part_t, typename pcoord_t, typename tcoord_t>
3278 RCP<const Teuchos::Comm<int> > problemComm,
3279 int proc_dim,
3280 int num_processors,
3281 pcoord_t **machine_coords,
3282 int task_dim,
3283 part_t num_tasks,
3284 tcoord_t **task_coords,
3285 part_t *task_comm_xadj,
3286 part_t *task_comm_adj,
3287 // float-like, same size with task_communication_adj_ weight of the
3288 // corresponding edge.
3289 pcoord_t *task_communication_edge_weight_,
3290 part_t *proc_to_task_xadj, /*output*/
3291 part_t *proc_to_task_adj, /*output*/
3292 int recursion_depth,
3293 Kokkos::View<part_t *, Kokkos::HostSpace> part_no_array,
3294 const part_t *machine_dimensions,
3295 int num_ranks_per_node = 1,
3296 bool divide_to_prime_first = false) {
3297
3298 const Environment *envConst_ = new Environment(problemComm);
3299
3300 // mfh 03 Mar 2015: It's OK to omit the Node template
3301 // parameter in Tpetra, if you're just going to use the
3302 // default Node.
3303 typedef Tpetra::MultiVector<tcoord_t, part_t, part_t> tMVector_t;
3304
3305 Teuchos::ArrayRCP<part_t> task_communication_xadj(
3306 task_comm_xadj, 0, num_tasks + 1, false);
3307
3308 Teuchos::ArrayRCP<part_t> task_communication_adj;
3309 if (task_comm_xadj) {
3310 Teuchos::ArrayRCP<part_t> tmp_task_communication_adj(
3311 task_comm_adj, 0, task_comm_xadj[num_tasks], false);
3312 task_communication_adj = tmp_task_communication_adj;
3313 }
3314
3315
3318 envConst_,
3319 problemComm.getRawPtr(),
3320 proc_dim,
3321 num_processors,
3322 machine_coords,
3323// machine_coords_,
3324
3325 task_dim,
3326 num_tasks,
3327 task_coords,
3328
3329 task_communication_xadj,
3330 task_communication_adj,
3331 task_communication_edge_weight_,
3332 recursion_depth,
3333 part_no_array,
3334 machine_dimensions,
3335 num_ranks_per_node,
3336 divide_to_prime_first);
3337
3338
3339 part_t* proc_to_task_xadj_;
3340 part_t* proc_to_task_adj_;
3341
3342 ctm->getProcTask(proc_to_task_xadj_, proc_to_task_adj_);
3343
3344 for (part_t i = 0; i <= num_processors; ++i) {
3345 proc_to_task_xadj[i] = proc_to_task_xadj_[i];
3346 }
3347
3348 for (part_t i = 0; i < num_tasks; ++i) {
3349 proc_to_task_adj[i] = proc_to_task_adj_[i];
3350 }
3351
3352 delete ctm;
3353 delete envConst_;
3354}
3355
3356template <typename proc_coord_t, typename v_lno_t>
3357inline void visualize_mapping(int myRank,
3358 const int machine_coord_dim,
3359 const int num_ranks,
3360 proc_coord_t **machine_coords,
3361 const v_lno_t num_tasks,
3362 const v_lno_t *task_communication_xadj,
3363 const v_lno_t *task_communication_adj,
3364 const int *task_to_rank) {
3365
3366 std::string rankStr = Teuchos::toString<int>(myRank);
3367 std::string gnuPlots = "gnuPlot", extentionS = ".plot";
3368 std::string outF = gnuPlots + rankStr+ extentionS;
3369 std::ofstream gnuPlotCode( outF.c_str(), std::ofstream::out);
3370
3371 if (machine_coord_dim != 3) {
3372 std::cerr << "Mapping Write is only good for 3 dim" << std::endl;
3373 return;
3374 }
3375 std::string ss = "";
3376 std::string procs = "";
3377
3378 std::set<std::tuple<int, int, int, int, int, int> > my_arrows;
3379
3380 for (v_lno_t origin_task = 0; origin_task < num_tasks; ++origin_task) {
3381 int origin_rank = task_to_rank[origin_task];
3382 std::string gnuPlotArrow = "set arrow from ";
3383
3384 for (int j = 0; j < machine_coord_dim; ++j) {
3385 if (j == machine_coord_dim - 1) {
3386 gnuPlotArrow +=
3387 Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank]);
3388 procs +=
3389 Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank]);
3390
3391 }
3392 else {
3393 gnuPlotArrow +=
3394 Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank])
3395 + ",";
3396 procs +=
3397 Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank])
3398 + " ";
3399 }
3400 }
3401 procs += "\n";
3402
3403 gnuPlotArrow += " to ";
3404
3405
3406 for (int nind = task_communication_xadj[origin_task];
3407 nind < task_communication_xadj[origin_task + 1]; ++nind) {
3408
3409 int neighbor_task = task_communication_adj[nind];
3410
3411 bool differentnode = false;
3412 int neighbor_rank = task_to_rank[neighbor_task];
3413
3414 for (int j = 0; j < machine_coord_dim; ++j) {
3415 if (int(machine_coords[j][origin_rank]) !=
3416 int(machine_coords[j][neighbor_rank])) {
3417 differentnode = true; break;
3418 }
3419 }
3420
3421 std::tuple<int,int,int, int, int, int> foo(
3422 (int)(machine_coords[0][origin_rank]),
3423 (int)(machine_coords[1][origin_rank]),
3424 (int)(machine_coords[2][origin_rank]),
3425 (int)(machine_coords[0][neighbor_rank]),
3426 (int)(machine_coords[1][neighbor_rank]),
3427 (int)(machine_coords[2][neighbor_rank]));
3428
3429 if (differentnode && my_arrows.find(foo) == my_arrows.end()) {
3430 my_arrows.insert(foo);
3431
3432 std::string gnuPlotArrow2 = "";
3433 for (int j = 0; j < machine_coord_dim; ++j) {
3434 if (j == machine_coord_dim - 1) {
3435 gnuPlotArrow2 +=
3436 Teuchos::toString<float>(machine_coords[j][neighbor_rank]);
3437 }
3438 else {
3439 gnuPlotArrow2 +=
3440 Teuchos::toString<float>(machine_coords[j][neighbor_rank])
3441 + ",";
3442 }
3443 }
3444 ss += gnuPlotArrow + gnuPlotArrow2 + " nohead\n";
3445 }
3446 }
3447 }
3448
3449 std::ofstream procFile("procPlot.plot", std::ofstream::out);
3450 procFile << procs << "\n";
3451 procFile.close();
3452
3453 //gnuPlotCode << ss;
3454 if (machine_coord_dim == 2) {
3455 gnuPlotCode << "plot \"procPlot.plot\" with points pointsize 3\n";
3456 }
3457 else {
3458 gnuPlotCode << "splot \"procPlot.plot\" with points pointsize 3\n";
3459 }
3460
3461 gnuPlotCode << ss << "\nreplot\n pause -1\npause -1";
3462 gnuPlotCode.close();
3463}
3464
3465} // namespace Zoltan2
3466
3467#endif
typename node_t::device_type device_t
typename Zoltan2::InputTraits< ztcrsmatrix_t >::node_t node_t
Contains the Multi-jagged algorthm.
Defines the GraphModel interface.
Defines the MappingSolution class.
Gathering definitions used in software development.
Traits class to handle conversions between gno_t/lno_t and TPL data types (e.g., ParMETIS's idx_t,...
#define MINOF(a, b)
Defines the XpetraMultiVectorAdapter.
Zoltan2_ReduceBestMapping Class, reduces the minimum cost mapping, ties breaks with minimum proc id.
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
Zoltan2_ReduceBestMapping()
Default Constructor.
Multi Jagged coordinate partitioning algorithm.
void sequential_task_partitioning(const RCP< const Environment > &env, mj_lno_t num_total_coords, mj_lno_t num_selected_coords, size_t num_target_part, int coord_dim, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates_, Kokkos::View< mj_lno_t *, device_t > &initial_selected_coords_output_permutation, mj_lno_t *output_xadj, int recursion_depth_, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, bool partition_along_longest_dim, int num_ranks_per_node, bool divide_to_prime_first_, mj_part_t num_first_level_parts_=1, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &first_level_distribution_=Kokkos::View< mj_part_t *, Kokkos::HostSpace >())
Special function for partitioning for task mapping. Runs sequential, and performs deterministic parti...
Adapter::scalar_t scalar_t
virtual std::vector< coordinateModelPartBox > & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
CommunicationModel Base Class that performs mapping between the coordinate partitioning result.
virtual void getMapping(int myRank, const RCP< const Environment > &env, ArrayRCP< part_t > &proc_to_task_xadj, ArrayRCP< part_t > &proc_to_task_adj, ArrayRCP< part_t > &task_to_proc, const Teuchos::RCP< const Teuchos::Comm< int > > comm_) const =0
Function is called whenever nprocs > no_task. Function returns only the subset of processors that are...
void calculateCommunicationCost(part_t *task_to_proc, part_t *task_communication_xadj, part_t *task_communication_adj, pcoord_t *task_communication_edge_weight)
CommunicationModel(part_t no_procs_, part_t no_tasks_)
virtual double getProcDistance(int procId1, int procId2) const =0
CoordinateModelInput Class that performs mapping between the coordinate partitioning result and mpi r...
void update_visit_order(part_t *visitOrder, part_t n, int &_u_umpa_seed, part_t rndm)
const MachineRepresentation< pcoord_t, part_t > * machine
virtual void getMapping(int myRank, const RCP< const Environment > &env, ArrayRCP< part_t > &rcp_proc_to_task_xadj, ArrayRCP< part_t > &rcp_proc_to_task_adj, ArrayRCP< part_t > &rcp_task_to_proc, const Teuchos::RCP< const Teuchos::Comm< int > > comm_) const
Function is called whenever nprocs > no_task. Function returns only the subset of processors that are...
static part_t umpa_uRandom(part_t l, int &_u_umpa_seed)
void setPartArray(Kokkos::View< part_t *, Kokkos::HostSpace > pNo)
Kokkos::View< part_t *, Kokkos::HostSpace > kokkos_partNoArray
virtual double getProcDistance(int procId1, int procId2) const
CoordinateCommunicationModel(int pcoord_dim_, pcoord_t **pcoords_, int tcoord_dim_, tcoord_t **tcoords_, part_t no_procs_, part_t no_tasks_, int *machine_extent_, bool *machine_extent_wrap_around_, const MachineRepresentation< pcoord_t, part_t > *machine_=NULL)
Class Constructor:
void getClosestSubset(part_t *proc_permutation, part_t nprocs, part_t ntasks) const
Function is called whenever nprocs > no_task. Function returns only the subset of processors that are...
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
size_t getCoordinates(ArrayView< const gno_t > &Ids, ArrayView< input_t > &xyz, ArrayView< input_t > &wgts) const
Returns the coordinate ids, values and optional weights.
size_t getLocalNumCoordinates() const
Returns the number of coordinates on this process.
virtual void map(const RCP< MappingSolution< Adapter > > &mappingsoln)
Mapping method.
ArrayRCP< scalar_t > task_communication_edge_weight
virtual size_t getLocalNumberOfParts() const
Returns the number of parts to be assigned to this process.
void doMapping(int myRank, const Teuchos::RCP< const Teuchos::Comm< int > > comm_)
doMapping function, calls getMapping function of communicationModel object.
pcoord_t ** shiftMachineCoordinates(int machine_dim, const part_t *machine_dimensions, bool *machine_extent_wrap_around, part_t numProcs, pcoord_t **mCoords)
Using the machine dimensions provided, create virtual machine coordinates by assigning the largest ga...
void getProcTask(part_t *&proc_to_task_xadj_, part_t *&proc_to_task_adj_)
CoordinateTaskMapper(const Teuchos::RCP< const Teuchos::Comm< int > > comm_, const Teuchos::RCP< const MachineRepresentation< pcoord_t, part_t > > machine_, const Teuchos::RCP< const Adapter > input_adapter_, const Teuchos::RCP< const Zoltan2::PartitioningSolution< Adapter > > soln_, const Teuchos::RCP< const Environment > envConst, bool is_input_adapter_distributed=true, int num_ranks_per_node=1, bool divide_to_prime_first=false, bool reduce_best_mapping=true)
Constructor. When this constructor is called, in order to calculate the communication metric,...
ArrayView< part_t > getAssignedTasksForProc(part_t procId)
part_t getAssignedProcForTask(part_t taskId)
getAssignedProcForTask function, returns the assigned processor id for the given task
void getBestMapping()
finds the lowest cost mapping and broadcasts solution to everyone.
void create_local_task_to_rank(const lno_t num_local_coords, const part_t *local_coord_parts, const ArrayRCP< part_t > task_to_proc_)
RCP< Comm< int > > create_subCommunicator()
creates and returns the subcommunicator for the processor group.
virtual void getProcsForPart(part_t taskId, part_t &numProcs, part_t *&procs) const
getAssignedProcForTask function, returns the assigned tasks with the number of tasks.
virtual void getPartsForProc(int procId, part_t &numParts, part_t *&parts) const
getAssignedProcForTask function, returns the assigned tasks with the number of tasks.
CoordinateTaskMapper(const Environment *env_const_, const Teuchos::Comm< int > *problemComm, int proc_dim, int num_processors, pcoord_t **machine_coords, int task_dim, part_t num_tasks, tcoord_t **task_coords, ArrayRCP< part_t >task_comm_xadj, ArrayRCP< part_t >task_comm_adj, pcoord_t *task_communication_edge_weight_, int recursion_depth, Kokkos::View< part_t *, Kokkos::HostSpace > part_no_array, const part_t *machine_dimensions, int num_ranks_per_node=1, bool divide_to_prime_first=false, bool reduce_best_mapping=true)
Constructor The mapping constructor which will also perform the mapping operation....
CoordinateCommunicationModel< pcoord_t, tcoord_t, part_t, node_t > * proc_task_comm
CoordinateTaskMapper(const Teuchos::RCP< const Teuchos::Comm< int > > comm_, const Teuchos::RCP< const MachineRepresentation< pcoord_t, part_t > > machine_, const Teuchos::RCP< const Adapter > input_adapter_, const part_t num_parts_, const part_t *result_parts, const Teuchos::RCP< const Environment > envConst, bool is_input_adapter_distributed=true, int num_ranks_per_node=1, bool divide_to_prime_first=false, bool reduce_best_mapping=true)
Constructor. Instead of Solution we have two parameters, numparts.
The user parameters, debug, timing and memory profiling output objects, and error checking methods.
void timerStart(TimerType tt, const char *timerName) const
Start a named timer.
void timerStop(TimerType tt, const char *timerName) const
Stop a named timer.
GraphModel defines the interface required for graph models.
size_t getLocalNumEdges() const
Returns the number of edges on this process. In global or subset graphs, includes off-process edges.
size_t getEdgeList(ArrayView< const gno_t > &edgeIds, ArrayView< const offset_t > &offsets, ArrayView< input_t > &wgts) const
Sets pointers to this process' edge (neighbor) global Ids, including off-process edges.
size_t getLocalNumVertices() const
Returns the number vertices on this process.
size_t getVertexList(ArrayView< const gno_t > &Ids, ArrayView< input_t > &wgts) const
Sets pointers to this process' vertex Ids and their weights.
int getNumWeightsPerEdge() const
Returns the number (0 or greater) of weights per edge.
KMeansAlgorithm Class that performs clustering of the coordinates, and returns the closest set of coo...
KMeansAlgorithm(int dim_, IT numElements_, WT **elementCoords_, IT required_elements_)
KMeansAlgorithm Constructor.
void getMinDistanceCluster(IT *procPermutation)
void setParams(int dimension_, int heapsize)
void copyCoordinates(IT *permutation)
WT getDistance(IT index, WT **elementCoords)
KmeansHeap Class, max heap, but holds the minimum values.
bool getNewCenters(WT *center, WT **coords, int dimension)
void setHeapsize(IT heapsize_)
void push_down(IT index_on_heap)
void copyCoordinates(IT *permutation)
void addPoint(IT index, WT distance)
MachineRepresentation Class Base class for representing machine coordinates, networks,...
bool getHopCount(int rank1, int rank2, pcoord_t &hops) const
return the hop count between rank1 and rank2
PartitionMapping maps a solution or an input distribution to ranks.
PartitionMapping maps a solution or an input distribution to ranks.
const Teuchos::RCP< const Environment > env
const Teuchos::RCP< const Teuchos::Comm< int > > comm
A PartitioningSolution is a solution to a partitioning problem.
size_t getActualGlobalNumberOfParts() const
Returns the actual global number of parts provided in setParts().
const part_t * getPartListView() const
Returns the part list corresponding to the global ID list.
size_t getTargetGlobalNumberOfParts() const
Returns the global number of parts desired in the solution.
The StridedData class manages lists of weights or coordinates.
map_t::local_ordinal_type lno_t
map_t::global_ordinal_type gno_t
Created by mbenlioglu on Aug 31, 2020.
void visualize_mapping(int myRank, const int machine_coord_dim, const int num_ranks, proc_coord_t **machine_coords, const v_lno_t num_tasks, const v_lno_t *task_communication_xadj, const v_lno_t *task_communication_adj, const int *task_to_rank)
void fillContinousArray(T *arr, size_t arrSize, T *val)
fillContinousArray function
std::bitset< NUM_MODEL_FLAGS > modelFlag_t
@ MACRO_TIMERS
Time an algorithm (or other entity) as a whole.
void getSolutionCenterCoordinates(const Environment *envConst, const Teuchos::Comm< int > *comm, const Zoltan2::CoordinateModel< typename Adapter::base_adapter_t > *coords, const part_t *parts, int coordDim, part_t ntasks, scalar_t **partCenters)
void getCoarsenedPartGraph(const Environment *envConst, const Teuchos::Comm< int > *comm, const Zoltan2::GraphModel< typename Adapter::base_adapter_t > *graph, part_t np, const part_t *parts, ArrayRCP< part_t > &g_part_xadj, ArrayRCP< part_t > &g_part_adj, ArrayRCP< scalar_t > &g_part_ew)
void getGridCommunicationGraph(part_t taskCount, part_t *&task_comm_xadj, part_t *&task_comm_adj, std::vector< int > grid_dims)
void ithPermutation(const IT n, IT i, IT *perm)
BaseAdapterType
An enum to identify general types of adapters.
@ GraphAdapterType
graph data
@ MatrixAdapterType
matrix data
@ MeshAdapterType
mesh data
void coordinateTaskMapperInterface(RCP< const Teuchos::Comm< int > > problemComm, int proc_dim, int num_processors, pcoord_t **machine_coords, int task_dim, part_t num_tasks, tcoord_t **task_coords, part_t *task_comm_xadj, part_t *task_comm_adj, pcoord_t *task_communication_edge_weight_, part_t *proc_to_task_xadj, part_t *proc_to_task_adj, int recursion_depth, Kokkos::View< part_t *, Kokkos::HostSpace > part_no_array, const part_t *machine_dimensions, int num_ranks_per_node=1, bool divide_to_prime_first=false)
Constructor The interface function that calls CoordinateTaskMapper which will also perform the mappin...
#define epsilon
Definition nd.cpp:47
SparseMatrixAdapter_t::part_t part_t
Tpetra::MultiVector< zscalar_t, zlno_t, zgno_t, znode_t > tMVector_t