734 ArrayView<const gno_t> vtxIDs;
735 ArrayView<StridedData<lno_t, scalar_t> > vwgts;
741 size_t nVtx = model->getVertexList(vtxIDs, vwgts);
745 ArrayView<const gno_t> adjs;
747 ArrayView<const offset_t> offsets;
748 ArrayView<StridedData<lno_t, scalar_t> > ewgts;
749 model->getEdgeList(adjs, offsets, ewgts);
752 std::unordered_map<gno_t,lno_t> globalToLocal;
764 std::vector<gno_t> ownedPlusGhosts;
772 std::vector<int> owners;
775 for(
int i = 0; i < vtxIDs.size(); i++){
776 globalToLocal[vtxIDs[i]] = i;
777 ownedPlusGhosts.push_back(vtxIDs[i]);
778 owners.push_back(
comm->getRank());
786 std::vector<lno_t> local_adjs;
787 for(
int i = 0; i < adjs.size(); i++){
788 if(globalToLocal.count(adjs[i])==0){
789 ownedPlusGhosts.push_back(adjs[i]);
790 globalToLocal[adjs[i]] = vtxIDs.size()+nGhosts;
793 local_adjs.push_back(globalToLocal[adjs[i]]);
797 Tpetra::global_size_t dummy = Teuchos::OrdinalTraits
798 <Tpetra::global_size_t>::invalid();
799 RCP<const map_t> mapOwned = rcp(
new map_t(dummy, vtxIDs, 0,
comm));
802 std::vector<gno_t> ghosts;
803 std::vector<int> ghostowners;
804 for(
size_t i = nVtx; i < nVtx+nGhosts; i++){
805 ghosts.push_back(ownedPlusGhosts[i]);
806 ghostowners.push_back(-1);
810 ArrayView<int> owningProcs = Teuchos::arrayViewFromVector(ghostowners);
811 ArrayView<const gno_t> gids = Teuchos::arrayViewFromVector(ghosts);
812 mapOwned->getRemoteIndexList(gids, owningProcs);
815 for(
size_t i = 0; i < ghostowners.size(); i++){
816 owners.push_back(ghostowners[i]);
837 std::vector< gno_t> first_layer_ghost_adjs;
838 std::vector< offset_t> first_layer_ghost_offsets;
839 constructSecondGhostLayer(ownedPlusGhosts,owners, adjs, offsets, mapOwned,
840 first_layer_ghost_adjs, first_layer_ghost_offsets);
844 globalToLocal.clear();
845 for(
size_t i = 0; i < ownedPlusGhosts.size(); i++){
846 globalToLocal[ownedPlusGhosts[i]] = i;
851 for(
int i = 0 ; i < adjs.size(); i++){
852 local_adjs[i] = globalToLocal[adjs[i]];
862 std::vector<lno_t> local_ghost_adjs;
863 for(
size_t i = 0; i< first_layer_ghost_adjs.size(); i++ ){
864 if(globalToLocal.count(first_layer_ghost_adjs[i]) == 0){
865 ownedPlusGhosts.push_back(first_layer_ghost_adjs[i]);
866 globalToLocal[first_layer_ghost_adjs[i]] = vtxIDs.size() + nGhosts + n2Ghosts;
869 local_ghost_adjs.push_back(globalToLocal[first_layer_ghost_adjs[i]]);
873 if(
verbose) std::cout<<
comm->getRank()<<
": constructing Tpetra map with copies\n";
874 dummy = Teuchos::OrdinalTraits <Tpetra::global_size_t>::invalid();
875 RCP<const map_t> mapWithCopies = rcp(
new map_t(dummy,
876 Teuchos::arrayViewFromVector(ownedPlusGhosts),
878 if(
verbose) std::cout<<
comm->getRank()<<
": done constructing map with copies\n";
880 using import_t = Tpetra::Import<lno_t, gno_t>;
881 Teuchos::RCP<import_t> importer = rcp(
new import_t(mapOwned,
883 if(
verbose) std::cout<<
comm->getRank()<<
": done constructing importer\n";
884 Teuchos::RCP<femv_t> femv = rcp(
new femv_t(mapOwned,
886 if(
verbose) std::cout<<
comm->getRank()<<
": done constructing femv\n";
890 std::vector<int> rand(ownedPlusGhosts.size());
891 for(
size_t i = 0; i < rand.size(); i++){
892 std::srand(ownedPlusGhosts[i]);
893 rand[i] = std::rand();
897 std::vector<int> ghostOwners2(ownedPlusGhosts.size() -nVtx);
898 std::vector<gno_t> ghosts2(ownedPlusGhosts.size() - nVtx);
899 for(
size_t i = nVtx; i < ownedPlusGhosts.size(); i++) ghosts2[i-nVtx] = ownedPlusGhosts[i];
900 Teuchos::ArrayView<int> owners2 = Teuchos::arrayViewFromVector(ghostOwners2);
901 Teuchos::ArrayView<const gno_t> ghostGIDs = Teuchos::arrayViewFromVector(ghosts2);
902 mapOwned->getRemoteIndexList(ghostGIDs,owners2);
903 if(
verbose) std::cout<<
comm->getRank()<<
": done getting ghost owners\n";
908 std::cout<<
comm->getRank()<<
": calculating 2GL stats...\n";
910 std::vector<int> sendcounts(
comm->getSize(),0);
911 std::vector<gno_t> sdispls(
comm->getSize()+1,0);
913 for(
int i = nGhosts; i < ghostGIDs.size(); i++){
914 if(owners2[i] !=
comm->getRank()&& owners2[i] !=-1) sendcounts[owners2[i]]++;
918 for(
int i = 1; i <
comm->getSize()+1; i++){
919 sdispls[i] = sdispls[i-1] + sendcounts[i-1];
920 sendcount += sendcounts[i-1];
922 std::vector<gno_t> idx(
comm->getSize(),0);
923 for(
int i = 0; i <
comm->getSize(); i++){
927 std::vector<gno_t> sendbuf(sendcount,0);
928 for(
size_t i = nGhosts; i < (size_t)ghostGIDs.size(); i++){
929 if(owners2[i] !=
comm->getRank() && owners2[i] != -1){
930 sendbuf[idx[owners2[i]]++] = ghostGIDs[i];
934 Teuchos::ArrayView<int> sendcounts_view = Teuchos::arrayViewFromVector(sendcounts);
935 Teuchos::ArrayView<gno_t> sendbuf_view = Teuchos::arrayViewFromVector(sendbuf);
936 Teuchos::ArrayRCP<gno_t> recvbuf;
937 std::vector<int> recvcounts(
comm->getSize(),0);
938 Teuchos::ArrayView<int> recvcounts_view = Teuchos::arrayViewFromVector(recvcounts);
939 Zoltan2::AlltoAllv<gno_t>(*
comm, *
env, sendbuf_view, sendcounts_view, recvbuf, recvcounts_view);
940 std::vector<int> is_bndry_send(recvbuf.size(),0);
943 for(
int i = 0; i < recvbuf.size(); i++){
944 size_t lid = mapWithCopies->getLocalElement(recvbuf[i]);
945 is_bndry_send[i] = 0;
947 for(
offset_t j = offsets[lid]; j < offsets[lid+1]; j++){
948 if((
size_t)local_adjs[j] >= nVtx) is_bndry_send[i] = 1;
951 for(
offset_t j = first_layer_ghost_offsets[lid]; j < first_layer_ghost_offsets[lid+1]; j++){
952 if((
size_t)local_ghost_adjs[j] >= nVtx) is_bndry_send[i] = 1;
958 Teuchos::ArrayView<int> is_bndry_send_view = Teuchos::arrayViewFromVector(is_bndry_send);
959 Teuchos::ArrayRCP<int> is_bndry_recv;
960 std::vector<int> bndry_recvcounts(
comm->getSize(),0);
961 Teuchos::ArrayView<int> bndry_recvcounts_view = Teuchos::arrayViewFromVector(bndry_recvcounts);
962 Zoltan2::AlltoAllv<int> (*
comm, *
env, is_bndry_send_view, recvcounts_view, is_bndry_recv, bndry_recvcounts_view);
965 int boundaryverts = 0;
966 for(
int i = 0; i < is_bndry_recv.size(); i++){
967 boundaryverts += is_bndry_recv[i];
970 std::cout<<
comm->getRank()<<
": "<<boundaryverts<<
" boundary verts out of "<<n2Ghosts<<
" verts in 2GL\n";
975 Teuchos::ArrayView<const lno_t> local_adjs_view = Teuchos::arrayViewFromVector(local_adjs);
979 Teuchos::ArrayView<const offset_t> ghost_offsets = Teuchos::arrayViewFromVector(first_layer_ghost_offsets);
980 Teuchos::ArrayView<const lno_t> ghost_adjacencies = Teuchos::arrayViewFromVector(local_ghost_adjs);
981 Teuchos::ArrayView<const int> rand_view = Teuchos::arrayViewFromVector(rand);
982 Teuchos::ArrayView<const gno_t> gid_view = Teuchos::arrayViewFromVector(ownedPlusGhosts);
986 Teuchos::ArrayView<const lno_t> exportLIDs = importer->getExportLIDs();
987 Teuchos::ArrayView<const int> exportPIDs = importer->getExportPIDs();
991 std::unordered_map<lno_t, std::vector<int>> procs_to_send;
992 for(
int i = 0; i < exportLIDs.size(); i++){
993 procs_to_send[exportLIDs[i]].push_back(exportPIDs[i]);
997 twoGhostLayer(nVtx, nVtx+nGhosts, local_adjs_view, offsets, ghost_adjacencies, ghost_offsets,
998 femv, gid_view, rand_view, owners2, mapWithCopies, procs_to_send);
1001 ArrayRCP<int> colors = solution->getColorsRCP();
1002 auto femvdata = femv->getData(0);
1003 for(
size_t i=0; i<nVtx; i++){
1004 colors[i] = femvdata[i];
1064 const Teuchos::ArrayView<const lno_t>& adjs,
1065 const Teuchos::ArrayView<const offset_t>& offsets,
1066 const Teuchos::ArrayView<const lno_t>& ghost_adjs,
1067 const Teuchos::ArrayView<const offset_t>& ghost_offsets,
1068 const Teuchos::RCP<femv_t>& femv,
1069 const Teuchos::ArrayView<const gno_t>& gids,
1070 const Teuchos::ArrayView<const int>& rand,
1071 const Teuchos::ArrayView<const int>& ghost_owners,
1072 RCP<const map_t> mapOwnedPlusGhosts,
1073 const std::unordered_map<
lno_t, std::vector<int>>& procs_to_send){
1075 double total_time = 0.0;
1076 double interior_time = 0.0;
1077 double comm_time = 0.0;
1078 double comp_time = 0.0;
1079 double recoloring_time=0.0;
1080 double conflict_detection = 0.0;
1084 const int numStatisticRecordingRounds = 100;
1087 const size_t n_ghosts = rand.size() - n_local;
1093 std::vector<int> deg_send_cnts(
comm->getSize(),0);
1094 std::vector<gno_t> deg_sdispls(
comm->getSize()+1,0);
1095 for(
int i = 0; i < ghost_owners.size(); i++){
1096 deg_send_cnts[ghost_owners[i]]++;
1099 gno_t deg_sendsize = 0;
1100 std::vector<int> deg_sentcount(
comm->getSize(),0);
1101 for(
int i = 1; i <
comm->getSize()+1; i++){
1102 deg_sdispls[i] = deg_sdispls[i-1] + deg_send_cnts[i-1];
1103 deg_sendsize += deg_send_cnts[i-1];
1105 std::vector<gno_t> deg_sendbuf(deg_sendsize,0);
1106 for(
int i = 0; i < ghost_owners.size(); i++){
1107 size_t idx = deg_sdispls[ghost_owners[i]] + deg_sentcount[ghost_owners[i]];
1108 deg_sentcount[ghost_owners[i]]++;
1109 deg_sendbuf[idx] = mapOwnedPlusGhosts->getGlobalElement(i+n_local);
1111 Teuchos::ArrayView<int> deg_send_cnts_view = Teuchos::arrayViewFromVector(deg_send_cnts);
1112 Teuchos::ArrayView<gno_t> deg_sendbuf_view = Teuchos::arrayViewFromVector(deg_sendbuf);
1113 Teuchos::ArrayRCP<gno_t> deg_recvbuf;
1114 std::vector<int> deg_recvcnts(
comm->getSize(),0);
1115 Teuchos::ArrayView<int> deg_recvcnts_view = Teuchos::arrayViewFromVector(deg_recvcnts);
1116 Zoltan2::AlltoAllv<gno_t>(*
comm, *
env, deg_sendbuf_view, deg_send_cnts_view, deg_recvbuf, deg_recvcnts_view);
1121 for(
int i = 0; i < deg_recvbuf.size(); i++){
1122 lno_t lid = mapOwnedPlusGhosts->getLocalElement(deg_recvbuf[i]);
1123 deg_recvbuf[i] = offsets[lid+1] - offsets[lid];
1126 ArrayRCP<gno_t> ghost_degrees;
1127 Zoltan2::AlltoAllv<gno_t>(*
comm, *
env, deg_recvbuf(), deg_recvcnts_view, ghost_degrees, deg_send_cnts_view);
1130 Kokkos::View<gno_t*, device_type> ghost_degrees_dev(
"ghost degree view",ghost_degrees.size());
1131 typename Kokkos::View<gno_t*, device_type>::host_mirror_type ghost_degrees_host = Kokkos::create_mirror(ghost_degrees_dev);
1132 for(
int i = 0; i < ghost_degrees.size(); i++){
1133 lno_t lid = mapOwnedPlusGhosts->getLocalElement(deg_sendbuf[i]);
1134 ghost_degrees_host(lid-n_local) = ghost_degrees[i];
1136 Kokkos::deep_copy(ghost_degrees_dev, ghost_degrees_host);
1139 gno_t recvPerRound[numStatisticRecordingRounds];
1140 gno_t sentPerRound[numStatisticRecordingRounds];
1147 for(
size_t i = 0; i < n_local; i++){
1148 offset_t curr_degree = offsets[i+1] - offsets[i];
1149 if(curr_degree > local_max_degree){
1150 local_max_degree = curr_degree;
1153 Teuchos::reduceAll<int, offset_t>(*
comm, Teuchos::REDUCE_MAX,1, &local_max_degree, &global_max_degree);
1154 if(
comm->getRank() == 0 &&
verbose) std::cout<<
"Input has max degree "<<global_max_degree<<
"\n";
1156 if(
verbose) std::cout<<
comm->getRank()<<
": constructing Kokkos Views for initial coloring\n";
1160 Kokkos::View<offset_t*, device_type> offsets_dev(
"Host Offset View", offsets.size());
1161 typename Kokkos::View<offset_t*, device_type>::host_mirror_type offsets_host = Kokkos::create_mirror(offsets_dev);
1162 Kokkos::View<lno_t*, device_type> adjs_dev(
"Host Adjacencies View", adjs.size());
1163 typename Kokkos::View<lno_t*, device_type>::host_mirror_type adjs_host = Kokkos::create_mirror(adjs_dev);
1164 for(Teuchos_Ordinal i = 0; i < offsets.size(); i++) offsets_host(i) = offsets[i];
1165 for(Teuchos_Ordinal i = 0; i < adjs.size(); i++) adjs_host(i) = adjs[i];
1166 Kokkos::deep_copy(offsets_dev,offsets_host);
1167 Kokkos::deep_copy(adjs_dev, adjs_host);
1171 if(
verbose) std::cout<<
comm->getRank()<<
": constructing Kokkos Views for recoloring\n";
1175 Kokkos::View<offset_t*, device_type> dist_degrees_dev(
"Owned+Ghost degree view",rand.size());
1176 typename Kokkos::View<offset_t*, device_type>::host_mirror_type dist_degrees_host = Kokkos::create_mirror(dist_degrees_dev);
1181 for(Teuchos_Ordinal i = 0; i < offsets.size()-1; i++) dist_degrees_host(i) = offsets[i+1] - offsets[i];
1183 for(Teuchos_Ordinal i = 0; i < ghost_offsets.size()-1; i++) dist_degrees_host(i+n_local) = ghost_offsets[i+1] - ghost_offsets[i];
1185 for(Teuchos_Ordinal i = 0; i < ghost_adjs.size(); i++){
1187 if((
size_t)ghost_adjs[i] >= n_total ){
1188 dist_degrees_host(ghost_adjs[i])++;
1216 Kokkos::View<offset_t*, device_type> dist_offsets_dev(
"Owned+Ghost Offset view", rand.size()+1);
1217 typename Kokkos::View<offset_t*, device_type>::host_mirror_type dist_offsets_host = Kokkos::create_mirror(dist_offsets_dev);
1220 dist_offsets_host(0) = 0;
1222 for(Teuchos_Ordinal i = 1; i < rand.size()+1; i++){
1223 dist_offsets_host(i) = dist_degrees_host(i-1) + dist_offsets_host(i-1);
1224 total_adjs += dist_degrees_host(i-1);
1226 Kokkos::View<lno_t*, device_type> dist_adjs_dev(
"Owned+Ghost adjacency view", total_adjs);
1227 typename Kokkos::View<lno_t*, device_type>::host_mirror_type dist_adjs_host = Kokkos::create_mirror(dist_adjs_dev);
1232 for(Teuchos_Ordinal i = 0; i < rand.size(); i++){
1233 dist_degrees_host(i) = 0;
1236 for(Teuchos_Ordinal i = 0; i < adjs.size(); i++) dist_adjs_host(i) = adjs[i];
1237 for(Teuchos_Ordinal i = adjs.size(); i < adjs.size() + ghost_adjs.size(); i++) dist_adjs_host(i) = ghost_adjs[i-adjs.size()];
1241 for(Teuchos_Ordinal i = 0; i < ghost_offsets.size()-1; i++){
1243 for(
offset_t j = ghost_offsets[i]; j < ghost_offsets[i+1]; j++){
1245 if((
size_t)ghost_adjs[j] >= n_total){
1247 dist_adjs_host(dist_offsets_host(ghost_adjs[j]) + dist_degrees_host(ghost_adjs[j])) = i + n_local;
1250 dist_degrees_host(ghost_adjs[j])++;
1255 Kokkos::deep_copy(dist_degrees_dev,dist_degrees_host);
1256 Kokkos::deep_copy(dist_offsets_dev,dist_offsets_host);
1257 Kokkos::deep_copy(dist_adjs_dev, dist_adjs_host);
1260 Kokkos::View<size_t*, device_type> recoloringSize(
"Recoloring Queue Size",1);
1261 typename Kokkos::View<size_t*, device_type>::host_mirror_type recoloringSize_host = Kokkos::create_mirror(recoloringSize);
1262 recoloringSize_host(0) = 0;
1263 Kokkos::deep_copy(recoloringSize, recoloringSize_host);
1266 if(
verbose) std::cout<<
comm->getRank()<<
": constructing rand and GIDs views\n";
1267 Kokkos::View<int*, device_type> rand_dev(
"Random View", rand.size());
1268 typename Kokkos::View<int*, device_type>::host_mirror_type rand_host = Kokkos::create_mirror(rand_dev);
1269 for(Teuchos_Ordinal i = 0; i < rand.size(); i ++) rand_host(i) = rand[i];
1270 Kokkos::deep_copy(rand_dev,rand_host);
1273 Kokkos::View<gno_t*, device_type> gid_dev(
"GIDs", gids.size());
1274 typename Kokkos::View<gno_t*, device_type>::host_mirror_type gid_host = Kokkos::create_mirror(gid_dev);
1275 for(Teuchos_Ordinal i = 0; i < gids.size(); i++) gid_host(i) = gids[i];
1276 Kokkos::deep_copy(gid_dev,gid_host);
1282 Kokkos::View<lno_t*, device_type> boundary_verts_dev;
1284 if(
verbose) std::cout<<
comm->getRank()<<
": constructing communication and recoloring lists\n";
1290 Kokkos::View<lno_t*, device_type> verts_to_recolor_view(
"verts to recolor", rand.size());
1291 typename Kokkos::View<lno_t*, device_type>::host_mirror_type verts_to_recolor_host = create_mirror(verts_to_recolor_view);
1294 Kokkos::View<int*, device_type> verts_to_recolor_size(
"verts to recolor size",1);
1295 Kokkos::View<int*, device_type, Kokkos::MemoryTraits<Kokkos::Atomic>> verts_to_recolor_size_atomic = verts_to_recolor_size;
1296 typename Kokkos::View<int*, device_type>::host_mirror_type verts_to_recolor_size_host = create_mirror(verts_to_recolor_size);
1299 verts_to_recolor_size_host(0) = 0;
1301 Kokkos::deep_copy(verts_to_recolor_size, verts_to_recolor_size_host);
1308 Kokkos::View<lno_t*, device_type> verts_to_send_view(
"verts to send", n_local);
1309 typename Kokkos::View<lno_t*, device_type>::host_mirror_type verts_to_send_host = create_mirror(verts_to_send_view);
1312 Kokkos::View<size_t*, device_type> verts_to_send_size(
"verts to send size",1);
1313 Kokkos::View<size_t*, device_type, Kokkos::MemoryTraits<Kokkos::Atomic>> verts_to_send_size_atomic = verts_to_send_size;
1314 typename Kokkos::View<size_t*, device_type>::host_mirror_type verts_to_send_size_host = create_mirror(verts_to_send_size);
1316 verts_to_send_size_host(0) = 0;
1317 Kokkos::deep_copy(verts_to_send_size, verts_to_send_size_host);
1319 if(
verbose) std::cout<<
comm->getRank()<<
": Constructing the boundary\n";
1324 constructBoundary(n_local, dist_offsets_dev, dist_adjs_dev, dist_offsets_host, dist_adjs_host, boundary_verts_dev,
1325 verts_to_send_view, verts_to_send_size_atomic);
1330 bool use_vbbit = (global_max_degree < 6000);
1335 interior_time =
timer();
1336 total_time =
timer();
1338 this->colorInterior(n_local, adjs_dev, offsets_dev, femv,adjs_dev,0,use_vbbit);
1339 interior_time =
timer() - interior_time;
1340 comp_time = interior_time;
1346 Kokkos::View<int*,device_type> ghost_colors(
"ghost color backups", n_ghosts);
1349 if(
verbose) std::cout<<
comm->getRank()<<
": communicating\n";
1354 Kokkos::deep_copy(verts_to_send_host, verts_to_send_view);
1355 Kokkos::deep_copy(verts_to_send_size_host, verts_to_send_size);
1357 comm_time = doOwnedToGhosts(mapOwnedPlusGhosts,n_local,verts_to_send_host,verts_to_send_size_host,femv,procs_to_send,sent,recv);
1358 sentPerRound[0] = sent;
1359 recvPerRound[0] = recv;
1365 auto femvColors = femv->getLocalViewDevice(Tpetra::Access::ReadWrite);
1366 auto femv_colors = subview(femvColors, Kokkos::ALL, 0);
1367 Kokkos::parallel_for(
"get femv colors",
1368 Kokkos::RangePolicy<execution_space, int>(0,n_ghosts),
1369 KOKKOS_LAMBDA(
const int& i){
1370 ghost_colors(i) = femv_colors(i+n_local);
1374 double temp =
timer();
1376 bool recolor_degrees = this->pl->template get<bool>(
"recolor_degrees",
false);
1377 if(
verbose) std::cout<<
comm->getRank()<<
": detecting conflicts\n";
1380 verts_to_send_size_host(0) = 0;
1381 verts_to_recolor_size_host(0) = 0;
1382 recoloringSize_host(0) = 0;
1383 Kokkos::deep_copy(verts_to_send_size, verts_to_send_size_host);
1384 Kokkos::deep_copy(verts_to_recolor_size, verts_to_recolor_size_host);
1385 Kokkos::deep_copy(recoloringSize, recoloringSize_host);
1387 detectConflicts(n_local, dist_offsets_dev, dist_adjs_dev, femv_colors, boundary_verts_dev,
1388 verts_to_recolor_view, verts_to_recolor_size_atomic, verts_to_send_view, verts_to_send_size_atomic,
1389 recoloringSize, rand_dev, gid_dev, ghost_degrees_dev, recolor_degrees);
1393 Kokkos::deep_copy(verts_to_send_host, verts_to_send_view);
1394 Kokkos::deep_copy(verts_to_send_size_host, verts_to_send_size);
1395 Kokkos::deep_copy(recoloringSize_host, recoloringSize);
1396 Kokkos::deep_copy(verts_to_recolor_size_host, verts_to_recolor_size);
1398 if(
comm->getSize() > 1){
1399 conflict_detection =
timer() - temp;
1400 comp_time += conflict_detection;
1403 if(
verbose) std::cout<<
comm->getRank()<<
": starting to recolor\n";
1405 double totalPerRound[numStatisticRecordingRounds];
1406 double commPerRound[numStatisticRecordingRounds];
1407 double compPerRound[numStatisticRecordingRounds];
1408 double recoloringPerRound[numStatisticRecordingRounds];
1409 double conflictDetectionPerRound[numStatisticRecordingRounds];
1410 uint64_t vertsPerRound[numStatisticRecordingRounds];
1411 uint64_t incorrectGhostsPerRound[numStatisticRecordingRounds];
1412 int distributedRounds = 1;
1413 totalPerRound[0] = interior_time + comm_time + conflict_detection;
1414 recoloringPerRound[0] = 0;
1415 commPerRound[0] = comm_time;
1416 compPerRound[0] = interior_time + conflict_detection;
1417 conflictDetectionPerRound[0] = conflict_detection;
1418 recoloringPerRound[0] = 0;
1419 vertsPerRound[0] = 0;
1420 incorrectGhostsPerRound[0]=0;
1421 typename Kokkos::View<int*, device_type>::host_mirror_type ghost_colors_host;
1422 typename Kokkos::View<lno_t*, device_type>::host_mirror_type boundary_verts_host;
1423 size_t serial_threshold = this->pl->template get<int>(
"serial_threshold",0);
1425 size_t totalConflicts = 0;
1426 size_t localConflicts = recoloringSize_host(0);
1427 Teuchos::reduceAll<int,size_t>(*
comm, Teuchos::REDUCE_SUM, 1, &localConflicts, &totalConflicts);
1428 bool done = !totalConflicts;
1429 if(
comm->getSize()==1) done =
true;
1436 if(recoloringSize_host(0) < serial_threshold)
break;
1437 if(distributedRounds < numStatisticRecordingRounds) {
1438 vertsPerRound[distributedRounds] = verts_to_recolor_size_host(0);
1442 double recolor_temp =
timer();
1444 if(verts_to_recolor_size_host(0) > 0){
1445 this->colorInterior(femv_colors.size(), dist_adjs_dev, dist_offsets_dev,femv,verts_to_recolor_view,verts_to_recolor_size_host(0),use_vbbit);
1448 if(distributedRounds < numStatisticRecordingRounds){
1449 recoloringPerRound[distributedRounds] =
timer() - recolor_temp;
1450 recoloring_time += recoloringPerRound[distributedRounds];
1451 comp_time += recoloringPerRound[distributedRounds];
1452 compPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
1453 totalPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
1455 double recoloring_round_time =
timer() - recolor_temp;
1456 recoloring_time += recoloring_round_time;
1457 comp_time += recoloring_round_time;
1464 Kokkos::parallel_for(
"set femv colors",
1465 Kokkos::RangePolicy<execution_space, int>(0,n_ghosts),
1466 KOKKOS_LAMBDA(
const int& i){
1467 femv_colors(i+n_local) = ghost_colors(i);
1475 femvColors =
decltype(femvColors)();
1476 femv_colors =
decltype(femv_colors)();
1477 double curr_comm_time = doOwnedToGhosts(mapOwnedPlusGhosts,n_local,verts_to_send_host,verts_to_send_size_host,femv,procs_to_send,sent,recv);
1478 comm_time += curr_comm_time;
1480 if(distributedRounds < numStatisticRecordingRounds){
1481 commPerRound[distributedRounds] = curr_comm_time;
1482 recvPerRound[distributedRounds] = recv;
1483 sentPerRound[distributedRounds] = sent;
1484 totalPerRound[distributedRounds] += commPerRound[distributedRounds];
1491 femvColors = femv->getLocalViewDevice(Tpetra::Access::ReadWrite);
1492 femv_colors = subview(femvColors, Kokkos::ALL, 0);
1493 Kokkos::parallel_for(
"get femv colors 2",
1494 Kokkos::RangePolicy<execution_space, int>(0,n_ghosts),
1495 KOKKOS_LAMBDA(
const int& i){
1496 ghost_colors(i) = femv_colors(i+n_local);
1503 verts_to_send_size_host(0) = 0;
1504 verts_to_recolor_size_host(0) = 0;
1505 recoloringSize_host(0) = 0;
1506 Kokkos::deep_copy(verts_to_send_size, verts_to_send_size_host);
1507 Kokkos::deep_copy(verts_to_recolor_size, verts_to_recolor_size_host);
1508 Kokkos::deep_copy(recoloringSize, recoloringSize_host);
1511 double detection_temp =
timer();
1513 detectConflicts(n_local, dist_offsets_dev, dist_adjs_dev,femv_colors, boundary_verts_dev,
1514 verts_to_recolor_view, verts_to_recolor_size_atomic, verts_to_send_view, verts_to_send_size_atomic,
1515 recoloringSize, rand_dev, gid_dev, ghost_degrees_dev, recolor_degrees);
1518 Kokkos::deep_copy(verts_to_send_host, verts_to_send_view);
1519 Kokkos::deep_copy(verts_to_send_size_host, verts_to_send_size);
1521 Kokkos::deep_copy(verts_to_recolor_size_host, verts_to_recolor_size);
1522 Kokkos::deep_copy(recoloringSize_host, recoloringSize);
1524 if(distributedRounds < numStatisticRecordingRounds){
1525 conflictDetectionPerRound[distributedRounds] =
timer() - detection_temp;
1526 conflict_detection += conflictDetectionPerRound[distributedRounds];
1527 compPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
1528 totalPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
1529 comp_time += conflictDetectionPerRound[distributedRounds];
1531 double conflict_detection_round_time =
timer() - detection_temp;
1532 conflict_detection += conflict_detection_round_time;
1533 comp_time += conflict_detection_round_time;
1536 distributedRounds++;
1537 size_t localDone = recoloringSize_host(0);
1538 size_t globalDone = 0;
1539 Teuchos::reduceAll<int,size_t>(*
comm, Teuchos::REDUCE_SUM, 1, &localDone, &globalDone);
1549 if(recoloringSize_host(0) > 0 || !done){
1550 ghost_colors_host = Kokkos::create_mirror_view_and_copy(
host_mem(),ghost_colors,
"ghost_colors host mirror");
1551 boundary_verts_host = Kokkos::create_mirror_view_and_copy(
host_mem(),boundary_verts_dev,
"boundary_verts host mirror");
1557 femvColors =
decltype(femvColors)();
1558 femv_colors =
decltype(femv_colors)();
1559 while(recoloringSize_host(0) > 0 || !done){
1560 auto femvColors_host = femv->getLocalViewHost(Tpetra::Access::ReadWrite);
1561 auto colors_host = subview(femvColors_host, Kokkos::ALL, 0);
1562 if(distributedRounds < numStatisticRecordingRounds){
1563 vertsPerRound[distributedRounds] = recoloringSize_host(0);
1565 if(
verbose) std::cout<<
comm->getRank()<<
": starting to recolor, serial\n";
1568 double recolor_temp =
timer();
1569 if(verts_to_recolor_size_host(0) > 0){
1570 this->colorInterior_serial(colors_host.size(), dist_adjs_host, dist_offsets_host, femv,
1571 verts_to_recolor_host, verts_to_recolor_size_host(0),
true);
1573 if(distributedRounds < numStatisticRecordingRounds){
1574 recoloringPerRound[distributedRounds] =
timer() - recolor_temp;
1575 recoloring_time += recoloringPerRound[distributedRounds];
1576 comp_time += recoloringPerRound[distributedRounds];
1577 compPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
1578 totalPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
1580 double recoloring_serial_round_time =
timer() - recolor_temp;
1581 recoloring_time += recoloring_serial_round_time;
1582 comp_time += recoloring_serial_round_time;
1587 for(
size_t i = 0; i < n_ghosts; i++){
1588 colors_host(i+n_local) = ghost_colors_host(i);
1591 double curr_comm_time = doOwnedToGhosts(mapOwnedPlusGhosts, n_local,verts_to_send_host, verts_to_send_size_host, femv, procs_to_send, sent,recv);
1592 comm_time += curr_comm_time;
1594 if(distributedRounds < numStatisticRecordingRounds){
1595 commPerRound[distributedRounds] = curr_comm_time;
1596 recvPerRound[distributedRounds] = recv;
1597 sentPerRound[distributedRounds] = sent;
1598 totalPerRound[distributedRounds] += commPerRound[distributedRounds];
1603 for(
size_t i = 0; i < n_ghosts; i++){
1604 ghost_colors_host(i) = colors_host(i+n_local);
1608 double detection_temp =
timer();
1611 verts_to_recolor_size_host(0) = 0;
1612 verts_to_send_size_host(0) = 0;
1613 recoloringSize_host(0) = 0;
1616 verts_to_recolor_host, verts_to_recolor_size_host, verts_to_send_host, verts_to_send_size_host,
1617 recoloringSize_host, rand_host, gid_host, ghost_degrees_host, recolor_degrees);
1621 if(distributedRounds < numStatisticRecordingRounds){
1622 conflictDetectionPerRound[distributedRounds] =
timer() - detection_temp;
1623 conflict_detection += conflictDetectionPerRound[distributedRounds];
1624 compPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
1625 totalPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
1626 comp_time += conflictDetectionPerRound[distributedRounds];
1628 double conflict_detection_serial_round_time =
timer() - detection_temp;
1629 conflict_detection += conflict_detection_serial_round_time;
1630 comp_time += conflict_detection_serial_round_time;
1633 size_t globalDone = 0;
1634 size_t localDone = recoloringSize_host(0);
1635 Teuchos::reduceAll<int,size_t>(*
comm, Teuchos::REDUCE_SUM, 1, &localDone, &globalDone);
1636 distributedRounds++;
1640 total_time =
timer() - total_time;
1643 uint64_t localBoundaryVertices = 0;
1644 for(
size_t i = 0; i < n_local; i++){
1645 for(
offset_t j = offsets[i]; j < offsets[i+1]; j++){
1646 if((
size_t)adjs[j] >= n_local){
1647 localBoundaryVertices++;
1654 uint64_t totalVertsPerRound[numStatisticRecordingRounds];
1655 uint64_t totalBoundarySize = 0;
1656 uint64_t totalIncorrectGhostsPerRound[numStatisticRecordingRounds];
1657 double finalTotalPerRound[numStatisticRecordingRounds];
1658 double maxRecoloringPerRound[numStatisticRecordingRounds];
1659 double minRecoloringPerRound[numStatisticRecordingRounds];
1660 double finalCommPerRound[numStatisticRecordingRounds];
1661 double finalCompPerRound[numStatisticRecordingRounds];
1662 double finalConflictDetectionPerRound[numStatisticRecordingRounds];
1663 gno_t finalRecvPerRound[numStatisticRecordingRounds];
1664 gno_t finalSentPerRound[numStatisticRecordingRounds];
1665 for(
int i = 0; i < numStatisticRecordingRounds; i++){
1666 totalVertsPerRound[i] = 0;
1667 finalTotalPerRound[i] = 0.0;
1668 maxRecoloringPerRound[i] = 0.0;
1669 minRecoloringPerRound[i] = 0.0;
1670 finalCommPerRound[i] = 0.0;
1671 finalCompPerRound[i] = 0.0;
1672 finalConflictDetectionPerRound[i] = 0.0;
1673 finalRecvPerRound[i] = 0;
1674 finalSentPerRound[i] = 0;
1676 Teuchos::reduceAll<int,uint64_t>(*
comm, Teuchos::REDUCE_SUM,1,&localBoundaryVertices, &totalBoundarySize);
1677 Teuchos::reduceAll<int,uint64_t>(*
comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,vertsPerRound,totalVertsPerRound);
1678 Teuchos::reduceAll<int,uint64_t>(*
comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,incorrectGhostsPerRound,totalIncorrectGhostsPerRound);
1679 Teuchos::reduceAll<int,double>(*
comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,totalPerRound, finalTotalPerRound);
1680 Teuchos::reduceAll<int,double>(*
comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,recoloringPerRound,maxRecoloringPerRound);
1681 Teuchos::reduceAll<int,double>(*
comm, Teuchos::REDUCE_MIN,numStatisticRecordingRounds,recoloringPerRound,minRecoloringPerRound);
1682 Teuchos::reduceAll<int,double>(*
comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,commPerRound,finalCommPerRound);
1683 Teuchos::reduceAll<int,double>(*
comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,compPerRound,finalCompPerRound);
1684 Teuchos::reduceAll<int,double>(*
comm,
1685 Teuchos::REDUCE_MAX,numStatisticRecordingRounds,conflictDetectionPerRound,finalConflictDetectionPerRound);
1686 Teuchos::reduceAll<int,gno_t> (*
comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,recvPerRound,finalRecvPerRound);
1687 Teuchos::reduceAll<int,gno_t> (*
comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,sentPerRound,finalSentPerRound);
1688 std::cout <<
"Rank " <<
comm->getRank()
1689 <<
": boundary size: " << localBoundaryVertices << std::endl;
1690 if(
comm->getRank() == 0)
1691 std::cout <<
"Total boundary size: " << totalBoundarySize << std::endl;
1692 for(
int i = 0; i < std::min((
int)distributedRounds,numStatisticRecordingRounds); i++){
1693 std::cout <<
"Rank " <<
comm->getRank()
1694 <<
": recolor " << vertsPerRound[i]
1695 <<
" vertices in round " << i << std::endl;
1696 std::cout <<
"Rank " <<
comm->getRank()
1697 <<
" sentbuf had " << sentPerRound[i]
1698 <<
" entries in round " << i << std::endl;
1699 if(
comm->getRank()==0){
1700 std::cout <<
"recolored " << totalVertsPerRound[i]
1701 <<
" vertices in round " << i << std::endl;
1702 std::cout << totalIncorrectGhostsPerRound[i]
1703 <<
" inconsistent ghosts in round " << i << std::endl;
1704 std::cout <<
"total time in round " << i
1705 <<
": " << finalTotalPerRound[i] << std::endl;
1706 std::cout <<
"recoloring time in round " << i
1707 <<
": " << maxRecoloringPerRound[i] << std::endl;
1708 std::cout <<
"min recoloring time in round " << i
1709 <<
": " << minRecoloringPerRound[i] << std::endl;
1710 std::cout <<
"conflict detection time in round " << i
1711 <<
": " << finalConflictDetectionPerRound[i] << std::endl;
1712 std::cout <<
"comm time in round " << i
1713 <<
": " << finalCommPerRound[i] << std::endl;
1714 std::cout <<
"recvbuf size in round " << i
1715 <<
": " << finalRecvPerRound[i] << std::endl;
1716 std::cout <<
"sendbuf size in round " << i
1717 <<
": " << finalSentPerRound[i] << std::endl;
1718 std::cout <<
"comp time in round " << i
1719 <<
": " << finalCompPerRound[i] << std::endl;
1723 double global_total_time = 0.0;
1724 double global_recoloring_time = 0.0;
1725 double global_min_recoloring_time = 0.0;
1726 double global_conflict_detection=0.0;
1727 double global_comm_time=0.0;
1728 double global_comp_time=0.0;
1729 double global_interior_time=0.0;
1730 Teuchos::reduceAll<int,double>(*
comm, Teuchos::REDUCE_MAX,1,&total_time,&global_total_time);
1731 Teuchos::reduceAll<int,double>(*
comm, Teuchos::REDUCE_MAX,1,&recoloring_time,&global_recoloring_time);
1732 Teuchos::reduceAll<int,double>(*
comm, Teuchos::REDUCE_MIN,1,&recoloring_time,&global_min_recoloring_time);
1733 Teuchos::reduceAll<int,double>(*
comm, Teuchos::REDUCE_MAX,1,&conflict_detection,&global_conflict_detection);
1734 Teuchos::reduceAll<int,double>(*
comm, Teuchos::REDUCE_MAX,1,&comm_time,&global_comm_time);
1735 Teuchos::reduceAll<int,double>(*
comm, Teuchos::REDUCE_MAX,1,&comp_time,&global_comp_time);
1736 Teuchos::reduceAll<int,double>(*
comm, Teuchos::REDUCE_MAX,1,&interior_time,&global_interior_time);
1739 if(
comm->getRank()==0){
1740 std::cout <<
"Total Time: " << global_total_time << std::endl;
1741 std::cout <<
"Interior Time: " << global_interior_time << std::endl;
1742 std::cout <<
"Recoloring Time: " << global_recoloring_time << std::endl;
1743 std::cout <<
"Min Recoloring Time: " << global_min_recoloring_time << std::endl;
1744 std::cout <<
"Conflict Detection Time: " << global_conflict_detection << std::endl;
1745 std::cout <<
"Comm Time: " << global_comm_time << std::endl;
1746 std::cout <<
"Comp Time: " << global_comp_time << std::endl;