Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
Tpetra_BlockCrsMatrix_Helpers_def.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef TPETRA_BLOCKCRSMATRIX_HELPERS_DEF_HPP
11#define TPETRA_BLOCKCRSMATRIX_HELPERS_DEF_HPP
12
14
15#include "Tpetra_BlockCrsMatrix.hpp"
16#include "Tpetra_CrsMatrix.hpp"
17#include "Tpetra_HashTable.hpp"
18#include "Tpetra_Import.hpp"
19#include "Tpetra_Map.hpp"
20#include "Tpetra_MultiVector.hpp"
21#include "Teuchos_ParameterList.hpp"
22#include "Teuchos_ScalarTraits.hpp"
23#include <ctime>
24#include <fstream>
25
26namespace Tpetra {
27
28template <class Scalar, class LO, class GO, class Node>
30 Teuchos::ParameterList pl;
31 std::ofstream out;
32 out.open(fileName.c_str());
34}
35
36template <class Scalar, class LO, class GO, class Node>
37void blockCrsMatrixWriter(BlockCrsMatrix<Scalar, LO, GO, Node> const &A, std::string const &fileName, Teuchos::ParameterList const &params) {
38 std::ofstream out;
39 out.open(fileName.c_str());
41}
42
43template <class Scalar, class LO, class GO, class Node>
45 Teuchos::ParameterList pl;
47}
48
49template <class Scalar, class LO, class GO, class Node>
50void blockCrsMatrixWriter(BlockCrsMatrix<Scalar, LO, GO, Node> const &A, std::ostream &os, Teuchos::ParameterList const &params) {
51 using Teuchos::RCP;
52 using Teuchos::rcp;
53
54 typedef Teuchos::OrdinalTraits<GO> TOT;
55 typedef BlockCrsMatrix<Scalar, LO, GO, Node> block_crs_matrix_type;
56 typedef Tpetra::Import<LO, GO, Node> import_type;
57 typedef Tpetra::Map<LO, GO, Node> map_type;
59 typedef Tpetra::CrsGraph<LO, GO, Node> crs_graph_type;
60
61 RCP<const map_type> const rowMap = A.getRowMap(); //"mesh" map
62 RCP<const Teuchos::Comm<int>> comm = rowMap->getComm();
63 const int myRank = comm->getRank();
64 const size_t numProcs = comm->getSize();
65
66 // If true, force use of the import strip-mining infrastructure. This is useful for debugging on one process.
67 bool alwaysUseParallelAlgorithm = false;
68 if (params.isParameter("always use parallel algorithm"))
69 alwaysUseParallelAlgorithm = params.get<bool>("always use parallel algorithm");
70 bool printMatrixMarketHeader = true;
71 if (params.isParameter("print MatrixMarket header"))
72 printMatrixMarketHeader = params.get<bool>("print MatrixMarket header");
73
74 if (printMatrixMarketHeader && myRank == 0) {
75 std::time_t now = std::time(NULL);
76
77 const std::string dataTypeStr =
78 Teuchos::ScalarTraits<Scalar>::isComplex ? "complex" : "real";
79
80 // Explanation of first line of file:
81 // - "%%MatrixMarket" is the tag for Matrix Market format.
82 // - "matrix" is what we're printing.
83 // - "coordinate" means sparse (triplet format), rather than dense.
84 // - "real" / "complex" is the type (in an output format sense,
85 // not in a C++ sense) of each value of the matrix. (The
86 // other two possibilities are "integer" (not really necessary
87 // here) and "pattern" (no values, just graph).)
88 os << "%%MatrixMarket matrix coordinate " << dataTypeStr << " general" << std::endl;
89 os << "% time stamp: " << ctime(&now);
90 os << "% written from " << numProcs << " processes" << std::endl;
91 os << "% point representation of Tpetra::BlockCrsMatrix" << std::endl;
92 size_t numRows = A.getGlobalNumRows();
93 size_t numCols = A.getGlobalNumCols();
94 os << "% " << numRows << " block rows, " << numCols << " block columns" << std::endl;
95 const LO blockSize = A.getBlockSize();
96 os << "% block size " << blockSize << std::endl;
97 os << numRows * blockSize << " " << numCols * blockSize << " " << A.getGlobalNumEntries() * blockSize * blockSize << std::endl;
98 }
99
102 } else {
103 size_t numRows = rowMap->getLocalNumElements();
104
105 // Create source map
106 RCP<const map_type> allMeshGidsMap = rcp(new map_type(TOT::invalid(), numRows, A.getIndexBase(), comm));
107 // Create and populate vector of mesh GIDs corresponding to this pid's rows.
108 // This vector will be imported one pid's worth of information at a time to pid 0.
109 mv_type allMeshGids(allMeshGidsMap, 1);
110 Teuchos::ArrayRCP<GO> allMeshGidsData = allMeshGids.getDataNonConst(0);
111
112 for (size_t i = 0; i < numRows; i++)
113 allMeshGidsData[i] = rowMap->getGlobalElement(i);
114 allMeshGidsData = Teuchos::null;
115
116 // Now construct a RowMatrix on PE 0 by strip-mining the rows of the input matrix A.
117 size_t stripSize = allMeshGids.getGlobalLength() / numProcs;
118 size_t remainder = allMeshGids.getGlobalLength() % numProcs;
119 size_t curStart = 0;
120 size_t curStripSize = 0;
121 Teuchos::Array<GO> importMeshGidList;
122 for (size_t i = 0; i < numProcs; i++) {
123 if (myRank == 0) { // Only PE 0 does this part
125 if (i < remainder) curStripSize++; // handle leftovers
126 importMeshGidList.resize(curStripSize); // Set size of vector to max needed
127 for (size_t j = 0; j < curStripSize; j++) importMeshGidList[j] = j + curStart + A.getIndexBase();
129 }
130 // The following import map should be non-trivial only on PE 0.
132 std::runtime_error, "Tpetra::blockCrsMatrixWriter: (pid " << myRank << ") map size should be zero, but is " << curStripSize);
133 RCP<map_type> importMeshGidMap = rcp(new map_type(TOT::invalid(), importMeshGidList(), A.getIndexBase(), comm));
137
138 // importMeshGids now has a list of GIDs for the current strip of matrix rows.
139 // Use these values to build another importer that will get rows of the matrix.
140
141 // The following import map will be non-trivial only on PE 0.
142 Teuchos::ArrayRCP<const GO> importMeshGidsData = importMeshGids.getData(0);
143 Teuchos::Array<GO> importMeshGidsGO;
144 importMeshGidsGO.reserve(importMeshGidsData.size());
145 for (typename Teuchos::ArrayRCP<const GO>::size_type j = 0; j < importMeshGidsData.size(); ++j)
147 RCP<const map_type> importMap = rcp(new map_type(TOT::invalid(), importMeshGidsGO(), rowMap->getIndexBase(), comm));
148
149 import_type importer(rowMap, importMap);
150 size_t numEntriesPerRow = A.getCrsGraph().getGlobalMaxNumRowEntries();
152 RCP<const map_type> domainMap = A.getCrsGraph().getDomainMap();
153 graph->doImport(A.getCrsGraph(), importer, INSERT);
154 graph->fillComplete(domainMap, importMap);
155
156 block_crs_matrix_type importA(*graph, A.getBlockSize());
157 importA.doImport(A, importer, INSERT);
158
159 // Finally we are ready to write this strip of the matrix
161 }
162 }
163}
164
165template <class Scalar, class LO, class GO, class Node>
166void writeMatrixStrip(BlockCrsMatrix<Scalar, LO, GO, Node> const &A, std::ostream &os, Teuchos::ParameterList const &params) {
167 using Teuchos::RCP;
168 using map_type = Tpetra::Map<LO, GO, Node>;
170 using bcrs_local_inds_host_view_type = typename bcrs_type::local_inds_host_view_type;
171 using bcrs_values_host_view_type = typename bcrs_type::values_host_view_type;
172 using impl_scalar_type = typename bcrs_type::impl_scalar_type;
173
174 size_t numRows = A.getGlobalNumRows();
175 RCP<const map_type> rowMap = A.getRowMap();
176 RCP<const map_type> colMap = A.getColMap();
177 RCP<const Teuchos::Comm<int>> comm = rowMap->getComm();
178 const int myRank = comm->getRank();
179
180 const size_t meshRowOffset = rowMap->getIndexBase();
181 const size_t meshColOffset = colMap->getIndexBase();
183 std::runtime_error,
184 "Tpetra::writeMatrixStrip: "
185 "mesh row index base != mesh column index base");
186
187 if (myRank != 0) {
188 TEUCHOS_TEST_FOR_EXCEPTION(A.getLocalNumRows() != 0,
189 std::runtime_error, "Tpetra::writeMatrixStrip: pid " << myRank << " should have 0 rows but has " << A.getLocalNumRows());
190 TEUCHOS_TEST_FOR_EXCEPTION(A.getLocalNumCols() != 0,
191 std::runtime_error, "Tpetra::writeMatrixStrip: pid " << myRank << " should have 0 columns but has " << A.getLocalNumCols());
192
193 } else {
194 TEUCHOS_TEST_FOR_EXCEPTION(numRows != A.getLocalNumRows(),
195 std::runtime_error,
196 "Tpetra::writeMatrixStrip: "
197 "number of rows on pid 0 does not match global number of rows");
198
199 int err = 0;
200 const LO blockSize = A.getBlockSize();
201 const size_t numLocalRows = A.getLocalNumRows();
202 bool precisionChanged = false;
203 int oldPrecision = 0; // avoid "unused variable" warning
204 if (params.isParameter("precision")) {
205 oldPrecision = os.precision(params.get<int>("precision"));
206 precisionChanged = true;
207 }
208 int pointOffset = 1;
209 if (params.isParameter("zero-based indexing")) {
210 if (params.get<bool>("zero-based indexing") == true)
211 pointOffset = 0;
212 }
213
214 size_t localRowInd;
216 // Get a view of the current row.
219 LO numEntries;
220 A.getLocalRowView(localRowInd, localColInds, vals);
221 numEntries = localColInds.extent(0);
222 GO globalMeshRowID = rowMap->getGlobalElement(localRowInd) - meshRowOffset;
223
224 for (LO k = 0; k < numEntries; ++k) {
225 GO globalMeshColID = colMap->getGlobalElement(localColInds[k]) - meshColOffset;
226 const impl_scalar_type *curBlock = vals.data() + blockSize * blockSize * k;
227 // Blocks are stored in row-major format.
228 for (LO j = 0; j < blockSize; ++j) {
230 for (LO i = 0; i < blockSize; ++i) {
232 const impl_scalar_type curVal = curBlock[i + j * blockSize];
233
234 os << globalPointRowID << " " << globalPointColID << " ";
235 if (Teuchos::ScalarTraits<impl_scalar_type>::isComplex) {
236 // Matrix Market format wants complex values to be
237 // written as space-delimited pairs. See Bug 6469.
239 << Teuchos::ScalarTraits<impl_scalar_type>::imag(curVal);
240 } else {
241 os << curVal;
242 }
243 os << std::endl;
244 }
245 }
246 }
247 }
249 os.precision(oldPrecision);
251 std::runtime_error,
252 "Tpetra::writeMatrixStrip: "
253 "error getting view of local row "
254 << localRowInd);
255 }
256}
257
258template <class Scalar, class LO, class GO, class Node>
259Teuchos::RCP<Tpetra::CrsGraph<LO, GO, Node>>
261 TEUCHOS_FUNC_TIME_MONITOR_DIFF("Tpetra::getBlockCrsGraph", getBlockCrsGraph0);
262 /*
263 ASSUMPTIONS:
264
265 1) In point matrix, all entries associated with a little block are present (even if they are zero).
266 2) For given mesh DOF, point DOFs appear consecutively and in ascending order in row & column maps.
267 3) Point column map and block column map are ordered consistently.
268 */
269
270 using Teuchos::Array;
271 using Teuchos::ArrayView;
272 using Teuchos::RCP;
273
274 typedef Tpetra::Map<LO, GO, Node> map_type;
275 typedef Tpetra::CrsGraph<LO, GO, Node> crs_graph_type;
276 typedef Tpetra::CrsMatrix<Scalar, LO, GO, Node> crs_matrix_type;
277
278 using local_graph_device_type = typename crs_matrix_type::local_graph_device_type;
279 using row_map_type = typename local_graph_device_type::row_map_type::non_const_type;
280 using entries_type = typename local_graph_device_type::entries_type::non_const_type;
281
282 using offset_type = typename row_map_type::non_const_value_type;
283
284 using execution_space = typename Node::execution_space;
285 using range_type = Kokkos::RangePolicy<execution_space, LO>;
286
287 const map_type &pointRowMap = *(pointMatrix.getRowMap());
289
290 const map_type &pointColMap = *(pointMatrix.getColMap());
291 const map_type &pointDomainMap = *(pointMatrix.getDomainMap());
292 const map_type &pointRangeMap = *(pointMatrix.getRangeMap());
293
294 {
295 TEUCHOS_FUNC_TIME_MONITOR_DIFF("Tpetra::getBlockCrsGraph::createMeshMaps", getBlockCrsGraph1);
300 Kokkos::DefaultExecutionSpace().fence();
301 }
302
303 if (meshColMap.is_null()) throw std::runtime_error("ERROR: Cannot create mesh colmap");
304
305 auto localMeshColMap = meshColMap->getLocalMap();
306 auto localPointColMap = pointColMap.getLocalMap();
307 auto localPointRowMap = pointRowMap.getLocalMap();
308
310
311 const offset_type bs2 = blockSize * blockSize;
312
313 if (use_LID) {
314 TEUCHOS_FUNC_TIME_MONITOR_DIFF("Tpetra::getBlockCrsGraph::LID", getBlockCrsGraph2);
315 auto pointLocalGraph = pointMatrix.getCrsGraph()->getLocalGraphDevice();
316 auto pointRowptr = pointLocalGraph.row_map;
317 auto pointColind = pointLocalGraph.entries;
318
320 std::runtime_error,
321 "Tpetra::getBlockCrsGraph: "
322 "local number of non zero entries is not a multiple of blockSize^2 ");
323
324 LO block_rows = pointRowptr.extent(0) == 0 ? 0 : (pointRowptr.extent(0) - 1) / blockSize;
325 row_map_type blockRowptr("blockRowptr", block_rows + 1);
326 entries_type blockColind("blockColind", pointColind.extent(0) / (bs2));
327
328 Kokkos::parallel_for(
329 "fillMesh", range_type(0, block_rows), KOKKOS_LAMBDA(const LO i) {
330 const LO offset_b = pointRowptr(i * blockSize) / bs2;
331 const LO offset_b_max = pointRowptr((i + 1) * blockSize) / bs2;
332
333 if (i == block_rows - 1)
336
337 const LO offset_p = pointRowptr(i * blockSize);
338
339 for (LO k = 0; k < offset_b_max - offset_b; ++k) {
341 }
342 });
343
346 Kokkos::DefaultExecutionSpace().fence();
347 } else {
348 TEUCHOS_FUNC_TIME_MONITOR_DIFF("Tpetra::getBlockCrsGraph::GID", getBlockCrsGraph3);
349 auto pointLocalGraph = pointMatrix.getCrsGraph()->getLocalGraphDevice();
350 auto pointRowptr = pointLocalGraph.row_map;
351 auto pointColind = pointLocalGraph.entries;
352
354 std::runtime_error,
355 "Tpetra::getBlockCrsGraph: "
356 "local number of non zero entries is not a multiple of blockSize^2 ");
357
358 LO block_rows = pointRowptr.extent(0) == 0 ? 0 : (pointRowptr.extent(0) - 1) / blockSize;
359 row_map_type blockRowptr("blockRowptr", block_rows + 1);
360 entries_type blockColind("blockColind", pointColind.extent(0) / (bs2));
361
362 Kokkos::parallel_for(
363 "fillMesh", range_type(0, block_rows), KOKKOS_LAMBDA(const LO i) {
364 const LO offset_b = pointRowptr(i * blockSize) / bs2;
365 const LO offset_b_max = pointRowptr((i + 1) * blockSize) / bs2;
366
367 if (i == block_rows - 1)
370
371 const LO offset_p = pointRowptr(i * blockSize);
372 const LO offset_p_max = pointRowptr((i + 1) * blockSize);
373
374 LO filled_block = 0;
375 for (LO p_i = 0; p_i < offset_p_max - offset_p; ++p_i) {
376 auto bcol_GID = localPointColMap.getGlobalElement(pointColind(offset_p + p_i)) / blockSize;
377 auto bcol_LID = localMeshColMap.getLocalElement(bcol_GID);
378
379 bool visited = false;
380 for (LO k = 0; k < filled_block; ++k) {
381 if (blockColind(offset_b + k) == bcol_LID)
382 visited = true;
383 }
384 if (!visited) {
386 ++filled_block;
387 }
388 }
389 });
390
393 Kokkos::DefaultExecutionSpace().fence();
394 }
395
396 return meshCrsGraph;
397}
398
399template <class Scalar, class LO, class GO, class Node>
400Teuchos::RCP<BlockCrsMatrix<Scalar, LO, GO, Node>>
402 TEUCHOS_FUNC_TIME_MONITOR_DIFF("Tpetra::convertToBlockCrsMatrix", convertToBlockCrsMatrix0);
403 /*
404 ASSUMPTIONS:
405
406 1) In point matrix, all entries associated with a little block are present (even if they are zero).
407 2) For given mesh DOF, point DOFs appear consecutively and in ascending order in row & column maps.
408 3) Point column map and block column map are ordered consistently.
409 */
410
411 using Teuchos::Array;
412 using Teuchos::ArrayView;
413 using Teuchos::RCP;
414
415 typedef Tpetra::BlockCrsMatrix<Scalar, LO, GO, Node> block_crs_matrix_type;
416 typedef Tpetra::CrsMatrix<Scalar, LO, GO, Node> crs_matrix_type;
417
418 using local_graph_device_type = typename crs_matrix_type::local_graph_device_type;
419 using local_matrix_device_type = typename crs_matrix_type::local_matrix_device_type;
420 using row_map_type = typename local_graph_device_type::row_map_type::non_const_type;
421 using values_type = typename local_matrix_device_type::values_type::non_const_type;
422
423 using offset_type = typename row_map_type::non_const_value_type;
424
425 using execution_space = typename Node::execution_space;
426 using range_type = Kokkos::RangePolicy<execution_space, LO>;
427
429
430 const offset_type bs2 = blockSize * blockSize;
431
433
434 if (use_LID) {
435 TEUCHOS_FUNC_TIME_MONITOR_DIFF("Tpetra::convertToBlockCrsMatrix::LID", convertToBlockCrsMatrix1);
436 auto pointLocalGraph = pointMatrix.getCrsGraph()->getLocalGraphDevice();
437 auto pointRowptr = pointLocalGraph.row_map;
438 auto pointColind = pointLocalGraph.entries;
439
440 offset_type block_rows = pointRowptr.extent(0) == 0 ? 0 : (pointRowptr.extent(0) - 1) / blockSize;
441 values_type blockValues("values", meshCrsGraph->getLocalNumEntries() * bs2);
442 auto pointValues = pointMatrix.getLocalValuesDevice(Access::ReadOnly);
443 auto blockRowptr = meshCrsGraph->getLocalGraphDevice().row_map;
444
445 Kokkos::parallel_for(
446 "copyblockValues", range_type(0, block_rows), KOKKOS_LAMBDA(const LO i) {
447 const offset_type blkBeg = blockRowptr[i];
448 const offset_type numBlocks = blockRowptr[i + 1] - blkBeg;
449
450 // For each block in the row...
451 for (offset_type block = 0; block < numBlocks; block++) {
452 // For each entry in the block...
453 for (LO little_row = 0; little_row < blockSize; little_row++) {
455 for (LO little_col = 0; little_col < blockSize; little_col++) {
458 }
459 }
460 }
461 });
462 blockMatrix = rcp(new block_crs_matrix_type(*meshCrsGraph, blockValues, blockSize));
463 Kokkos::DefaultExecutionSpace().fence();
464 } else {
465 TEUCHOS_FUNC_TIME_MONITOR_DIFF("Tpetra::convertToBlockCrsMatrix::GID", convertToBlockCrsMatrix2);
466 auto localMeshColMap = meshCrsGraph->getColMap()->getLocalMap();
467 auto localPointColMap = pointMatrix.getColMap()->getLocalMap();
468
469 values_type blockValues("values", meshCrsGraph->getLocalNumEntries() * bs2);
470 {
471 auto pointLocalGraph = pointMatrix.getCrsGraph()->getLocalGraphDevice();
472 auto pointRowptr = pointLocalGraph.row_map;
473 auto pointColind = pointLocalGraph.entries;
474
475 offset_type block_rows = pointRowptr.extent(0) == 0 ? 0 : (pointRowptr.extent(0) - 1) / blockSize;
476 auto pointValues = pointMatrix.getLocalValuesDevice(Access::ReadOnly);
477 auto blockRowptr = meshCrsGraph->getLocalGraphDevice().row_map;
478 auto blockColind = meshCrsGraph->getLocalGraphDevice().entries;
479
480 row_map_type pointGColind("pointGColind", pointColind.extent(0));
481
482 Kokkos::parallel_for(
483 "computePointGColind", range_type(0, pointColind.extent(0)), KOKKOS_LAMBDA(const LO i) {
484 pointGColind(i) = localPointColMap.getGlobalElement(pointColind(i));
485 });
486
487 row_map_type blockGColind("blockGColind", blockColind.extent(0));
488
489 Kokkos::parallel_for(
490 "computeBlockGColind", range_type(0, blockGColind.extent(0)), KOKKOS_LAMBDA(const LO i) {
491 blockGColind(i) = localMeshColMap.getGlobalElement(blockColind(i));
492 });
493
494 Kokkos::parallel_for(
495 "copyblockValues", range_type(0, block_rows), KOKKOS_LAMBDA(const LO i) {
496 const offset_type blkBeg = blockRowptr[i];
497 const offset_type numBlocks = blockRowptr[i + 1] - blkBeg;
498
499 for (offset_type point_i = 0; point_i < pointRowptr[i * blockSize + 1] - pointRowptr[i * blockSize]; point_i++) {
500 offset_type block_inv = static_cast<offset_type>(-1);
501 offset_type little_col_inv = static_cast<offset_type>(-1);
502 for (offset_type block_2 = 0; block_2 < numBlocks; block_2++) {
507 break;
508 }
509 }
510 if (block_inv != static_cast<offset_type>(-1))
511 break;
512 }
513
514 for (LO little_row = 0; little_row < blockSize; little_row++) {
516 }
517 }
518 });
519 }
520 blockMatrix = rcp(new block_crs_matrix_type(*meshCrsGraph, blockValues, blockSize));
521 Kokkos::DefaultExecutionSpace().fence();
522 }
523
524 return blockMatrix;
525}
526
527template <class Scalar, class LO, class GO, class Node>
528Teuchos::RCP<Tpetra::CrsMatrix<Scalar, LO, GO, Node>>
531 using execution_space = typename Node::execution_space;
532 using dev_row_view_t = typename crs_t::local_graph_device_type::row_map_type::non_const_type;
533 using dev_col_view_t = typename crs_t::local_graph_device_type::entries_type::non_const_type;
534 using dev_val_view_t = typename crs_t::local_matrix_device_type::values_type::non_const_type;
535 using range_type = Kokkos::RangePolicy<execution_space, size_t>;
536 using team_policy = Kokkos::TeamPolicy<execution_space>;
537 using member_type = typename team_policy::member_type;
538 using scratch_view = Kokkos::View<bool *, typename execution_space::scratch_memory_space>;
539 using Ordinal = typename dev_row_view_t::non_const_value_type;
540
541 // Get structure / values
542 auto local = pointMatrix.getLocalMatrixDevice();
543 auto row_ptrs = local.graph.row_map;
544 auto col_inds = local.graph.entries;
545 auto values = local.values;
546 const auto nrows = pointMatrix.getLocalNumRows();
547
548 //
549 // Populate all active blocks, they must be fully populated with entries so fill with zeroes
550 //
551
552 // Make row workspace views
553 dev_row_view_t new_rowmap("new_rowmap", nrows + 1);
554 const auto blocks_per_row = nrows / blockSize; // assumes square matrix
555 dev_row_view_t active_block_row_map("active_block_row_map", blocks_per_row + 1);
556 const int max_threads = execution_space().concurrency();
557 assert(blockSize > 1);
558 assert(nrows % blockSize == 0);
559 const int mem_level = 1;
560 const int bytes = scratch_view::shmem_size(blocks_per_row);
561
562 if (max_threads >= blockSize) {
563 // Prefer 1 team per block since this will require a lot less scratch memory
564 team_policy tp(blocks_per_row, blockSize);
565
566 // Count active blocks
567 Kokkos::parallel_for(
568 "countActiveBlocks", tp.set_scratch_size(mem_level, Kokkos::PerTeam(bytes)), KOKKOS_LAMBDA(const member_type &team) {
569 Ordinal block_row = team.league_rank();
570
572 Kokkos::single(
573 Kokkos::PerTeam(team), [&]() {
576 }
577 });
578 team.team_barrier();
579
580 // All threads in a team scan a blocks-worth of rows to see which
581 // blocks are active
582 Kokkos::parallel_for(
583 Kokkos::TeamThreadRange(team, blockSize), [&](Ordinal block_offset) {
585 Ordinal row_itr = row_ptrs(row);
586 Ordinal row_end = row_ptrs(row + 1);
587
593 // This block has at least one nnz in this row
595 ++row_itr;
596 if (row_itr == row_end) break;
598 }
599 }
600 });
601
602 team.team_barrier();
603
604 Kokkos::single(
605 Kokkos::PerTeam(team), [&]() {
606 Ordinal count = 0;
609 ++count;
610 }
611 }
613 });
614 });
615 } else {
616 // We don't have enough parallelism to make a thread team handle a block, so just
617 // have 1 thread per block
618 team_policy tp(blocks_per_row, 1);
619
620 // Count active blocks
621 Kokkos::parallel_for(
622 "countActiveBlocks", tp.set_scratch_size(mem_level, Kokkos::PerTeam(bytes)), KOKKOS_LAMBDA(const member_type &team) {
623 Ordinal block_row = team.league_rank();
624
628 }
629
630 // One thread scans a blocks-worth of rows to see which blocks are active
631 for (int block_offset = 0; block_offset < blockSize; ++block_offset) {
633 Ordinal row_itr = row_ptrs(row);
634 Ordinal row_end = row_ptrs(row + 1);
635
641 // This block has at least one nnz in this row
643 ++row_itr;
644 if (row_itr == row_end) break;
646 }
647 }
648 }
649
650 Ordinal count = 0;
653 ++count;
654 }
655 }
657 });
658 }
659
661 KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<
662 execution_space>(active_block_row_map.extent(0), active_block_row_map, nnz_block_count);
664
665 // Find active blocks
666 if (max_threads >= blockSize) {
667 // Prefer 1 team per block since this will require a lot less scratch memory
668 team_policy tp(blocks_per_row, blockSize);
669
670 Kokkos::parallel_for(
671 "findActiveBlocks", tp.set_scratch_size(mem_level, Kokkos::PerTeam(bytes)), KOKKOS_LAMBDA(const member_type &team) {
672 Ordinal block_row = team.league_rank();
673
675 Kokkos::single(
676 Kokkos::PerTeam(team), [&]() {
679 }
680 });
681 team.team_barrier();
682
683 // All threads in a team scan a blocks-worth of rows to see which
684 // blocks are active
685 Kokkos::parallel_for(
686 Kokkos::TeamThreadRange(team, blockSize), [&](Ordinal block_offset) {
688 Ordinal row_itr = row_ptrs(row);
689 Ordinal row_end = row_ptrs(row + 1);
690
696 // This block has at least one nnz in this row
698 ++row_itr;
699 if (row_itr == row_end) break;
701 }
702 }
703 });
704
705 team.team_barrier();
706
707 Kokkos::single(
708 Kokkos::PerTeam(team), [&]() {
713 ++offset;
714 }
715 }
716 });
717 });
718 } else {
719 team_policy tp(blocks_per_row, 1);
720
721 Kokkos::parallel_for(
722 "findActiveBlocks", tp.set_scratch_size(mem_level, Kokkos::PerTeam(bytes)), KOKKOS_LAMBDA(const member_type &team) {
723 Ordinal block_row = team.league_rank();
724
728 }
729
730 // One thread scans a blocks-worth of rows to see which blocks are active
731 for (int block_offset = 0; block_offset < blockSize; ++block_offset) {
733 Ordinal row_itr = row_ptrs(row);
734 Ordinal row_end = row_ptrs(row + 1);
735
741 // This block has at least one nnz in this row
743 ++row_itr;
744 if (row_itr == row_end) break;
746 }
747 }
748 }
749
754 ++offset;
755 }
756 }
757 });
758 }
759
760 // Sizing
761 Kokkos::parallel_for(
762 "sizing", range_type(0, nrows), KOKKOS_LAMBDA(const size_t row) {
763 const auto block_row = row / blockSize;
767 new_rowmap(row) = row_nnz;
768 });
769
771 KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<
772 execution_space>(new_rowmap.extent(0), new_rowmap, new_nnz_count);
773 // Now populate cols and vals
776 Kokkos::parallel_for(
777 "entries", range_type(0, nrows), KOKKOS_LAMBDA(const size_t row) {
778 Ordinal row_itr = row_ptrs(row);
779 Ordinal row_end = row_ptrs(row + 1);
781
785
794 // Already a non-zero entry
796 ++row_itr;
797 }
798 }
799 }
800 });
801
802 // Create new, filled CRS
803 auto crs_row_map = pointMatrix.getRowMap();
804 auto crs_col_map = pointMatrix.getColMap();
805 Teuchos::RCP<crs_t> result = Teuchos::rcp(new crs_t(crs_row_map, crs_col_map, new_rowmap, new_col_ids, new_vals));
806 result->fillComplete();
807 return result;
808}
809
810template <class Scalar, class LO, class GO, class Node>
811Teuchos::RCP<Tpetra::CrsMatrix<Scalar, LO, GO, Node>>
814 using dev_row_view_t = typename crs_t::local_graph_device_type::row_map_type::non_const_type;
815 using dev_col_view_t = typename crs_t::local_graph_device_type::entries_type::non_const_type;
816 using dev_val_view_t = typename crs_t::local_matrix_device_type::values_type::non_const_type;
817#if KOKKOS_VERSION >= 40799
818 using impl_scalar_t = typename KokkosKernels::ArithTraits<Scalar>::val_type;
819#else
820 using impl_scalar_t = typename Kokkos::ArithTraits<Scalar>::val_type;
821#endif
822#if KOKKOS_VERSION >= 40799
823 using STS = KokkosKernels::ArithTraits<impl_scalar_t>;
824#else
825 using STS = Kokkos::ArithTraits<impl_scalar_t>;
826#endif
827 using Ordinal = typename dev_row_view_t::non_const_value_type;
828 using execution_space = typename Node::execution_space;
829 using range_type = Kokkos::RangePolicy<execution_space, size_t>;
830
831 // Get structure / values
832 auto local = pointMatrix.getLocalMatrixHost();
833 auto row_ptrs = local.graph.row_map;
834 auto col_inds = local.graph.entries;
835 auto values = local.values;
836 const auto nrows = pointMatrix.getLocalNumRows();
837
838 // Sizing and rows
839 dev_row_view_t new_rowmap("new_rowmap", nrows + 1);
840 const impl_scalar_t zero = STS::zero();
841 Kokkos::parallel_for(
842 "sizing", range_type(0, nrows), KOKKOS_LAMBDA(const size_t row) {
843 const Ordinal row_nnz_begin = row_ptrs(row);
844 Ordinal row_nnzs = 0;
845 for (Ordinal row_nnz = row_nnz_begin; row_nnz < row_ptrs(row + 1); ++row_nnz) {
846 const impl_scalar_t value = values(row_nnz);
847 if (value != zero) {
848 ++row_nnzs;
849 }
850 }
851 new_rowmap(row) = row_nnzs;
852 });
853
854 Ordinal real_nnzs = 0;
855 KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<
856 execution_space>(new_rowmap.extent(0), new_rowmap, real_nnzs);
857 // Now populate cols and vals
858 dev_col_view_t new_col_ids("new_col_ids", real_nnzs);
859 dev_val_view_t new_vals("new_vals", real_nnzs);
860 Kokkos::parallel_for(
861 "entries", range_type(0, nrows), KOKKOS_LAMBDA(const size_t row) {
862 Ordinal row_nnzs = 0;
866 const impl_scalar_t value = values(old_row_nnz);
867 if (value != zero) {
870 ++row_nnzs;
871 }
872 }
873 });
874
875 // Create new, unfilled CRS
876 auto crs_row_map = pointMatrix.getRowMap();
877 auto crs_col_map = pointMatrix.getColMap();
878 Teuchos::RCP<crs_t> result = Teuchos::rcp(new crs_t(crs_row_map, crs_col_map, new_rowmap, new_col_ids, new_vals));
879 result->fillComplete();
880 return result;
881}
882
883template <class LO, class GO, class Node>
884Teuchos::RCP<const Tpetra::Map<LO, GO, Node>>
886 typedef Teuchos::OrdinalTraits<Tpetra::global_size_t> TOT;
887 typedef Tpetra::Map<LO, GO, Node> map_type;
888
889 if (use_LID) {
890 using execution_space = typename Node::execution_space;
891 using range_type = Kokkos::RangePolicy<execution_space, size_t>;
892
893 auto pointGlobalID = pointMap.getMyGlobalIndicesDevice();
894 LO block_rows = pointGlobalID.extent(0) / blockSize;
895 Kokkos::View<GO *, typename map_type::device_type> meshGlobalID("meshGlobalID", block_rows);
896
897 Kokkos::parallel_for(
898 "fillMeshMap", range_type(0, block_rows), KOKKOS_LAMBDA(const LO i) {
900 });
901
902 Teuchos::RCP<const map_type> meshMap = Teuchos::rcp(new map_type(TOT::invalid(), meshGlobalID, 0, pointMap.getComm()));
903 return meshMap;
904 } else {
905 // calculate mesh GIDs
906 Teuchos::ArrayView<const GO> pointGids = pointMap.getLocalElementList();
907 Teuchos::Array<GO> meshGids;
908 GO indexBase = pointMap.getIndexBase();
909
910 // Use hash table to track whether we've encountered this GID previously. This will happen
911 // when striding through the point DOFs in a block. It should not happen otherwise.
912 // I don't use sort/make unique because I don't want to change the ordering.
913 meshGids.reserve(pointGids.size());
915 for (int i = 0; i < pointGids.size(); ++i) {
917 if (hashTable.get(meshGid) == -1) {
918 hashTable.add(meshGid, 1); //(key,value)
919 meshGids.push_back(meshGid);
920 }
921 }
922
923 Teuchos::RCP<const map_type> meshMap = Teuchos::rcp(new map_type(TOT::invalid(), meshGids(), 0, pointMap.getComm()));
924 return meshMap;
925 }
926}
927
928template <class LO, class GO, class Node>
929Teuchos::RCP<const Tpetra::Map<LO, GO, Node>>
931 typedef Teuchos::OrdinalTraits<Tpetra::global_size_t> TOT;
932 typedef Tpetra::Map<LO, GO, Node> map_type;
933
934 // calculate mesh GIDs
935 Teuchos::ArrayView<const GO> blockGids = blockMap.getLocalElementList();
936 Teuchos::Array<GO> pointGids(blockGids.size() * blockSize);
937 GO indexBase = blockMap.getIndexBase();
938
939 for (LO i = 0, ct = 0; i < (LO)blockGids.size(); i++) {
941 for (LO j = 0; j < blockSize; j++, ct++)
942 pointGids[i * blockSize + j] = base + j;
943 }
944
945 Teuchos::RCP<const map_type> pointMap = Teuchos::rcp(new map_type(TOT::invalid(), pointGids(), 0, blockMap.getComm()));
946 return pointMap;
947}
948
949template <class Scalar, class LO, class GO, class Node>
950Teuchos::RCP<CrsMatrix<Scalar, LO, GO, Node>>
952 using Teuchos::Array;
953 using Teuchos::ArrayView;
954 using Teuchos::RCP;
955
956 typedef Tpetra::BlockCrsMatrix<Scalar, LO, GO, Node> block_crs_matrix_type;
957 typedef Tpetra::Map<LO, GO, Node> map_type;
958 typedef Tpetra::CrsMatrix<Scalar, LO, GO, Node> crs_matrix_type;
959
960 using crs_graph_type = typename block_crs_matrix_type::crs_graph_type;
961 using local_graph_device_type = typename crs_matrix_type::local_graph_device_type;
962 using local_matrix_device_type = typename crs_matrix_type::local_matrix_device_type;
963 using row_map_type = typename local_graph_device_type::row_map_type::non_const_type;
964 using entries_type = typename local_graph_device_type::entries_type::non_const_type;
965 using values_type = typename local_matrix_device_type::values_type::non_const_type;
966
967 using row_map_type_const = typename local_graph_device_type::row_map_type;
968 using entries_type_const = typename local_graph_device_type::entries_type;
969
970 using little_block_type = typename block_crs_matrix_type::const_little_block_type;
971 using offset_type = typename row_map_type::non_const_value_type;
972 using column_type = typename entries_type::non_const_value_type;
973
974 using execution_space = typename Node::execution_space;
975 using range_type = Kokkos::RangePolicy<execution_space, LO>;
976
977 LO blocksize = blockMatrix.getBlockSize();
978 const offset_type bs2 = blocksize * blocksize;
979 size_t block_nnz = blockMatrix.getLocalNumEntries();
980 size_t point_nnz = block_nnz * bs2;
981
982 // We can get these from the blockMatrix directly
985
986 // We need to generate the row/col Map ourselves.
989
992
993 // Get the last few things
994
995 const crs_graph_type &blockGraph = blockMatrix.getCrsGraph();
996 LO point_rows = (LO)pointRowMap->getLocalNumElements();
997 LO block_rows = (LO)blockRowMap->getLocalNumElements();
998 auto blockValues = blockMatrix.getValuesDevice();
999 auto blockLocalGraph = blockGraph.getLocalGraphDevice();
1002
1003 // Generate the point matrix rowptr / colind / values
1004 row_map_type rowptr("row_map", point_rows + 1);
1005 entries_type colind("entries", point_nnz);
1006 values_type values("values", point_nnz);
1007
1008 // Pre-fill the rowptr
1009 Kokkos::parallel_for(
1010 "fillRowPtr", range_type(0, block_rows), KOKKOS_LAMBDA(const LO i) {
1011 offset_type base = blockRowptr[i];
1012 offset_type diff = blockRowptr[i + 1] - base;
1013 for (LO j = 0; j < blocksize; j++) {
1014 rowptr[i * blocksize + j] = base * bs2 + j * diff * blocksize;
1015 }
1016
1017 // Fill the last guy, if we're on the final entry
1018 if (i == block_rows - 1) {
1019 rowptr[block_rows * blocksize] = blockRowptr[i + 1] * bs2;
1020 }
1021 });
1022
1023 Kokkos::parallel_for(
1024 "copyEntriesAndValues", range_type(0, block_rows), KOKKOS_LAMBDA(const LO i) {
1025 const offset_type blkBeg = blockRowptr[i];
1026 const offset_type numBlocks = blockRowptr[i + 1] - blkBeg;
1027
1028 // For each block in the row...
1029 for (offset_type block = 0; block < numBlocks; block++) {
1031 little_block_type my_block(blockValues.data() + (blkBeg + block) * bs2, blocksize, blocksize);
1032
1033 // For each entry in the block...
1034 for (LO little_row = 0; little_row < blocksize; little_row++) {
1035 offset_type point_row_offset = rowptr[i * blocksize + little_row];
1036 for (LO little_col = 0; little_col < blocksize; little_col++) {
1039 }
1040 }
1041 }
1042 });
1043
1044 // Generate the matrix
1046 pointCrsMatrix->setAllValues(rowptr, colind, values);
1047
1048 // FUTURE OPTIMIZATION: Directly compute import/export, rather than letting ESFC do it
1049 pointCrsMatrix->expertStaticFillComplete(pointDomainMap, pointRangeMap);
1050
1051 return pointCrsMatrix;
1052}
1053
1054} // namespace Tpetra
1055
1056//
1057// Explicit instantiation macro for blockCrsMatrixWriter (various
1058// overloads), writeMatrixStrip, and convertToBlockCrsMatrix.
1059//
1060// Must be expanded from within the Tpetra namespace!
1061//
1062#define TPETRA_BLOCKCRSMATRIX_HELPERS_INSTANT(S, LO, GO, NODE) \
1063 template void blockCrsMatrixWriter(BlockCrsMatrix<S, LO, GO, NODE> const &A, std::string const &fileName); \
1064 template void blockCrsMatrixWriter(BlockCrsMatrix<S, LO, GO, NODE> const &A, std::string const &fileName, Teuchos::ParameterList const &params); \
1065 template void blockCrsMatrixWriter(BlockCrsMatrix<S, LO, GO, NODE> const &A, std::ostream &os); \
1066 template void blockCrsMatrixWriter(BlockCrsMatrix<S, LO, GO, NODE> const &A, std::ostream &os, Teuchos::ParameterList const &params); \
1067 template void writeMatrixStrip(BlockCrsMatrix<S, LO, GO, NODE> const &A, std::ostream &os, Teuchos::ParameterList const &params); \
1068 template Teuchos::RCP<BlockCrsMatrix<S, LO, GO, NODE>> convertToBlockCrsMatrix(const CrsMatrix<S, LO, GO, NODE> &pointMatrix, const LO &blockSize, bool use_LID); \
1069 template Teuchos::RCP<CrsMatrix<S, LO, GO, NODE>> fillLogicalBlocks(const CrsMatrix<S, LO, GO, NODE> &pointMatrix, const LO &blockSize); \
1070 template Teuchos::RCP<CrsMatrix<S, LO, GO, NODE>> unfillFormerBlockCrs(const CrsMatrix<S, LO, GO, NODE> &pointMatrix); \
1071 template Teuchos::RCP<CrsMatrix<S, LO, GO, NODE>> convertToCrsMatrix(const BlockCrsMatrix<S, LO, GO, NODE> &blockMatrix); \
1072 template Teuchos::RCP<CrsGraph<LO, GO, NODE>> getBlockCrsGraph(const Tpetra::CrsMatrix<S, LO, GO, NODE> &pointMatrix, const LO &blockSize, bool use_local_ID);
1073
1074//
1075// Explicit instantiation macro for createMeshMap / createPointMap
1076//
1077#define TPETRA_CREATEMESHMAP_INSTANT(LO, GO, NODE) \
1078 template Teuchos::RCP<const Map<LO, GO, NODE>> createMeshMap(const LO &blockSize, const Map<LO, GO, NODE> &pointMap, bool use_local_ID); \
1079 template Teuchos::RCP<const Map<LO, GO, NODE>> createPointMap(const LO &blockSize, const Map<LO, GO, NODE> &blockMap);
1080
1081#endif // TPETRA_BLOCKCRSMATRIX_HELPERS_DEF_HPP
Struct that holds views of the contents of a CrsMatrix.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
Teuchos::RCP< Tpetra::CrsMatrix< Scalar, LO, GO, Node > > fillLogicalBlocks(const Tpetra::CrsMatrix< Scalar, LO, GO, Node > &pointMatrix, const LO &blockSize)
Fill all point entries in a logical block of a CrsMatrix with zeroes. This should be called before co...
Teuchos::RCP< Tpetra::CrsMatrix< Scalar, LO, GO, Node > > unfillFormerBlockCrs(const Tpetra::CrsMatrix< Scalar, LO, GO, Node > &pointMatrix)
Unfill all point entries in a logical block of a CrsMatrix with zeroes. This can be called after conv...
void writeMatrixStrip(BlockCrsMatrix< Scalar, LO, GO, Node > const &A, std::ostream &os, Teuchos::ParameterList const &params)
Helper function called by blockCrsMatrixWriter.
Teuchos::RCP< Tpetra::CrsGraph< LO, GO, Node > > getBlockCrsGraph(const Tpetra::CrsMatrix< Scalar, LO, GO, Node > &pointMatrix, const LO &blockSize, bool use_local_ID=true)
Non-member constructor that creates the CrsGraph of a BlockCrsMatrix from an existing point CrsMatrix...
Teuchos::RCP< const Tpetra::Map< LO, GO, Node > > createPointMap(LO const &blockSize, const Tpetra::Map< LO, GO, Node > &blockMap)
Helper function to generate a point map from a block map (with a given block size) GIDs associated wi...
Teuchos::RCP< CrsGraph< LocalOrdinal, GlobalOrdinal, Node > > createCrsGraph(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &map, size_t maxNumEntriesPerRow=0, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Nonmember function to create an empty CrsGraph given a row Map and the max number of entries allowed ...
Teuchos::RCP< CrsMatrix< Scalar, LO, GO, Node > > convertToCrsMatrix(const Tpetra::BlockCrsMatrix< Scalar, LO, GO, Node > &blockMatrix)
Non-member constructor that creates a point CrsMatrix from an existing BlockCrsMatrix.
Teuchos::RCP< const Tpetra::Map< LO, GO, Node > > createMeshMap(LO const &blockSize, const Tpetra::Map< LO, GO, Node > &pointMap, bool use_local_ID=false)
Helper function to generate a mesh map from a point map. Important! It's assumed that point GIDs asso...
void blockCrsMatrixWriter(BlockCrsMatrix< Scalar, LO, GO, Node > const &A, std::string const &fileName)
Helper function to write a BlockCrsMatrix. Calls the 3-argument version.
@ INSERT
Insert new values that don't currently exist.
Teuchos::RCP< BlockCrsMatrix< Scalar, LO, GO, Node > > convertToBlockCrsMatrix(const Tpetra::CrsMatrix< Scalar, LO, GO, Node > &pointMatrix, const LO &blockSize, bool use_local_ID=true)
Non-member constructor that creates a BlockCrsMatrix from an existing point CrsMatrix.