Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
TpetraExt_MatrixMatrix_Cuda.hpp
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef TPETRA_MATRIXMATRIX_CUDA_DEF_HPP
11#define TPETRA_MATRIXMATRIX_CUDA_DEF_HPP
12
13#include "Tpetra_Details_IntRowPtrHelper.hpp"
14
15#ifdef HAVE_TPETRA_INST_CUDA
16namespace Tpetra {
17namespace MMdetails {
18
19/*********************************************************************************************************/
20// MMM KernelWrappers for Partial Specialization to CUDA
21template <class Scalar,
22 class LocalOrdinal,
23 class GlobalOrdinal,
24 class LocalOrdinalViewType>
25struct KernelWrappers<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode, LocalOrdinalViewType> {
26 static inline void mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
27 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
28 const LocalOrdinalViewType& Acol2Brow,
29 const LocalOrdinalViewType& Acol2Irow,
30 const LocalOrdinalViewType& Bcol2Ccol,
31 const LocalOrdinalViewType& Icol2Ccol,
32 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
33 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
34 const std::string& label = std::string(),
35 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
36
37 static inline void mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
38 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
39 const LocalOrdinalViewType& Acol2Brow,
40 const LocalOrdinalViewType& Acol2Irow,
41 const LocalOrdinalViewType& Bcol2Ccol,
42 const LocalOrdinalViewType& Icol2Ccol,
43 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
44 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
45 const std::string& label = std::string(),
46 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
47};
48
49// Jacobi KernelWrappers for Partial Specialization to Cuda
50template <class Scalar,
51 class LocalOrdinal,
52 class GlobalOrdinal, class LocalOrdinalViewType>
53struct KernelWrappers2<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode, LocalOrdinalViewType> {
54 static inline void jacobi_A_B_newmatrix_kernel_wrapper(typename Teuchos::ScalarTraits<Scalar>::magnitudeType omega,
55 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Dinv,
56 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
57 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
58 const LocalOrdinalViewType& Acol2Brow,
59 const LocalOrdinalViewType& Acol2Irow,
60 const LocalOrdinalViewType& Bcol2Ccol,
61 const LocalOrdinalViewType& Icol2Ccol,
62 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
63 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
64 const std::string& label = std::string(),
65 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
66
67 static inline void jacobi_A_B_reuse_kernel_wrapper(typename Teuchos::ScalarTraits<Scalar>::magnitudeType omega,
68 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Dinv,
69 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
70 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
71 const LocalOrdinalViewType& Acol2Brow,
72 const LocalOrdinalViewType& Acol2Irow,
73 const LocalOrdinalViewType& Bcol2Ccol,
74 const LocalOrdinalViewType& Icol2Ccol,
75 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
76 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
77 const std::string& label = std::string(),
78 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
79
80 static inline void jacobi_A_B_newmatrix_KokkosKernels(typename Teuchos::ScalarTraits<Scalar>::magnitudeType omega,
81 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Dinv,
82 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
83 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
84 const LocalOrdinalViewType& Acol2Brow,
85 const LocalOrdinalViewType& Acol2Irow,
86 const LocalOrdinalViewType& Bcol2Ccol,
87 const LocalOrdinalViewType& Icol2Ccol,
88 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
89 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
90 const std::string& label = std::string(),
91 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
92};
93
94/*********************************************************************************************************/
95// AB NewMatrix Kernel wrappers (KokkosKernels/CUDA Version)
96template <class Scalar,
97 class LocalOrdinal,
98 class GlobalOrdinal,
99 class LocalOrdinalViewType>
100void KernelWrappers<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode, LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
101 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
102 const LocalOrdinalViewType& Acol2Brow,
103 const LocalOrdinalViewType& Acol2Irow,
104 const LocalOrdinalViewType& Bcol2Ccol,
105 const LocalOrdinalViewType& Icol2Ccol,
106 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
107 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
108 const std::string& label,
109 const Teuchos::RCP<Teuchos::ParameterList>& params) {
110 using Teuchos::RCP;
111 using Teuchos::rcp;
112 RCP<Tpetra::Details::ProfilingRegion> MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: Newmatrix CudaWrapper"));
113
114 // Node-specific code
115 typedef Tpetra::KokkosCompat::KokkosCudaWrapperNode Node;
116 std::string nodename("Cuda");
117
118 // Lots and lots of typedefs
120 typedef typename KCRS::device_type device_t;
121 typedef typename KCRS::StaticCrsGraphType graph_t;
122 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
123 using int_view_t = Kokkos::View<int*, typename lno_view_t::array_layout, typename lno_view_t::memory_space, typename lno_view_t::memory_traits>;
124 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
125 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
126 typedef typename KCRS::values_type::non_const_type scalar_view_t;
127 // typedef typename graph_t::row_map_type::const_type lno_view_t_const;
128
129 // Options
130 int team_work_size = 16; // Defaults to 16 as per Deveci 12/7/16 - csiefer
131 std::string myalg("SPGEMM_KK_MEMORY");
132 if (!params.is_null()) {
133 if (params->isParameter("cuda: algorithm"))
134 myalg = params->get("cuda: algorithm", myalg);
135 if (params->isParameter("cuda: team work size"))
136 team_work_size = params->get("cuda: team work size", team_work_size);
137 }
138
139 // KokkosKernelsHandle
140 typedef KokkosKernels::Experimental::KokkosKernelsHandle<
141 typename lno_view_t::const_value_type, typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
142 typename device_t::execution_space, typename device_t::memory_space, typename device_t::memory_space>
143 KernelHandle;
144 using IntKernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle<
145 typename int_view_t::const_value_type, typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
146 typename device_t::execution_space, typename device_t::memory_space, typename device_t::memory_space>;
147
148 // Grab the Kokkos::SparseCrsMatrices
149 const KCRS Amat = Aview.origMatrix->getLocalMatrixDevice();
150 const KCRS Bmat = Bview.origMatrix->getLocalMatrixDevice();
151
152 c_lno_view_t Arowptr = Amat.graph.row_map,
153 Browptr = Bmat.graph.row_map;
154 const lno_nnz_view_t Acolind = Amat.graph.entries,
155 Bcolind = Bmat.graph.entries;
156 const scalar_view_t Avals = Amat.values,
157 Bvals = Bmat.values;
158
159 // Get the algorithm mode
160 std::string alg = nodename + std::string(" algorithm");
161 // printf("DEBUG: Using kernel: %s\n",myalg.c_str());
162 if (!params.is_null() && params->isParameter(alg)) myalg = params->get(alg, myalg);
163 KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
164
165 // Merge the B and Bimport matrices
166 KCRS Bmerged = Tpetra::MMdetails::merge_matrices(Aview, Bview, Acol2Brow, Acol2Irow, Bcol2Ccol, Icol2Ccol, C.getColMap()->getLocalNumElements());
167
168 // if Kokkos Kernels is going to use the cuSparse TPL for SpGEMM, this matrix
169 // needs to be sorted
170 // Try to mirror the Kokkos Kernels internal SpGEMM TPL use logic
171 // inspired by https://github.com/trilinos/Trilinos/pull/11709
172 // and https://github.com/kokkos/kokkos-kernels/pull/2008
173#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && ((CUDA_VERSION < 11000) || (CUDA_VERSION >= 11040))
174 if constexpr (std::is_same_v<typename device_t::execution_space, Kokkos::Cuda>) {
175 if (!KokkosSparse::isCrsGraphSorted(Bmerged.graph.row_map, Bmerged.graph.entries)) {
176 Import_Util::sortCrsMatrix(Bmerged);
177 }
178 }
179#endif
180
181 MM = Teuchos::null;
182 MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: Newmatrix CudaCore"));
183
184 // Do the multiply on whatever we've got
185 typename KernelHandle::nnz_lno_t AnumRows = Amat.numRows();
186 typename KernelHandle::nnz_lno_t BnumRows = Bmerged.numRows();
187 typename KernelHandle::nnz_lno_t BnumCols = Bmerged.numCols();
188
189 // regardless of whether integer row ptrs are used, need to ultimately produce the row pointer, entries, and values of the expected types
190 lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing("non_const_lno_row"), AnumRows + 1);
191 lno_nnz_view_t entriesC;
192 scalar_view_t valuesC;
193
194 Tpetra::Details::IntRowPtrHelper<decltype(Bmerged)> irph(Bmerged.nnz(), Bmerged.graph.row_map);
195 const bool useIntRowptrs =
196 irph.shouldUseIntRowptrs() &&
197 Aview.origMatrix->getApplyHelper()->shouldUseIntRowptrs();
198
199 if (useIntRowptrs) {
200 IntKernelHandle kh;
201 kh.create_spgemm_handle(alg_enum);
202 kh.set_team_work_size(team_work_size);
203
204 int_view_t int_row_mapC(Kokkos::ViewAllocateWithoutInitializing("non_const_int_row"), AnumRows + 1);
205
206 auto Aint = Aview.origMatrix->getApplyHelper()->getIntRowptrMatrix(Amat);
207 auto Bint = irph.getIntRowptrMatrix(Bmerged);
208
209 {
210 Tpetra::Details::ProfilingRegion MM2("TpetraExt: MMM: Newmatrix KokkosKernels symbolic int");
211 KokkosSparse::spgemm_symbolic(&kh, AnumRows, BnumRows, BnumCols, Aint.graph.row_map, Aint.graph.entries, false, Bint.graph.row_map, Bint.graph.entries, false, int_row_mapC);
212 }
213
214 Tpetra::Details::ProfilingRegion MM2("TpetraExt: MMM: Newmatrix KokkosKernels numeric int");
215
216 size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
217 if (c_nnz_size) {
218 entriesC = lno_nnz_view_t(Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
219 valuesC = scalar_view_t(Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
220 }
221 KokkosSparse::spgemm_numeric(&kh, AnumRows, BnumRows, BnumCols, Aint.graph.row_map, Aint.graph.entries, Aint.values, false, Bint.graph.row_map, Bint.graph.entries, Bint.values, false, int_row_mapC, entriesC, valuesC);
222 // transfer the integer rowptrs back to the correct rowptr type
223 Kokkos::parallel_for(
224 int_row_mapC.size(), KOKKOS_LAMBDA(int i) { row_mapC(i) = int_row_mapC(i); });
225 kh.destroy_spgemm_handle();
226
227 } else {
228 KernelHandle kh;
229 kh.create_spgemm_handle(alg_enum);
230 kh.set_team_work_size(team_work_size);
231
232 {
233 Tpetra::Details::ProfilingRegion MM2("TpetraExt: Jacobi: Newmatrix KokkosKernels symbolic non-int");
234 KokkosSparse::spgemm_symbolic(&kh, AnumRows, BnumRows, BnumCols, Amat.graph.row_map, Amat.graph.entries, false, Bmerged.graph.row_map, Bmerged.graph.entries, false, row_mapC);
235 }
236
237 Tpetra::Details::ProfilingRegion MM2("TpetraExt: Jacobi: Newmatrix KokkosKernels numeric non-int");
238 size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
239 if (c_nnz_size) {
240 entriesC = lno_nnz_view_t(Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
241 valuesC = scalar_view_t(Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
242 }
243
244 KokkosSparse::spgemm_numeric(&kh, AnumRows, BnumRows, BnumCols, Amat.graph.row_map, Amat.graph.entries, Amat.values, false, Bmerged.graph.row_map, Bmerged.graph.entries, Bmerged.values, false, row_mapC, entriesC, valuesC);
245
246 kh.destroy_spgemm_handle();
247 }
248
249 MM = Teuchos::null;
250 MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: Newmatrix CudaSort"));
251
252 // Sort & set values
253 if (params.is_null() || params->get("sort entries", true))
254 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
255 C.setAllValues(row_mapC, entriesC, valuesC);
256
257 MM = Teuchos::null;
258 MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: Newmatrix CudaESFC"));
259
260 // Final Fillcomplete
261 RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
262 labelList->set("Timer Label", label);
263 if (!params.is_null()) labelList->set("compute global constants", params->get("compute global constants", true));
264 RCP<const Export<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> > dummyExport;
265 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport, dummyExport, labelList);
266}
267
268/*********************************************************************************************************/
269template <class Scalar,
270 class LocalOrdinal,
271 class GlobalOrdinal,
272 class LocalOrdinalViewType>
273void KernelWrappers<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode, LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(
274 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
275 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
276 const LocalOrdinalViewType& targetMapToOrigRow_dev,
277 const LocalOrdinalViewType& targetMapToImportRow_dev,
278 const LocalOrdinalViewType& Bcol2Ccol_dev,
279 const LocalOrdinalViewType& Icol2Ccol_dev,
280 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
281 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
282 const std::string& label,
283 const Teuchos::RCP<Teuchos::ParameterList>& params) {
284 // FIXME: Right now, this is a cut-and-paste of the serial kernel
285 typedef Tpetra::KokkosCompat::KokkosCudaWrapperNode Node;
286
287 using Teuchos::RCP;
288 using Teuchos::rcp;
289
290 // By default, if A*B results in an entry that is not supported by the graph of C, we throw.
291 // This option allows to override this behavior and silently ignores such entries.
292 bool throwOnInsert = true;
293 if (!params.is_null() && params->isType<bool>("MM Throw For Non-Existent Entries"))
294 throwOnInsert = params->get<bool>("MM Throw For Non-Existent Entries");
295
296 RCP<Tpetra::Details::ProfilingRegion> MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: Reuse SerialCore"));
297
298 // Lots and lots of typedefs
299 typedef typename Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_host_type KCRS;
300 typedef typename KCRS::StaticCrsGraphType graph_t;
301 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
302 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
303 typedef typename KCRS::values_type::non_const_type scalar_view_t;
304
305 typedef Scalar SC;
306 typedef LocalOrdinal LO;
307 typedef GlobalOrdinal GO;
308 typedef Node NO;
309 typedef Map<LO, GO, NO> map_type;
310 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
311 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
312 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
313
314 // Since this is being run on Cuda, we need to fence because the below code will use UVM
315 // typename graph_t::execution_space().fence();
316
317 // KDDKDD UVM Without UVM, need to copy targetMap arrays to host.
318 // KDDKDD UVM Ideally, this function would run on device and use
319 // KDDKDD UVM KokkosKernels instead of this host implementation.
320 auto targetMapToOrigRow =
321 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
322 targetMapToOrigRow_dev);
323 auto targetMapToImportRow =
324 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
325 targetMapToImportRow_dev);
326 auto Bcol2Ccol =
327 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
328 Bcol2Ccol_dev);
329 auto Icol2Ccol =
330 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
331 Icol2Ccol_dev);
332
333 // Sizes
334 RCP<const map_type> Ccolmap = C.getColMap();
335 size_t m = Aview.origMatrix->getLocalNumRows();
336 size_t n = Ccolmap->getLocalNumElements();
337
338 // Grab the Kokkos::SparseCrsMatrices & inner stuff
339 const KCRS Amat = Aview.origMatrix->getLocalMatrixHost();
340 const KCRS Bmat = Bview.origMatrix->getLocalMatrixHost();
341 const KCRS Cmat = C.getLocalMatrixHost();
342
343 c_lno_view_t Arowptr = Amat.graph.row_map,
344 Browptr = Bmat.graph.row_map,
345 Crowptr = Cmat.graph.row_map;
346 const lno_nnz_view_t Acolind = Amat.graph.entries,
347 Bcolind = Bmat.graph.entries,
348 Ccolind = Cmat.graph.entries;
349 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
350 scalar_view_t Cvals = Cmat.values;
351
352 c_lno_view_t Irowptr;
353 lno_nnz_view_t Icolind;
354 scalar_view_t Ivals;
355 if (!Bview.importMatrix.is_null()) {
356 auto lclB = Bview.importMatrix->getLocalMatrixHost();
357 Irowptr = lclB.graph.row_map;
358 Icolind = lclB.graph.entries;
359 Ivals = lclB.values;
360 }
361
362 // Classic csr assembly (low memory edition)
363 // mfh 27 Sep 2016: The c_status array is an implementation detail
364 // of the local sparse matrix-matrix multiply routine.
365
366 // The status array will contain the index into colind where this entry was last deposited.
367 // c_status[i] < CSR_ip - not in the row yet
368 // c_status[i] >= CSR_ip - this is the entry where you can find the data
369 // We start with this filled with INVALID's indicating that there are no entries yet.
370 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
371 std::vector<size_t> c_status(n, ST_INVALID);
372
373 // For each row of A/C
374 size_t CSR_ip = 0, OLD_ip = 0;
375 for (size_t i = 0; i < m; i++) {
376 // First fill the c_status array w/ locations where we're allowed to
377 // generate nonzeros for this row
378 OLD_ip = Crowptr[i];
379 CSR_ip = Crowptr[i + 1];
380 for (size_t k = OLD_ip; k < CSR_ip; k++) {
381 c_status[Ccolind[k]] = k;
382
383 // Reset values in the row of C
384 Cvals[k] = SC_ZERO;
385 }
386
387 for (size_t k = Arowptr[i]; k < Arowptr[i + 1]; k++) {
388 LO Aik = Acolind[k];
389 const SC Aval = Avals[k];
390 if (Aval == SC_ZERO)
391 continue;
392
393 if (targetMapToOrigRow[Aik] != LO_INVALID) {
394 // Local matrix
395 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
396
397 for (size_t j = Browptr[Bk]; j < Browptr[Bk + 1]; ++j) {
398 LO Bkj = Bcolind[j];
399 LO Cij = Bcol2Ccol[Bkj];
400
401 const bool badInsert = c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip;
402 if (!badInsert)
403 Cvals[c_status[Cij]] += Aval * Bvals[j];
404 else if (throwOnInsert)
405 TEUCHOS_TEST_FOR_EXCEPTION(badInsert,
406 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph "
407 << "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
408 }
409
410 } else {
411 // Remote matrix
412 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
413 for (size_t j = Irowptr[Ik]; j < Irowptr[Ik + 1]; ++j) {
414 LO Ikj = Icolind[j];
415 LO Cij = Icol2Ccol[Ikj];
416
417 const bool badInsert = c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip;
418 if (!badInsert)
419 Cvals[c_status[Cij]] += Aval * Ivals[j];
420 else if (throwOnInsert)
421 TEUCHOS_TEST_FOR_EXCEPTION(badInsert,
422 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph "
423 << "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
424 }
425 }
426 }
427 }
428
429 C.fillComplete(C.getDomainMap(), C.getRangeMap());
430}
431
432/*********************************************************************************************************/
433template <class Scalar,
434 class LocalOrdinal,
435 class GlobalOrdinal,
436 class LocalOrdinalViewType>
437void KernelWrappers2<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode, LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(typename Teuchos::ScalarTraits<Scalar>::magnitudeType omega,
438 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Dinv,
439 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
440 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
441 const LocalOrdinalViewType& Acol2Brow,
442 const LocalOrdinalViewType& Acol2Irow,
443 const LocalOrdinalViewType& Bcol2Ccol,
444 const LocalOrdinalViewType& Icol2Ccol,
445 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
446 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
447 const std::string& label,
448 const Teuchos::RCP<Teuchos::ParameterList>& params) {
449 // Node-specific code
450 using Teuchos::RCP;
451 using Teuchos::rcp;
452 RCP<Tpetra::Details::ProfilingRegion> MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: Jacobi CudaWrapper"));
453
454 // Options
455 // int team_work_size = 16; // Defaults to 16 as per Deveci 12/7/16 - csiefer // unreferenced
456 std::string myalg("KK");
457 if (!params.is_null()) {
458 if (params->isParameter("cuda: jacobi algorithm"))
459 myalg = params->get("cuda: jacobi algorithm", myalg);
460 }
461
462 if (myalg == "MSAK") {
463 ::Tpetra::MatrixMatrix::ExtraKernels::jacobi_A_B_newmatrix_MultiplyScaleAddKernel(omega, Dinv, Aview, Bview, Acol2Brow, Acol2Irow, Bcol2Ccol, Icol2Ccol, C, Cimport, label, params);
464 } else if (myalg == "KK") {
465 jacobi_A_B_newmatrix_KokkosKernels(omega, Dinv, Aview, Bview, Acol2Brow, Acol2Irow, Bcol2Ccol, Icol2Ccol, C, Cimport, label, params);
466 } else {
467 throw std::runtime_error("Tpetra::MatrixMatrix::Jacobi newmatrix unknown kernel");
468 }
469
470 MM = Teuchos::null;
471 MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: Jacobi: Newmatrix CudaESFC"));
472
473 // Final Fillcomplete
474 RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
475 labelList->set("Timer Label", label);
476 if (!params.is_null()) labelList->set("compute global constants", params->get("compute global constants", true));
477
478 // NOTE: MSAK already fillCompletes, so we have to check here
479 if (!C.isFillComplete()) {
480 RCP<const Export<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> > dummyExport;
481 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport, dummyExport, labelList);
482 }
483}
484
485/*********************************************************************************************************/
486template <class Scalar,
487 class LocalOrdinal,
488 class GlobalOrdinal,
489 class LocalOrdinalViewType>
490void KernelWrappers2<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode, LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(typename Teuchos::ScalarTraits<Scalar>::magnitudeType omega,
491 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Dinv,
492 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
493 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
494 const LocalOrdinalViewType& targetMapToOrigRow_dev,
495 const LocalOrdinalViewType& targetMapToImportRow_dev,
496 const LocalOrdinalViewType& Bcol2Ccol_dev,
497 const LocalOrdinalViewType& Icol2Ccol_dev,
498 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
499 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
500 const std::string& label,
501 const Teuchos::RCP<Teuchos::ParameterList>& params) {
502 // FIXME: Right now, this is a cut-and-paste of the serial kernel
503 typedef Tpetra::KokkosCompat::KokkosCudaWrapperNode Node;
504
505 using Teuchos::RCP;
506 using Teuchos::rcp;
507 RCP<Tpetra::Details::ProfilingRegion> MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: Jacobi: Reuse CudaCore"));
508
509 // Lots and lots of typedefs
510 typedef typename Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_host_type KCRS;
511 typedef typename KCRS::StaticCrsGraphType graph_t;
512 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
513 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
514 typedef typename KCRS::values_type::non_const_type scalar_view_t;
515 typedef typename scalar_view_t::memory_space scalar_memory_space;
516
517 typedef Scalar SC;
518 typedef LocalOrdinal LO;
519 typedef GlobalOrdinal GO;
520 typedef Node NO;
521 typedef Map<LO, GO, NO> map_type;
522 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
523 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
524 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
525
526 // Since this is being run on Cuda, we need to fence because the below host code will use UVM
527 // KDDKDD typename graph_t::execution_space().fence();
528
529 // KDDKDD UVM Without UVM, need to copy targetMap arrays to host.
530 // KDDKDD UVM Ideally, this function would run on device and use
531 // KDDKDD UVM KokkosKernels instead of this host implementation.
532 auto targetMapToOrigRow =
533 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
534 targetMapToOrigRow_dev);
535 auto targetMapToImportRow =
536 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
537 targetMapToImportRow_dev);
538 auto Bcol2Ccol =
539 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
540 Bcol2Ccol_dev);
541 auto Icol2Ccol =
542 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
543 Icol2Ccol_dev);
544
545 // Sizes
546 RCP<const map_type> Ccolmap = C.getColMap();
547 size_t m = Aview.origMatrix->getLocalNumRows();
548 size_t n = Ccolmap->getLocalNumElements();
549
550 // Grab the Kokkos::SparseCrsMatrices & inner stuff
551 const KCRS Amat = Aview.origMatrix->getLocalMatrixHost();
552 const KCRS Bmat = Bview.origMatrix->getLocalMatrixHost();
553 const KCRS Cmat = C.getLocalMatrixHost();
554
555 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
556 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
557 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
558 scalar_view_t Cvals = Cmat.values;
559
560 c_lno_view_t Irowptr;
561 lno_nnz_view_t Icolind;
562 scalar_view_t Ivals;
563 if (!Bview.importMatrix.is_null()) {
564 auto lclB = Bview.importMatrix->getLocalMatrixHost();
565 Irowptr = lclB.graph.row_map;
566 Icolind = lclB.graph.entries;
567 Ivals = lclB.values;
568 }
569
570 // Jacobi-specific inner stuff
571 auto Dvals =
572 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
573
574 // The status array will contain the index into colind where this entry was last deposited.
575 // c_status[i] < CSR_ip - not in the row yet
576 // c_status[i] >= CSR_ip - this is the entry where you can find the data
577 // We start with this filled with INVALID's indicating that there are no entries yet.
578 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
579 std::vector<size_t> c_status(n, ST_INVALID);
580
581 // For each row of A/C
582 size_t CSR_ip = 0, OLD_ip = 0;
583 for (size_t i = 0; i < m; i++) {
584 // First fill the c_status array w/ locations where we're allowed to
585 // generate nonzeros for this row
586 OLD_ip = Crowptr[i];
587 CSR_ip = Crowptr[i + 1];
588 for (size_t k = OLD_ip; k < CSR_ip; k++) {
589 c_status[Ccolind[k]] = k;
590
591 // Reset values in the row of C
592 Cvals[k] = SC_ZERO;
593 }
594
595 SC minusOmegaDval = -omega * Dvals(i, 0);
596
597 // Entries of B
598 for (size_t j = Browptr[i]; j < Browptr[i + 1]; j++) {
599 Scalar Bval = Bvals[j];
600 if (Bval == SC_ZERO)
601 continue;
602 LO Bij = Bcolind[j];
603 LO Cij = Bcol2Ccol[Bij];
604
605 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
606 std::runtime_error, "Trying to insert a new entry into a static graph");
607
608 Cvals[c_status[Cij]] = Bvals[j];
609 }
610
611 // Entries of -omega * Dinv * A * B
612 for (size_t k = Arowptr[i]; k < Arowptr[i + 1]; k++) {
613 LO Aik = Acolind[k];
614 const SC Aval = Avals[k];
615 if (Aval == SC_ZERO)
616 continue;
617
618 if (targetMapToOrigRow[Aik] != LO_INVALID) {
619 // Local matrix
620 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
621
622 for (size_t j = Browptr[Bk]; j < Browptr[Bk + 1]; ++j) {
623 LO Bkj = Bcolind[j];
624 LO Cij = Bcol2Ccol[Bkj];
625
626 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
627 std::runtime_error, "Trying to insert a new entry into a static graph");
628
629 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
630 }
631
632 } else {
633 // Remote matrix
634 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
635 for (size_t j = Irowptr[Ik]; j < Irowptr[Ik + 1]; ++j) {
636 LO Ikj = Icolind[j];
637 LO Cij = Icol2Ccol[Ikj];
638
639 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
640 std::runtime_error, "Trying to insert a new entry into a static graph");
641
642 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
643 }
644 }
645 }
646 }
647
648 MM = Teuchos::null;
649 MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: Jacobi: Reuse ESFC"));
650
651 C.fillComplete(C.getDomainMap(), C.getRangeMap());
652}
653
654/*********************************************************************************************************/
655template <class Scalar,
656 class LocalOrdinal,
657 class GlobalOrdinal,
658 class LocalOrdinalViewType>
659void KernelWrappers2<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode, LocalOrdinalViewType>::jacobi_A_B_newmatrix_KokkosKernels(typename Teuchos::ScalarTraits<Scalar>::magnitudeType omega,
660 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Dinv,
661 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
662 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
663 const LocalOrdinalViewType& Acol2Brow,
664 const LocalOrdinalViewType& Acol2Irow,
665 const LocalOrdinalViewType& Bcol2Ccol,
666 const LocalOrdinalViewType& Icol2Ccol,
667 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
668 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
669 const std::string& label,
670 const Teuchos::RCP<Teuchos::ParameterList>& params) {
671 // Check if the diagonal entries exist in debug mode
672 const bool debug = Tpetra::Details::Behavior::debug();
673 if (debug) {
674 auto rowMap = Aview.origMatrix->getRowMap();
676 Aview.origMatrix->getLocalDiagCopy(diags);
677 size_t diagLength = rowMap->getLocalNumElements();
678 Teuchos::Array<Scalar> diagonal(diagLength);
679 diags.get1dCopy(diagonal());
680
681 for (size_t i = 0; i < diagLength; ++i) {
682 TEUCHOS_TEST_FOR_EXCEPTION(diagonal[i] == Teuchos::ScalarTraits<Scalar>::zero(),
683 std::runtime_error,
684 "Matrix A has a zero/missing diagonal: " << diagonal[i] << std::endl
685 << "KokkosKernels Jacobi-fused SpGEMM requires nonzero diagonal entries in A" << std::endl);
686 }
687 }
688
689 using Teuchos::RCP;
690 using Teuchos::rcp;
691 RCP<Tpetra::Details::ProfilingRegion> MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: Jacobi: Newmatrix KokkosKernels"));
692
693 // Usings
694 using device_t = typename Tpetra::KokkosCompat::KokkosCudaWrapperNode::device_type;
696 using graph_t = typename matrix_t::StaticCrsGraphType;
697 using lno_view_t = typename graph_t::row_map_type::non_const_type;
698 using int_view_t = Kokkos::View<int*,
699 typename lno_view_t::array_layout,
700 typename lno_view_t::memory_space,
701 typename lno_view_t::memory_traits>;
702 using c_lno_view_t = typename graph_t::row_map_type::const_type;
703 using lno_nnz_view_t = typename graph_t::entries_type::non_const_type;
704 using scalar_view_t = typename matrix_t::values_type::non_const_type;
705
706 // KokkosKernels handle
707 using handle_t = typename KokkosKernels::Experimental::KokkosKernelsHandle<
708 typename lno_view_t::const_value_type, typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
709 typename device_t::execution_space, typename device_t::memory_space, typename device_t::memory_space>;
710
711 using int_handle_t = typename KokkosKernels::Experimental::KokkosKernelsHandle<
712 typename int_view_t::const_value_type, typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
713 typename device_t::execution_space, typename device_t::memory_space, typename device_t::memory_space>;
714
715 // Merge the B and Bimport matrices
716 const matrix_t Bmerged = Tpetra::MMdetails::merge_matrices(Aview, Bview, Acol2Brow, Acol2Irow, Bcol2Ccol, Icol2Ccol, C.getColMap()->getLocalNumElements());
717
718 // Get the properties and arrays of input matrices
719 const matrix_t Amat = Aview.origMatrix->getLocalMatrixDevice();
720 const matrix_t Bmat = Bview.origMatrix->getLocalMatrixDevice();
721
722 typename handle_t::nnz_lno_t AnumRows = Amat.numRows();
723 typename handle_t::nnz_lno_t BnumRows = Bmerged.numRows();
724 typename handle_t::nnz_lno_t BnumCols = Bmerged.numCols();
725
726 // Arrays of the output matrix
727 lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing("row_mapC"), AnumRows + 1);
728 lno_nnz_view_t entriesC;
729 scalar_view_t valuesC;
730
731 // Options
732 int team_work_size = 16;
733 std::string myalg("SPGEMM_KK_MEMORY");
734 if (!params.is_null()) {
735 if (params->isParameter("cuda: algorithm"))
736 myalg = params->get("cuda: algorithm", myalg);
737 if (params->isParameter("cuda: team work size"))
738 team_work_size = params->get("cuda: team work size", team_work_size);
739 }
740
741 // Get the algorithm mode
742 std::string alg("Cuda algorithm");
743 if (!params.is_null() && params->isParameter(alg)) myalg = params->get(alg, myalg);
744 KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
745
746 // decide whether to use integer-typed row pointers for this spgemm
747 Tpetra::Details::IntRowPtrHelper<decltype(Bmerged)> irph(Bmerged.nnz(), Bmerged.graph.row_map);
748 const bool useIntRowptrs =
749 irph.shouldUseIntRowptrs() &&
750 Aview.origMatrix->getApplyHelper()->shouldUseIntRowptrs();
751
752 const Scalar jacobiOmega = omega * Teuchos::ScalarTraits<Scalar>::one();
753
754 if (useIntRowptrs) {
755 int_handle_t kh;
756 kh.create_spgemm_handle(alg_enum);
757 kh.set_team_work_size(team_work_size);
758
759 int_view_t int_row_mapC(Kokkos::ViewAllocateWithoutInitializing("int_row_mapC"), AnumRows + 1);
760
761 auto Aint = Aview.origMatrix->getApplyHelper()->getIntRowptrMatrix(Amat);
762 auto Bint = irph.getIntRowptrMatrix(Bmerged);
763
764 {
765 Tpetra::Details::ProfilingRegion MM2("TpetraExt: Jacobi: KokkosKernels symbolic int");
766 KokkosSparse::spgemm_symbolic(&kh, AnumRows, BnumRows, BnumCols,
767 Aint.graph.row_map, Aint.graph.entries, false,
768 Bint.graph.row_map, Bint.graph.entries, false,
769 int_row_mapC);
770 }
771
772 size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
773 if (c_nnz_size) {
774 entriesC = lno_nnz_view_t(Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
775 valuesC = scalar_view_t(Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
776 }
777 Tpetra::Details::ProfilingRegion MM2("TpetraExt: Jacobi: KokkosKernels numeric int");
778
779 // even though there is no TPL for this, we have to use the same handle that was used in the symbolic phase,
780 // so need to have a special int-typed call for this as well.
781 KokkosSparse::Experimental::spgemm_jacobi(&kh, AnumRows, BnumRows, BnumCols,
782 Aint.graph.row_map, Aint.graph.entries, Amat.values, false,
783 Bint.graph.row_map, Bint.graph.entries, Bint.values, false,
784 int_row_mapC, entriesC, valuesC,
785 jacobiOmega, Dinv.getLocalViewDevice(Access::ReadOnly));
786 // transfer the integer rowptrs back to the correct rowptr type
787 Kokkos::parallel_for(
788 int_row_mapC.size(), KOKKOS_LAMBDA(int i) { row_mapC(i) = int_row_mapC(i); });
789 kh.destroy_spgemm_handle();
790 } else {
791 handle_t kh;
792 kh.create_spgemm_handle(alg_enum);
793 kh.set_team_work_size(team_work_size);
794
795 {
796 Tpetra::Details::ProfilingRegion MM2("TpetraExt: Jacobi: KokkosKernels symbolic non-int");
797 KokkosSparse::spgemm_symbolic(&kh, AnumRows, BnumRows, BnumCols,
798 Amat.graph.row_map, Amat.graph.entries, false,
799 Bmerged.graph.row_map, Bmerged.graph.entries, false,
800 row_mapC);
801 }
802
803 size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
804 if (c_nnz_size) {
805 entriesC = lno_nnz_view_t(Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
806 valuesC = scalar_view_t(Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
807 }
808
809 Tpetra::Details::ProfilingRegion MM2("TpetraExt: Jacobi: KokkosKernels numeric non-int");
810 KokkosSparse::Experimental::spgemm_jacobi(&kh, AnumRows, BnumRows, BnumCols,
811 Amat.graph.row_map, Amat.graph.entries, Amat.values, false,
812 Bmerged.graph.row_map, Bmerged.graph.entries, Bmerged.values, false,
813 row_mapC, entriesC, valuesC,
814 jacobiOmega, Dinv.getLocalViewDevice(Access::ReadOnly));
815 kh.destroy_spgemm_handle();
816 }
817
818 MM = Teuchos::null;
819 MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: Jacobi: Newmatrix CudaSort"));
820
821 // Sort & set values
822 if (params.is_null() || params->get("sort entries", true))
823 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
824 C.setAllValues(row_mapC, entriesC, valuesC);
825
826 MM = Teuchos::null;
827 MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: Jacobi: Newmatrix CudaESFC"));
828
829 // Final Fillcomplete
830 Teuchos::RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
831 labelList->set("Timer Label", label);
832 if (!params.is_null()) labelList->set("compute global constants", params->get("compute global constants", true));
833 Teuchos::RCP<const Export<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> > dummyExport;
834 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport, dummyExport, labelList);
835}
836
837} // namespace MMdetails
838} // namespace Tpetra
839
840#endif // CUDA
841
842#endif
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
Struct that holds views of the contents of a CrsMatrix.
Teuchos::RCP< const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > origMatrix
The original matrix.
static bool debug()
Whether Tpetra is in debug mode.
Namespace Tpetra contains the class and methods constituting the Tpetra library.