Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
TpetraExt_MatrixMatrix_Cuda.hpp
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef TPETRA_MATRIXMATRIX_CUDA_DEF_HPP
11#define TPETRA_MATRIXMATRIX_CUDA_DEF_HPP
12
13#include "Tpetra_Details_IntRowPtrHelper.hpp"
14
15#ifdef HAVE_TPETRA_INST_CUDA
16namespace Tpetra {
17namespace MMdetails {
18
19/*********************************************************************************************************/
20// MMM KernelWrappers for Partial Specialization to CUDA
21template <class Scalar,
22 class LocalOrdinal,
23 class GlobalOrdinal,
24 class LocalOrdinalViewType>
25struct KernelWrappers<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode, LocalOrdinalViewType> {
26 static inline void mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
27 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
28 const LocalOrdinalViewType& Acol2Brow,
29 const LocalOrdinalViewType& Acol2Irow,
30 const LocalOrdinalViewType& Bcol2Ccol,
31 const LocalOrdinalViewType& Icol2Ccol,
32 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
33 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
34 const std::string& label = std::string(),
35 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
36
37 static inline void mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
38 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
39 const LocalOrdinalViewType& Acol2Brow,
40 const LocalOrdinalViewType& Acol2Irow,
41 const LocalOrdinalViewType& Bcol2Ccol,
42 const LocalOrdinalViewType& Icol2Ccol,
43 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
44 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
45 const std::string& label = std::string(),
46 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
47};
48
49// Jacobi KernelWrappers for Partial Specialization to Cuda
50template <class Scalar,
51 class LocalOrdinal,
52 class GlobalOrdinal, class LocalOrdinalViewType>
53struct KernelWrappers2<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode, LocalOrdinalViewType> {
54 static inline void jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
55 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Dinv,
56 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
57 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
58 const LocalOrdinalViewType& Acol2Brow,
59 const LocalOrdinalViewType& Acol2Irow,
60 const LocalOrdinalViewType& Bcol2Ccol,
61 const LocalOrdinalViewType& Icol2Ccol,
62 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
63 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
64 const std::string& label = std::string(),
65 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
66
67 static inline void jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
68 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Dinv,
69 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
70 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
71 const LocalOrdinalViewType& Acol2Brow,
72 const LocalOrdinalViewType& Acol2Irow,
73 const LocalOrdinalViewType& Bcol2Ccol,
74 const LocalOrdinalViewType& Icol2Ccol,
75 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
76 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
77 const std::string& label = std::string(),
78 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
79
80 static inline void jacobi_A_B_newmatrix_KokkosKernels(Scalar omega,
81 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Dinv,
82 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
83 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
84 const LocalOrdinalViewType& Acol2Brow,
85 const LocalOrdinalViewType& Acol2Irow,
86 const LocalOrdinalViewType& Bcol2Ccol,
87 const LocalOrdinalViewType& Icol2Ccol,
88 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
89 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
90 const std::string& label = std::string(),
91 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
92};
93
94/*********************************************************************************************************/
95// AB NewMatrix Kernel wrappers (KokkosKernels/CUDA Version)
96template <class Scalar,
97 class LocalOrdinal,
98 class GlobalOrdinal,
99 class LocalOrdinalViewType>
100void KernelWrappers<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode, LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
101 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
102 const LocalOrdinalViewType& Acol2Brow,
103 const LocalOrdinalViewType& Acol2Irow,
104 const LocalOrdinalViewType& Bcol2Ccol,
105 const LocalOrdinalViewType& Icol2Ccol,
106 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
107 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
108 const std::string& label,
109 const Teuchos::RCP<Teuchos::ParameterList>& params) {
110 using Teuchos::RCP;
111 using Teuchos::rcp;
112 RCP<Tpetra::Details::ProfilingRegion> MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: Newmatrix CudaWrapper"));
113
114 // Node-specific code
115 typedef Tpetra::KokkosCompat::KokkosCudaWrapperNode Node;
116 std::string nodename("Cuda");
117
118 // Lots and lots of typedefs
120 typedef typename KCRS::device_type device_t;
121 typedef typename KCRS::StaticCrsGraphType graph_t;
122 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
123 using int_view_t = Kokkos::View<int*, typename lno_view_t::array_layout, typename lno_view_t::memory_space, typename lno_view_t::memory_traits>;
124 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
125 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
126 typedef typename KCRS::values_type::non_const_type scalar_view_t;
127 // typedef typename graph_t::row_map_type::const_type lno_view_t_const;
128
129 // Options
130 int team_work_size = 16; // Defaults to 16 as per Deveci 12/7/16 - csiefer
131 std::string myalg("SPGEMM_KK_MEMORY");
132 if (!params.is_null()) {
133 if (params->isParameter("cuda: algorithm"))
134 myalg = params->get("cuda: algorithm", myalg);
135 if (params->isParameter("cuda: team work size"))
136 team_work_size = params->get("cuda: team work size", team_work_size);
137 }
138
139 // KokkosKernelsHandle
140 typedef KokkosKernels::Experimental::KokkosKernelsHandle<
141 typename lno_view_t::const_value_type, typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
142 typename device_t::execution_space, typename device_t::memory_space, typename device_t::memory_space>
143 KernelHandle;
144 using IntKernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle<
145 typename int_view_t::const_value_type, typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
146 typename device_t::execution_space, typename device_t::memory_space, typename device_t::memory_space>;
147
148 // Grab the Kokkos::SparseCrsMatrices
149 const KCRS& Amat = Aview.origMatrix->getLocalMatrixDevice();
150 const KCRS& Bmat = Bview.origMatrix->getLocalMatrixDevice();
151
152 c_lno_view_t Arowptr = Amat.graph.row_map,
153 Browptr = Bmat.graph.row_map;
154 const lno_nnz_view_t Acolind = Amat.graph.entries,
155 Bcolind = Bmat.graph.entries;
156 const scalar_view_t Avals = Amat.values,
157 Bvals = Bmat.values;
158
159 // Get the algorithm mode
160 std::string alg = nodename + std::string(" algorithm");
161 // printf("DEBUG: Using kernel: %s\n",myalg.c_str());
162 if (!params.is_null() && params->isParameter(alg)) myalg = params->get(alg, myalg);
163 KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
164
165 // Merge the B and Bimport matrices
166 KCRS Bmerged = Tpetra::MMdetails::merge_matrices(Aview, Bview, Acol2Brow, Acol2Irow, Bcol2Ccol, Icol2Ccol, C.getColMap()->getLocalNumElements());
167
168 // if Kokkos Kernels is going to use the cuSparse TPL for SpGEMM, this matrix
169 // needs to be sorted
170 // Try to mirror the Kokkos Kernels internal SpGEMM TPL use logic
171 // inspired by https://github.com/trilinos/Trilinos/pull/11709
172 // and https://github.com/kokkos/kokkos-kernels/pull/2008
173#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && ((CUDA_VERSION < 11000) || (CUDA_VERSION >= 11040))
174 if constexpr (std::is_same_v<typename device_t::execution_space, Kokkos::Cuda>) {
175 if (!KokkosSparse::isCrsGraphSorted(Bmerged.graph.row_map, Bmerged.graph.entries)) {
176 KokkosSparse::sort_crs_matrix(Bmerged);
177 }
178 }
179#endif
180
181 MM = Teuchos::null;
182 MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: Newmatrix CudaCore"));
183
184 // Do the multiply on whatever we've got
185 typename KernelHandle::nnz_lno_t AnumRows = Amat.numRows();
186 typename KernelHandle::nnz_lno_t BnumRows = Bmerged.numRows();
187 typename KernelHandle::nnz_lno_t BnumCols = Bmerged.numCols();
188
189 // regardless of whether integer row ptrs are used, need to ultimately produce the row pointer, entries, and values of the expected types
190 lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing("non_const_lno_row"), AnumRows + 1);
191 lno_nnz_view_t entriesC;
192 scalar_view_t valuesC;
193
194 Tpetra::Details::IntRowPtrHelper<decltype(Bmerged)> irph(Bmerged.nnz(), Bmerged.graph.row_map);
195 const bool useIntRowptrs =
196 irph.shouldUseIntRowptrs() &&
197 Aview.origMatrix->getApplyHelper()->shouldUseIntRowptrs();
198
199 if (useIntRowptrs) {
200 IntKernelHandle kh;
201 kh.create_spgemm_handle(alg_enum);
202 kh.set_team_work_size(team_work_size);
203
204 int_view_t int_row_mapC(Kokkos::ViewAllocateWithoutInitializing("non_const_int_row"), AnumRows + 1);
205
206 auto Aint = Aview.origMatrix->getApplyHelper()->getIntRowptrMatrix(Amat);
207 auto Bint = irph.getIntRowptrMatrix(Bmerged);
208
209 {
210 Tpetra::Details::ProfilingRegion MM2("TpetraExt: MMM: Newmatrix KokkosKernels symbolic int");
211 KokkosSparse::spgemm_symbolic(&kh, AnumRows, BnumRows, BnumCols, Aint.graph.row_map, Aint.graph.entries, false, Bint.graph.row_map, Bint.graph.entries, false, int_row_mapC);
212 }
213
214 Tpetra::Details::ProfilingRegion MM2("TpetraExt: MMM: Newmatrix KokkosKernels numeric int");
215
216 size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
217 if (c_nnz_size) {
218 entriesC = lno_nnz_view_t(Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
219 valuesC = scalar_view_t(Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
220 }
221 KokkosSparse::spgemm_numeric(&kh, AnumRows, BnumRows, BnumCols, Aint.graph.row_map, Aint.graph.entries, Aint.values, false, Bint.graph.row_map, Bint.graph.entries, Bint.values, false, int_row_mapC, entriesC, valuesC);
222 // transfer the integer rowptrs back to the correct rowptr type
223 Kokkos::parallel_for(
224 int_row_mapC.size(), KOKKOS_LAMBDA(int i) { row_mapC(i) = int_row_mapC(i); });
225 kh.destroy_spgemm_handle();
226
227 } else {
228 KernelHandle kh;
229 kh.create_spgemm_handle(alg_enum);
230 kh.set_team_work_size(team_work_size);
231
232 {
233 Tpetra::Details::ProfilingRegion MM2("TpetraExt: Jacobi: Newmatrix KokkosKernels symbolic non-int");
234 KokkosSparse::spgemm_symbolic(&kh, AnumRows, BnumRows, BnumCols, Amat.graph.row_map, Amat.graph.entries, false, Bmerged.graph.row_map, Bmerged.graph.entries, false, row_mapC);
235 }
236
237 Tpetra::Details::ProfilingRegion MM2("TpetraExt: Jacobi: Newmatrix KokkosKernels numeric non-int");
238 size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
239 if (c_nnz_size) {
240 entriesC = lno_nnz_view_t(Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
241 valuesC = scalar_view_t(Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
242 }
243
244 KokkosSparse::spgemm_numeric(&kh, AnumRows, BnumRows, BnumCols, Amat.graph.row_map, Amat.graph.entries, Amat.values, false, Bmerged.graph.row_map, Bmerged.graph.entries, Bmerged.values, false, row_mapC, entriesC, valuesC);
245
246 kh.destroy_spgemm_handle();
247 }
248
249 MM = Teuchos::null;
250 MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: Newmatrix CudaSort"));
251
252 // Sort & set values
253 if (params.is_null() || params->get("sort entries", true))
254 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
255 C.setAllValues(row_mapC, entriesC, valuesC);
256
257 MM = Teuchos::null;
258 MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: Newmatrix CudaESFC"));
259
260 // Final Fillcomplete
261 RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
262 labelList->set("Timer Label", label);
263 if (!params.is_null()) labelList->set("compute global constants", params->get("compute global constants", true));
264 RCP<const Export<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> > dummyExport;
265 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport, dummyExport, labelList);
266}
267
268/*********************************************************************************************************/
269template <class Scalar,
270 class LocalOrdinal,
271 class GlobalOrdinal,
272 class LocalOrdinalViewType>
273void KernelWrappers<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode, LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(
274 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
275 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
276 const LocalOrdinalViewType& targetMapToOrigRow_dev,
277 const LocalOrdinalViewType& targetMapToImportRow_dev,
278 const LocalOrdinalViewType& Bcol2Ccol_dev,
279 const LocalOrdinalViewType& Icol2Ccol_dev,
280 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
281 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
282 const std::string& label,
283 const Teuchos::RCP<Teuchos::ParameterList>& params) {
284 // FIXME: Right now, this is a cut-and-paste of the serial kernel
285 typedef Tpetra::KokkosCompat::KokkosCudaWrapperNode Node;
286
287 using Teuchos::RCP;
288 using Teuchos::rcp;
289 RCP<Tpetra::Details::ProfilingRegion> MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: Reuse SerialCore"));
290
291 // Lots and lots of typedefs
292 typedef typename Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_host_type KCRS;
293 typedef typename KCRS::StaticCrsGraphType graph_t;
294 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
295 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
296 typedef typename KCRS::values_type::non_const_type scalar_view_t;
297
298 typedef Scalar SC;
299 typedef LocalOrdinal LO;
300 typedef GlobalOrdinal GO;
301 typedef Node NO;
302 typedef Map<LO, GO, NO> map_type;
303 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
304 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
305 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
306
307 // Since this is being run on Cuda, we need to fence because the below code will use UVM
308 // typename graph_t::execution_space().fence();
309
310 // KDDKDD UVM Without UVM, need to copy targetMap arrays to host.
311 // KDDKDD UVM Ideally, this function would run on device and use
312 // KDDKDD UVM KokkosKernels instead of this host implementation.
313 auto targetMapToOrigRow =
314 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
315 targetMapToOrigRow_dev);
316 auto targetMapToImportRow =
317 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
318 targetMapToImportRow_dev);
319 auto Bcol2Ccol =
320 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
321 Bcol2Ccol_dev);
322 auto Icol2Ccol =
323 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
324 Icol2Ccol_dev);
325
326 // Sizes
327 RCP<const map_type> Ccolmap = C.getColMap();
328 size_t m = Aview.origMatrix->getLocalNumRows();
329 size_t n = Ccolmap->getLocalNumElements();
330
331 // Grab the Kokkos::SparseCrsMatrices & inner stuff
332 const KCRS& Amat = Aview.origMatrix->getLocalMatrixHost();
333 const KCRS& Bmat = Bview.origMatrix->getLocalMatrixHost();
334 const KCRS& Cmat = C.getLocalMatrixHost();
335
336 c_lno_view_t Arowptr = Amat.graph.row_map,
337 Browptr = Bmat.graph.row_map,
338 Crowptr = Cmat.graph.row_map;
339 const lno_nnz_view_t Acolind = Amat.graph.entries,
340 Bcolind = Bmat.graph.entries,
341 Ccolind = Cmat.graph.entries;
342 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
343 scalar_view_t Cvals = Cmat.values;
344
345 c_lno_view_t Irowptr;
346 lno_nnz_view_t Icolind;
347 scalar_view_t Ivals;
348 if (!Bview.importMatrix.is_null()) {
349 auto lclB = Bview.importMatrix->getLocalMatrixHost();
350 Irowptr = lclB.graph.row_map;
351 Icolind = lclB.graph.entries;
352 Ivals = lclB.values;
353 }
354
355 // Classic csr assembly (low memory edition)
356 // mfh 27 Sep 2016: The c_status array is an implementation detail
357 // of the local sparse matrix-matrix multiply routine.
358
359 // The status array will contain the index into colind where this entry was last deposited.
360 // c_status[i] < CSR_ip - not in the row yet
361 // c_status[i] >= CSR_ip - this is the entry where you can find the data
362 // We start with this filled with INVALID's indicating that there are no entries yet.
363 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
364 std::vector<size_t> c_status(n, ST_INVALID);
365
366 // For each row of A/C
367 size_t CSR_ip = 0, OLD_ip = 0;
368 for (size_t i = 0; i < m; i++) {
369 // First fill the c_status array w/ locations where we're allowed to
370 // generate nonzeros for this row
371 OLD_ip = Crowptr[i];
372 CSR_ip = Crowptr[i + 1];
373 for (size_t k = OLD_ip; k < CSR_ip; k++) {
374 c_status[Ccolind[k]] = k;
375
376 // Reset values in the row of C
377 Cvals[k] = SC_ZERO;
378 }
379
380 for (size_t k = Arowptr[i]; k < Arowptr[i + 1]; k++) {
381 LO Aik = Acolind[k];
382 const SC Aval = Avals[k];
383 if (Aval == SC_ZERO)
384 continue;
385
386 if (targetMapToOrigRow[Aik] != LO_INVALID) {
387 // Local matrix
388 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
389
390 for (size_t j = Browptr[Bk]; j < Browptr[Bk + 1]; ++j) {
391 LO Bkj = Bcolind[j];
392 LO Cij = Bcol2Ccol[Bkj];
393
394 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
395 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph "
396 << "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
397
398 Cvals[c_status[Cij]] += Aval * Bvals[j];
399 }
400
401 } else {
402 // Remote matrix
403 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
404 for (size_t j = Irowptr[Ik]; j < Irowptr[Ik + 1]; ++j) {
405 LO Ikj = Icolind[j];
406 LO Cij = Icol2Ccol[Ikj];
407
408 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
409 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph "
410 << "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
411
412 Cvals[c_status[Cij]] += Aval * Ivals[j];
413 }
414 }
415 }
416 }
417
418 C.fillComplete(C.getDomainMap(), C.getRangeMap());
419}
420
421/*********************************************************************************************************/
422template <class Scalar,
423 class LocalOrdinal,
424 class GlobalOrdinal,
425 class LocalOrdinalViewType>
426void KernelWrappers2<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode, LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
427 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Dinv,
428 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
429 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
430 const LocalOrdinalViewType& Acol2Brow,
431 const LocalOrdinalViewType& Acol2Irow,
432 const LocalOrdinalViewType& Bcol2Ccol,
433 const LocalOrdinalViewType& Icol2Ccol,
434 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
435 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
436 const std::string& label,
437 const Teuchos::RCP<Teuchos::ParameterList>& params) {
438 // Node-specific code
439 using Teuchos::RCP;
440 using Teuchos::rcp;
441 RCP<Tpetra::Details::ProfilingRegion> MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: Jacobi CudaWrapper"));
442
443 // Options
444 // int team_work_size = 16; // Defaults to 16 as per Deveci 12/7/16 - csiefer // unreferenced
445 std::string myalg("KK");
446 if (!params.is_null()) {
447 if (params->isParameter("cuda: jacobi algorithm"))
448 myalg = params->get("cuda: jacobi algorithm", myalg);
449 }
450
451 if (myalg == "MSAK") {
452 ::Tpetra::MatrixMatrix::ExtraKernels::jacobi_A_B_newmatrix_MultiplyScaleAddKernel(omega, Dinv, Aview, Bview, Acol2Brow, Acol2Irow, Bcol2Ccol, Icol2Ccol, C, Cimport, label, params);
453 } else if (myalg == "KK") {
454 jacobi_A_B_newmatrix_KokkosKernels(omega, Dinv, Aview, Bview, Acol2Brow, Acol2Irow, Bcol2Ccol, Icol2Ccol, C, Cimport, label, params);
455 } else {
456 throw std::runtime_error("Tpetra::MatrixMatrix::Jacobi newmatrix unknown kernel");
457 }
458
459 MM = Teuchos::null;
460 MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: Jacobi: Newmatrix CudaESFC"));
461
462 // Final Fillcomplete
463 RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
464 labelList->set("Timer Label", label);
465 if (!params.is_null()) labelList->set("compute global constants", params->get("compute global constants", true));
466
467 // NOTE: MSAK already fillCompletes, so we have to check here
468 if (!C.isFillComplete()) {
469 RCP<const Export<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> > dummyExport;
470 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport, dummyExport, labelList);
471 }
472}
473
474/*********************************************************************************************************/
475template <class Scalar,
476 class LocalOrdinal,
477 class GlobalOrdinal,
478 class LocalOrdinalViewType>
479void KernelWrappers2<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode, LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
480 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Dinv,
481 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
482 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
483 const LocalOrdinalViewType& targetMapToOrigRow_dev,
484 const LocalOrdinalViewType& targetMapToImportRow_dev,
485 const LocalOrdinalViewType& Bcol2Ccol_dev,
486 const LocalOrdinalViewType& Icol2Ccol_dev,
487 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
488 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
489 const std::string& label,
490 const Teuchos::RCP<Teuchos::ParameterList>& params) {
491 // FIXME: Right now, this is a cut-and-paste of the serial kernel
492 typedef Tpetra::KokkosCompat::KokkosCudaWrapperNode Node;
493
494 using Teuchos::RCP;
495 using Teuchos::rcp;
496 RCP<Tpetra::Details::ProfilingRegion> MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: Jacobi: Reuse CudaCore"));
497
498 // Lots and lots of typedefs
499 typedef typename Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_host_type KCRS;
500 typedef typename KCRS::StaticCrsGraphType graph_t;
501 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
502 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
503 typedef typename KCRS::values_type::non_const_type scalar_view_t;
504 typedef typename scalar_view_t::memory_space scalar_memory_space;
505
506 typedef Scalar SC;
507 typedef LocalOrdinal LO;
508 typedef GlobalOrdinal GO;
509 typedef Node NO;
510 typedef Map<LO, GO, NO> map_type;
511 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
512 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
513 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
514
515 // Since this is being run on Cuda, we need to fence because the below host code will use UVM
516 // KDDKDD typename graph_t::execution_space().fence();
517
518 // KDDKDD UVM Without UVM, need to copy targetMap arrays to host.
519 // KDDKDD UVM Ideally, this function would run on device and use
520 // KDDKDD UVM KokkosKernels instead of this host implementation.
521 auto targetMapToOrigRow =
522 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
523 targetMapToOrigRow_dev);
524 auto targetMapToImportRow =
525 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
526 targetMapToImportRow_dev);
527 auto Bcol2Ccol =
528 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
529 Bcol2Ccol_dev);
530 auto Icol2Ccol =
531 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
532 Icol2Ccol_dev);
533
534 // Sizes
535 RCP<const map_type> Ccolmap = C.getColMap();
536 size_t m = Aview.origMatrix->getLocalNumRows();
537 size_t n = Ccolmap->getLocalNumElements();
538
539 // Grab the Kokkos::SparseCrsMatrices & inner stuff
540 const KCRS& Amat = Aview.origMatrix->getLocalMatrixHost();
541 const KCRS& Bmat = Bview.origMatrix->getLocalMatrixHost();
542 const KCRS& Cmat = C.getLocalMatrixHost();
543
544 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
545 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
546 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
547 scalar_view_t Cvals = Cmat.values;
548
549 c_lno_view_t Irowptr;
550 lno_nnz_view_t Icolind;
551 scalar_view_t Ivals;
552 if (!Bview.importMatrix.is_null()) {
553 auto lclB = Bview.importMatrix->getLocalMatrixHost();
554 Irowptr = lclB.graph.row_map;
555 Icolind = lclB.graph.entries;
556 Ivals = lclB.values;
557 }
558
559 // Jacobi-specific inner stuff
560 auto Dvals =
561 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
562
563 // The status array will contain the index into colind where this entry was last deposited.
564 // c_status[i] < CSR_ip - not in the row yet
565 // c_status[i] >= CSR_ip - this is the entry where you can find the data
566 // We start with this filled with INVALID's indicating that there are no entries yet.
567 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
568 std::vector<size_t> c_status(n, ST_INVALID);
569
570 // For each row of A/C
571 size_t CSR_ip = 0, OLD_ip = 0;
572 for (size_t i = 0; i < m; i++) {
573 // First fill the c_status array w/ locations where we're allowed to
574 // generate nonzeros for this row
575 OLD_ip = Crowptr[i];
576 CSR_ip = Crowptr[i + 1];
577 for (size_t k = OLD_ip; k < CSR_ip; k++) {
578 c_status[Ccolind[k]] = k;
579
580 // Reset values in the row of C
581 Cvals[k] = SC_ZERO;
582 }
583
584 SC minusOmegaDval = -omega * Dvals(i, 0);
585
586 // Entries of B
587 for (size_t j = Browptr[i]; j < Browptr[i + 1]; j++) {
588 Scalar Bval = Bvals[j];
589 if (Bval == SC_ZERO)
590 continue;
591 LO Bij = Bcolind[j];
592 LO Cij = Bcol2Ccol[Bij];
593
594 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
595 std::runtime_error, "Trying to insert a new entry into a static graph");
596
597 Cvals[c_status[Cij]] = Bvals[j];
598 }
599
600 // Entries of -omega * Dinv * A * B
601 for (size_t k = Arowptr[i]; k < Arowptr[i + 1]; k++) {
602 LO Aik = Acolind[k];
603 const SC Aval = Avals[k];
604 if (Aval == SC_ZERO)
605 continue;
606
607 if (targetMapToOrigRow[Aik] != LO_INVALID) {
608 // Local matrix
609 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
610
611 for (size_t j = Browptr[Bk]; j < Browptr[Bk + 1]; ++j) {
612 LO Bkj = Bcolind[j];
613 LO Cij = Bcol2Ccol[Bkj];
614
615 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
616 std::runtime_error, "Trying to insert a new entry into a static graph");
617
618 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
619 }
620
621 } else {
622 // Remote matrix
623 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
624 for (size_t j = Irowptr[Ik]; j < Irowptr[Ik + 1]; ++j) {
625 LO Ikj = Icolind[j];
626 LO Cij = Icol2Ccol[Ikj];
627
628 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
629 std::runtime_error, "Trying to insert a new entry into a static graph");
630
631 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
632 }
633 }
634 }
635 }
636
637 MM = Teuchos::null;
638 MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: Jacobi: Reuse ESFC"));
639
640 C.fillComplete(C.getDomainMap(), C.getRangeMap());
641}
642
643/*********************************************************************************************************/
644template <class Scalar,
645 class LocalOrdinal,
646 class GlobalOrdinal,
647 class LocalOrdinalViewType>
648void KernelWrappers2<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode, LocalOrdinalViewType>::jacobi_A_B_newmatrix_KokkosKernels(Scalar omega,
649 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Dinv,
650 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
651 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
652 const LocalOrdinalViewType& Acol2Brow,
653 const LocalOrdinalViewType& Acol2Irow,
654 const LocalOrdinalViewType& Bcol2Ccol,
655 const LocalOrdinalViewType& Icol2Ccol,
656 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
657 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
658 const std::string& label,
659 const Teuchos::RCP<Teuchos::ParameterList>& params) {
660 // Check if the diagonal entries exist in debug mode
661 const bool debug = Tpetra::Details::Behavior::debug();
662 if (debug) {
663 auto rowMap = Aview.origMatrix->getRowMap();
665 Aview.origMatrix->getLocalDiagCopy(diags);
666 size_t diagLength = rowMap->getLocalNumElements();
667 Teuchos::Array<Scalar> diagonal(diagLength);
668 diags.get1dCopy(diagonal());
669
670 for (size_t i = 0; i < diagLength; ++i) {
671 TEUCHOS_TEST_FOR_EXCEPTION(diagonal[i] == Teuchos::ScalarTraits<Scalar>::zero(),
672 std::runtime_error,
673 "Matrix A has a zero/missing diagonal: " << diagonal[i] << std::endl
674 << "KokkosKernels Jacobi-fused SpGEMM requires nonzero diagonal entries in A" << std::endl);
675 }
676 }
677
678 using Teuchos::RCP;
679 using Teuchos::rcp;
680 RCP<Tpetra::Details::ProfilingRegion> MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: Jacobi: Newmatrix KokkosKernels"));
681
682 // Usings
683 using device_t = typename Tpetra::KokkosCompat::KokkosCudaWrapperNode::device_type;
685 using graph_t = typename matrix_t::StaticCrsGraphType;
686 using lno_view_t = typename graph_t::row_map_type::non_const_type;
687 using int_view_t = Kokkos::View<int*,
688 typename lno_view_t::array_layout,
689 typename lno_view_t::memory_space,
690 typename lno_view_t::memory_traits>;
691 using c_lno_view_t = typename graph_t::row_map_type::const_type;
692 using lno_nnz_view_t = typename graph_t::entries_type::non_const_type;
693 using scalar_view_t = typename matrix_t::values_type::non_const_type;
694
695 // KokkosKernels handle
696 using handle_t = typename KokkosKernels::Experimental::KokkosKernelsHandle<
697 typename lno_view_t::const_value_type, typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
698 typename device_t::execution_space, typename device_t::memory_space, typename device_t::memory_space>;
699
700 using int_handle_t = typename KokkosKernels::Experimental::KokkosKernelsHandle<
701 typename int_view_t::const_value_type, typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
702 typename device_t::execution_space, typename device_t::memory_space, typename device_t::memory_space>;
703
704 // Merge the B and Bimport matrices
705 const matrix_t Bmerged = Tpetra::MMdetails::merge_matrices(Aview, Bview, Acol2Brow, Acol2Irow, Bcol2Ccol, Icol2Ccol, C.getColMap()->getLocalNumElements());
706
707 // Get the properties and arrays of input matrices
708 const matrix_t& Amat = Aview.origMatrix->getLocalMatrixDevice();
709 const matrix_t& Bmat = Bview.origMatrix->getLocalMatrixDevice();
710
711 typename handle_t::nnz_lno_t AnumRows = Amat.numRows();
712 typename handle_t::nnz_lno_t BnumRows = Bmerged.numRows();
713 typename handle_t::nnz_lno_t BnumCols = Bmerged.numCols();
714
715 // Arrays of the output matrix
716 lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing("row_mapC"), AnumRows + 1);
717 lno_nnz_view_t entriesC;
718 scalar_view_t valuesC;
719
720 // Options
721 int team_work_size = 16;
722 std::string myalg("SPGEMM_KK_MEMORY");
723 if (!params.is_null()) {
724 if (params->isParameter("cuda: algorithm"))
725 myalg = params->get("cuda: algorithm", myalg);
726 if (params->isParameter("cuda: team work size"))
727 team_work_size = params->get("cuda: team work size", team_work_size);
728 }
729
730 // Get the algorithm mode
731 std::string alg("Cuda algorithm");
732 if (!params.is_null() && params->isParameter(alg)) myalg = params->get(alg, myalg);
733 KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
734
735 // decide whether to use integer-typed row pointers for this spgemm
736 Tpetra::Details::IntRowPtrHelper<decltype(Bmerged)> irph(Bmerged.nnz(), Bmerged.graph.row_map);
737 const bool useIntRowptrs =
738 irph.shouldUseIntRowptrs() &&
739 Aview.origMatrix->getApplyHelper()->shouldUseIntRowptrs();
740
741 if (useIntRowptrs) {
742 int_handle_t kh;
743 kh.create_spgemm_handle(alg_enum);
744 kh.set_team_work_size(team_work_size);
745
746 int_view_t int_row_mapC(Kokkos::ViewAllocateWithoutInitializing("int_row_mapC"), AnumRows + 1);
747
748 auto Aint = Aview.origMatrix->getApplyHelper()->getIntRowptrMatrix(Amat);
749 auto Bint = irph.getIntRowptrMatrix(Bmerged);
750
751 {
752 Tpetra::Details::ProfilingRegion MM2("TpetraExt: Jacobi: KokkosKernels symbolic int");
753 KokkosSparse::spgemm_symbolic(&kh, AnumRows, BnumRows, BnumCols,
754 Aint.graph.row_map, Aint.graph.entries, false,
755 Bint.graph.row_map, Bint.graph.entries, false,
756 int_row_mapC);
757 }
758
759 size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
760 if (c_nnz_size) {
761 entriesC = lno_nnz_view_t(Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
762 valuesC = scalar_view_t(Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
763 }
764 Tpetra::Details::ProfilingRegion MM2("TpetraExt: Jacobi: KokkosKernels numeric int");
765
766 // even though there is no TPL for this, we have to use the same handle that was used in the symbolic phase,
767 // so need to have a special int-typed call for this as well.
768 KokkosSparse::Experimental::spgemm_jacobi(&kh, AnumRows, BnumRows, BnumCols,
769 Aint.graph.row_map, Aint.graph.entries, Amat.values, false,
770 Bint.graph.row_map, Bint.graph.entries, Bint.values, false,
771 int_row_mapC, entriesC, valuesC,
772 omega, Dinv.getLocalViewDevice(Access::ReadOnly));
773 // transfer the integer rowptrs back to the correct rowptr type
774 Kokkos::parallel_for(
775 int_row_mapC.size(), KOKKOS_LAMBDA(int i) { row_mapC(i) = int_row_mapC(i); });
776 kh.destroy_spgemm_handle();
777 } else {
778 handle_t kh;
779 kh.create_spgemm_handle(alg_enum);
780 kh.set_team_work_size(team_work_size);
781
782 {
783 Tpetra::Details::ProfilingRegion MM2("TpetraExt: Jacobi: KokkosKernels symbolic non-int");
784 KokkosSparse::spgemm_symbolic(&kh, AnumRows, BnumRows, BnumCols,
785 Amat.graph.row_map, Amat.graph.entries, false,
786 Bmerged.graph.row_map, Bmerged.graph.entries, false,
787 row_mapC);
788 }
789
790 size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
791 if (c_nnz_size) {
792 entriesC = lno_nnz_view_t(Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
793 valuesC = scalar_view_t(Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
794 }
795
796 Tpetra::Details::ProfilingRegion MM2("TpetraExt: Jacobi: KokkosKernels numeric non-int");
797 KokkosSparse::Experimental::spgemm_jacobi(&kh, AnumRows, BnumRows, BnumCols,
798 Amat.graph.row_map, Amat.graph.entries, Amat.values, false,
799 Bmerged.graph.row_map, Bmerged.graph.entries, Bmerged.values, false,
800 row_mapC, entriesC, valuesC,
801 omega, Dinv.getLocalViewDevice(Access::ReadOnly));
802 kh.destroy_spgemm_handle();
803 }
804
805 MM = Teuchos::null;
806 MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: Jacobi: Newmatrix CudaSort"));
807
808 // Sort & set values
809 if (params.is_null() || params->get("sort entries", true))
810 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
811 C.setAllValues(row_mapC, entriesC, valuesC);
812
813 MM = Teuchos::null;
814 MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: Jacobi: Newmatrix CudaESFC"));
815
816 // Final Fillcomplete
817 Teuchos::RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
818 labelList->set("Timer Label", label);
819 if (!params.is_null()) labelList->set("compute global constants", params->get("compute global constants", true));
820 Teuchos::RCP<const Export<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> > dummyExport;
821 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport, dummyExport, labelList);
822}
823
824} // namespace MMdetails
825} // namespace Tpetra
826
827#endif // CUDA
828
829#endif
Struct that holds views of the contents of a CrsMatrix.
Teuchos::RCP< const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > origMatrix
The original matrix.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
static bool debug()
Whether Tpetra is in debug mode.
Namespace Tpetra contains the class and methods constituting the Tpetra library.