152Teuchos::RCP<Thyra::LinearOpBase<double> >
buildInterpolation(
const Teuchos::RCP<const panzer::ConnManager> &conn,
153 const Teuchos::RCP<panzer::DOFManager> &domain_ugi,
154 const Teuchos::RCP<panzer::DOFManager> &range_ugi,
155 const std::string& domain_basis_name,
156 const std::string& range_basis_name,
157 Intrepid2::EOperator op,
159 const bool force_vectorial,
160 const bool useTpetra,
161 const bool matrixFree)
165 using Teuchos::rcp_dynamic_cast;
167 using Scalar = double;
169 using STS = Teuchos::ScalarTraits<Scalar>;
170#if KOKKOS_VERSION >= 40799
171 using KAT = KokkosKernels::ArithTraits<Scalar>;
173 using KAT = Kokkos::ArithTraits<Scalar>;
175 using OT = Teuchos::OrdinalTraits<GlobalOrdinal>;
177 using DeviceSpace = PHX::Device;
178 using HostSpace = Kokkos::HostSpace;
179 using ots = Intrepid2::OrientationTools<DeviceSpace>;
180 using li = Intrepid2::LagrangianInterpolation<DeviceSpace>;
181 using DynRankDeviceView = Kokkos::DynRankView<double, DeviceSpace>;
183 using tp_graph = Tpetra::CrsGraph<LocalOrdinal, GlobalOrdinal>;
184 using tp_matrix = Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal>;
185 using tp_map = Tpetra::Map<LocalOrdinal, GlobalOrdinal>;
186#ifdef PANZER_HAVE_EPETRA_STACK
192 TEUCHOS_ASSERT(useTpetra);
193 TEUCHOS_ASSERT(!force_vectorial);
195 return Thyra::tpetraLinearOp<Scalar,LocalOrdinal,GlobalOrdinal,typename tp_matrix::node_type>(Thyra::createVectorSpace<Scalar,LocalOrdinal,GlobalOrdinal>(mfOp->getRangeMap()),
196 Thyra::createVectorSpace<Scalar,LocalOrdinal,GlobalOrdinal>(mfOp->getDomainMap()),
201 auto domain_fieldPattern = domain_ugi->getFieldPattern(domain_basis_name);
202 auto domain_basis = rcp_dynamic_cast<const panzer::Intrepid2FieldPattern>(domain_fieldPattern,
true)->getIntrepidBasis();
203 auto range_fieldPattern = range_ugi->getFieldPattern(range_basis_name);
204 auto range_basis = rcp_dynamic_cast<const panzer::Intrepid2FieldPattern>(range_fieldPattern,
true)->getIntrepidBasis();
207 const size_t domainCardinality = domain_basis->getCardinality();
208 const size_t rangeCardinality = range_basis->getCardinality();
210 const int dim = range_basis->getBaseCellTopology().getDimension();
212 if (op == Intrepid2::OPERATOR_VALUE) {
213 TEUCHOS_ASSERT(rangeCardinality >= domainCardinality);
214 TEUCHOS_ASSERT_EQUALITY(domain_basis->getFunctionSpace(), range_basis->getFunctionSpace());
218 RCP<const tp_map> tp_rangemap;
219 RCP<const tp_map> tp_domainmap;
220 RCP<const tp_map> tp_rowmap;
221 RCP<const tp_map> tp_colmap;
222 RCP<tp_matrix> tp_interp_matrix;
223 typename tp_matrix::local_matrix_device_type lcl_tp_interp_matrix;
224#ifdef PANZER_HAVE_EPETRA_STACK
225 RCP<const ep_map> ep_rangemap;
226 RCP<const ep_map> ep_domainmap;
227 RCP<const ep_map> ep_rowmap;
228 RCP<const ep_map> ep_colmap;
229 RCP<ep_matrix> ep_interp_matrix;
232 auto rangeElementLIDs_d = range_ugi->getLIDs();
233 auto domainElementLIDs_d = domain_ugi->getLIDs();
235 RCP<Thyra::LinearOpBase<Scalar> > thyra_interp;
236 size_t maxNumElementsPerBlock = 0;
237 LocalOrdinal minLocalIndex = 0;
238 LocalOrdinal maxLocalIndex = 0;
241 std::vector<GlobalOrdinal> gids;
242 range_ugi->getOwnedIndices(gids);
243 tp_rowmap = rcp(
new tp_map(OT::invalid(), gids.data(),
static_cast<LocalOrdinal
>(gids.size()), OT::zero(), range_ugi->getComm()));
244 tp_rangemap = tp_rowmap;
245 domain_ugi->getOwnedIndices(gids);
246 tp_domainmap = rcp(
new tp_map(OT::invalid(), gids.data(),
static_cast<LocalOrdinal
>(gids.size()), OT::zero(), domain_ugi->getComm()));
247 domain_ugi->getOwnedAndGhostedIndices(gids);
248 tp_colmap = rcp(
new tp_map(OT::invalid(), gids.data(),
static_cast<LocalOrdinal
>(gids.size()), OT::zero(), domain_ugi->getComm()));
250 minLocalIndex = tp_rowmap->getMinLocalIndex();
251 maxLocalIndex = tp_rowmap->getMaxLocalIndex();
255 using dv = Kokkos::DualView<size_t*, typename tp_graph::device_type>;
256 dv numEntriesPerRow(
"numEntriesPerRow", tp_rowmap->getLocalNumElements());
258 auto numEntriesPerRow_d = numEntriesPerRow.view_device();
261 std::vector<std::string> elementBlockIds;
262 range_ugi->getElementBlockIds(elementBlockIds);
263 for(std::size_t blockIter = 0; blockIter < elementBlockIds.size(); ++blockIter) {
266 std::vector<int> elementIds = range_ugi->getElementBlock(elementBlockIds[blockIter]);
267 Kokkos::View<int *, HostSpace> elementIds_h(elementIds.data(), elementIds.size());
268 Kokkos::View<int *, DeviceSpace> elementIds_d(
"elementIds_d", elementIds_h.extent(0));
269 Kokkos::deep_copy(elementIds_d, elementIds_h);
270 maxNumElementsPerBlock = std::max(maxNumElementsPerBlock, elementIds.size());
272 Kokkos::parallel_for(
"MiniEM_Interpolation::numEntriesPerRow",
273 Kokkos::RangePolicy<size_t, typename tp_matrix::node_type::execution_space>(0, elementIds.size()),
274 KOKKOS_LAMBDA(
const size_t elemIter) {
275 auto elemId = elementIds_d(elemIter);
278 auto rangeLIDs_d = Kokkos::subview(rangeElementLIDs_d, elemId, Kokkos::ALL());
281 for(
size_t rangeIter = 0; rangeIter < rangeCardinality; ++rangeIter) {
282 const LocalOrdinal range_row = rangeLIDs_d(rangeIter);
283 const bool isOwned = ((minLocalIndex <= range_row) && (range_row <= maxLocalIndex));
285 Kokkos::atomic_add(&numEntriesPerRow_d(range_row), domainCardinality);
289 numEntriesPerRow.template modify<typename dv::t_dev>();
290 numEntriesPerRow.template sync<typename dv::t_host>();
294 auto tp_interp_graph = rcp(
new tp_graph(tp_rowmap, tp_colmap, numEntriesPerRow));
297 Kokkos::View<LocalOrdinal**, HostSpace> rangeElementLIDs_h(
"rangeElementLIDs_h", rangeElementLIDs_d.extent(0), rangeCardinality);
298 Kokkos::View<LocalOrdinal**, HostSpace> domainElementLIDs_h(
"domainElementLIDs_h", domainElementLIDs_d.extent(0), domainCardinality);
299 Kokkos::deep_copy(rangeElementLIDs_h, rangeElementLIDs_d);
300 Kokkos::deep_copy(domainElementLIDs_h, domainElementLIDs_d);
303 std::vector<std::string> elementBlockIds;
304 range_ugi->getElementBlockIds(elementBlockIds);
305 for(std::size_t blockIter = 0; blockIter < elementBlockIds.size(); ++blockIter) {
308 std::vector<int> elementIds = range_ugi->getElementBlock(elementBlockIds[blockIter]);
309 maxNumElementsPerBlock = std::max(maxNumElementsPerBlock, elementIds.size());
310 for(std::size_t elemIter = 0; elemIter < elementIds.size(); ++elemIter) {
311 auto elemId = elementIds[elemIter];
314 auto rangeLIDs_h = Kokkos::subview(rangeElementLIDs_h, elemId, Kokkos::ALL());
315 auto domainLIDs_h = Kokkos::subview(domainElementLIDs_h, elemId, Kokkos::ALL());
318 for(
size_t rangeIter = 0; rangeIter < rangeCardinality; ++rangeIter) {
319 const LocalOrdinal range_row = rangeLIDs_h(rangeIter);
320 const bool isOwned = ((minLocalIndex <= range_row) && (range_row <= maxLocalIndex));
322 Teuchos::ArrayView<LocalOrdinal> domainLIDs_av = Teuchos::ArrayView<LocalOrdinal>(domainLIDs_h.data(), domainLIDs_h.extent_int(0));
323 tp_interp_graph->insertLocalIndices(range_row, domainLIDs_av);
330 RCP<Teuchos::ParameterList> pl = rcp(
new Teuchos::ParameterList());
331 pl->set(
"Optimize Storage",
true);
332 tp_interp_graph->fillComplete(tp_domainmap, tp_rangemap, pl);
334 tp_interp_matrix = rcp(
new tp_matrix(tp_interp_graph));
335 lcl_tp_interp_matrix = tp_interp_matrix->getLocalMatrixDevice();
337#ifdef PANZER_HAVE_EPETRA_STACK
340 const RCP<const Teuchos::MpiComm<int> > mpiComm = Teuchos::rcp_dynamic_cast<const Teuchos::MpiComm<int> >(range_ugi->getComm());
341 auto ep_comm = Teuchos::rcp(
new Epetra_MpiComm(*mpiComm->getRawMpiComm()));
342 std::vector<GlobalOrdinal> gids;
343 range_ugi->getOwnedIndices(gids);
344 ep_rowmap = rcp(
new ep_map(OT::invalid(),
static_cast<LocalOrdinal
>(gids.size()), gids.data(), OT::zero(), *ep_comm));
345 ep_rangemap = ep_rowmap;
346 domain_ugi->getOwnedIndices(gids);
347 ep_domainmap = rcp(
new ep_map(OT::invalid(),
static_cast<LocalOrdinal
>(gids.size()), gids.data(), OT::zero(), *ep_comm));
348 domain_ugi->getOwnedAndGhostedIndices(gids);
349 ep_colmap = rcp(
new ep_map(OT::invalid(),
static_cast<LocalOrdinal
>(gids.size()), gids.data(), OT::zero(), *ep_comm));
353 std::vector<std::string> elementBlockIds;
354 range_ugi->getElementBlockIds(elementBlockIds);
355 for(std::size_t blockIter = 0; blockIter < elementBlockIds.size(); ++blockIter) {
358 std::vector<int> elementIds = range_ugi->getElementBlock(elementBlockIds[blockIter]);
359 maxNumElementsPerBlock = std::max(maxNumElementsPerBlock, elementIds.size());
364 size_t nnzPerRowEstimate = 25*domainCardinality;
366 ep_interp_matrix = rcp(
new ep_matrix(
Copy, *ep_rowmap, *ep_colmap,
static_cast<int>(nnzPerRowEstimate),
true));
368 RCP<const Thyra::LinearOpBase<double> > th_ep_interp = Thyra::epetraLinearOp(ep_interp_matrix,
370 Thyra::EPETRA_OP_APPLY_APPLY,
371 Thyra::EPETRA_OP_ADJOINT_SUPPORTED,
372 Thyra::create_VectorSpace(ep_rangemap),
373 Thyra::create_VectorSpace(ep_domainmap));
374 thyra_interp = Teuchos::rcp_const_cast<Thyra::LinearOpBase<double> >(th_ep_interp);
380 if (maxNumElementsPerBlock > 0)
381 numCells =
static_cast<int>(std::min(maxNumElementsPerBlock, worksetSize));
383 numCells =
static_cast<int>(worksetSize);
385 DynRankDeviceView range_dofCoords_d(
"range_dofCoords_d", rangeCardinality, dim);
386 DynRankDeviceView basisCoeffsLIOriented_d(
"basisCoeffsLIOriented_d", numCells, rangeCardinality, domainCardinality);
389 DynRankDeviceView valuesAtDofCoordsNonOriented_d;
390 DynRankDeviceView valuesAtDofCoordsOriented_d;
392 if (!force_vectorial) {
394 auto temp = domain_basis->allocateOutputView(
static_cast<int>(rangeCardinality), op);
399 if (temp.rank() == 3) {
400 valuesAtDofCoordsNonOriented_d = DynRankDeviceView(
"valuesAtDofCoordsNonOriented_d", temp.extent(0), temp.extent(1), temp.extent(2));
401 valuesAtDofCoordsOriented_d = DynRankDeviceView(
"valuesAtDofCoordsOriented_d", numCells, temp.extent(0), temp.extent(1), temp.extent(2));
403 valuesAtDofCoordsNonOriented_d = DynRankDeviceView(
"valuesAtDofCoordsNonOriented_d", temp.extent(0), temp.extent(1));
404 valuesAtDofCoordsOriented_d = DynRankDeviceView(
"valuesAtDofCoordsOriented_d", numCells, temp.extent(0), temp.extent(1));
407 valuesAtDofCoordsNonOriented_d = DynRankDeviceView(
"valuesAtDofCoordsNonOriented_d", domainCardinality, rangeCardinality, dim);
408 valuesAtDofCoordsOriented_d = DynRankDeviceView(
"valuesAtDofCoordsOriented_d", numCells, domainCardinality, rangeCardinality, dim);
411 int fieldRank = Intrepid2::getFieldRank(range_basis->getFunctionSpace());
412 TEUCHOS_ASSERT((fieldRank == 0) || (fieldRank == 1));
414 auto entryFilterTol = 1e5*Teuchos::ScalarTraits<typename STS::magnitudeType>::eps();
417 range_basis->getDofCoords(range_dofCoords_d);
420 domain_basis->getValues(valuesAtDofCoordsNonOriented_d, range_dofCoords_d, op);
423 std::vector<std::string> elementBlockIds;
424 range_ugi->getElementBlockIds(elementBlockIds);
427 std::map<std::string, std::vector<Intrepid2::Orientation> > orientations;
431 for(std::size_t blockIter = 0; blockIter < elementBlockIds.size(); ++blockIter) {
433 auto rangeOffsets_d = range_ugi->getGIDFieldOffsetsKokkos(elementBlockIds[blockIter], 0);
434 auto domainOffsets_d = domain_ugi->getGIDFieldOffsetsKokkos(elementBlockIds[blockIter], 0);
435#ifdef PANZER_INTERPOLATION_DEBUG_OUTPUT
436 std::cout <<
"rangeOffsets_d" << std::endl;
437 for (
int i = 0; i < rangeCardinality; i++)
438 std::cout << rangeOffsets_d(i) <<
" ";
439 std::cout << std::endl;
440 std::cout <<
"domainOffsets_d" << std::endl;
441 for (
int i = 0; i < domainCardinality; i++)
442 std::cout << domainOffsets_d(i) <<
" ";
443 std::cout << std::endl;
447 Kokkos::View<int *, DeviceSpace> elementIds_d;
449 std::vector<int> elementIds = range_ugi->getElementBlock(elementBlockIds[blockIter]);
450 Kokkos::View<int *, HostSpace> elementIds_h(elementIds.data(), elementIds.size());
451 elementIds_d = Kokkos::View<int *, DeviceSpace>(
"elementIds_d", elementIds_h.extent(0));
452 Kokkos::deep_copy(elementIds_d, elementIds_h);
456 typename Kokkos::DynRankView<Intrepid2::Orientation,DeviceSpace> elemOrts_d (
"elemOrts_d", elementIds_d.extent(0));
459 auto blockOrientations = orientations[elementBlockIds[blockIter]];
460 Kokkos::View<Intrepid2::Orientation*, HostSpace> elemOrts_h(blockOrientations.data(), blockOrientations.size());
461 Kokkos::deep_copy(elemOrts_d, elemOrts_h);
465 for(std::size_t elemIter = 0; elemIter < elementIds_d.extent(0); elemIter += numCells) {
468 std::min(numCells, Teuchos::as<int>(elementIds_d.extent(0)) -
469 Teuchos::as<int>(elemIter));
472 auto ortsRange = Kokkos::make_pair(elemIter, std::min(elemIter + numCells, elemOrts_d.extent(0)));
473 auto elemOrtsWorkset_d = Kokkos::subview(elemOrts_d, ortsRange);
475 auto worksetRange = Kokkos::make_pair(0, endCellRange);
476 auto valuesAtDofCoordsOrientedWorkset_d = Kokkos::subview(valuesAtDofCoordsOriented_d, worksetRange, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
477 auto basisCoeffsLIOrientedWorkset_d = Kokkos::subview(basisCoeffsLIOriented_d, worksetRange, Kokkos::ALL(), Kokkos::ALL());
481 ots::modifyBasisByOrientation(valuesAtDofCoordsOrientedWorkset_d,
482 valuesAtDofCoordsNonOriented_d,
487 for(
size_t domainIter=0; domainIter<domainCardinality; domainIter++)
490 li::getBasisCoeffs(Kokkos::subview(basisCoeffsLIOrientedWorkset_d, Kokkos::ALL(), Kokkos::ALL(), domainIter),
491 Kokkos::subview(valuesAtDofCoordsOrientedWorkset_d, Kokkos::ALL(), domainIter, Kokkos::ALL(), Kokkos::ALL()),
495#ifdef PANZER_HAVE_EPETRA_STACK
498 Kokkos::View<LocalOrdinal*,DeviceSpace> indices_d(
"indices", domainCardinality);
499 Kokkos::View<Scalar*, DeviceSpace> values_d (
"values", domainCardinality);
502 for (
int cellNo = 0; cellNo < endCellRange; cellNo++) {
503 auto elemId = elementIds_d(elemIter+cellNo);
506 auto rangeLIDs_d = Kokkos::subview(rangeElementLIDs_d, elemId, Kokkos::ALL());
507 auto domainLIDs_d = Kokkos::subview(domainElementLIDs_d, elemId, Kokkos::ALL());
510 for(
size_t rangeIter = 0; rangeIter < rangeCardinality; ++rangeIter) {
511 const LocalOrdinal range_row = rangeLIDs_d(rangeOffsets_d(rangeIter));
512 const bool isOwned = ep_rowmap->MyLID(range_row);
515 LocalOrdinal rowNNZ = 0;
516 for(
size_t domainIter = 0; domainIter < domainCardinality; domainIter++) {
517 Scalar val = basisCoeffsLIOriented_d(cellNo, rangeIter, domainIter);
518 if (KAT::magnitude(val) > entryFilterTol) {
519 indices_d(rowNNZ) = domainLIDs_d(domainOffsets_d(domainIter));
520 values_d(rowNNZ) = val;
525 int ret = ep_interp_matrix->ReplaceMyValues(range_row, rowNNZ, values_d.data(), indices_d.data());
527 ret = ep_interp_matrix->InsertMyValues(range_row, rowNNZ, values_d.data(), indices_d.data());
528 TEUCHOS_ASSERT(ret == 0);
537 Kokkos::parallel_for(
538 "MiniEM_Interpolation::worksetLoop",
539 Kokkos::RangePolicy<int, typename tp_matrix::node_type::execution_space>(0, endCellRange),
540 KOKKOS_LAMBDA(
const int cellNo) {
541 auto elemId = elementIds_d(elemIter+cellNo);
544 auto rangeLIDs_d = Kokkos::subview(rangeElementLIDs_d, elemId, Kokkos::ALL());
545 auto domainLIDs_d = Kokkos::subview(domainElementLIDs_d, elemId, Kokkos::ALL());
547#ifdef PANZER_INTERPOLATION_DEBUG_OUTPUT
548 std::cout <<
"\n" << elemOrts_d(elemIter+cellNo).to_string() << std::endl;
549 std::cout <<
"rangeLIDs" << std::endl;
550 for (
int i = 0; i < rangeCardinality; i++)
551 std::cout << rangeLIDs_d(i) <<
" ";
552 std::cout << std::endl <<
"domainLIDs" << std::endl;
553 for (
int i = 0; i < domainCardinality; i++)
554 std::cout << domainLIDs_d(i) <<
" ";
555 std::cout << std::endl;
558 for(
size_t rangeIter = 0; rangeIter < rangeCardinality; ++rangeIter) {
559 const LocalOrdinal range_row = rangeLIDs_d(rangeOffsets_d(rangeIter));
560 const bool isOwned = ((minLocalIndex <= range_row) && (range_row <= maxLocalIndex));
563 for(
size_t domainIter = 0; domainIter < domainCardinality; domainIter++) {
564 Scalar val = basisCoeffsLIOriented_d(cellNo, rangeIter, domainIter);
565 if (KAT::magnitude(val) > entryFilterTol) {
567#if defined(PANZER_INTERPOLATION_DEBUG_OUTPUT) || defined(PANZER_DEBUG)
570 auto row = lcl_tp_interp_matrix.rowConst(range_row);
571 for(LocalOrdinal kk = 0; kk<row.length; ++kk)
572 if (row.colidx(kk) == domainLIDs_d(domainOffsets_d(domainIter)))
573 if (!(KAT::magnitude(row.value(kk)-val) < entryFilterTol || KAT::magnitude(row.value(kk)) < entryFilterTol)) {
574#ifdef PANZER_INTERPOLATION_DEBUG_OUTPUT
575 std::cout <<
"Replacing (" << range_row <<
"," << row.colidx(kk) <<
") = " << row.value(kk) <<
" with " << val << std::endl;
578 Kokkos::abort(
"MiniEM interpolation worksetLoop failed!");
583#ifdef PANZER_INTERPOLATION_DEBUG_OUTPUT
584 std::cout <<
"Setting (" << range_row <<
"," << domainLIDs_d(domainOffsets_d(domainIter)) <<
") = " << val << std::endl;
586 lcl_tp_interp_matrix.replaceValues(range_row, &(domainLIDs_d(domainOffsets_d(domainIter))), 1, &val,
false,
true);
597 tp_interp_matrix->fillComplete(tp_domainmap, tp_rangemap);
599 if (op != Intrepid2::OPERATOR_VALUE) {
604 thyra_interp = Thyra::tpetraLinearOp<Scalar,LocalOrdinal,GlobalOrdinal,typename tp_matrix::node_type>(Thyra::createVectorSpace<Scalar,LocalOrdinal,GlobalOrdinal>(tp_rangemap),
605 Thyra::createVectorSpace<Scalar,LocalOrdinal,GlobalOrdinal>(tp_domainmap),
608#ifdef PANZER_HAVE_EPETRA_STACK
610 ep_interp_matrix->FillComplete(*ep_domainmap, *ep_rangemap);
762 applyNonTransposed (
const Tpetra::MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node> &X,
763 Tpetra::MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node> &Y,
769 using Teuchos::rcp_dynamic_cast;
770 using range_type = Kokkos::RangePolicy<LocalOrdinal, DeviceSpace>;
772 using ots = Intrepid2::OrientationTools<DeviceSpace>;
773 using li = Intrepid2::LagrangianInterpolation<DeviceSpace>;
774 using DynRankDeviceView = Kokkos::DynRankView<Scalar,DeviceSpace>;
775 using view_t =
typename Tpetra::MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>::dual_view_type::t_dev;
776 using const_view_t =
typename Tpetra::MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>::dual_view_type::t_dev::const_type;
778 Teuchos::TimeMonitor tm(*Teuchos::TimeMonitor::getNewTimer(std::string(
"Mini-EM: matrix-free apply no_trans ") + name));
780 using TST = Teuchos::ScalarTraits<Scalar>;
781 const Scalar ZERO = TST::zero();
784 colmapMV_->doImport(X, *import_,Tpetra::INSERT);
786 const_view_t lclX = colmapMV_->getLocalViewDevice(Tpetra::Access::ReadOnly);
787 view_t lclY = Y.getLocalViewDevice(Tpetra::Access::ReadWrite);
788 size_t numVectors = lclY.extent(1);
789 LocalOrdinal lclTargetSize = getRangeMap()->getLocalNumElements();
792 auto domain_fieldPattern = domain_ugi->getFieldPattern(domain_basis_name);
793 auto domain_basis = rcp_dynamic_cast<const panzer::Intrepid2FieldPattern>(domain_fieldPattern,
true)->getIntrepidBasis();
794 auto range_fieldPattern = range_ugi->getFieldPattern(range_basis_name);
795 auto range_basis = rcp_dynamic_cast<const panzer::Intrepid2FieldPattern>(range_fieldPattern,
true)->getIntrepidBasis();
798 const size_t domainCardinality = domain_basis->getCardinality();
799 const size_t rangeCardinality = range_basis->getCardinality();
801 size_t maxNumElementsPerBlock = 0;
803 const int dim = range_basis->getBaseCellTopology().getDimension();
805 if (op == Intrepid2::OPERATOR_VALUE) {
806 TEUCHOS_ASSERT(rangeCardinality >= domainCardinality);
807 TEUCHOS_ASSERT_EQUALITY(domain_basis->getFunctionSpace(), range_basis->getFunctionSpace());
812 if (maxNumElementsPerBlock > 0)
813 numCells = std::min(maxNumElementsPerBlock, worksetSize);
815 numCells = worksetSize;
816 DynRankDeviceView range_dofCoords_d(
"range_dofCoords_d", rangeCardinality, dim);
817 DynRankDeviceView basisCoeffsLIOriented_d(
"basisCoeffsLIOriented_d", numCells, rangeCardinality, domainCardinality);
820 DynRankDeviceView valuesAtDofCoordsNonOriented_d;
821 DynRankDeviceView valuesAtDofCoordsOriented_d;
822 DynRankDeviceView reducedValuesAtDofCoordsOriented_d;
826 auto temp = domain_basis->allocateOutputView(rangeCardinality, op);
831 if (temp.rank() == 3) {
832 valuesAtDofCoordsNonOriented_d = DynRankDeviceView(
"valuesAtDofCoordsNonOriented_d", temp.extent(0), temp.extent(1), temp.extent(2));
833 valuesAtDofCoordsOriented_d = DynRankDeviceView(
"valuesAtDofCoordsOriented_d", numCells, temp.extent(0), temp.extent(1), temp.extent(2));
834 reducedValuesAtDofCoordsOriented_d = DynRankDeviceView(
"reducedValuesAtDofCoordsOriented_d", numCells, temp.extent(1), temp.extent(2), numVectors);
836 valuesAtDofCoordsNonOriented_d = DynRankDeviceView(
"valuesAtDofCoordsNonOriented_d", temp.extent(0), temp.extent(1));
837 valuesAtDofCoordsOriented_d = DynRankDeviceView(
"valuesAtDofCoordsOriented_d", numCells, temp.extent(0), temp.extent(1));
838 reducedValuesAtDofCoordsOriented_d = DynRankDeviceView(
"reducedValuesAtDofCoordsOriented_d", numCells, temp.extent(1), numVectors);
842 int fieldRank = Intrepid2::getFieldRank(range_basis->getFunctionSpace());
843 TEUCHOS_ASSERT((fieldRank == 0) || (fieldRank == 1));
845 auto rangeLIDs_d = range_ugi->getLIDs();
846 auto domainLIDs_d = domain_ugi->getLIDs();
849 range_basis->getDofCoords(range_dofCoords_d);
852 domain_basis->getValues(valuesAtDofCoordsNonOriented_d, range_dofCoords_d, op);
855 std::vector<std::string> elementBlockIds;
856 range_ugi->getElementBlockIds(elementBlockIds);
857 for(std::size_t blockIter = 0; blockIter < elementBlockIds.size(); ++blockIter) {
859 auto rangeOffsets_d = range_ugi->getGIDFieldOffsetsKokkos(elementBlockIds[blockIter], 0);
860 auto domainOffsets_d = domain_ugi->getGIDFieldOffsetsKokkos(elementBlockIds[blockIter], 0);
863 Kokkos::View<int *, DeviceSpace> elementIds_d;
865 std::vector<int> elementIds = range_ugi->getElementBlock(elementBlockIds[blockIter]);
866 Kokkos::View<int *, Kokkos::HostSpace> elementIds_h(elementIds.data(), elementIds.size());
867 elementIds_d = Kokkos::View<int *, DeviceSpace>(
"elementIds_d", elementIds_h.extent(0));
868 Kokkos::deep_copy(elementIds_d, elementIds_h);
872 auto elemOrts_d = orientations_.at(elementBlockIds[blockIter]);
874 for(std::size_t elemIter = 0; elemIter < elementIds_d.extent(0); elemIter += numCells) {
877 std::min(numCells, Teuchos::as<int>(elementIds_d.extent(0)) -
878 Teuchos::as<int>(elemIter));
881 auto ortsRange = Kokkos::make_pair(elemIter, std::min(elemIter + numCells, elemOrts_d.extent(0)));
882 auto elemOrtsWorkset_d = Kokkos::subview(elemOrts_d, ortsRange);
884 auto worksetRange = Kokkos::make_pair(0, endCellRange);
885 auto valuesAtDofCoordsOrientedWorkset_d = Kokkos::subview(valuesAtDofCoordsOriented_d, worksetRange, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
886 auto basisCoeffsLIOrientedWorkset_d = Kokkos::subview(basisCoeffsLIOriented_d, worksetRange, Kokkos::ALL(), Kokkos::ALL());
890 ots::modifyBasisByOrientation(valuesAtDofCoordsOrientedWorkset_d,
891 valuesAtDofCoordsNonOriented_d,
895 Kokkos::deep_copy(reducedValuesAtDofCoordsOriented_d, 0.0);
897 if (reducedValuesAtDofCoordsOriented_d.rank() == 4) {
898 Kokkos::parallel_for(
"miniEM:MatrixFreeInterpolationOp:cellLoop1",
899 range_type(0, std::min(numCells, elementIds_d.extent_int(0)-Teuchos::as<int>(elemIter))),
900 KOKKOS_LAMBDA(
const LocalOrdinal cellNo) {
901 LocalOrdinal cellNo2 = elemIter+cellNo;
902 LocalOrdinal elemId = elementIds_d(cellNo2);
903 for(
size_t domainIter=0; domainIter<domainCardinality; domainIter++) {
904 LocalOrdinal J = domainLIDs_d(elemId, domainOffsets_d(domainIter));
905 for(
size_t rangeIter=0; rangeIter<rangeCardinality; rangeIter++) {
906 for(
size_t d=0; d<valuesAtDofCoordsOriented_d.extent(3); d++) {
907 Scalar val = valuesAtDofCoordsOriented_d(cellNo, domainIter, rangeIter, d);
908 for (
size_t j = 0; j<numVectors; ++j)
909 reducedValuesAtDofCoordsOriented_d(cellNo, rangeIter, d, j) += val * lclX(J, j);
915 for (
size_t j = 0; j<numVectors; ++j)
916 li::getBasisCoeffs(Kokkos::subview(basisCoeffsLIOrientedWorkset_d, Kokkos::ALL(), Kokkos::ALL(), j),
917 Kokkos::subview(reducedValuesAtDofCoordsOriented_d, worksetRange, Kokkos::ALL(), Kokkos::ALL(), j),
922 Kokkos::parallel_for(
"miniEM:MatrixFreeInterpolationOp:cellLoop1",
923 range_type(0, std::min(numCells, elementIds_d.extent_int(0)-Teuchos::as<int>(elemIter))),
924 KOKKOS_LAMBDA(
const LocalOrdinal cellNo) {
925 LocalOrdinal cellNo2 = elemIter+cellNo;
926 LocalOrdinal elemId = elementIds_d(cellNo2);
927 for(
size_t domainIter=0; domainIter<domainCardinality; domainIter++) {
928 LocalOrdinal J = domainLIDs_d(elemId, domainOffsets_d(domainIter));
929 for(
size_t rangeIter=0; rangeIter<rangeCardinality; rangeIter++) {
930 Scalar val = valuesAtDofCoordsOriented_d(cellNo, domainIter, rangeIter);
931 for (
size_t j = 0; j<numVectors; ++j)
932 reducedValuesAtDofCoordsOriented_d(cellNo, rangeIter, j) += val * lclX(J, j);
937 for (
size_t j = 0; j<numVectors; ++j)
938 li::getBasisCoeffs(Kokkos::subview(basisCoeffsLIOrientedWorkset_d, Kokkos::ALL(), Kokkos::ALL(), j),
939 Kokkos::subview(reducedValuesAtDofCoordsOriented_d, worksetRange, Kokkos::ALL(), j),
945 auto owner_d = owner_d_;
947 Kokkos::parallel_for(
"miniEM::MatrixFreeInterpolationOp::cellLoop2",
948 range_type(elemIter, std::min(elemIter+numCells,
949 elementIds_d.extent(0))),
950 KOKKOS_LAMBDA(
const LocalOrdinal cellNo2) {
951 LocalOrdinal cellNo = cellNo2-elemIter;
952 LocalOrdinal elemId = elementIds_d(cellNo2);
955 for(
size_t rangeIter = 0; rangeIter < rangeLIDs_d.extent(1); ++rangeIter) {
956 LocalOrdinal range_row = rangeLIDs_d(elemId, rangeOffsets_d(rangeIter));
959 if ((range_row < lclTargetSize) && (owner_d(range_row) == elemId)) {
961 for (
size_t j = 0; j<numVectors; ++j) {
962 Scalar val = basisCoeffsLIOriented_d(cellNo, rangeIter, j);
963 lclY(range_row,j) = beta*lclY(range_row,j) + alpha*val;
976 applyTransposed (
const Tpetra::MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node> &X,
977 Tpetra::MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node> &Y,
983 using Teuchos::rcp_dynamic_cast;
984 using range_type = Kokkos::RangePolicy<LocalOrdinal, DeviceSpace>;
986 typedef Intrepid2::OrientationTools<DeviceSpace> ots;
987 typedef Intrepid2::LagrangianInterpolation<DeviceSpace> li;
988 typedef Kokkos::DynRankView<Scalar,DeviceSpace> DynRankDeviceView;
989 using TST = Teuchos::ScalarTraits<Scalar>;
990 const Scalar ZERO = TST::zero();
991 using view_t =
typename Tpetra::MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>::dual_view_type::t_dev;
992 using const_view_t =
typename Tpetra::MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>::dual_view_type::t_dev::const_type;
994 Teuchos::TimeMonitor tm(*Teuchos::TimeMonitor::getNewTimer(std::string(
"Mini-EM: matrix-free apply trans ") + name));
996 const_view_t lclX = X.getLocalViewDevice(Tpetra::Access::ReadOnly);
997 colmapMV_->putScalar(ZERO);
998 view_t lclYtemp = colmapMV_->getLocalViewDevice(Tpetra::Access::ReadWrite);
1001 auto domain_fieldPattern = domain_ugi->getFieldPattern(domain_basis_name);
1002 auto domain_basis = rcp_dynamic_cast<const panzer::Intrepid2FieldPattern>(domain_fieldPattern,
true)->getIntrepidBasis();
1003 auto range_fieldPattern = range_ugi->getFieldPattern(range_basis_name);
1004 auto range_basis = rcp_dynamic_cast<const panzer::Intrepid2FieldPattern>(range_fieldPattern,
true)->getIntrepidBasis();
1007 const size_t domainCardinality = domain_basis->getCardinality();
1008 const size_t rangeCardinality = range_basis->getCardinality();
1010 size_t maxNumElementsPerBlock = 0;
1012 const int dim = range_basis->getBaseCellTopology().getDimension();
1014 if (op == Intrepid2::OPERATOR_VALUE) {
1015 TEUCHOS_ASSERT(rangeCardinality >= domainCardinality);
1016 TEUCHOS_ASSERT_EQUALITY(domain_basis->getFunctionSpace(), range_basis->getFunctionSpace());
1021 if (maxNumElementsPerBlock > 0)
1022 numCells = std::min(maxNumElementsPerBlock, worksetSize);
1024 numCells = worksetSize;
1025 DynRankDeviceView range_dofCoords_d(
"range_dofCoords_d", rangeCardinality, dim);
1026 DynRankDeviceView basisCoeffsLIOriented_d(
"basisCoeffsLIOriented_d", numCells, rangeCardinality, domainCardinality);
1029 DynRankDeviceView valuesAtDofCoordsNonOriented_d;
1030 DynRankDeviceView valuesAtDofCoordsOriented_d;
1034 auto temp = domain_basis->allocateOutputView(rangeCardinality, op);
1039 if (temp.rank() == 3) {
1040 valuesAtDofCoordsNonOriented_d = DynRankDeviceView(
"valuesAtDofCoordsNonOriented_d", temp.extent(0), temp.extent(1), temp.extent(2));
1041 valuesAtDofCoordsOriented_d = DynRankDeviceView(
"valuesAtDofCoordsOriented_d", numCells, temp.extent(0), temp.extent(1), temp.extent(2));
1043 valuesAtDofCoordsNonOriented_d = DynRankDeviceView(
"valuesAtDofCoordsNonOriented_d", temp.extent(0), temp.extent(1));
1044 valuesAtDofCoordsOriented_d = DynRankDeviceView(
"valuesAtDofCoordsOriented_d", numCells, temp.extent(0), temp.extent(1));
1048 int fieldRank = Intrepid2::getFieldRank(range_basis->getFunctionSpace());
1049 TEUCHOS_ASSERT((fieldRank == 0) || (fieldRank == 1));
1051 auto rangeLIDs_d = range_ugi->getLIDs();
1052 auto domainLIDs_d = domain_ugi->getLIDs();
1056 range_basis->getDofCoords(range_dofCoords_d);
1059 domain_basis->getValues(valuesAtDofCoordsNonOriented_d, range_dofCoords_d, op);
1062 std::vector<std::string> elementBlockIds;
1063 range_ugi->getElementBlockIds(elementBlockIds);
1064 for(std::size_t blockIter = 0; blockIter < elementBlockIds.size(); ++blockIter) {
1066 auto rangeOffsets_d = range_ugi->getGIDFieldOffsetsKokkos(elementBlockIds[blockIter], 0);
1067 auto domainOffsets_d = domain_ugi->getGIDFieldOffsetsKokkos(elementBlockIds[blockIter], 0);
1070 Kokkos::View<int *, DeviceSpace> elementIds_d;
1072 std::vector<int> elementIds = range_ugi->getElementBlock(elementBlockIds[blockIter]);
1073 Kokkos::View<int *, Kokkos::HostSpace> elementIds_h(elementIds.data(), elementIds.size());
1074 elementIds_d = Kokkos::View<int *, DeviceSpace>(
"elementIds_d", elementIds_h.extent(0));
1075 Kokkos::deep_copy(elementIds_d, elementIds_h);
1079 auto elemOrts_d = orientations_.at(elementBlockIds[blockIter]);
1081 for(std::size_t elemIter = 0; elemIter < elementIds_d.extent(0); elemIter += numCells) {
1084 std::min(numCells, Teuchos::as<int>(elementIds_d.extent(0)) -
1085 Teuchos::as<int>(elemIter));
1088 auto ortsRange = Kokkos::make_pair(elemIter, std::min(elemIter + numCells, elemOrts_d.extent(0)));
1089 auto elemOrtsWorkset_d = Kokkos::subview(elemOrts_d, ortsRange);
1091 auto worksetRange = Kokkos::make_pair(0, endCellRange);
1092 auto valuesAtDofCoordsOrientedWorkset_d = Kokkos::subview(valuesAtDofCoordsOriented_d, worksetRange, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
1093 auto basisCoeffsLIOrientedWorkset_d = Kokkos::subview(basisCoeffsLIOriented_d, worksetRange, Kokkos::ALL(), Kokkos::ALL());
1097 ots::modifyBasisByOrientation(valuesAtDofCoordsOrientedWorkset_d,
1098 valuesAtDofCoordsNonOriented_d,
1100 domain_basis.get());
1104 for(
size_t domainIter=0; domainIter<domainCardinality; domainIter++)
1107 li::getBasisCoeffs(Kokkos::subview(basisCoeffsLIOrientedWorkset_d, Kokkos::ALL(), Kokkos::ALL(), domainIter),
1108 Kokkos::subview(valuesAtDofCoordsOrientedWorkset_d, Kokkos::ALL(), domainIter, Kokkos::ALL(), Kokkos::ALL()),
1109 range_basis.get(), elemOrtsWorkset_d);
1112 auto owner_d = owner_d_;
1115 Kokkos::parallel_for(
"miniEM::MatrixFreeInterpolationOp::cellLoop",
1116 range_type(elemIter, std::min(elemIter+numCells,
1117 elementIds_d.extent(0))),
1118 KOKKOS_LAMBDA(
const LocalOrdinal cellNo2) {
1119 LocalOrdinal cellNo = cellNo2-elemIter;
1120 LocalOrdinal elemId = elementIds_d(cellNo2);
1123 for(
size_t rangeIter = 0; rangeIter < rangeLIDs_d.extent(1); ++rangeIter) {
1124 LocalOrdinal range_row = rangeLIDs_d(elemId, rangeOffsets_d(rangeIter));
1127 if ((range_row < (LocalOrdinal) lclX.extent(0)) && (owner_d(range_row) == elemId)) {
1129 for(
size_t domainIter = 0; domainIter < domainLIDs_d.extent(1); domainIter++) {
1130 LocalOrdinal J = domainLIDs_d(elemId, domainOffsets_d(domainIter));
1131 Scalar val = basisCoeffsLIOriented_d(cellNo, rangeIter, domainIter);
1132 for (
size_t j = 0; j<lclYtemp.extent(1); ++j)
1133 Kokkos::atomic_add(&lclYtemp(J,j), alpha*val*lclX(range_row,j));
1146 Y.doExport(*colmapMV_, *import_, Tpetra::ADD_ASSIGN);