367 const dstBasisType* dstBasis,
368 const srcViewType srcCoeffs,
369 const srcBasisType* srcBasis,
370 const OrientationViewType cellOrientations){
373 INTREPID2_TEST_FOR_EXCEPTION(dstBasis->getFunctionSpace() != srcBasis->getFunctionSpace(), std::runtime_error,
374 "The source and destination bases are not compatible. They need to belong to the same function space");
375 INTREPID2_TEST_FOR_EXCEPTION(dstBasis->getBaseCellTopology().getKey() != srcBasis->getBaseCellTopology().getKey(), std::runtime_error,
376 "The source and destination bases are not compatible. They do not have the same basic cell topology");
382 ordinal_type numCells = cellOrientations.extent(0);
383 ordinal_type dim = srcBasis->getBaseCellTopology().getDimension();
384 ordinal_type srcBasisCardinality = srcBasis->getCardinality();
385 ordinal_type fieldDimension = (srcBasis->getFunctionSpace() == Intrepid2::FUNCTION_SPACE_HCURL || srcBasis->getFunctionSpace() == Intrepid2::FUNCTION_SPACE_HDIV) ? dim : 1;
388 ordinal_type numPoints = evaluationPoints.extent(0);
390 using outViewType = Kokkos::DynRankView<typename srcBasisType::OutputValueType, DeviceType>;
391 outViewType srcAtEvalPoints, refBasisAtEvalPoints, basisAtEvalPoints;
392 if(fieldDimension == dim) {
393 srcAtEvalPoints = outViewType(
"srcAtEvalPoints", numCells, numPoints, dim);
394 refBasisAtEvalPoints = outViewType(
"refBasisAtEvalPoints", srcBasisCardinality, numPoints, dim);
395 basisAtEvalPoints = outViewType(
"basisAtEvalPoints", numCells, srcBasisCardinality, numPoints, dim);
397 srcAtEvalPoints = outViewType(
"srcAtEvalPoints", numCells, numPoints);
398 refBasisAtEvalPoints = outViewType(
"refBasisAtEvalPoints", srcBasisCardinality, numPoints);
399 basisAtEvalPoints = outViewType(
"basisAtEvalPoints", numCells, srcBasisCardinality, numPoints);
402 srcBasis->getValues(refBasisAtEvalPoints,evaluationPoints);
406 refBasisAtEvalPoints,
410 Kokkos::parallel_for(Kokkos::RangePolicy<typename DeviceType::execution_space>(0,numCells),
411 KOKKOS_LAMBDA (
const int &ic) {
412 for(
int j=0; j<numPoints; ++j) {
413 for(
int k=0; k<srcBasisCardinality; ++k) {
414 for(
int d=0; d<fieldDimension; ++d)
415 srcAtEvalPoints.access(ic,j,d) += srcCoeffs(ic,k)*basisAtEvalPoints.access(ic,k,j,d);
419 ExecSpaceType().fence();
442 std::string systemName_;
443 bool matrixIndependentOfCell_;
452 ElemSystem (std::string systemName,
bool matrixIndependentOfCell) :
453 systemName_(systemName), matrixIndependentOfCell_(matrixIndependentOfCell){};
482 template<
typename ViewType1,
typename ViewType2,
typename ViewType3,
typename ViewType4>
483 void solve(ViewType1 basisCoeffs, ViewType2 elemMat, ViewType2 elemRhs, ViewType2 tau,
484 ViewType3 w,
const ViewType4 elemDof, ordinal_type n, ordinal_type m=0) {
485#ifdef HAVE_INTREPID2_KOKKOSKERNELS
486 solveDevice(basisCoeffs, elemMat, elemRhs, tau,
489 solveHost(basisCoeffs, elemMat, elemRhs, tau,
497#ifdef HAVE_INTREPID2_KOKKOSKERNELS
498 template<
typename ViewType1,
typename ViewType2,
typename ViewType3,
typename ViewType4>
499 void solveDevice(ViewType1 basisCoeffs, ViewType2 elemMat, ViewType2 elemRhs, ViewType2 taul,
500 ViewType3 work,
const ViewType4 elemDof, ordinal_type n, ordinal_type m) {
501 using HostDeviceType = Kokkos::Device<Kokkos::DefaultHostExecutionSpace,Kokkos::HostSpace>;
503 ordinal_type numCells = basisCoeffs.extent(0);
505 if(matrixIndependentOfCell_) {
506 auto A0 = Kokkos::subview(elemMat, 0, Kokkos::ALL(), Kokkos::ALL());
507 auto tau0 = Kokkos::subview(taul, 0, Kokkos::ALL());
509 Kokkos::DynRankView<typename ViewType2::value_type, HostDeviceType> A0_host(
"A0_host", A0.extent(0),A0.extent(1));
510 auto A0_device = Kokkos::create_mirror_view(
typename DeviceType::memory_space(), A0_host);
511 Kokkos::deep_copy(A0_device, A0);
512 Kokkos::deep_copy(A0_host, A0_device);
514 for(ordinal_type i=n; i<n+m; ++i)
515 for(ordinal_type j=0; j<n; ++j)
516 A0_host(i,j) = A0_host(j,i);
518 Kokkos::DynRankView<typename ViewType2::value_type, HostDeviceType> tau0_host(
"A0_host", tau0.extent(0));
519 auto tau0_device = Kokkos::create_mirror_view(
typename DeviceType::memory_space(), tau0_host);
520 auto w0_host = Kokkos::create_mirror_view(Kokkos::subview(work, 0, Kokkos::ALL()));
523 KokkosBatched::Impl::SerialQR_Internal::invoke
524 (A0_host.extent(0), A0_host.extent(1),
525 A0_host.data(), A0_host.stride(0), A0_host.stride(1),
526 tau0_host.data(), tau0_host.stride(0), w0_host.data());
528 Kokkos::deep_copy(A0_device, A0_host);
529 Kokkos::deep_copy(A0, A0_device);
530 Kokkos::deep_copy(tau0_device, tau0_host);
531 Kokkos::deep_copy(tau0, tau0_device);
533 Kokkos::parallel_for (systemName_,
534 Kokkos::RangePolicy<ExecSpaceType, int> (0, numCells),
535 KOKKOS_LAMBDA (
const size_t ic) {
536 auto w = Kokkos::subview(work, ic, Kokkos::ALL());
538 auto b = Kokkos::subview(elemRhs, ic, Kokkos::ALL());
541 KokkosBatched::SerialApplyQ_RightForwardInternal::invoke(
542 1, A0.extent(0), A0.extent(1),
543 A0.data(), A0.stride(0), A0.stride(1),
544 tau0.data(), tau0.stride(0),
545 b.data(), 1, b.stride(0),
549 KokkosBatched::SerialTrsv<KokkosBatched::Uplo::Upper, KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::NonUnit, KokkosBatched::Algo::Trsv::Unblocked>::invoke(1.0, A0, b);
552 for(ordinal_type i=0; i<n; ++i){
553 basisCoeffs(ic,elemDof(i)) = b(i);
559 Kokkos::parallel_for (systemName_,
560 Kokkos::RangePolicy<ExecSpaceType, int> (0, numCells),
561 KOKKOS_LAMBDA (
const size_t ic) {
563 auto A = Kokkos::subview(elemMat, ic, Kokkos::ALL(), Kokkos::ALL());
564 auto tau = Kokkos::subview(taul, ic, Kokkos::ALL());
565 auto w = Kokkos::subview(work, ic, Kokkos::ALL());
567 for(ordinal_type i=n; i<n+m; ++i)
568 for(ordinal_type j=0; j<n; ++j)
572 KokkosBatched::Impl::SerialQR_Internal::invoke
573 (A.extent(0), A.extent(1),
574 A.data(), A.stride(0), A.stride(1), tau.data(), tau.stride(0), w.data());
576 auto b = Kokkos::subview(elemRhs, ic, Kokkos::ALL());
579 KokkosBatched::SerialApplyQ_RightForwardInternal::invoke(
580 1, A.extent(0), A.extent(1),
581 A.data(), A.stride(0), A.stride(1),
582 tau.data(), tau.stride(0),
583 b.data(), 1, b.stride(0),
587 KokkosBatched::SerialTrsv<KokkosBatched::Uplo::Upper, KokkosBatched::Trans::NoTranspose, KokkosBatched::Diag::NonUnit, KokkosBatched::Algo::Trsv::Unblocked>::invoke(1.0, A, b);
590 for(ordinal_type i=0; i<n; ++i){
591 basisCoeffs(ic,elemDof(i)) = b(i);
601 template<
typename ViewType1,
typename ViewType2,
typename ViewType3,
typename ViewType4>
602 void solveHost(ViewType1 basisCoeffs, ViewType2 elemMat, ViewType2 elemRhs, ViewType2 ,
603 ViewType3,
const ViewType4 elemDof, ordinal_type n, ordinal_type m) {
604 using value_type =
typename ViewType2::value_type;
605 using device_type = DeviceType;
606 using host_exec_space = Kokkos::DefaultHostExecutionSpace;
607 using host_memory_space = Kokkos::HostSpace;
608 using host_device_type = Kokkos::Device<host_exec_space,host_memory_space>;
609 using vector_host_type = Kokkos::View<value_type*,host_device_type>;
610 using scratch_host_type = Kokkos::View<value_type*,host_exec_space::scratch_memory_space>;
611 using matrix_host_type = Kokkos::View<value_type**,Kokkos::LayoutLeft,host_device_type>;
612 using do_not_init_tag = Kokkos::ViewAllocateWithoutInitializing;
613 using host_team_policy_type = Kokkos::TeamPolicy<host_exec_space>;
614 using host_range_policy_type = Kokkos::RangePolicy<host_exec_space>;
620 const ordinal_type numCells = basisCoeffs.extent(0);
621 const ordinal_type numRows = m+n, numCols = n;
624 Teuchos::LAPACK<ordinal_type,value_type> lapack;
627 Kokkos::View<ordinal_type*,host_device_type> elemDof_host(do_not_init_tag(
"elemDof_host"), elemDof.extent(0));
629 auto elemDof_device = Kokkos::create_mirror_view(
typename device_type::memory_space(), elemDof_host);
630 Kokkos::deep_copy(elemDof_device, elemDof); Kokkos::fence();
631 Kokkos::deep_copy(elemDof_host, elemDof_device);
635 auto elemRhs_host = Kokkos::create_mirror_view_and_copy(host_memory_space(), elemRhs);
636 auto elemMat_host = Kokkos::create_mirror_view_and_copy(host_memory_space(), elemMat);
639 auto basisCoeffs_host = Kokkos::create_mirror_view_and_copy(host_memory_space(), basisCoeffs);
641 if (matrixIndependentOfCell_) {
643 matrix_host_type A(do_not_init_tag(
"A"), numRows, numRows);
645 for (ordinal_type j=0;j<numRows;++j)
646 for (ordinal_type i=0;i<numRows;++i)
647 A(i, j) = elemMat_host(0, i, j);
649 for (ordinal_type j=0;j<numCols;++j)
650 for (ordinal_type i=numCols;i<numRows;++i)
654 ordinal_type lwork(-1);
656 ordinal_type info(0);
659 numRows, numRows, numCells,
660 nullptr, std::max(1,numRows),
661 nullptr, std::max(1,numRows),
667 matrix_host_type C(do_not_init_tag(
"C"), numRows, numCells);
669 host_range_policy_type policy(0, numCells);
672 (
"ProjectionTools::solveHost::matrixIndependentOfCell::pack",
673 policy, [=](
const ordinal_type & ic) {
674 for (ordinal_type i=0;i<numRows;++i)
675 C(i,ic) = elemRhs_host(ic, i);
680 vector_host_type work(do_not_init_tag(
"work"), lwork);
681 ordinal_type info(0);
683 numRows, numRows, numCells,
684 A.data(), std::max(1,numRows),
685 C.data(), std::max(1,numRows),
688 INTREPID2_TEST_FOR_EXCEPTION
689 (info != 0, std::runtime_error,
"GELS return non-zero info code");
693 (
"ProjectionTools::solveHost::matrixIndependentOfCell::unpack",
694 policy, [=](
const ordinal_type & ic) {
695 for (ordinal_type i=0;i<numCols;++i)
696 basisCoeffs_host(ic,elemDof_host(i)) = C(i,ic);
700 const ordinal_type level(0);
701 host_team_policy_type policy(numCells, 1, 1);
704 ordinal_type lwork(-1);
706 ordinal_type info(0);
710 nullptr, std::max(1,numRows),
711 nullptr, std::max(1,numRows),
717 const ordinal_type per_team_extent = numRows*numRows + numRows + lwork;
718 const ordinal_type per_team_scratch = scratch_host_type::shmem_size(per_team_extent);
719 policy.set_scratch_size(level, Kokkos::PerTeam(per_team_scratch));
723 (
"ProjectionTools::solveHost::matrixDependentOfCell",
724 policy, [=](
const typename host_team_policy_type::member_type& member) {
725 const ordinal_type ic = member.league_rank();
727 scratch_host_type scratch(member.team_scratch(level), per_team_extent);
728 value_type * sptr = scratch.data();
731 matrix_host_type A(sptr, numRows, numRows); sptr += A.span();
732 for (ordinal_type j=0;j<numRows;++j)
733 for (ordinal_type i=0;i<numRows;++i)
734 A(i, j) = elemMat_host(ic, i, j);
736 for (ordinal_type j=0;j<numCols;++j)
737 for (ordinal_type i=numCols;i<numRows;++i)
740 vector_host_type c(sptr, numRows); sptr += c.span();
741 for (ordinal_type i=0;i<numRows;++i)
742 c(i) = elemRhs_host(ic, i);
744 vector_host_type work(sptr, lwork); sptr += work.span();
745 ordinal_type info(0);
748 A.data(), std::max(1,numRows),
749 c.data(), std::max(1,numRows),
752 INTREPID2_TEST_FOR_EXCEPTION
753 (info != 0, std::runtime_error,
"GELS return non-zero info code");
756 for (ordinal_type i=0;i<numCols;++i)
757 basisCoeffs_host(ic,elemDof_host(i)) = c(i);
760 Kokkos::deep_copy(basisCoeffs, basisCoeffs_host);