34 using ExecutionSpace =
typename DeviceType::execution_space;
35 using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace>;
36 using TeamMember =
typename TeamPolicy::member_type;
38 using IntegralViewType = Kokkos::View<typename RankExpander<Scalar, integralViewRank>::value_type, DeviceType>;
39 IntegralViewType integralView_;
46 int leftComponentSpan_;
47 int rightComponentSpan_;
48 int numTensorComponents_;
49 int leftFieldOrdinalOffset_;
50 int rightFieldOrdinalOffset_;
51 bool forceNonSpecialized_;
53 size_t fad_size_output_ = 0;
55 Kokkos::Array<int, 7> offsetsForComponentOrdinal_;
59 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldBounds_;
60 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldBounds_;
61 Kokkos::Array<int,Parameters::MaxTensorComponents> pointBounds_;
63 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldRelativeEnumerationSpans_;
64 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldRelativeEnumerationSpans_;
77 int leftFieldOrdinalOffset,
78 int rightFieldOrdinalOffset,
79 bool forceNonSpecialized)
81 integralView_(integralData.template getUnderlyingView<integralViewRank>()),
82 leftComponent_(leftComponent),
83 composedTransform_(composedTransform),
84 rightComponent_(rightComponent),
85 cellMeasures_(cellMeasures),
88 leftComponentSpan_(leftComponent.
extent_int(2)),
89 rightComponentSpan_(rightComponent.
extent_int(2)),
91 leftFieldOrdinalOffset_(leftFieldOrdinalOffset),
92 rightFieldOrdinalOffset_(rightFieldOrdinalOffset),
93 forceNonSpecialized_(forceNonSpecialized)
95 INTREPID2_TEST_FOR_EXCEPTION(numTensorComponents_ != rightComponent_.
numTensorComponents(), std::invalid_argument,
"Left and right components must have matching number of tensorial components");
98 const int FIELD_DIM = 0;
99 const int POINT_DIM = 1;
103 for (
int r=0; r<numTensorComponents_; r++)
106 maxFieldsLeft_ = std::max(maxFieldsLeft_, leftFieldBounds_[r]);
108 maxFieldsRight_ = std::max(maxFieldsRight_, rightFieldBounds_[r]);
110 maxPointCount_ = std::max(maxPointCount_, pointBounds_[r]);
114 int leftRelativeEnumerationSpan = 1;
115 int rightRelativeEnumerationSpan = 1;
116 for (
int startingComponent=numTensorComponents_-2; startingComponent>=0; startingComponent--)
118 leftRelativeEnumerationSpan *= leftFieldBounds_[startingComponent];
119 rightRelativeEnumerationSpan *= rightFieldBounds_[startingComponent];
120 leftFieldRelativeEnumerationSpans_ [startingComponent] = leftRelativeEnumerationSpan;
121 rightFieldRelativeEnumerationSpans_[startingComponent] = rightRelativeEnumerationSpan;
127 const bool allocateFadStorage = !(std::is_standard_layout<Scalar>::value && std::is_trivial<Scalar>::value);
128 if (allocateFadStorage)
133 const int R = numTensorComponents_ - 1;
136 int allocationSoFar = 0;
137 offsetsForComponentOrdinal_[R] = allocationSoFar;
140 for (
int r=R-1; r>0; r--)
145 num_ij *= leftFields * rightFields;
147 offsetsForComponentOrdinal_[r] = allocationSoFar;
148 allocationSoFar += num_ij;
150 offsetsForComponentOrdinal_[0] = allocationSoFar;
153 template<
size_t maxComponents,
size_t numComponents = maxComponents>
154 KOKKOS_INLINE_FUNCTION
155 int incrementArgument( Kokkos::Array<int,maxComponents> &arguments,
156 const Kokkos::Array<int,maxComponents> &bounds)
const
158 if (numComponents == 0)
164 int r =
static_cast<int>(numComponents - 1);
165 while (arguments[r] + 1 >= bounds[r])
171 if (r >= 0) ++arguments[r];
177 KOKKOS_INLINE_FUNCTION
179 const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
180 const int &numComponents)
const
182 if (numComponents == 0)
return -1;
183 int r =
static_cast<int>(numComponents - 1);
184 while (arguments[r] + 1 >= bounds[r])
190 if (r >= 0) ++arguments[r];
194 template<
size_t maxComponents,
size_t numComponents = maxComponents>
195 KOKKOS_INLINE_FUNCTION
196 int nextIncrementResult(
const Kokkos::Array<int,maxComponents> &arguments,
197 const Kokkos::Array<int,maxComponents> &bounds)
const
199 if (numComponents == 0)
205 int r =
static_cast<int>(numComponents - 1);
206 while (arguments[r] + 1 >= bounds[r])
216 KOKKOS_INLINE_FUNCTION
218 const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
219 const int &numComponents)
const
221 if (numComponents == 0)
return -1;
222 int r = numComponents - 1;
223 while (arguments[r] + 1 >= bounds[r])
231 template<
size_t maxComponents,
size_t numComponents = maxComponents>
232 KOKKOS_INLINE_FUNCTION
233 int relativeEnumerationIndex(
const Kokkos::Array<int,maxComponents> &arguments,
234 const Kokkos::Array<int,maxComponents> &bounds,
235 const int startIndex)
const
238 if (numComponents == 0)
244 int enumerationIndex = 0;
245 for (
size_t r=numComponents-1; r>
static_cast<size_t>(startIndex); r--)
247 enumerationIndex += arguments[r];
248 enumerationIndex *= bounds[r-1];
250 enumerationIndex += arguments[startIndex];
251 return enumerationIndex;
265 KOKKOS_INLINE_FUNCTION
268 constexpr int numTensorComponents = 3;
270 Kokkos::Array<int,numTensorComponents> pointBounds;
271 Kokkos::Array<int,numTensorComponents> leftFieldBounds;
272 Kokkos::Array<int,numTensorComponents> rightFieldBounds;
273 for (
unsigned r=0; r<numTensorComponents; r++)
275 pointBounds[r] = pointBounds_[r];
276 leftFieldBounds[r] = leftFieldBounds_[r];
277 rightFieldBounds[r] = rightFieldBounds_[r];
280 const int cellDataOrdinal = teamMember.league_rank();
281 const int numThreads = teamMember.team_size();
282 const int scratchViewSize = offsetsForComponentOrdinal_[0];
284 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> scratchView;
285 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights;
286 Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_x, leftFields_y, leftFields_z, rightFields_x, rightFields_y, rightFields_z;
287 if (fad_size_output_ > 0) {
288 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads, fad_size_output_);
289 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.
extent_int(1), fad_size_output_);
290 leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[0], pointBounds[0], fad_size_output_);
291 rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[0], pointBounds[0], fad_size_output_);
292 leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[1], pointBounds[1], fad_size_output_);
293 rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[1], pointBounds[1], fad_size_output_);
294 leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[2], pointBounds[2], fad_size_output_);
295 rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[2], pointBounds[2], fad_size_output_);
298 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads);
299 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.
extent_int(1));
300 leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[0], pointBounds[0]);
301 rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[0], pointBounds[0]);
302 leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[1], pointBounds[1]);
303 rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[1], pointBounds[1]);
304 leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[2], pointBounds[2]);
305 rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[2], pointBounds[2]);
311 constexpr int R = numTensorComponents - 1;
313 const int composedTransformRank = composedTransform_.
rank();
315 for (
int a_component=0; a_component < leftComponentSpan_; a_component++)
317 const int a = a_offset_ + a_component;
318 for (
int b_component=0; b_component < rightComponentSpan_; b_component++)
320 const int b = b_offset_ + b_component;
325 const int numLeftFieldsFinal = leftFinalComponent.
extent_int(0);
326 const int numRightFieldsFinal = rightFinalComponent.
extent_int(0);
335 const int maxFields = (maxFieldsLeft_ > maxFieldsRight_) ? maxFieldsLeft_ : maxFieldsRight_;
336 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,maxFields * maxPointCount_), [&] (
const int& fieldOrdinalPointOrdinal) {
337 const int fieldOrdinal = fieldOrdinalPointOrdinal % maxFields;
338 const int pointOrdinal = fieldOrdinalPointOrdinal / maxFields;
339 if ((fieldOrdinal < leftTensorComponent_x.
extent_int(0)) && (pointOrdinal < leftTensorComponent_x.
extent_int(1)))
341 const int leftRank = leftTensorComponent_x.rank();
342 leftFields_x(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_x(fieldOrdinal,pointOrdinal) : leftTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
344 if ((fieldOrdinal < leftTensorComponent_y.
extent_int(0)) && (pointOrdinal < leftTensorComponent_y.
extent_int(1)))
346 const int leftRank = leftTensorComponent_y.rank();
347 leftFields_y(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_y(fieldOrdinal,pointOrdinal) : leftTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
349 if ((fieldOrdinal < leftTensorComponent_z.
extent_int(0)) && (pointOrdinal < leftTensorComponent_z.
extent_int(1)))
351 const int leftRank = leftTensorComponent_z.rank();
352 leftFields_z(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_z(fieldOrdinal,pointOrdinal) : leftTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
354 if ((fieldOrdinal < rightTensorComponent_x.
extent_int(0)) && (pointOrdinal < rightTensorComponent_x.
extent_int(1)))
356 const int rightRank = rightTensorComponent_x.rank();
357 rightFields_x(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_x(fieldOrdinal,pointOrdinal) : rightTensorComponent_x(fieldOrdinal,pointOrdinal,b_component);
359 if ((fieldOrdinal < rightTensorComponent_y.
extent_int(0)) && (pointOrdinal < rightTensorComponent_y.
extent_int(1)))
361 const int rightRank = rightTensorComponent_y.rank();
362 rightFields_y(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_y(fieldOrdinal,pointOrdinal) : rightTensorComponent_y(fieldOrdinal,pointOrdinal,b_component);
364 if ((fieldOrdinal < rightTensorComponent_z.
extent_int(0)) && (pointOrdinal < rightTensorComponent_z.
extent_int(1)))
366 const int rightRank = rightTensorComponent_z.rank();
367 rightFields_z(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_z(fieldOrdinal,pointOrdinal) : rightTensorComponent_z(fieldOrdinal,pointOrdinal,b_component);
373 if (composedTransformRank == 4)
376 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransformView.extent_int(1)), [&] (
const int& pointOrdinal) {
377 pointWeights(pointOrdinal) = composedTransformView(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
383 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransformView.extent_int(1)), [&] (
const int& pointOrdinal) {
384 pointWeights(pointOrdinal) = composedTransformView(cellDataOrdinal,pointOrdinal) * cellMeasures_(cellDataOrdinal,pointOrdinal);
390 if (composedTransformRank == 4)
392 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.
extent_int(1)), [&] (
const int& pointOrdinal) {
393 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
398 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.
extent_int(1)), [&] (
const int& pointOrdinal) {
399 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal) * cellMeasures_(cellDataOrdinal,pointOrdinal);
407 teamMember.team_barrier();
410 const int scratchOffsetForThread = teamMember.team_rank() * scratchViewSize;
411 for (
int i=scratchOffsetForThread; i<scratchOffsetForThread+scratchViewSize; i++)
413 scratchView(i) = 0.0;
417 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numLeftFieldsFinal * numRightFieldsFinal), [&] (
const int& leftRightFieldEnumeration) {
418 const int iz = leftRightFieldEnumeration % numLeftFieldsFinal;
419 const int jz = leftRightFieldEnumeration / numLeftFieldsFinal;
423 Kokkos::Array<int,numTensorComponents> leftFieldArguments;
424 Kokkos::Array<int,numTensorComponents> rightFieldArguments;
425 rightFieldArguments[R] = jz;
426 leftFieldArguments[R] = iz;
428 Kokkos::Array<int,numTensorComponents> pointArguments;
429 for (
int i=0; i<numTensorComponents; i++)
431 pointArguments[i] = 0;
434 for (
int lx=0; lx<pointBounds[0]; lx++)
436 pointArguments[0] = lx;
440 const int Gy_start_index = scratchOffsetForThread + offsetsForComponentOrdinal_[1];
441 const int Gy_end_index = scratchOffsetForThread + offsetsForComponentOrdinal_[0];
443 for (
int Gy_index=Gy_start_index; Gy_index < Gy_end_index; Gy_index++)
445 scratchView(Gy_index) = 0;
448 for (
int ly=0; ly<pointBounds[1]; ly++)
450 pointArguments[1] = ly;
452 Scalar * Gz = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[R]);
455 for (
int lz=0; lz < pointBounds[2]; lz++)
457 const Scalar & leftValue = leftFields_z(iz,lz);
458 const Scalar & rightValue = rightFields_z(jz,lz);
460 pointArguments[2] = lz;
461 int pointEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(pointArguments, pointBounds, 0);
463 *Gz += leftValue * pointWeights(pointEnumerationIndex) * rightValue;
468 for (
int iy=0; iy<leftFieldBounds_[1]; iy++)
470 leftFieldArguments[1] = iy;
471 const int leftEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, 1);
473 const Scalar & leftValue = leftFields_y(iy,ly);
475 for (
int jy=0; jy<rightFieldBounds_[1]; jy++)
477 rightFieldArguments[1] = jy;
479 const int rightEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, 1);
480 const Scalar & rightValue = rightFields_y(jy,ly);
482 const int & rightEnumerationSpan_y = rightFieldRelativeEnumerationSpans_[1];
483 const int Gy_index = leftEnumerationIndex_y * rightEnumerationSpan_y + rightEnumerationIndex_y;
485 const int Gz_index = 0;
486 const Scalar & aGz = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[2] + Gz_index);
488 Scalar & Gy = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[1] + Gy_index);
490 Gy += leftValue * aGz * rightValue;
495 for (
int ix=0; ix<leftFieldBounds_[0]; ix++)
497 leftFieldArguments[0] = ix;
498 const Scalar & leftValue = leftFields_x(ix,lx);
500 for (
int iy=0; iy<leftFieldBounds_[1]; iy++)
502 leftFieldArguments[1] = iy;
504 const int leftEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, 1);
506 for (
int jx=0; jx<rightFieldBounds_[0]; jx++)
508 rightFieldArguments[0] = jx;
509 const Scalar & rightValue = rightFields_x(jx,lx);
511 for (
int jy=0; jy<rightFieldBounds_[1]; jy++)
513 rightFieldArguments[1] = jy;
514 const int rightEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, 1);
516 const int rightEnumerationSpan_y = rightFieldRelativeEnumerationSpans_[1];
518 const int Gy_index = leftEnumerationIndex_y * rightEnumerationSpan_y + rightEnumerationIndex_y;
519 Scalar & Gy = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[1] + Gy_index);
522 int leftEnumerationIndex = relativeEnumerationIndex<numTensorComponents>( leftFieldArguments, leftFieldBounds, 0);
523 int rightEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(rightFieldArguments, rightFieldBounds, 0);
524 const int leftFieldIndex = leftEnumerationIndex + leftFieldOrdinalOffset_;
525 const int rightFieldIndex = rightEnumerationIndex + rightFieldOrdinalOffset_;
527 if (integralViewRank == 3)
549 integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex) += leftValue * Gy * rightValue;
554 integralView_.access(leftFieldIndex,rightFieldIndex,0) += leftValue * Gy * rightValue;
568 template<
size_t numTensorComponents>
569 KOKKOS_INLINE_FUNCTION
570 void run(
const TeamMember & teamMember )
const
572 Kokkos::Array<int,numTensorComponents> pointBounds;
573 Kokkos::Array<int,numTensorComponents> leftFieldBounds;
574 Kokkos::Array<int,numTensorComponents> rightFieldBounds;
575 for (
unsigned r=0; r<numTensorComponents; r++)
577 pointBounds[r] = pointBounds_[r];
578 leftFieldBounds[r] = leftFieldBounds_[r];
579 rightFieldBounds[r] = rightFieldBounds_[r];
582 const int cellDataOrdinal = teamMember.league_rank();
583 const int numThreads = teamMember.team_size();
584 const int scratchViewSize = offsetsForComponentOrdinal_[0];
585 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> scratchView;
586 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights;
587 Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged> leftFields, rightFields;
588 if (fad_size_output_ > 0) {
589 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads, fad_size_output_);
590 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.
extent_int(1), fad_size_output_);
591 leftFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsLeft_, maxPointCount_, fad_size_output_);
592 rightFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsRight_, maxPointCount_, fad_size_output_);
595 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize*numThreads);
596 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.
extent_int(1));
597 leftFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsLeft_, maxPointCount_);
598 rightFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsRight_, maxPointCount_);
604 constexpr int R = numTensorComponents - 1;
606 const int composedTransformRank = composedTransform_.
rank();
608 for (
int a_component=0; a_component < leftComponentSpan_; a_component++)
610 const int a = a_offset_ + a_component;
611 for (
int b_component=0; b_component < rightComponentSpan_; b_component++)
613 const int b = b_offset_ + b_component;
615 const Data<Scalar,DeviceType> & leftFinalComponent = leftComponent_.
getTensorComponent(R);
616 const Data<Scalar,DeviceType> & rightFinalComponent = rightComponent_.
getTensorComponent(R);
618 const int numLeftFieldsFinal = leftFinalComponent.
extent_int(0);
619 const int numRightFieldsFinal = rightFinalComponent.extent_int(0);
621 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numTensorComponents), [&] (
const int& r) {
622 const Data<Scalar,DeviceType> & leftTensorComponent = leftComponent_.
getTensorComponent(r);
623 const Data<Scalar,DeviceType> & rightTensorComponent = rightComponent_.
getTensorComponent(r);
624 const int leftFieldCount = leftTensorComponent.
extent_int(0);
625 const int pointCount = leftTensorComponent.extent_int(1);
626 const int leftRank = leftTensorComponent.rank();
627 const int rightFieldCount = rightTensorComponent.extent_int(0);
628 const int rightRank = rightTensorComponent.rank();
629 for (
int fieldOrdinal=0; fieldOrdinal<maxFieldsLeft_; fieldOrdinal++)
632 const int fieldAddress = (fieldOrdinal < leftFieldCount) ? fieldOrdinal : leftFieldCount - 1;
633 for (
int pointOrdinal=0; pointOrdinal<maxPointCount_; pointOrdinal++)
635 const int pointAddress = (pointOrdinal < pointCount) ? pointOrdinal : pointCount - 1;
636 leftFields(r,fieldAddress,pointAddress) = (leftRank == 2) ? leftTensorComponent(fieldAddress,pointAddress) : leftTensorComponent(fieldAddress,pointAddress,a_component);
639 for (
int fieldOrdinal=0; fieldOrdinal<maxFieldsRight_; fieldOrdinal++)
642 const int fieldAddress = (fieldOrdinal < rightFieldCount) ? fieldOrdinal : rightFieldCount - 1;
643 for (
int pointOrdinal=0; pointOrdinal<maxPointCount_; pointOrdinal++)
645 const int pointAddress = (pointOrdinal < pointCount) ? pointOrdinal : pointCount - 1;
646 rightFields(r,fieldAddress,pointAddress) = (rightRank == 2) ? rightTensorComponent(fieldAddress,pointAddress) : rightTensorComponent(fieldAddress,pointAddress,b_component);
651 if (composedTransformRank == 4)
653 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.
extent_int(1)), [&] (
const int& pointOrdinal) {
654 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
659 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.
extent_int(1)), [&] (
const int& pointOrdinal) {
660 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal) * cellMeasures_(cellDataOrdinal,pointOrdinal);
666 teamMember.team_barrier();
668 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numLeftFieldsFinal * numRightFieldsFinal), [&] (
const int& leftRightFieldEnumeration) {
669 const int scratchOffsetForThread = teamMember.team_rank() * scratchViewSize;
670 const int i_R = leftRightFieldEnumeration % numLeftFieldsFinal;
671 const int j_R = leftRightFieldEnumeration / numLeftFieldsFinal;
675 Kokkos::Array<int,numTensorComponents> leftFieldArguments;
676 Kokkos::Array<int,numTensorComponents> rightFieldArguments;
677 rightFieldArguments[R] = j_R;
678 leftFieldArguments[R] = i_R;
681 for (
int i=scratchOffsetForThread; i<scratchOffsetForThread+scratchViewSize; i++)
683 scratchView(i) = 0.0;
685 Kokkos::Array<int,numTensorComponents> pointArguments;
686 for (
unsigned i=0; i<numTensorComponents; i++)
688 pointArguments[i] = 0;
695 const int pointBounds_R = pointBounds[R];
696 int & pointArgument_R = pointArguments[R];
697 for (pointArgument_R=0; pointArgument_R < pointBounds_R; pointArgument_R++)
702 G_R = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[R]);
706 const int leftFieldIndex = i_R + leftFieldOrdinalOffset_;
707 const int rightFieldIndex = j_R + rightFieldOrdinalOffset_;
709 if (integralViewRank == 3)
712 G_R = &integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex);
717 G_R = &integralView_.access(leftFieldIndex,rightFieldIndex,0);
721 const int & pointIndex_R = pointArguments[R];
723 const Scalar & leftValue = leftFields(R,i_R,pointIndex_R);
724 const Scalar & rightValue = rightFields(R,j_R,pointIndex_R);
726 int pointEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(pointArguments, pointBounds, 0);
728 *G_R += leftValue * pointWeights(pointEnumerationIndex) * rightValue;
733 const int r_next = nextIncrementResult<numTensorComponents,numTensorComponents-1>(pointArguments, pointBounds);
734 const int r_min = (r_next >= 0) ? r_next : 0;
736 for (
int s=R-1; s>=r_min; s--)
738 const int & pointIndex_s = pointArguments[s];
741 for (
int s1=s; s1<R; s1++)
743 leftFieldArguments[s1] = 0;
747 const int & i_s = leftFieldArguments[s];
748 const int & j_s = rightFieldArguments[s];
751 const int & rightEnumerationSpan_s = rightFieldRelativeEnumerationSpans_[s];
752 const int & rightEnumerationSpan_sp = rightFieldRelativeEnumerationSpans_[s+1];
756 const int leftEnumerationIndex_s = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, s);
759 const int leftEnumerationIndex_sp = (s+1 == R) ? 0 : relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, s+1);
761 const Scalar & leftValue = leftFields(s,i_s,pointIndex_s);
763 for (
int s1=s; s1<R; s1++)
765 rightFieldArguments[s1] = 0;
770 const int rightEnumerationIndex_s = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, s);
773 const int rightEnumerationIndex_sp = (s+1 == R) ? 0 : relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, s+1);
775 const Scalar & rightValue = rightFields(s,j_s,pointIndex_s);
777 const int G_s_index = leftEnumerationIndex_s * rightEnumerationSpan_s + rightEnumerationIndex_s;
783 const int G_sp_index = leftEnumerationIndex_sp * rightEnumerationSpan_sp + rightEnumerationIndex_sp;
785 const Scalar & G_sp = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[s+1] + G_sp_index);
790 G_s = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[s] + G_s_index);
795 int leftEnumerationIndex = relativeEnumerationIndex<numTensorComponents>( leftFieldArguments, leftFieldBounds, 0);
796 int rightEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(rightFieldArguments, rightFieldBounds, 0);
797 const int leftFieldIndex = leftEnumerationIndex + leftFieldOrdinalOffset_;
798 const int rightFieldIndex = rightEnumerationIndex + rightFieldOrdinalOffset_;
800 if (integralViewRank == 3)
803 G_s = &integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex);
808 G_s = &integralView_.access(leftFieldIndex,rightFieldIndex,0);
812 *G_s += leftValue * G_sp * rightValue;
817 sRight = incrementArgument<numTensorComponents,R>(rightFieldArguments, rightFieldBounds);
821 sLeft = incrementArgument<numTensorComponents,R>(leftFieldArguments, leftFieldBounds);
828 const int endIndex = scratchOffsetForThread + offsetsForComponentOrdinal_[r_min];
829 for (
int i=scratchOffsetForThread; i<endIndex; i++)
831 scratchView(i) = 0.0;
838 r = incrementArgument<numTensorComponents,numTensorComponents-1>(pointArguments, pointBounds);
846 KOKKOS_INLINE_FUNCTION
847 void operator()(
const TeamMember & teamMember )
const
849 switch (numTensorComponents_)
851 case 1: run<1>(teamMember);
break;
852 case 2: run<2>(teamMember);
break;
854 if (forceNonSpecialized_)
859 case 4: run<4>(teamMember);
break;
860 case 5: run<5>(teamMember);
break;
861 case 6: run<6>(teamMember);
break;
862 case 7: run<7>(teamMember);
break;
863#ifdef INTREPID2_HAVE_DEBUG
876 const int R = numTensorComponents_ - 1;
882 for (
int a_component=0; a_component < leftComponentSpan_; a_component++)
884 for (
int b_component=0; b_component < rightComponentSpan_; b_component++)
889 const int numLeftFieldsFinal = leftFinalComponent.
extent_int(0);
890 const int numRightFieldsFinal = rightFinalComponent.
extent_int(0);
892 flopCount += flopsPerPoint_ab * cellMeasures_.
extent_int(1);
894 int flopsPer_i_R_j_R = 0;
898 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldArguments;
899 leftFieldArguments[R] = 0;
901 Kokkos::Array<int,Parameters::MaxTensorComponents> pointArguments;
902 for (
int i=0; i<numTensorComponents_; i++)
904 pointArguments[i] = 0;
909 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldArguments;
910 rightFieldArguments[R] = 0;
916 for (pointArguments[R]=0; pointArguments[R] < pointBounds_[R]; pointArguments[R]++)
918 flopsPer_i_R_j_R += 4;
926 const int r_next = nextIncrementResult(pointArguments, pointBounds_, numTensorComponents_);
927 const int r_min = (r_next >= 0) ? r_next : 0;
929 for (
int s=R-1; s>=r_min; s--)
932 int numLeftIterations = leftFieldRelativeEnumerationSpans_[s];
933 int numRightIterations = rightFieldRelativeEnumerationSpans_[s];
935 flopsPer_i_R_j_R += numLeftIterations * numRightIterations * 3;
939 r = incrementArgument(pointArguments, pointBounds_, numTensorComponents_);
942 flopCount += flopsPer_i_R_j_R * numLeftFieldsFinal * numRightFieldsFinal;
950 int teamSize(
const int &maxTeamSizeFromKokkos)
const
952 const int R = numTensorComponents_ - 1;
953 const int threadParallelismExpressed = leftFieldBounds_[R] * rightFieldBounds_[R];
954 return std::min(maxTeamSizeFromKokkos, threadParallelismExpressed);
961 size_t shmem_size = 0;
963 if (fad_size_output_ > 0)
965 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(offsetsForComponentOrdinal_[0] * team_size, fad_size_output_);
966 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(composedTransform_.
extent_int(1), fad_size_output_);
967 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsLeft_, maxPointCount_, fad_size_output_);
968 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsRight_, maxPointCount_, fad_size_output_);
972 shmem_size += Kokkos::View<Scalar*, DeviceType>::shmem_size(offsetsForComponentOrdinal_[0] * team_size);
973 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(composedTransform_.
extent_int(1));
974 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsLeft_, maxPointCount_);
975 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsRight_, maxPointCount_);
994 using ExecutionSpace =
typename DeviceType::execution_space;
995 using TeamPolicy = Kokkos::TeamPolicy<DeviceType>;
996 using TeamMember =
typename TeamPolicy::member_type;
998 using IntegralViewType = Kokkos::View<typename RankExpander<Scalar, integralViewRank>::value_type, DeviceType>;
999 IntegralViewType integralView_;
1006 int leftComponentSpan_;
1007 int rightComponentSpan_;
1008 int numTensorComponents_;
1009 int leftFieldOrdinalOffset_;
1010 int rightFieldOrdinalOffset_;
1012 size_t fad_size_output_ = 0;
1016 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldBounds_;
1017 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldBounds_;
1018 Kokkos::Array<int,Parameters::MaxTensorComponents> pointBounds_;
1021 int maxFieldsRight_;
1031 int leftFieldOrdinalOffset,
1032 int rightFieldOrdinalOffset)
1034 integralView_(integralData.template getUnderlyingView<integralViewRank>()),
1035 leftComponent_(leftComponent),
1036 composedTransform_(composedTransform),
1037 rightComponent_(rightComponent),
1038 cellMeasures_(cellMeasures),
1039 a_offset_(a_offset),
1040 b_offset_(b_offset),
1041 leftComponentSpan_(leftComponent.
extent_int(2)),
1042 rightComponentSpan_(rightComponent.
extent_int(2)),
1044 leftFieldOrdinalOffset_(leftFieldOrdinalOffset),
1045 rightFieldOrdinalOffset_(rightFieldOrdinalOffset)
1047 INTREPID2_TEST_FOR_EXCEPTION(numTensorComponents_ != rightComponent_.
numTensorComponents(), std::invalid_argument,
"Left and right components must have matching number of tensorial components");
1049 const int FIELD_DIM = 0;
1050 const int POINT_DIM = 1;
1052 maxFieldsRight_ = 0;
1054 for (
int r=0; r<numTensorComponents_; r++)
1057 maxFieldsLeft_ = std::max(maxFieldsLeft_, leftFieldBounds_[r]);
1059 maxFieldsRight_ = std::max(maxFieldsRight_, rightFieldBounds_[r]);
1061 maxPointCount_ = std::max(maxPointCount_, pointBounds_[r]);
1067 const bool allocateFadStorage = !(std::is_standard_layout<Scalar>::value && std::is_trivial<Scalar>::value);
1068 if (allocateFadStorage)
1074 template<
size_t maxComponents,
size_t numComponents = maxComponents>
1075 KOKKOS_INLINE_FUNCTION
1076 int incrementArgument( Kokkos::Array<int,maxComponents> &arguments,
1077 const Kokkos::Array<int,maxComponents> &bounds)
const
1079 if (numComponents == 0)
return -1;
1080 int r =
static_cast<int>(numComponents - 1);
1081 while (arguments[r] + 1 >= bounds[r])
1087 if (r >= 0) ++arguments[r];
1092 KOKKOS_INLINE_FUNCTION
1094 const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
1095 const int &numComponents)
const
1097 if (numComponents == 0)
return -1;
1098 int r =
static_cast<int>(numComponents - 1);
1099 while (arguments[r] + 1 >= bounds[r])
1105 if (r >= 0) ++arguments[r];
1109 template<
size_t maxComponents,
size_t numComponents = maxComponents>
1110 KOKKOS_INLINE_FUNCTION
1111 int nextIncrementResult(
const Kokkos::Array<int,maxComponents> &arguments,
1112 const Kokkos::Array<int,maxComponents> &bounds)
const
1114 if (numComponents == 0)
return -1;
1115 int r =
static_cast<int>(numComponents - 1);
1116 while (arguments[r] + 1 >= bounds[r])
1125 KOKKOS_INLINE_FUNCTION
1127 const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
1128 const int &numComponents)
const
1130 if (numComponents == 0)
return -1;
1131 int r = numComponents - 1;
1132 while (arguments[r] + 1 >= bounds[r])
1140 template<
size_t maxComponents,
size_t numComponents = maxComponents>
1141 KOKKOS_INLINE_FUNCTION
1142 int relativeEnumerationIndex(
const Kokkos::Array<int,maxComponents> &arguments,
1143 const Kokkos::Array<int,maxComponents> &bounds,
1144 const int startIndex)
const
1147 if (numComponents == 0)
1151 int enumerationIndex = 0;
1152 for (
size_t r=numComponents-1; r>
static_cast<size_t>(startIndex); r--)
1154 enumerationIndex += arguments[r];
1155 enumerationIndex *= bounds[r-1];
1157 enumerationIndex += arguments[startIndex];
1158 return enumerationIndex;
1162 KOKKOS_INLINE_FUNCTION
1163 enable_if_t<rank==3 && rank==integralViewRank, Scalar &>
1164 integralViewEntry(
const IntegralViewType& integralView,
const int &cellDataOrdinal,
const int &i,
const int &j)
const
1166 return integralView(cellDataOrdinal,i,j);
1170 KOKKOS_INLINE_FUNCTION
1171 enable_if_t<rank==2 && rank==integralViewRank, Scalar &>
1172 integralViewEntry(
const IntegralViewType& integralView,
const int &cellDataOrdinal,
const int &i,
const int &j)
const
1174 return integralView(i,j);
1178 KOKKOS_INLINE_FUNCTION
1181 constexpr int numTensorComponents = 3;
1183 const int pointBounds_x = pointBounds_[0];
1184 const int pointBounds_y = pointBounds_[1];
1185 const int pointBounds_z = pointBounds_[2];
1186 const int pointsInNonzeroComponentDimensions = pointBounds_y * pointBounds_z;
1188 const int leftFieldBounds_x = leftFieldBounds_[0];
1189 const int rightFieldBounds_x = rightFieldBounds_[0];
1190 const int leftFieldBounds_y = leftFieldBounds_[1];
1191 const int rightFieldBounds_y = rightFieldBounds_[1];
1192 const int leftFieldBounds_z = leftFieldBounds_[2];
1193 const int rightFieldBounds_z = rightFieldBounds_[2];
1195 Kokkos::Array<int,numTensorComponents> leftFieldBounds;
1196 Kokkos::Array<int,numTensorComponents> rightFieldBounds;
1197 for (
unsigned r=0; r<numTensorComponents; r++)
1199 leftFieldBounds[r] = leftFieldBounds_[r];
1200 rightFieldBounds[r] = rightFieldBounds_[r];
1203 const auto integralView = integralView_;
1204 const auto leftFieldOrdinalOffset = leftFieldOrdinalOffset_;
1205 const auto rightFieldOrdinalOffset = rightFieldOrdinalOffset_;
1207 const int cellDataOrdinal = teamMember.league_rank();
1208 const int threadNumber = teamMember.team_rank();
1210 const int numThreads = teamMember.team_size();
1211 const int GyEntryCount = pointBounds_z;
1212 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GxIntegrals;
1213 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GyIntegrals;
1214 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights;
1216 Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_x, rightFields_x;
1217 Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_y, rightFields_y;
1218 Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_z, rightFields_z;
1219 if (fad_size_output_ > 0) {
1220 GxIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions, fad_size_output_);
1221 GyIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), GyEntryCount * numThreads, fad_size_output_);
1222 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.
extent_int(1), fad_size_output_);
1224 leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_x, pointBounds_x, fad_size_output_);
1225 rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_x, pointBounds_x, fad_size_output_);
1226 leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_y, pointBounds_y, fad_size_output_);
1227 rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_y, pointBounds_y, fad_size_output_);
1228 leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_z, pointBounds_z, fad_size_output_);
1229 rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_z, pointBounds_z, fad_size_output_);
1232 GxIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions);
1233 GyIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), GyEntryCount * numThreads);
1234 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.
extent_int(1));
1236 leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_x, pointBounds_x);
1237 rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_x, pointBounds_x);
1238 leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_y, pointBounds_y);
1239 rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_y, pointBounds_y);
1240 leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_z, pointBounds_z);
1241 rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_z, pointBounds_z);
1249 const int composedTransformRank = composedTransform_.
rank();
1252 teamMember.team_barrier();
1254 for (
int a_component=0; a_component < leftComponentSpan_; a_component++)
1256 const int a = a_offset_ + a_component;
1257 for (
int b_component=0; b_component < rightComponentSpan_; b_component++)
1259 const int b = b_offset_ + b_component;
1268 const int maxFields = (maxFieldsLeft_ > maxFieldsRight_) ? maxFieldsLeft_ : maxFieldsRight_;
1269 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,maxFields), [&] (
const int& fieldOrdinal) {
1270 if (fieldOrdinal < leftTensorComponent_x.
extent_int(0))
1272 const int pointCount = leftTensorComponent_x.extent_int(1);
1273 const int leftRank = leftTensorComponent_x.rank();
1274 for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1276 leftFields_x(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_x(fieldOrdinal,pointOrdinal) : leftTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
1279 if (fieldOrdinal < leftTensorComponent_y.
extent_int(0))
1281 const int pointCount = leftTensorComponent_y.extent_int(1);
1282 const int leftRank = leftTensorComponent_y.rank();
1283 for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1285 leftFields_y(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_y(fieldOrdinal,pointOrdinal) : leftTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
1288 if (fieldOrdinal < leftTensorComponent_z.extent_int(0))
1290 const int pointCount = leftTensorComponent_z.extent_int(1);
1291 const int leftRank = leftTensorComponent_z.rank();
1292 for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1294 leftFields_z(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_z(fieldOrdinal,pointOrdinal) : leftTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
1297 if (fieldOrdinal < rightTensorComponent_x.extent_int(0))
1299 const int pointCount = rightTensorComponent_x.extent_int(1);
1300 const int rightRank = rightTensorComponent_x.rank();
1301 for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1303 rightFields_x(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_x(fieldOrdinal,pointOrdinal) : rightTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
1306 if (fieldOrdinal < rightTensorComponent_y.extent_int(0))
1308 const int pointCount = rightTensorComponent_y.extent_int(1);
1309 const int rightRank = rightTensorComponent_y.rank();
1310 for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1312 rightFields_y(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_y(fieldOrdinal,pointOrdinal) : rightTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
1315 if (fieldOrdinal < rightTensorComponent_z.extent_int(0))
1317 const int pointCount = rightTensorComponent_z.extent_int(1);
1318 const int rightRank = rightTensorComponent_z.rank();
1319 for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1321 rightFields_z(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_z(fieldOrdinal,pointOrdinal) : rightTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
1326 if (composedTransformRank == 4)
1328 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.
extent_int(1)), [&] (
const int& pointOrdinal) {
1329 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1334 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.
extent_int(1)), [&] (
const int& pointOrdinal) {
1335 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1340 teamMember.team_barrier();
1342 for (
int i0=0; i0<leftFieldBounds_x; i0++)
1344 for (
int j0=0; j0<rightFieldBounds_x; j0++)
1346 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,pointsInNonzeroComponentDimensions), [&] (
const int& pointEnumerationIndexLaterDimensions) {
1348 const int tensorPointEnumerationOffset = pointBounds_x * pointEnumerationIndexLaterDimensions;
1350 Scalar & Gx = GxIntegrals(pointEnumerationIndexLaterDimensions);
1353 if (fad_size_output_ == 0)
1356 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, pointBounds_x), [&] (
const int &x_pointOrdinal, Scalar &integralThusFar)
1358 integralThusFar += leftFields_x(i0,x_pointOrdinal) * rightFields_x(j0,x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1363 for (
int x_pointOrdinal=0; x_pointOrdinal<pointBounds_x; x_pointOrdinal++)
1365 Gx += leftFields_x(i0,x_pointOrdinal) * rightFields_x(j0,x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1371 teamMember.team_barrier();
1373 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,leftFieldBounds_y * rightFieldBounds_y), [&] (
const int& i1j1) {
1374 const int i1 = i1j1 % leftFieldBounds_y;
1375 const int j1 = i1j1 / leftFieldBounds_y;
1377 int Gy_index_offset = GyEntryCount * threadNumber;
1379 for (
int lz=0; lz<pointBounds_z; lz++)
1381 int pointEnumerationIndex = lz * pointBounds_y;
1382 if (fad_size_output_ == 0)
1384 Scalar Gy_local = 0;
1387 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, pointBounds_y), [&] (
const int &ly, Scalar &integralThusFar)
1389 const Scalar & leftValue = leftFields_y(i1,ly);
1390 const Scalar & rightValue = rightFields_y(j1,ly);
1392 integralThusFar += leftValue * rightValue * GxIntegrals(pointEnumerationIndex + ly);
1395 GyIntegrals(Gy_index_offset + lz) = Gy_local;
1399 Scalar & Gy = GyIntegrals(Gy_index_offset + lz);
1400 for (
int ly=0; ly<pointBounds_y; ly++)
1402 const Scalar & leftValue = leftFields_y(i1,ly);
1403 const Scalar & rightValue = rightFields_y(j1,ly);
1405 Gy += leftValue * rightValue * GxIntegrals(pointEnumerationIndex + ly);
1410 for (
int i2=0; i2<leftFieldBounds_z; i2++)
1412 for (
int j2=0; j2<rightFieldBounds_z; j2++)
1416 if (fad_size_output_ == 0)
1419 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, pointBounds_z), [&] (
const int &lz, Scalar &integralThusFar)
1421 const Scalar & leftValue = leftFields_z(i2,lz);
1422 const Scalar & rightValue = rightFields_z(j2,lz);
1424 integralThusFar += leftValue * rightValue * GyIntegrals(Gy_index_offset+lz);
1429 for (
int lz=0; lz<pointBounds_z; lz++)
1431 const Scalar & leftValue = leftFields_z(i2,lz);
1432 const Scalar & rightValue = rightFields_z(j2,lz);
1434 Gz += leftValue * rightValue * GyIntegrals(Gy_index_offset+lz);
1438 const int i = leftFieldOrdinalOffset + i0 + (i1 + i2 * leftFieldBounds_y) * leftFieldBounds_x;
1439 const int j = rightFieldOrdinalOffset + j0 + (j1 + j2 * rightFieldBounds_y) * rightFieldBounds_x;
1444 Kokkos::single (Kokkos::PerThread(teamMember), [&] () {
1445 integralViewEntry<integralViewRank>(integralView, cellDataOrdinal, i, j) += Gz;
1451 teamMember.team_barrier();
1458 template<
size_t numTensorComponents>
1459 KOKKOS_INLINE_FUNCTION
1460 void run(
const TeamMember & teamMember )
const
1598 KOKKOS_INLINE_FUNCTION
1599 void operator()(
const TeamMember & teamMember )
const
1601 switch (numTensorComponents_)
1603 case 1: run<1>(teamMember);
break;
1604 case 2: run<2>(teamMember);
break;
1605 case 3: runSpecialized3(teamMember);
break;
1606 case 4: run<4>(teamMember);
break;
1607 case 5: run<5>(teamMember);
break;
1608 case 6: run<6>(teamMember);
break;
1609 case 7: run<7>(teamMember);
break;
1610#ifdef INTREPID2_HAVE_DEBUG
1623 constexpr int numTensorComponents = 3;
1624 Kokkos::Array<int,numTensorComponents> pointBounds;
1625 Kokkos::Array<int,numTensorComponents> leftFieldBounds;
1626 Kokkos::Array<int,numTensorComponents> rightFieldBounds;
1628 int pointsInNonzeroComponentDimensions = 1;
1629 for (
unsigned r=0; r<numTensorComponents; r++)
1631 pointBounds[r] = pointBounds_[r];
1632 if (r > 0) pointsInNonzeroComponentDimensions *= pointBounds[r];
1633 leftFieldBounds[r] = leftFieldBounds_[r];
1634 rightFieldBounds[r] = rightFieldBounds_[r];
1637 for (
int a_component=0; a_component < leftComponentSpan_; a_component++)
1639 for (
int b_component=0; b_component < rightComponentSpan_; b_component++)
1646 for (
int i0=0; i0<leftFieldBounds[0]; i0++)
1648 for (
int j0=0; j0<rightFieldBounds[0]; j0++)
1650 flopCount += pointsInNonzeroComponentDimensions * pointBounds[0] * 3;
1690 flopCount += leftFieldBounds[1] * rightFieldBounds[1] * pointBounds[1] * pointBounds[2] * 3;
1691 flopCount += leftFieldBounds[1] * rightFieldBounds[1] * leftFieldBounds[2] * rightFieldBounds[2] * pointBounds[2] * 3;
1692 flopCount += leftFieldBounds[1] * rightFieldBounds[1] * leftFieldBounds[2] * rightFieldBounds[2] * 1;
1768 const int R = numTensorComponents_ - 1;
1769 const int threadParallelismExpressed = leftFieldBounds_[R] * rightFieldBounds_[R];
1770 return std::min(maxTeamSizeFromKokkos, threadParallelismExpressed);
1777 size_t shmem_size = 0;
1779 const int GyEntryCount = pointBounds_[2];
1781 int pointsInNonzeroComponentDimensions = 1;
1782 for (
int d=1; d<numTensorComponents_; d++)
1784 pointsInNonzeroComponentDimensions *= pointBounds_[d];
1787 if (fad_size_output_ > 0)
1789 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(pointsInNonzeroComponentDimensions, fad_size_output_);
1790 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(GyEntryCount * numThreads, fad_size_output_);
1791 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size (composedTransform_.
extent_int(1), fad_size_output_);
1793 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[0], pointBounds_[0], fad_size_output_);
1794 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[0], pointBounds_[0], fad_size_output_);
1795 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[1], pointBounds_[1], fad_size_output_);
1796 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[1], pointBounds_[1], fad_size_output_);
1797 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[2], pointBounds_[2], fad_size_output_);
1798 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[2], pointBounds_[2], fad_size_output_);
1802 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(pointsInNonzeroComponentDimensions);
1803 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(GyEntryCount * numThreads);
1804 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size (composedTransform_.
extent_int(1));
1806 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[0], pointBounds_[0]);
1807 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[0], pointBounds_[0]);
1808 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[1], pointBounds_[1]);
1809 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[1], pointBounds_[1]);
1810 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[2], pointBounds_[2]);
1811 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[2], pointBounds_[2]);
2027 double* approximateFlops)
2029 using ExecutionSpace =
typename DeviceType::execution_space;
2033 const bool leftHasOrdinalFilter = basisValuesLeft.basisValues().ordinalFilter().extent_int(0) > 0;
2034 const bool rightHasOrdinalFilter = basisValuesRight.basisValues().ordinalFilter().extent_int(0) > 0;
2035 TEUCHOS_TEST_FOR_EXCEPTION(leftHasOrdinalFilter || rightHasOrdinalFilter, std::invalid_argument,
"Ordinal filters for BasisValues are not yet supported by IntegrationTools");
2037 if (approximateFlops != NULL)
2039 *approximateFlops = 0;
2051 const int numFieldsLeft = basisValuesLeft.
numFields();
2052 const int numFieldsRight = basisValuesRight.
numFields();
2053 const int spaceDim = basisValuesLeft.
spaceDim();
2055 INTREPID2_TEST_FOR_EXCEPTION(basisValuesLeft.
spaceDim() != basisValuesRight.
spaceDim(), std::invalid_argument,
"vectorDataLeft and vectorDataRight must agree on the space dimension");
2057 const int leftFamilyCount = basisValuesLeft.basisValues().numFamilies();
2058 const int rightFamilyCount = basisValuesRight.basisValues().numFamilies();
2062 int numTensorComponentsLeft = -1;
2063 const bool leftIsVectorValued = basisValuesLeft.
vectorData().isValid();
2065 if (leftIsVectorValued)
2067 const auto &refVectorLeft = basisValuesLeft.
vectorData();
2068 int numFamiliesLeft = refVectorLeft.numFamilies();
2069 int numVectorComponentsLeft = refVectorLeft.numComponents();
2070 Kokkos::Array<int,7> maxFieldsForComponentLeft {0,0,0,0,0,0,0};
2071 for (
int familyOrdinal=0; familyOrdinal<numFamiliesLeft; familyOrdinal++)
2073 for (
int vectorComponent=0; vectorComponent<numVectorComponentsLeft; vectorComponent++)
2078 if (numTensorComponentsLeft == -1)
2082 INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsLeft != tensorData.
numTensorComponents(), std::invalid_argument,
"Each valid entry in vectorDataLeft must have the same number of tensor components as every other");
2083 for (
int r=0; r<numTensorComponentsLeft; r++)
2093 numTensorComponentsLeft = basisValuesLeft.basisValues().tensorData(0).numTensorComponents();
2094 for (
int familyOrdinal = 0; familyOrdinal < leftFamilyCount; familyOrdinal++)
2096 INTREPID2_TEST_FOR_EXCEPTION(basisValuesLeft.basisValues().tensorData(familyOrdinal).numTensorComponents() != numTensorComponentsLeft, std::invalid_argument,
"All families must match in the number of tensor components");
2099 int numTensorComponentsRight = -1;
2100 const bool rightIsVectorValued = basisValuesRight.
vectorData().isValid();
2102 if (rightIsVectorValued)
2104 const auto &refVectorRight = basisValuesRight.
vectorData();
2105 int numFamiliesRight = refVectorRight.numFamilies();
2106 int numVectorComponentsRight = refVectorRight.numComponents();
2107 Kokkos::Array<int,7> maxFieldsForComponentRight {0,0,0,0,0,0,0};
2108 for (
int familyOrdinal=0; familyOrdinal<numFamiliesRight; familyOrdinal++)
2110 for (
int vectorComponent=0; vectorComponent<numVectorComponentsRight; vectorComponent++)
2112 const auto &tensorData = refVectorRight.getComponent(familyOrdinal,vectorComponent);
2113 if (tensorData.numTensorComponents() > 0)
2115 if (numTensorComponentsRight == -1)
2117 numTensorComponentsRight = tensorData.numTensorComponents();
2119 INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsRight != tensorData.numTensorComponents(), std::invalid_argument,
"Each valid entry in vectorDataRight must have the same number of tensor components as every other");
2120 for (
int r=0; r<numTensorComponentsRight; r++)
2122 maxFieldsForComponentRight[r] = std::max(tensorData.getTensorComponent(r).extent_int(0), maxFieldsForComponentRight[r]);
2127 INTREPID2_TEST_FOR_EXCEPTION(numTensorComponentsRight != numTensorComponentsLeft, std::invalid_argument,
"Right families must match left in the number of tensor components");
2132 for (
int familyOrdinal=0; familyOrdinal< rightFamilyCount; familyOrdinal++)
2134 INTREPID2_TEST_FOR_EXCEPTION(basisValuesRight.basisValues().tensorData(familyOrdinal).numTensorComponents() != numTensorComponentsLeft, std::invalid_argument,
"Right families must match left in the number of tensor components");
2139 if ((numPointTensorComponents == numTensorComponentsLeft) && basisValuesLeft.
axisAligned() && basisValuesRight.
axisAligned())
2148 Kokkos::Array<int,Parameters::MaxTensorComponents> pointDimensions;
2149 for (
int r=0; r<numPointTensorComponents; r++)
2156 Kokkos::View<Scalar**, DeviceType> integralView2;
2157 Kokkos::View<Scalar***, DeviceType> integralView3;
2158 if (integralViewRank == 3)
2166 for (
int leftFamilyOrdinal=0; leftFamilyOrdinal<leftFamilyCount; leftFamilyOrdinal++)
2169 int leftFieldOffset = basisValuesLeft.basisValues().familyFieldOrdinalOffset(leftFamilyOrdinal);
2171 const int leftVectorComponentCount = leftIsVectorValued ? basisValuesLeft.
vectorData().numComponents() : 1;
2172 for (
int leftVectorComponentOrdinal = 0; leftVectorComponentOrdinal < leftVectorComponentCount; leftVectorComponentOrdinal++)
2175 : basisValuesLeft.basisValues().tensorData(leftFamilyOrdinal);
2181 const int leftDimSpan = leftComponent.
extent_int(2);
2183 const int leftComponentFieldCount = leftComponent.
extent_int(0);
2185 for (
int rightFamilyOrdinal=0; rightFamilyOrdinal<rightFamilyCount; rightFamilyOrdinal++)
2188 int rightFieldOffset = basisValuesRight.
vectorData().familyFieldOrdinalOffset(rightFamilyOrdinal);
2190 const int rightVectorComponentCount = rightIsVectorValued ? basisValuesRight.
vectorData().numComponents() : 1;
2191 for (
int rightVectorComponentOrdinal = 0; rightVectorComponentOrdinal < rightVectorComponentCount; rightVectorComponentOrdinal++)
2194 : basisValuesRight.basisValues().tensorData(rightFamilyOrdinal);
2195 if (!rightComponent.
isValid())
2200 const int rightDimSpan = rightComponent.
extent_int(2);
2202 const int rightComponentFieldCount = rightComponent.
extent_int(0);
2205 if ((a_offset + leftDimSpan <= b_offset) || (b_offset + rightDimSpan <= a_offset))
2207 b_offset += rightDimSpan;
2213 INTREPID2_TEST_FOR_EXCEPTION(( a_offset != b_offset), std::logic_error,
"left and right dimension offsets should match.");
2214 INTREPID2_TEST_FOR_EXCEPTION(( leftDimSpan != rightDimSpan), std::invalid_argument,
"left and right components must span the same number of dimensions.");
2216 const int d_start = a_offset;
2217 const int d_end = d_start + leftDimSpan;
2219 using ComponentIntegralsArray = Kokkos::Array< Kokkos::Array<ScalarView<Scalar,DeviceType>, Parameters::MaxTensorComponents>, Parameters::MaxTensorComponents>;
2220 ComponentIntegralsArray componentIntegrals;
2221 for (
int r=0; r<numPointTensorComponents; r++)
2238 const int numPoints = pointDimensions[r];
2245 const int leftTensorComponentDimSpan = leftTensorComponent.
extent_int(2);
2246 const int leftTensorComponentFields = leftTensorComponent.
extent_int(0);
2247 const int rightTensorComponentDimSpan = rightTensorComponent.
extent_int(2);
2248 const int rightTensorComponentFields = rightTensorComponent.
extent_int(0);
2250 INTREPID2_TEST_FOR_EXCEPTION(( leftTensorComponentDimSpan != rightTensorComponentDimSpan), std::invalid_argument,
"left and right components must span the same number of dimensions.");
2252 for (
int d=d_start; d<d_end; d++)
2254 ScalarView<Scalar,DeviceType> componentIntegralView;
2256 const bool allocateFadStorage = !(std::is_standard_layout<Scalar>::value && std::is_trivial<Scalar>::value);
2257 if (allocateFadStorage)
2260 componentIntegralView = ScalarView<Scalar,DeviceType>(
"componentIntegrals for tensor component " + std::to_string(r) +
", in dimension " + std::to_string(d), leftTensorComponentFields, rightTensorComponentFields, fad_size_output);
2264 componentIntegralView = ScalarView<Scalar,DeviceType>(
"componentIntegrals for tensor component " + std::to_string(r) +
", in dimension " + std::to_string(d), leftTensorComponentFields, rightTensorComponentFields);
2267 auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<2>>({0,0},{leftTensorComponentFields,rightTensorComponentFields});
2270 leftTensorComponentDimSpan);
2271 Kokkos::parallel_for(
"compute componentIntegrals", policy, refSpaceIntegralFunctor);
2273 componentIntegrals[r][d] = componentIntegralView;
2275 if (approximateFlops != NULL)
2277 *approximateFlops += leftTensorComponentFields*rightTensorComponentFields*numPoints*(3);
2282 ExecutionSpace().fence();
2283 Impl::F_ComputeIntegral<Scalar,DeviceType> computeIntegralFunctor(integralView2, integralView3, leftComponent, rightComponent, cellMeasures, basisValuesLeft, basisValuesRight, componentIntegrals, d_start, d_end, numPointTensorComponents, leftFieldOffset, rightFieldOffset, integralViewRank);
2284 Kokkos::Array<int,3> upperBounds {cellDataExtent,leftComponentFieldCount,rightComponentFieldCount};
2285 Kokkos::Array<int,3> lowerBounds {0,0,0};
2286 auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<3>>(lowerBounds, upperBounds);
2288 Kokkos::parallel_for(
"compute field integrals", policy, computeIntegralFunctor);
2289 b_offset += rightDimSpan;
2292 a_offset += leftDimSpan;
2296 if (approximateFlops != NULL)
2299 *approximateFlops += (2 + spaceDim * (3 + numPointTensorComponents)) * cellDataExtent * numFieldsLeft * numFieldsRight;
2307 const bool transposeLeft =
true;
2308 const bool transposeRight =
false;
2314 const int leftRank = leftTransform.
rank();
2315 const int rightRank = rightTransform.
rank();
2319 const bool bothRank4 = (leftRank == 4) && (rightRank == 4);
2320 const bool bothRank3 = (leftRank == 3) && (rightRank == 3);
2321 const bool bothRank2 = (leftRank == 2) && (rightRank == 2);
2322 const bool ranks32 = ((leftRank == 3) && (rightRank == 2)) || ((leftRank == 2) && (rightRank == 3));
2323 const bool ranks42 = ((leftRank == 4) && (rightRank == 2)) || ((leftRank == 2) && (rightRank == 4));
2328 composedTransform.
storeMatMat(transposeLeft, leftTransform, transposeRight, rightTransform);
2331 if (approximateFlops != NULL)
2339 const int newRank = 4;
2342 extents[3] = extents[2];
2344 variationTypes[3] = variationTypes[2];
2346 auto leftTransformMatrix = leftTransform.
shallowCopy(newRank, extents, variationTypes);
2351 extents[3] = extents[2];
2353 variationTypes[3] = variationTypes[2];
2355 auto rightTransformMatrix = rightTransform.
shallowCopy(newRank, extents, variationTypes);
2358 composedTransform.
storeMatMat(transposeLeft, leftTransformMatrix, transposeRight, rightTransformMatrix);
2360 if (approximateFlops != NULL)
2371 const int newRank = 4;
2372 auto extents = composedTransform.
getExtents();
2374 composedTransform = composedTransform.
shallowCopy(newRank, extents, variationTypes);
2375 if (approximateFlops != NULL)
2382 const auto & rank3Transform = (leftRank == 3) ? leftTransform : rightTransform;
2383 const auto & rank2Transform = (leftRank == 2) ? leftTransform : rightTransform;
2385 composedTransform = DataTools::multiplyByCPWeights(rank3Transform, rank2Transform);
2391 const int newRank = 4;
2392 auto extents = composedTransform.
getExtents();
2402 extents[3] = extents[2];
2404 variationTypes[3] = variationTypes[2];
2407 composedTransform = composedTransform.
shallowCopy(newRank, extents, variationTypes);
2415 auto composedTransformTransposed = DataTools::multiplyByCPWeights(leftTransform, rightTransform);
2416 composedTransform = DataTools::transposeMatrix(composedTransformTransposed);
2420 composedTransform = DataTools::multiplyByCPWeights(rightTransform, leftTransform);
2425 INTREPID2_TEST_FOR_EXCEPTION(
true, std::invalid_argument,
"Unsupported transform combination");
2428 else if (leftTransform.
isValid())
2433 case 4: composedTransform = DataTools::transposeMatrix(leftTransform);
break;
2437 const int newRank = 4;
2441 composedTransform = leftTransform.
shallowCopy(newRank, extents, variationTypes);
2444 case 2: composedTransform = leftTransform;
break;
2446 INTREPID2_TEST_FOR_EXCEPTION(
true, std::invalid_argument,
"Unsupported transform combination");
2449 else if (rightTransform.
isValid())
2452 composedTransform = rightTransform;
2455 case 4: composedTransform = rightTransform;
break;
2459 const int newRank = 4;
2462 extents[3] = extents[2];
2463 variationTypes[3] = variationTypes[2];
2467 composedTransform = rightTransform.
shallowCopy(newRank, extents, variationTypes);
2470 case 2: composedTransform = rightTransform;
break;
2472 INTREPID2_TEST_FOR_EXCEPTION(
true, std::invalid_argument,
"Unsupported transform combination");
2478 Kokkos::Array<ordinal_type,4> extents {basisValuesLeft.
numCells(),basisValuesLeft.
numPoints(),spaceDim,spaceDim};
2481 Kokkos::View<Scalar*,DeviceType> identityUnderlyingView(
"Intrepid2::FST::integrate() - identity view",spaceDim);
2482 Kokkos::deep_copy(identityUnderlyingView, 1.0);
2490 const int leftComponentCount = leftIsVectorValued ? basisValuesLeft. vectorData().numComponents() : 1;
2491 const int rightComponentCount = rightIsVectorValued ? basisValuesRight.
vectorData().numComponents() : 1;
2493 int leftFieldOrdinalOffset = 0;
2494 for (
int leftFamilyOrdinal=0; leftFamilyOrdinal<leftFamilyCount; leftFamilyOrdinal++)
2499 bool haveLaunchedContributionToCurrentFamilyLeft =
false;
2500 for (
int leftComponentOrdinal=0; leftComponentOrdinal<leftComponentCount; leftComponentOrdinal++)
2503 : basisValuesLeft.basisValues().tensorData(leftFamilyOrdinal);
2507 a_offset += basisValuesLeft.
vectorData().numDimsForComponent(leftComponentOrdinal);
2511 int rightFieldOrdinalOffset = 0;
2512 for (
int rightFamilyOrdinal=0; rightFamilyOrdinal<rightFamilyCount; rightFamilyOrdinal++)
2516 bool haveLaunchedContributionToCurrentFamilyRight =
false;
2518 for (
int rightComponentOrdinal=0; rightComponentOrdinal<rightComponentCount; rightComponentOrdinal++)
2521 : basisValuesRight.basisValues().tensorData(rightFamilyOrdinal);
2522 if (!rightComponent.
isValid())
2525 b_offset += basisValuesRight.
vectorData().numDimsForComponent(rightComponentOrdinal);
2531 const int vectorSize = getVectorSizeForHierarchicalParallelism<Scalar>();
2532 Kokkos::TeamPolicy<ExecutionSpace> policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,Kokkos::AUTO(),vectorSize);
2541 bool forceNonSpecialized =
false;
2542 bool usePointCacheForRank3Tensor =
true;
2546 if (haveLaunchedContributionToCurrentFamilyLeft && haveLaunchedContributionToCurrentFamilyRight)
2548 ExecutionSpace().fence();
2550 haveLaunchedContributionToCurrentFamilyLeft =
true;
2551 haveLaunchedContributionToCurrentFamilyRight =
true;
2553 if (integralViewRank == 2)
2557 auto functor =
Impl::F_IntegratePointValueCache<Scalar, DeviceType, 2>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset);
2559 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2560 const int teamSize = functor.teamSize(recommendedTeamSize);
2562 policy = Kokkos::TeamPolicy<DeviceType>(cellDataExtent,teamSize,vectorSize);
2564 Kokkos::parallel_for(
"F_IntegratePointValueCache rank 2", policy, functor);
2566 if (approximateFlops != NULL)
2568 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.
getDataExtent(0);
2573 auto functor =
Impl::F_Integrate<Scalar, DeviceType, 2>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset, forceNonSpecialized);
2575 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2576 const int teamSize = functor.teamSize(recommendedTeamSize);
2578 policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,teamSize,vectorSize);
2580 Kokkos::parallel_for(
"F_Integrate rank 2", policy, functor);
2582 if (approximateFlops != NULL)
2584 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.
getDataExtent(0);
2588 else if (integralViewRank == 3)
2592 auto functor =
Impl::F_IntegratePointValueCache<Scalar, DeviceType, 3>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset);
2594 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2595 const int teamSize = functor.teamSize(recommendedTeamSize);
2597 policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,teamSize,vectorSize);
2599 Kokkos::parallel_for(
"F_IntegratePointValueCache rank 3", policy, functor);
2601 if (approximateFlops != NULL)
2603 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.
getDataExtent(0);
2608 auto functor =
Impl::F_Integrate<Scalar, DeviceType, 3>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset, forceNonSpecialized);
2610 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2611 const int teamSize = functor.teamSize(recommendedTeamSize);
2613 policy = Kokkos::TeamPolicy<DeviceType>(cellDataExtent,teamSize,vectorSize);
2615 Kokkos::parallel_for(
"F_Integrate rank 3", policy, functor);
2617 if (approximateFlops != NULL)
2619 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.
getDataExtent(0);
2623 b_offset += rightIsVectorValued ? basisValuesRight.
vectorData().numDimsForComponent(rightComponentOrdinal) : 1;
2625 rightFieldOrdinalOffset += rightIsVectorValued ? basisValuesRight.
vectorData().numFieldsInFamily(rightFamilyOrdinal) : basisValuesRight.basisValues().numFieldsInFamily(rightFamilyOrdinal);
2627 a_offset += leftIsVectorValued ? basisValuesLeft.
vectorData().numDimsForComponent(leftComponentOrdinal) : 1;
2629 leftFieldOrdinalOffset += leftIsVectorValued ? basisValuesLeft.
vectorData().numFieldsInFamily(leftFamilyOrdinal) : basisValuesLeft.basisValues().numFieldsInFamily(leftFamilyOrdinal);
2636 ExecutionSpace().fence();