Intrepid2
Intrepid2_IntegrationToolsDef.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// Intrepid2 Package
4//
5// Copyright 2007 NTESS and the Intrepid2 contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
15#ifndef __INTREPID2_INTEGRATIONTOOLS_DEF_HPP__
16#define __INTREPID2_INTEGRATIONTOOLS_DEF_HPP__
17
21
22#include "Teuchos_TimeMonitor.hpp"
23
24namespace Intrepid2 {
25
26 namespace Impl
27 {
31 template<class Scalar, class DeviceType, int integralViewRank>
33 {
34 using ExecutionSpace = typename DeviceType::execution_space;
35 using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace>;
36 using TeamMember = typename TeamPolicy::member_type;
37
38 using IntegralViewType = Kokkos::View<typename RankExpander<Scalar, integralViewRank>::value_type, DeviceType>;
39 IntegralViewType integralView_;
40 TensorData<Scalar,DeviceType> leftComponent_;
41 Data<Scalar,DeviceType> composedTransform_;
42 TensorData<Scalar,DeviceType> rightComponent_;
44 int a_offset_;
45 int b_offset_;
46 int leftComponentSpan_; // leftComponentSpan tracks the dimensions spanned by the left component
47 int rightComponentSpan_; // rightComponentSpan tracks the dimensions spanned by the right component
48 int numTensorComponents_;
49 int leftFieldOrdinalOffset_;
50 int rightFieldOrdinalOffset_;
51 bool forceNonSpecialized_; // if true, don't use the specialized (more manual) implementation(s) for particular component counts. Primary use case is for testing.
52
53 size_t fad_size_output_ = 0; // 0 if not a fad type
54
55 Kokkos::Array<int, 7> offsetsForComponentOrdinal_;
56
57 // as an optimization, we do all the bounds and argument iteration within the functor rather than relying on TensorArgumentIterator
58 // (this also makes it easier to reorder loops, etc., for further optimizations)
59 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldBounds_;
60 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldBounds_;
61 Kokkos::Array<int,Parameters::MaxTensorComponents> pointBounds_;
62
63 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldRelativeEnumerationSpans_; // total number of enumeration indices with arguments prior to the startingComponent fixed
64 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldRelativeEnumerationSpans_;
65
66 int maxFieldsLeft_;
67 int maxFieldsRight_;
68 int maxPointCount_;
69 public:
72 Data<Scalar,DeviceType> composedTransform,
73 TensorData<Scalar,DeviceType> rightComponent,
75 int a_offset,
76 int b_offset,
77 int leftFieldOrdinalOffset,
78 int rightFieldOrdinalOffset,
79 bool forceNonSpecialized)
80 :
81 integralView_(integralData.template getUnderlyingView<integralViewRank>()),
82 leftComponent_(leftComponent),
83 composedTransform_(composedTransform),
84 rightComponent_(rightComponent),
85 cellMeasures_(cellMeasures),
86 a_offset_(a_offset),
87 b_offset_(b_offset),
88 leftComponentSpan_(leftComponent.extent_int(2)),
89 rightComponentSpan_(rightComponent.extent_int(2)),
90 numTensorComponents_(leftComponent.numTensorComponents()),
91 leftFieldOrdinalOffset_(leftFieldOrdinalOffset),
92 rightFieldOrdinalOffset_(rightFieldOrdinalOffset),
93 forceNonSpecialized_(forceNonSpecialized)
94 {
95 INTREPID2_TEST_FOR_EXCEPTION(numTensorComponents_ != rightComponent_.numTensorComponents(), std::invalid_argument, "Left and right components must have matching number of tensorial components");
96
97 // set up bounds containers
98 const int FIELD_DIM = 0;
99 const int POINT_DIM = 1;
100 maxFieldsLeft_ = 0;
101 maxFieldsRight_ = 0;
102 maxPointCount_ = 0;
103 for (int r=0; r<numTensorComponents_; r++)
104 {
105 leftFieldBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
106 maxFieldsLeft_ = std::max(maxFieldsLeft_, leftFieldBounds_[r]);
107 rightFieldBounds_[r] = rightComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
108 maxFieldsRight_ = std::max(maxFieldsRight_, rightFieldBounds_[r]);
109 pointBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(POINT_DIM);
110 maxPointCount_ = std::max(maxPointCount_, pointBounds_[r]);
111 }
112
113 // set up relative enumeration spans: total number of enumeration indices with arguments prior to the startingComponent fixed. These are for *truncated* iterators; hence the -2 rather than -1 for the first startingComponent value.
114 int leftRelativeEnumerationSpan = 1;
115 int rightRelativeEnumerationSpan = 1;
116 for (int startingComponent=numTensorComponents_-2; startingComponent>=0; startingComponent--)
117 {
118 leftRelativeEnumerationSpan *= leftFieldBounds_[startingComponent];
119 rightRelativeEnumerationSpan *= rightFieldBounds_[startingComponent];
120 leftFieldRelativeEnumerationSpans_ [startingComponent] = leftRelativeEnumerationSpan;
121 rightFieldRelativeEnumerationSpans_[startingComponent] = rightRelativeEnumerationSpan;
122 }
123
124 // prepare for allocation of temporary storage
125 // note: tempStorage goes "backward", starting from the final component, which needs just one entry
126
127 const bool allocateFadStorage = !(std::is_standard_layout<Scalar>::value && std::is_trivial<Scalar>::value);
128 if (allocateFadStorage)
129 {
130 fad_size_output_ = dimension_scalar(integralView_);
131 }
132
133 const int R = numTensorComponents_ - 1;
134
135 int num_ij = 1; // this counts how many entries there are corresponding to components from r to R-1.
136 int allocationSoFar = 0;
137 offsetsForComponentOrdinal_[R] = allocationSoFar;
138 allocationSoFar++; // we store one entry corresponding to R, the final component
139
140 for (int r=R-1; r>0; r--) // last component is innermost in the for loops (requires least storage)
141 {
142 const int leftFields = leftComponent.getTensorComponent(r).extent_int(0);
143 const int rightFields = rightComponent.getTensorComponent(r).extent_int(0);
144
145 num_ij *= leftFields * rightFields;
146
147 offsetsForComponentOrdinal_[r] = allocationSoFar;
148 allocationSoFar += num_ij;
149 }
150 offsetsForComponentOrdinal_[0] = allocationSoFar; // first component stores directly to final integralView.
151 }
152
153 template<size_t maxComponents, size_t numComponents = maxComponents>
154 KOKKOS_INLINE_FUNCTION
155 int incrementArgument( Kokkos::Array<int,maxComponents> &arguments,
156 const Kokkos::Array<int,maxComponents> &bounds) const
157 {
158 if (numComponents == 0)
159 {
160 return -1;
161 }
162 else
163 {
164 int r = static_cast<int>(numComponents - 1);
165 while (arguments[r] + 1 >= bounds[r])
166 {
167 arguments[r] = 0; // reset
168 r--;
169 if (r < 0) break;
170 }
171 if (r >= 0) ++arguments[r];
172 return r;
173 }
174 }
175
177 KOKKOS_INLINE_FUNCTION
178 int incrementArgument( Kokkos::Array<int,Parameters::MaxTensorComponents> &arguments,
179 const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
180 const int &numComponents) const
181 {
182 if (numComponents == 0) return -1;
183 int r = static_cast<int>(numComponents - 1);
184 while (arguments[r] + 1 >= bounds[r])
185 {
186 arguments[r] = 0; // reset
187 r--;
188 if (r < 0) break;
189 }
190 if (r >= 0) ++arguments[r];
191 return r;
192 }
193
194 template<size_t maxComponents, size_t numComponents = maxComponents>
195 KOKKOS_INLINE_FUNCTION
196 int nextIncrementResult(const Kokkos::Array<int,maxComponents> &arguments,
197 const Kokkos::Array<int,maxComponents> &bounds) const
198 {
199 if (numComponents == 0)
200 {
201 return -1;
202 }
203 else
204 {
205 int r = static_cast<int>(numComponents - 1);
206 while (arguments[r] + 1 >= bounds[r])
207 {
208 r--;
209 if (r < 0) break;
210 }
211 return r;
212 }
213 }
214
216 KOKKOS_INLINE_FUNCTION
217 int nextIncrementResult(const Kokkos::Array<int,Parameters::MaxTensorComponents> &arguments,
218 const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
219 const int &numComponents) const
220 {
221 if (numComponents == 0) return -1;
222 int r = numComponents - 1;
223 while (arguments[r] + 1 >= bounds[r])
224 {
225 r--;
226 if (r < 0) break;
227 }
228 return r;
229 }
230
231 template<size_t maxComponents, size_t numComponents = maxComponents>
232 KOKKOS_INLINE_FUNCTION
233 int relativeEnumerationIndex(const Kokkos::Array<int,maxComponents> &arguments,
234 const Kokkos::Array<int,maxComponents> &bounds,
235 const int startIndex) const
236 {
237 // the following mirrors what is done in TensorData
238 if (numComponents == 0)
239 {
240 return 0;
241 }
242 else
243 {
244 int enumerationIndex = 0;
245 for (size_t r=numComponents-1; r>static_cast<size_t>(startIndex); r--)
246 {
247 enumerationIndex += arguments[r];
248 enumerationIndex *= bounds[r-1];
249 }
250 enumerationIndex += arguments[startIndex];
251 return enumerationIndex;
252 }
253 }
254
256
257 // nvcc refuses to compile the below with error, "explicit specialization is not allowed in the current scope". Clang is OK with it. We just do a non-templated version below.
258// template<size_t numTensorComponents>
259// KOKKOS_INLINE_FUNCTION
260// void runSpecialized( const TeamMember & teamMember ) const;
261
262// template<>
263// KOKKOS_INLINE_FUNCTION
264// void runSpecialized<3>( const TeamMember & teamMember ) const
265 KOKKOS_INLINE_FUNCTION
266 void runSpecialized3( const TeamMember & teamMember ) const
267 {
268 constexpr int numTensorComponents = 3;
269
270 Kokkos::Array<int,numTensorComponents> pointBounds;
271 Kokkos::Array<int,numTensorComponents> leftFieldBounds;
272 Kokkos::Array<int,numTensorComponents> rightFieldBounds;
273 for (unsigned r=0; r<numTensorComponents; r++)
274 {
275 pointBounds[r] = pointBounds_[r];
276 leftFieldBounds[r] = leftFieldBounds_[r];
277 rightFieldBounds[r] = rightFieldBounds_[r];
278 }
279
280 const int cellDataOrdinal = teamMember.league_rank();
281 const int numThreads = teamMember.team_size(); // num threads
282 const int scratchViewSize = offsetsForComponentOrdinal_[0]; // per thread
283
284 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> scratchView; // for caching partial integration values
285 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights; // indexed by (expanded) point; stores M_ab * cell measure
286 Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_x, leftFields_y, leftFields_z, rightFields_x, rightFields_y, rightFields_z; // cache the field values for faster access
287 if (fad_size_output_ > 0) {
288 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads, fad_size_output_);
289 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1), fad_size_output_);
290 leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[0], pointBounds[0], fad_size_output_);
291 rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[0], pointBounds[0], fad_size_output_);
292 leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[1], pointBounds[1], fad_size_output_);
293 rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[1], pointBounds[1], fad_size_output_);
294 leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[2], pointBounds[2], fad_size_output_);
295 rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[2], pointBounds[2], fad_size_output_);
296 }
297 else {
298 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads);
299 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1));
300 leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[0], pointBounds[0]);
301 rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[0], pointBounds[0]);
302 leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[1], pointBounds[1]);
303 rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[1], pointBounds[1]);
304 leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[2], pointBounds[2]);
305 rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[2], pointBounds[2]);
306 }
307
308// int approximateFlopCount = 0;
309// int flopsPerCellMeasuresAccess = cellMeasures_.numTensorComponents() - 1;
310
311 constexpr int R = numTensorComponents - 1;
312
313 const int composedTransformRank = composedTransform_.rank();
314
315 for (int a_component=0; a_component < leftComponentSpan_; a_component++)
316 {
317 const int a = a_offset_ + a_component;
318 for (int b_component=0; b_component < rightComponentSpan_; b_component++)
319 {
320 const int b = b_offset_ + b_component;
321
322 const Data<Scalar,DeviceType> & leftFinalComponent = leftComponent_.getTensorComponent(R);
323 const Data<Scalar,DeviceType> & rightFinalComponent = rightComponent_.getTensorComponent(R);
324
325 const int numLeftFieldsFinal = leftFinalComponent.extent_int(0); // shape (F,P[,D])
326 const int numRightFieldsFinal = rightFinalComponent.extent_int(0); // shape (F,P[,D])
327
328 const Data<Scalar,DeviceType> & leftTensorComponent_x = leftComponent_.getTensorComponent(0);
329 const Data<Scalar,DeviceType> & rightTensorComponent_x = rightComponent_.getTensorComponent(0);
330 const Data<Scalar,DeviceType> & leftTensorComponent_y = leftComponent_.getTensorComponent(1);
331 const Data<Scalar,DeviceType> & rightTensorComponent_y = rightComponent_.getTensorComponent(1);
332 const Data<Scalar,DeviceType> & leftTensorComponent_z = leftComponent_.getTensorComponent(2);
333 const Data<Scalar,DeviceType> & rightTensorComponent_z = rightComponent_.getTensorComponent(2);
334
335 const int maxFields = (maxFieldsLeft_ > maxFieldsRight_) ? maxFieldsLeft_ : maxFieldsRight_;
336 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,maxFields * maxPointCount_), [&] (const int& fieldOrdinalPointOrdinal) {
337 const int fieldOrdinal = fieldOrdinalPointOrdinal % maxFields;
338 const int pointOrdinal = fieldOrdinalPointOrdinal / maxFields;
339 if ((fieldOrdinal < leftTensorComponent_x.extent_int(0)) && (pointOrdinal < leftTensorComponent_x.extent_int(1)))
340 {
341 const int leftRank = leftTensorComponent_x.rank();
342 leftFields_x(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_x(fieldOrdinal,pointOrdinal) : leftTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
343 }
344 if ((fieldOrdinal < leftTensorComponent_y.extent_int(0)) && (pointOrdinal < leftTensorComponent_y.extent_int(1)))
345 {
346 const int leftRank = leftTensorComponent_y.rank();
347 leftFields_y(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_y(fieldOrdinal,pointOrdinal) : leftTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
348 }
349 if ((fieldOrdinal < leftTensorComponent_z.extent_int(0)) && (pointOrdinal < leftTensorComponent_z.extent_int(1)))
350 {
351 const int leftRank = leftTensorComponent_z.rank();
352 leftFields_z(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_z(fieldOrdinal,pointOrdinal) : leftTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
353 }
354 if ((fieldOrdinal < rightTensorComponent_x.extent_int(0)) && (pointOrdinal < rightTensorComponent_x.extent_int(1)))
355 {
356 const int rightRank = rightTensorComponent_x.rank();
357 rightFields_x(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_x(fieldOrdinal,pointOrdinal) : rightTensorComponent_x(fieldOrdinal,pointOrdinal,b_component);
358 }
359 if ((fieldOrdinal < rightTensorComponent_y.extent_int(0)) && (pointOrdinal < rightTensorComponent_y.extent_int(1)))
360 {
361 const int rightRank = rightTensorComponent_y.rank();
362 rightFields_y(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_y(fieldOrdinal,pointOrdinal) : rightTensorComponent_y(fieldOrdinal,pointOrdinal,b_component);
363 }
364 if ((fieldOrdinal < rightTensorComponent_z.extent_int(0)) && (pointOrdinal < rightTensorComponent_z.extent_int(1)))
365 {
366 const int rightRank = rightTensorComponent_z.rank();
367 rightFields_z(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_z(fieldOrdinal,pointOrdinal) : rightTensorComponent_z(fieldOrdinal,pointOrdinal,b_component);
368 }
369 });
370
371 if (composedTransform_.underlyingMatchesLogical())
372 {
373 if (composedTransformRank == 4) // (C,P,D,D)
374 {
375 const auto & composedTransformView = composedTransform_.getUnderlyingView4();
376 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransformView.extent_int(1)), [&] (const int& pointOrdinal) {
377 pointWeights(pointOrdinal) = composedTransformView(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
378 });
379 }
380 else // rank 2, then: (C,P)
381 {
382 const auto & composedTransformView = composedTransform_.getUnderlyingView2();
383 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransformView.extent_int(1)), [&] (const int& pointOrdinal) {
384 pointWeights(pointOrdinal) = composedTransformView(cellDataOrdinal,pointOrdinal) * cellMeasures_(cellDataOrdinal,pointOrdinal);
385 });
386 }
387 }
388 else
389 {
390 if (composedTransformRank == 4) // (C,P,D,D)
391 {
392 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
393 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
394 });
395 }
396 else // rank 2, then: (C,P)
397 {
398 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
399 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal) * cellMeasures_(cellDataOrdinal,pointOrdinal);
400 });
401 }
402 }
403
404 // approximateFlopCount += composedTransform_.extent_int(1) * cellMeasures.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
405
406 // synchronize threads
407 teamMember.team_barrier();
408
409 // Setting scratchView to 0 here is not necessary from an algorithmic point of view, but *might* help with performance (due to a first-touch policy)
410 const int scratchOffsetForThread = teamMember.team_rank() * scratchViewSize;
411 for (int i=scratchOffsetForThread; i<scratchOffsetForThread+scratchViewSize; i++)
412 {
413 scratchView(i) = 0.0;
414 }
415
416 // TODO: consider adding an innerLoopRange that is sized to be the maximum of the size of the inner loops we'd like to parallelize over. (note that this means we do the work in the outer loop redundantly that many times…)
417 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numLeftFieldsFinal * numRightFieldsFinal), [&] (const int& leftRightFieldEnumeration) {
418 const int iz = leftRightFieldEnumeration % numLeftFieldsFinal;
419 const int jz = leftRightFieldEnumeration / numLeftFieldsFinal;
420
421 // as an optimization, we are using the same argument array for both the truncated field that spans s to R-1 and the full one that goes to R
422 // the "R" argument here is ignored by the methods that treat the truncated field (it's beyond their bounds)
423 Kokkos::Array<int,numTensorComponents> leftFieldArguments;
424 Kokkos::Array<int,numTensorComponents> rightFieldArguments;
425 rightFieldArguments[R] = jz;
426 leftFieldArguments[R] = iz;
427
428 Kokkos::Array<int,numTensorComponents> pointArguments;
429 for (int i=0; i<numTensorComponents; i++)
430 {
431 pointArguments[i] = 0;
432 }
433
434 for (int lx=0; lx<pointBounds[0]; lx++)
435 {
436 pointArguments[0] = lx;
437
438 // clear Gy scratch:
439 // in scratch, Gz (1 entry) comes first, then Gy entries.
440 const int Gy_start_index = scratchOffsetForThread + offsetsForComponentOrdinal_[1];
441 const int Gy_end_index = scratchOffsetForThread + offsetsForComponentOrdinal_[0];
442
443 for (int Gy_index=Gy_start_index; Gy_index < Gy_end_index; Gy_index++)
444 {
445 scratchView(Gy_index) = 0;
446 }
447
448 for (int ly=0; ly<pointBounds[1]; ly++)
449 {
450 pointArguments[1] = ly;
451
452 Scalar * Gz = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[R]);
453 *Gz = 0;
454
455 for (int lz=0; lz < pointBounds[2]; lz++)
456 {
457 const Scalar & leftValue = leftFields_z(iz,lz);
458 const Scalar & rightValue = rightFields_z(jz,lz);
459
460 pointArguments[2] = lz;
461 int pointEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(pointArguments, pointBounds, 0);
462
463 *Gz += leftValue * pointWeights(pointEnumerationIndex) * rightValue;
464
465 // approximateFlopCount += 3; // 2 multiplies, 1 sum
466 } // lz
467
468 for (int iy=0; iy<leftFieldBounds_[1]; iy++)
469 {
470 leftFieldArguments[1] = iy;
471 const int leftEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, 1);
472
473 const Scalar & leftValue = leftFields_y(iy,ly);
474
475 for (int jy=0; jy<rightFieldBounds_[1]; jy++)
476 {
477 rightFieldArguments[1] = jy;
478
479 const int rightEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, 1);
480 const Scalar & rightValue = rightFields_y(jy,ly);
481
482 const int & rightEnumerationSpan_y = rightFieldRelativeEnumerationSpans_[1];
483 const int Gy_index = leftEnumerationIndex_y * rightEnumerationSpan_y + rightEnumerationIndex_y;
484
485 const int Gz_index = 0;
486 const Scalar & aGz = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[2] + Gz_index);
487
488 Scalar & Gy = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[1] + Gy_index);
489
490 Gy += leftValue * aGz * rightValue;
491 // approximateFlopCount += 3; // 2 multiplies, 1 sum
492 }
493 }
494 } // ly
495 for (int ix=0; ix<leftFieldBounds_[0]; ix++)
496 {
497 leftFieldArguments[0] = ix;
498 const Scalar & leftValue = leftFields_x(ix,lx);
499
500 for (int iy=0; iy<leftFieldBounds_[1]; iy++)
501 {
502 leftFieldArguments[1] = iy;
503
504 const int leftEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, 1);
505
506 for (int jx=0; jx<rightFieldBounds_[0]; jx++)
507 {
508 rightFieldArguments[0] = jx;
509 const Scalar & rightValue = rightFields_x(jx,lx);
510
511 for (int jy=0; jy<rightFieldBounds_[1]; jy++)
512 {
513 rightFieldArguments[1] = jy;
514 const int rightEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, 1);
515
516 const int rightEnumerationSpan_y = rightFieldRelativeEnumerationSpans_[1];
517
518 const int Gy_index = leftEnumerationIndex_y * rightEnumerationSpan_y + rightEnumerationIndex_y;
519 Scalar & Gy = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[1] + Gy_index);
520
521 // compute enumeration indices to get field indices into output view
522 int leftEnumerationIndex = relativeEnumerationIndex<numTensorComponents>( leftFieldArguments, leftFieldBounds, 0);
523 int rightEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(rightFieldArguments, rightFieldBounds, 0);
524 const int leftFieldIndex = leftEnumerationIndex + leftFieldOrdinalOffset_;
525 const int rightFieldIndex = rightEnumerationIndex + rightFieldOrdinalOffset_;
526
527 if (integralViewRank == 3)
528 {
529// if ((leftFieldIndex==0) && (rightFieldIndex==2))
530// {
531// using std::cout;
532// using std::endl;
533// cout << "***** Contribution to (0,0,2) *****\n";
534// cout << "lx = " << lx << endl;
535// cout << "ix = " << ix << endl;
536// cout << "iy = " << iy << endl;
537// cout << "jx = " << jx << endl;
538// cout << "jy = " << jy << endl;
539// cout << "iz = " << iz << endl;
540// cout << "jz = " << jz << endl;
541// cout << " leftValue = " << leftValue << endl;
542// cout << "rightValue = " << rightValue << endl;
543// cout << "Gy = " << Gy << endl;
544//
545// cout << endl;
546// }
547
548 // shape (C,F1,F2)
549 integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex) += leftValue * Gy * rightValue;
550 }
551 else
552 {
553 // shape (F1,F2)
554 integralView_.access(leftFieldIndex,rightFieldIndex,0) += leftValue * Gy * rightValue;
555 }
556 // approximateFlopCount += 3; // 2 multiplies, 1 sum
557 } // jy
558 } // ix
559 } // iy
560 } // ix
561 } // lx
562 }); // TeamThreadRange parallel_for - (iz,jz) loop
563 }
564 }
565// std::cout << "flop count per cell (within operator()) : " << approximateFlopCount << std::endl;
566 }
567
568 template<size_t numTensorComponents>
569 KOKKOS_INLINE_FUNCTION
570 void run( const TeamMember & teamMember ) const
571 {
572 Kokkos::Array<int,numTensorComponents> pointBounds;
573 Kokkos::Array<int,numTensorComponents> leftFieldBounds;
574 Kokkos::Array<int,numTensorComponents> rightFieldBounds;
575 for (unsigned r=0; r<numTensorComponents; r++)
576 {
577 pointBounds[r] = pointBounds_[r];
578 leftFieldBounds[r] = leftFieldBounds_[r];
579 rightFieldBounds[r] = rightFieldBounds_[r];
580 }
581
582 const int cellDataOrdinal = teamMember.league_rank();
583 const int numThreads = teamMember.team_size(); // num threads
584 const int scratchViewSize = offsetsForComponentOrdinal_[0]; // per thread
585 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> scratchView; // for caching partial integration values
586 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights; // indexed by (expanded) point; stores M_ab * cell measure
587 Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged> leftFields, rightFields; // cache the field values for faster access
588 if (fad_size_output_ > 0) {
589 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads, fad_size_output_);
590 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1), fad_size_output_);
591 leftFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsLeft_, maxPointCount_, fad_size_output_);
592 rightFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsRight_, maxPointCount_, fad_size_output_);
593 }
594 else {
595 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize*numThreads);
596 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1));
597 leftFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsLeft_, maxPointCount_);
598 rightFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsRight_, maxPointCount_);
599 }
600
601// int approximateFlopCount = 0;
602// int flopsPerCellMeasuresAccess = cellMeasures_.numTensorComponents() - 1;
603
604 constexpr int R = numTensorComponents - 1;
605
606 const int composedTransformRank = composedTransform_.rank();
607
608 for (int a_component=0; a_component < leftComponentSpan_; a_component++)
609 {
610 const int a = a_offset_ + a_component;
611 for (int b_component=0; b_component < rightComponentSpan_; b_component++)
612 {
613 const int b = b_offset_ + b_component;
614
615 const Data<Scalar,DeviceType> & leftFinalComponent = leftComponent_.getTensorComponent(R);
616 const Data<Scalar,DeviceType> & rightFinalComponent = rightComponent_.getTensorComponent(R);
617
618 const int numLeftFieldsFinal = leftFinalComponent.extent_int(0); // shape (F,P[,D])
619 const int numRightFieldsFinal = rightFinalComponent.extent_int(0); // shape (F,P[,D])
620
621 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numTensorComponents), [&] (const int& r) {
622 const Data<Scalar,DeviceType> & leftTensorComponent = leftComponent_.getTensorComponent(r);
623 const Data<Scalar,DeviceType> & rightTensorComponent = rightComponent_.getTensorComponent(r);
624 const int leftFieldCount = leftTensorComponent.extent_int(0);
625 const int pointCount = leftTensorComponent.extent_int(1);
626 const int leftRank = leftTensorComponent.rank();
627 const int rightFieldCount = rightTensorComponent.extent_int(0);
628 const int rightRank = rightTensorComponent.rank();
629 for (int fieldOrdinal=0; fieldOrdinal<maxFieldsLeft_; fieldOrdinal++)
630 {
631 // slightly weird logic here in effort to avoid branch divergence
632 const int fieldAddress = (fieldOrdinal < leftFieldCount) ? fieldOrdinal : leftFieldCount - 1;
633 for (int pointOrdinal=0; pointOrdinal<maxPointCount_; pointOrdinal++)
634 {
635 const int pointAddress = (pointOrdinal < pointCount) ? pointOrdinal : pointCount - 1;
636 leftFields(r,fieldAddress,pointAddress) = (leftRank == 2) ? leftTensorComponent(fieldAddress,pointAddress) : leftTensorComponent(fieldAddress,pointAddress,a_component);
637 }
638 }
639 for (int fieldOrdinal=0; fieldOrdinal<maxFieldsRight_; fieldOrdinal++)
640 {
641 // slightly weird logic here in effort to avoid branch divergence
642 const int fieldAddress = (fieldOrdinal < rightFieldCount) ? fieldOrdinal : rightFieldCount - 1;
643 for (int pointOrdinal=0; pointOrdinal<maxPointCount_; pointOrdinal++)
644 {
645 const int pointAddress = (pointOrdinal < pointCount) ? pointOrdinal : pointCount - 1;
646 rightFields(r,fieldAddress,pointAddress) = (rightRank == 2) ? rightTensorComponent(fieldAddress,pointAddress) : rightTensorComponent(fieldAddress,pointAddress,b_component);
647 }
648 }
649 });
650
651 if (composedTransformRank == 4)
652 {
653 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
654 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
655 });
656 }
657 else
658 {
659 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
660 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal) * cellMeasures_(cellDataOrdinal,pointOrdinal);
661 });
662 }
663 // approximateFlopCount += composedTransform_.extent_int(1) * cellMeasures.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
664
665 // synchronize threads
666 teamMember.team_barrier();
667
668 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numLeftFieldsFinal * numRightFieldsFinal), [&] (const int& leftRightFieldEnumeration) {
669 const int scratchOffsetForThread = teamMember.team_rank() * scratchViewSize;
670 const int i_R = leftRightFieldEnumeration % numLeftFieldsFinal;
671 const int j_R = leftRightFieldEnumeration / numLeftFieldsFinal;
672
673 // as an optimization, we are using the same argument array for both the truncated field that spans s to R-1 and the full one that goes to R
674 // the "R" argument here is ignored by the methods that treat the truncated field (it's beyond their bounds)
675 Kokkos::Array<int,numTensorComponents> leftFieldArguments;
676 Kokkos::Array<int,numTensorComponents> rightFieldArguments;
677 rightFieldArguments[R] = j_R;
678 leftFieldArguments[R] = i_R;
679
680 //TODO: I believe that this can be moved outside the thread parallel_for
681 for (int i=scratchOffsetForThread; i<scratchOffsetForThread+scratchViewSize; i++)
682 {
683 scratchView(i) = 0.0;
684 }
685 Kokkos::Array<int,numTensorComponents> pointArguments;
686 for (unsigned i=0; i<numTensorComponents; i++)
687 {
688 pointArguments[i] = 0;
689 }
690
691 int r = R;
692 while (r >= 0)
693 {
694 // integrate in final component dimension; this is where we need the M weight, as well as the weighted measure
695 const int pointBounds_R = pointBounds[R];
696 int & pointArgument_R = pointArguments[R];
697 for (pointArgument_R=0; pointArgument_R < pointBounds_R; pointArgument_R++)
698 {
699 Scalar * G_R;
700 if (R != 0)
701 {
702 G_R = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[R]);
703 }
704 else
705 {
706 const int leftFieldIndex = i_R + leftFieldOrdinalOffset_;
707 const int rightFieldIndex = j_R + rightFieldOrdinalOffset_;
708
709 if (integralViewRank == 3)
710 {
711 // shape (C,F1,F2)
712 G_R = &integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex);
713 }
714 else
715 {
716 // shape (F1,F2)
717 G_R = &integralView_.access(leftFieldIndex,rightFieldIndex,0);
718 }
719 }
720
721 const int & pointIndex_R = pointArguments[R];
722
723 const Scalar & leftValue = leftFields(R,i_R,pointIndex_R);
724 const Scalar & rightValue = rightFields(R,j_R,pointIndex_R);
725
726 int pointEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(pointArguments, pointBounds, 0);
727
728 *G_R += leftValue * pointWeights(pointEnumerationIndex) * rightValue;
729
730// approximateFlopCount += 3; // 2 multiplies, 1 sum
731 } // pointArgument_R
732
733 const int r_next = nextIncrementResult<numTensorComponents,numTensorComponents-1>(pointArguments, pointBounds); // numTensorComponents_-1 means that we won't touch the [R] argument, which is treated in for loop above
734 const int r_min = (r_next >= 0) ? r_next : 0;
735
736 for (int s=R-1; s>=r_min; s--)
737 {
738 const int & pointIndex_s = pointArguments[s];
739
740 // want to cover all the multi-indices from s to R-1
741 for (int s1=s; s1<R; s1++)
742 {
743 leftFieldArguments[s1] = 0;
744 }
745
746 // i_s, j_s are the indices into the "current" tensor component; these are references, so they actually vary as the arguments are incremented.
747 const int & i_s = leftFieldArguments[s];
748 const int & j_s = rightFieldArguments[s];
749
750 int sLeft = s; // hereafter, sLeft is the return from the left field increment: the lowest rank that was incremented. If this is less than s, we've gotten through all the multi-indexes from rank s through R-1 (inclusive).
751 const int & rightEnumerationSpan_s = rightFieldRelativeEnumerationSpans_[s];
752 const int & rightEnumerationSpan_sp = rightFieldRelativeEnumerationSpans_[s+1];
753
754 while (sLeft >= s)
755 {
756 const int leftEnumerationIndex_s = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, s);
757
758 // for final tensor component, the indices i_R, j_R are fixed, so we only have one slot for temporary storage (hence the "0" index possibility here)
759 const int leftEnumerationIndex_sp = (s+1 == R) ? 0 : relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, s+1);
760
761 const Scalar & leftValue = leftFields(s,i_s,pointIndex_s);
762
763 for (int s1=s; s1<R; s1++)
764 {
765 rightFieldArguments[s1] = 0;
766 }
767 int sRight = s; // hereafter, sRight is the return from the leftFieldTensorIterator: the lowest rank that was incremented. If this is less than s, we've gotten through all the multi-indexes from rank s through R-1 (inclusive).
768 while (sRight >= s)
769 {
770 const int rightEnumerationIndex_s = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, s);
771
772 // for final tensor component, the indices i_R, j_R are fixed, so we only have one slot for temporary storage (hence the "0" index possibility here)
773 const int rightEnumerationIndex_sp = (s+1 == R) ? 0 : relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, s+1);
774
775 const Scalar & rightValue = rightFields(s,j_s,pointIndex_s);
776
777 const int G_s_index = leftEnumerationIndex_s * rightEnumerationSpan_s + rightEnumerationIndex_s;
778
779 Scalar* G_s;
780
781 // for final tensor component, the indices i_R, j_R are fixed, so we only have one slot for temporary storage
782 // (above, we have set the leftEnumerationIndex_sp, rightEnumerationIndex_sp to be 0 in this case, so G_sp_index will then also be 0)
783 const int G_sp_index = leftEnumerationIndex_sp * rightEnumerationSpan_sp + rightEnumerationIndex_sp;
784
785 const Scalar & G_sp = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[s+1] + G_sp_index);
786
787
788 if (s != 0)
789 {
790 G_s = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[s] + G_s_index);
791 }
792 else
793 {
794 // compute enumeration indices
795 int leftEnumerationIndex = relativeEnumerationIndex<numTensorComponents>( leftFieldArguments, leftFieldBounds, 0);
796 int rightEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(rightFieldArguments, rightFieldBounds, 0);
797 const int leftFieldIndex = leftEnumerationIndex + leftFieldOrdinalOffset_;
798 const int rightFieldIndex = rightEnumerationIndex + rightFieldOrdinalOffset_;
799
800 if (integralViewRank == 3)
801 {
802 // shape (C,F1,F2)
803 G_s = &integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex);
804 }
805 else
806 {
807 // shape (F1,F2)
808 G_s = &integralView_.access(leftFieldIndex,rightFieldIndex,0);
809 }
810 }
811
812 *G_s += leftValue * G_sp * rightValue;
813
814// approximateFlopCount += 3; // 2 multiplies, 1 sum
815
816 // increment rightField
817 sRight = incrementArgument<numTensorComponents,R>(rightFieldArguments, rightFieldBounds);
818 }
819
820 // increment leftField
821 sLeft = incrementArgument<numTensorComponents,R>(leftFieldArguments, leftFieldBounds);
822 }
823 }
824
825 // clear tempStorage for r_next+1 to R
826 if (r_min+1 <= R)
827 {
828 const int endIndex = scratchOffsetForThread + offsetsForComponentOrdinal_[r_min];
829 for (int i=scratchOffsetForThread; i<endIndex; i++)
830 {
831 scratchView(i) = 0.0;
832 }
833// auto tempStorageSubview = Kokkos::subview(scratchView, Kokkos::pair<int,int>{0,offsetsForComponentOrdinal_[r_min]});
834// Kokkos::deep_copy(tempStorageSubview, 0.0);
835 }
836
837 // proceed to next point
838 r = incrementArgument<numTensorComponents,numTensorComponents-1>(pointArguments, pointBounds); // numTensorComponents_-1 means that we won't touch the [R] argument, which is treated in the G_R integration for loop above
839 }
840 }); // TeamThreadRange parallel_for
841 }
842 }
843// std::cout << "flop count per cell (within operator()) : " << approximateFlopCount << std::endl;
844 }
845
846 KOKKOS_INLINE_FUNCTION
847 void operator()( const TeamMember & teamMember ) const
848 {
849 switch (numTensorComponents_)
850 {
851 case 1: run<1>(teamMember); break;
852 case 2: run<2>(teamMember); break;
853 case 3:
854 if (forceNonSpecialized_)
855 run<3>(teamMember);
856 else
857 runSpecialized3(teamMember);
858 break;
859 case 4: run<4>(teamMember); break;
860 case 5: run<5>(teamMember); break;
861 case 6: run<6>(teamMember); break;
862 case 7: run<7>(teamMember); break;
863#ifdef INTREPID2_HAVE_DEBUG
864 default:
865 INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true,std::invalid_argument,"Unsupported component count");
866#endif
867 }
868 }
869
872 {
873 // compute flop count on a single cell, then multiply by the number of cells
874 int flopCount = 0;
875
876 const int R = numTensorComponents_ - 1;
877
878 // we cache the value of M_ab * cellMeasure at each point.
879 // access to cellMeasures involves cellMeasures.numTensorComponents() - 1 multiplies, so total is the below:
880 const int flopsPerPoint_ab = cellMeasures_.numTensorComponents(); // the access involves multiplying all the components together
881
882 for (int a_component=0; a_component < leftComponentSpan_; a_component++)
883 {
884 for (int b_component=0; b_component < rightComponentSpan_; b_component++)
885 {
886 const Data<Scalar,DeviceType> & leftFinalComponent = leftComponent_.getTensorComponent(R);
887 const Data<Scalar,DeviceType> & rightFinalComponent = rightComponent_.getTensorComponent(R);
888
889 const int numLeftFieldsFinal = leftFinalComponent.extent_int(0); // shape (F,P[,D])
890 const int numRightFieldsFinal = rightFinalComponent.extent_int(0); // shape (F,P[,D])
891
892 flopCount += flopsPerPoint_ab * cellMeasures_.extent_int(1);
893
894 int flopsPer_i_R_j_R = 0;
895
896 // as an optimization, we are using the same argument array for both the truncated field that spans s to R-1 and the full one that goes to R
897 // the "R" argument here is ignored by the methods that treat the truncated field (it's beyond their bounds)
898 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldArguments;
899 leftFieldArguments[R] = 0;
900
901 Kokkos::Array<int,Parameters::MaxTensorComponents> pointArguments;
902 for (int i=0; i<numTensorComponents_; i++)
903 {
904 pointArguments[i] = 0;
905 }
906
907 // as an optimization, we are using the same argument array for both the truncated field that spans s to R-1 and the full one that goes to R
908 // the "R" argument here is ignored by the methods that treat the truncated field (it's beyond their bounds)
909 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldArguments;
910 rightFieldArguments[R] = 0;
911
912 int r = R;
913 while (r >= 0)
914 {
915 // integrate in final component dimension
916 for (pointArguments[R]=0; pointArguments[R] < pointBounds_[R]; pointArguments[R]++)
917 {
918 flopsPer_i_R_j_R += 4;
919 }
920 // TODO: figure out why the below is not the same thing as the above -- the below overcounts, somehow
921// if (0 < pointBounds_[R])
922// {
923// flopsPer_i_R_j_R += pointBounds_[R] * 4;
924// }
925
926 const int r_next = nextIncrementResult(pointArguments, pointBounds_, numTensorComponents_);
927 const int r_min = (r_next >= 0) ? r_next : 0;
928
929 for (int s=R-1; s>=r_min; s--)
930 {
931 // want to cover all the multi-indices from s to R-1: for each we have 2 multiplies and one add (3 flops)
932 int numLeftIterations = leftFieldRelativeEnumerationSpans_[s];
933 int numRightIterations = rightFieldRelativeEnumerationSpans_[s];
934
935 flopsPer_i_R_j_R += numLeftIterations * numRightIterations * 3;
936 }
937
938 // proceed to next point
939 r = incrementArgument(pointArguments, pointBounds_, numTensorComponents_);
940 }
941
942 flopCount += flopsPer_i_R_j_R * numLeftFieldsFinal * numRightFieldsFinal;
943 }
944 }
945// std::cout << "flop count per cell: " << flopCount << std::endl;
946 return flopCount;
947 }
948
950 int teamSize(const int &maxTeamSizeFromKokkos) const
951 {
952 const int R = numTensorComponents_ - 1;
953 const int threadParallelismExpressed = leftFieldBounds_[R] * rightFieldBounds_[R];
954 return std::min(maxTeamSizeFromKokkos, threadParallelismExpressed);
955 }
956
958 size_t team_shmem_size (int team_size) const
959 {
960 // we use shared memory to create a fast buffer for intermediate values, as well as fast access to the current-cell's field values
961 size_t shmem_size = 0;
962
963 if (fad_size_output_ > 0)
964 {
965 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(offsetsForComponentOrdinal_[0] * team_size, fad_size_output_);
966 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(composedTransform_.extent_int(1), fad_size_output_);
967 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsLeft_, maxPointCount_, fad_size_output_);
968 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsRight_, maxPointCount_, fad_size_output_);
969 }
970 else
971 {
972 shmem_size += Kokkos::View<Scalar*, DeviceType>::shmem_size(offsetsForComponentOrdinal_[0] * team_size);
973 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(composedTransform_.extent_int(1));
974 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsLeft_, maxPointCount_);
975 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsRight_, maxPointCount_);
976 }
977
978 return shmem_size;
979 }
980 };
981
991 template<class Scalar, class DeviceType, int integralViewRank>
993 {
994 using ExecutionSpace = typename DeviceType::execution_space;
995 using TeamPolicy = Kokkos::TeamPolicy<DeviceType>;
996 using TeamMember = typename TeamPolicy::member_type;
997
998 using IntegralViewType = Kokkos::View<typename RankExpander<Scalar, integralViewRank>::value_type, DeviceType>;
999 IntegralViewType integralView_;
1000 TensorData<Scalar,DeviceType> leftComponent_;
1001 Data<Scalar,DeviceType> composedTransform_;
1002 TensorData<Scalar,DeviceType> rightComponent_;
1003 TensorData<Scalar,DeviceType> cellMeasures_;
1004 int a_offset_;
1005 int b_offset_;
1006 int leftComponentSpan_; // leftComponentSpan tracks the dimensions spanned by the left component
1007 int rightComponentSpan_; // rightComponentSpan tracks the dimensions spanned by the right component
1008 int numTensorComponents_;
1009 int leftFieldOrdinalOffset_;
1010 int rightFieldOrdinalOffset_;
1011
1012 size_t fad_size_output_ = 0; // 0 if not a fad type
1013
1014 // as an optimization, we do all the bounds and argument iteration within the functor rather than relying on TensorArgumentIterator
1015 // (this also makes it easier to reorder loops, etc., for further optimizations)
1016 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldBounds_;
1017 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldBounds_;
1018 Kokkos::Array<int,Parameters::MaxTensorComponents> pointBounds_;
1019
1020 int maxFieldsLeft_;
1021 int maxFieldsRight_;
1022 int maxPointCount_;
1023 public:
1025 TensorData<Scalar,DeviceType> leftComponent,
1026 Data<Scalar,DeviceType> composedTransform,
1027 TensorData<Scalar,DeviceType> rightComponent,
1028 TensorData<Scalar,DeviceType> cellMeasures,
1029 int a_offset,
1030 int b_offset,
1031 int leftFieldOrdinalOffset,
1032 int rightFieldOrdinalOffset)
1033 :
1034 integralView_(integralData.template getUnderlyingView<integralViewRank>()),
1035 leftComponent_(leftComponent),
1036 composedTransform_(composedTransform),
1037 rightComponent_(rightComponent),
1038 cellMeasures_(cellMeasures),
1039 a_offset_(a_offset),
1040 b_offset_(b_offset),
1041 leftComponentSpan_(leftComponent.extent_int(2)),
1042 rightComponentSpan_(rightComponent.extent_int(2)),
1043 numTensorComponents_(leftComponent.numTensorComponents()),
1044 leftFieldOrdinalOffset_(leftFieldOrdinalOffset),
1045 rightFieldOrdinalOffset_(rightFieldOrdinalOffset)
1046 {
1047 INTREPID2_TEST_FOR_EXCEPTION(numTensorComponents_ != rightComponent_.numTensorComponents(), std::invalid_argument, "Left and right components must have matching number of tensorial components");
1048
1049 const int FIELD_DIM = 0;
1050 const int POINT_DIM = 1;
1051 maxFieldsLeft_ = 0;
1052 maxFieldsRight_ = 0;
1053 maxPointCount_ = 0;
1054 for (int r=0; r<numTensorComponents_; r++)
1055 {
1056 leftFieldBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
1057 maxFieldsLeft_ = std::max(maxFieldsLeft_, leftFieldBounds_[r]);
1058 rightFieldBounds_[r] = rightComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
1059 maxFieldsRight_ = std::max(maxFieldsRight_, rightFieldBounds_[r]);
1060 pointBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(POINT_DIM);
1061 maxPointCount_ = std::max(maxPointCount_, pointBounds_[r]);
1062 }
1063
1064 // prepare for allocation of temporary storage
1065 // note: tempStorage goes "backward", starting from the final component, which needs just one entry
1066
1067 const bool allocateFadStorage = !(std::is_standard_layout<Scalar>::value && std::is_trivial<Scalar>::value);
1068 if (allocateFadStorage)
1069 {
1070 fad_size_output_ = dimension_scalar(integralView_);
1071 }
1072 }
1073
1074 template<size_t maxComponents, size_t numComponents = maxComponents>
1075 KOKKOS_INLINE_FUNCTION
1076 int incrementArgument( Kokkos::Array<int,maxComponents> &arguments,
1077 const Kokkos::Array<int,maxComponents> &bounds) const
1078 {
1079 if (numComponents == 0) return -1;
1080 int r = static_cast<int>(numComponents - 1);
1081 while (arguments[r] + 1 >= bounds[r])
1082 {
1083 arguments[r] = 0; // reset
1084 r--;
1085 if (r < 0) break;
1086 }
1087 if (r >= 0) ++arguments[r];
1088 return r;
1089 }
1090
1092 KOKKOS_INLINE_FUNCTION
1093 int incrementArgument( Kokkos::Array<int,Parameters::MaxTensorComponents> &arguments,
1094 const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
1095 const int &numComponents) const
1096 {
1097 if (numComponents == 0) return -1;
1098 int r = static_cast<int>(numComponents - 1);
1099 while (arguments[r] + 1 >= bounds[r])
1100 {
1101 arguments[r] = 0; // reset
1102 r--;
1103 if (r < 0) break;
1104 }
1105 if (r >= 0) ++arguments[r];
1106 return r;
1107 }
1108
1109 template<size_t maxComponents, size_t numComponents = maxComponents>
1110 KOKKOS_INLINE_FUNCTION
1111 int nextIncrementResult(const Kokkos::Array<int,maxComponents> &arguments,
1112 const Kokkos::Array<int,maxComponents> &bounds) const
1113 {
1114 if (numComponents == 0) return -1;
1115 int r = static_cast<int>(numComponents - 1);
1116 while (arguments[r] + 1 >= bounds[r])
1117 {
1118 r--;
1119 if (r < 0) break;
1120 }
1121 return r;
1122 }
1123
1125 KOKKOS_INLINE_FUNCTION
1126 int nextIncrementResult(const Kokkos::Array<int,Parameters::MaxTensorComponents> &arguments,
1127 const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
1128 const int &numComponents) const
1129 {
1130 if (numComponents == 0) return -1;
1131 int r = numComponents - 1;
1132 while (arguments[r] + 1 >= bounds[r])
1133 {
1134 r--;
1135 if (r < 0) break;
1136 }
1137 return r;
1138 }
1139
1140 template<size_t maxComponents, size_t numComponents = maxComponents>
1141 KOKKOS_INLINE_FUNCTION
1142 int relativeEnumerationIndex(const Kokkos::Array<int,maxComponents> &arguments,
1143 const Kokkos::Array<int,maxComponents> &bounds,
1144 const int startIndex) const
1145 {
1146 // the following mirrors what is done in TensorData
1147 if (numComponents == 0)
1148 {
1149 return 0;
1150 }
1151 int enumerationIndex = 0;
1152 for (size_t r=numComponents-1; r>static_cast<size_t>(startIndex); r--)
1153 {
1154 enumerationIndex += arguments[r];
1155 enumerationIndex *= bounds[r-1];
1156 }
1157 enumerationIndex += arguments[startIndex];
1158 return enumerationIndex;
1159 }
1160
1161 template<int rank>
1162 KOKKOS_INLINE_FUNCTION
1163 enable_if_t<rank==3 && rank==integralViewRank, Scalar &>
1164 integralViewEntry(const IntegralViewType& integralView, const int &cellDataOrdinal, const int &i, const int &j) const
1165 {
1166 return integralView(cellDataOrdinal,i,j);
1167 }
1168
1169 template<int rank>
1170 KOKKOS_INLINE_FUNCTION
1171 enable_if_t<rank==2 && rank==integralViewRank, Scalar &>
1172 integralViewEntry(const IntegralViewType& integralView, const int &cellDataOrdinal, const int &i, const int &j) const
1173 {
1174 return integralView(i,j);
1175 }
1176
1178 KOKKOS_INLINE_FUNCTION
1179 void runSpecialized3( const TeamMember & teamMember ) const
1180 {
1181 constexpr int numTensorComponents = 3;
1182
1183 const int pointBounds_x = pointBounds_[0];
1184 const int pointBounds_y = pointBounds_[1];
1185 const int pointBounds_z = pointBounds_[2];
1186 const int pointsInNonzeroComponentDimensions = pointBounds_y * pointBounds_z;
1187
1188 const int leftFieldBounds_x = leftFieldBounds_[0];
1189 const int rightFieldBounds_x = rightFieldBounds_[0];
1190 const int leftFieldBounds_y = leftFieldBounds_[1];
1191 const int rightFieldBounds_y = rightFieldBounds_[1];
1192 const int leftFieldBounds_z = leftFieldBounds_[2];
1193 const int rightFieldBounds_z = rightFieldBounds_[2];
1194
1195 Kokkos::Array<int,numTensorComponents> leftFieldBounds;
1196 Kokkos::Array<int,numTensorComponents> rightFieldBounds;
1197 for (unsigned r=0; r<numTensorComponents; r++)
1198 {
1199 leftFieldBounds[r] = leftFieldBounds_[r];
1200 rightFieldBounds[r] = rightFieldBounds_[r];
1201 }
1202
1203 const auto integralView = integralView_;
1204 const auto leftFieldOrdinalOffset = leftFieldOrdinalOffset_;
1205 const auto rightFieldOrdinalOffset = rightFieldOrdinalOffset_;
1206
1207 const int cellDataOrdinal = teamMember.league_rank();
1208 const int threadNumber = teamMember.team_rank();
1209
1210 const int numThreads = teamMember.team_size(); // num threads
1211 const int GyEntryCount = pointBounds_z; // for each thread: store one Gy value per z coordinate
1212 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GxIntegrals; // for caching Gx values: we integrate out the first component dimension for each coordinate in the remaining dimensios
1213 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GyIntegrals; // for caching Gy values (each thread gets a stack, of the same height as tensorComponents - 1)
1214 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights; // indexed by (expanded) point; stores M_ab * cell measure; shared by team
1215
1216 Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_x, rightFields_x;
1217 Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_y, rightFields_y;
1218 Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_z, rightFields_z;
1219 if (fad_size_output_ > 0) {
1220 GxIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions, fad_size_output_);
1221 GyIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), GyEntryCount * numThreads, fad_size_output_);
1222 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1), fad_size_output_);
1223
1224 leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_x, pointBounds_x, fad_size_output_);
1225 rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_x, pointBounds_x, fad_size_output_);
1226 leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_y, pointBounds_y, fad_size_output_);
1227 rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_y, pointBounds_y, fad_size_output_);
1228 leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_z, pointBounds_z, fad_size_output_);
1229 rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_z, pointBounds_z, fad_size_output_);
1230 }
1231 else {
1232 GxIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions);
1233 GyIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), GyEntryCount * numThreads);
1234 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1));
1235
1236 leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_x, pointBounds_x);
1237 rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_x, pointBounds_x);
1238 leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_y, pointBounds_y);
1239 rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_y, pointBounds_y);
1240 leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_z, pointBounds_z);
1241 rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_z, pointBounds_z);
1242 }
1243
1244// int approximateFlopCount = 0;
1245// int flopsPerCellMeasuresAccess = cellMeasures_.numTensorComponents() - 1;
1246
1247 // approximateFlopCount += composedTransform_.extent_int(1) * cellMeasures.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
1248
1249 const int composedTransformRank = composedTransform_.rank();
1250
1251 // synchronize threads
1252 teamMember.team_barrier();
1253
1254 for (int a_component=0; a_component < leftComponentSpan_; a_component++)
1255 {
1256 const int a = a_offset_ + a_component;
1257 for (int b_component=0; b_component < rightComponentSpan_; b_component++)
1258 {
1259 const int b = b_offset_ + b_component;
1260
1261 const Data<Scalar,DeviceType> & leftTensorComponent_x = leftComponent_.getTensorComponent(0);
1262 const Data<Scalar,DeviceType> & rightTensorComponent_x = rightComponent_.getTensorComponent(0);
1263 const Data<Scalar,DeviceType> & leftTensorComponent_y = leftComponent_.getTensorComponent(1);
1264 const Data<Scalar,DeviceType> & rightTensorComponent_y = rightComponent_.getTensorComponent(1);
1265 const Data<Scalar,DeviceType> & leftTensorComponent_z = leftComponent_.getTensorComponent(2);
1266 const Data<Scalar,DeviceType> & rightTensorComponent_z = rightComponent_.getTensorComponent(2);
1267
1268 const int maxFields = (maxFieldsLeft_ > maxFieldsRight_) ? maxFieldsLeft_ : maxFieldsRight_;
1269 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,maxFields), [&] (const int& fieldOrdinal) {
1270 if (fieldOrdinal < leftTensorComponent_x.extent_int(0))
1271 {
1272 const int pointCount = leftTensorComponent_x.extent_int(1);
1273 const int leftRank = leftTensorComponent_x.rank();
1274 for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1275 {
1276 leftFields_x(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_x(fieldOrdinal,pointOrdinal) : leftTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
1277 }
1278 }
1279 if (fieldOrdinal < leftTensorComponent_y.extent_int(0))
1280 {
1281 const int pointCount = leftTensorComponent_y.extent_int(1);
1282 const int leftRank = leftTensorComponent_y.rank();
1283 for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1284 {
1285 leftFields_y(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_y(fieldOrdinal,pointOrdinal) : leftTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
1286 }
1287 }
1288 if (fieldOrdinal < leftTensorComponent_z.extent_int(0))
1289 {
1290 const int pointCount = leftTensorComponent_z.extent_int(1);
1291 const int leftRank = leftTensorComponent_z.rank();
1292 for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1293 {
1294 leftFields_z(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_z(fieldOrdinal,pointOrdinal) : leftTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
1295 }
1296 }
1297 if (fieldOrdinal < rightTensorComponent_x.extent_int(0))
1298 {
1299 const int pointCount = rightTensorComponent_x.extent_int(1);
1300 const int rightRank = rightTensorComponent_x.rank();
1301 for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1302 {
1303 rightFields_x(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_x(fieldOrdinal,pointOrdinal) : rightTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
1304 }
1305 }
1306 if (fieldOrdinal < rightTensorComponent_y.extent_int(0))
1307 {
1308 const int pointCount = rightTensorComponent_y.extent_int(1);
1309 const int rightRank = rightTensorComponent_y.rank();
1310 for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1311 {
1312 rightFields_y(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_y(fieldOrdinal,pointOrdinal) : rightTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
1313 }
1314 }
1315 if (fieldOrdinal < rightTensorComponent_z.extent_int(0))
1316 {
1317 const int pointCount = rightTensorComponent_z.extent_int(1);
1318 const int rightRank = rightTensorComponent_z.rank();
1319 for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1320 {
1321 rightFields_z(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_z(fieldOrdinal,pointOrdinal) : rightTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
1322 }
1323 }
1324 });
1325
1326 if (composedTransformRank == 4) // (C,P,D,D)
1327 {
1328 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
1329 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1330 });
1331 }
1332 else // (C,P)
1333 {
1334 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
1335 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1336 });
1337 }
1338
1339 // synchronize threads
1340 teamMember.team_barrier();
1341
1342 for (int i0=0; i0<leftFieldBounds_x; i0++)
1343 {
1344 for (int j0=0; j0<rightFieldBounds_x; j0++)
1345 {
1346 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,pointsInNonzeroComponentDimensions), [&] (const int& pointEnumerationIndexLaterDimensions) {
1347 // first component is fastest-moving; we can get a tensorPointEnumerationOffset just by multiplying by the pointBounds in x
1348 const int tensorPointEnumerationOffset = pointBounds_x * pointEnumerationIndexLaterDimensions; // compute offset for pointWeights container, for which x is the fastest-moving
1349
1350 Scalar & Gx = GxIntegrals(pointEnumerationIndexLaterDimensions);
1351
1352 Gx = 0;
1353 if (fad_size_output_ == 0)
1354 {
1355 // not a Fad type; we're allow to have a vector range
1356 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, pointBounds_x), [&] (const int &x_pointOrdinal, Scalar &integralThusFar)
1357 {
1358 integralThusFar += leftFields_x(i0,x_pointOrdinal) * rightFields_x(j0,x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1359 }, Gx);
1360 }
1361 else
1362 {
1363 for (int x_pointOrdinal=0; x_pointOrdinal<pointBounds_x; x_pointOrdinal++)
1364 {
1365 Gx += leftFields_x(i0,x_pointOrdinal) * rightFields_x(j0,x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1366 }
1367 }
1368 });
1369
1370 // synchronize threads
1371 teamMember.team_barrier();
1372
1373 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,leftFieldBounds_y * rightFieldBounds_y), [&] (const int& i1j1) {
1374 const int i1 = i1j1 % leftFieldBounds_y;
1375 const int j1 = i1j1 / leftFieldBounds_y;
1376
1377 int Gy_index_offset = GyEntryCount * threadNumber; // thread-relative index into GyIntegrals container; store one value per z coordinate
1378
1379 for (int lz=0; lz<pointBounds_z; lz++)
1380 {
1381 int pointEnumerationIndex = lz * pointBounds_y;
1382 if (fad_size_output_ == 0)
1383 {
1384 Scalar Gy_local = 0;
1385
1386 // not a Fad type; we're allow to have a vector range
1387 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, pointBounds_y), [&] (const int &ly, Scalar &integralThusFar)
1388 {
1389 const Scalar & leftValue = leftFields_y(i1,ly);
1390 const Scalar & rightValue = rightFields_y(j1,ly);
1391
1392 integralThusFar += leftValue * rightValue * GxIntegrals(pointEnumerationIndex + ly);
1393 }, Gy_local);
1394
1395 GyIntegrals(Gy_index_offset + lz) = Gy_local;
1396 }
1397 else
1398 {
1399 Scalar & Gy = GyIntegrals(Gy_index_offset + lz);
1400 for (int ly=0; ly<pointBounds_y; ly++)
1401 {
1402 const Scalar & leftValue = leftFields_y(i1,ly);
1403 const Scalar & rightValue = rightFields_y(j1,ly);
1404
1405 Gy += leftValue * rightValue * GxIntegrals(pointEnumerationIndex + ly);
1406 }
1407 }
1408 }
1409
1410 for (int i2=0; i2<leftFieldBounds_z; i2++)
1411 {
1412 for (int j2=0; j2<rightFieldBounds_z; j2++)
1413 {
1414 Scalar Gz = 0.0;
1415
1416 if (fad_size_output_ == 0)
1417 {
1418 // not a Fad type; we're allow to have a vector range
1419 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, pointBounds_z), [&] (const int &lz, Scalar &integralThusFar)
1420 {
1421 const Scalar & leftValue = leftFields_z(i2,lz);
1422 const Scalar & rightValue = rightFields_z(j2,lz);
1423
1424 integralThusFar += leftValue * rightValue * GyIntegrals(Gy_index_offset+lz);
1425 }, Gz);
1426 }
1427 else
1428 {
1429 for (int lz=0; lz<pointBounds_z; lz++)
1430 {
1431 const Scalar & leftValue = leftFields_z(i2,lz);
1432 const Scalar & rightValue = rightFields_z(j2,lz);
1433
1434 Gz += leftValue * rightValue * GyIntegrals(Gy_index_offset+lz);
1435 }
1436 }
1437
1438 const int i = leftFieldOrdinalOffset + i0 + (i1 + i2 * leftFieldBounds_y) * leftFieldBounds_x;
1439 const int j = rightFieldOrdinalOffset + j0 + (j1 + j2 * rightFieldBounds_y) * rightFieldBounds_x;
1440 // the above are an optimization of the below, undertaken on the occasion of a weird Intel compiler segfault, possibly a compiler bug.
1441// const int i = relativeEnumerationIndex( leftArguments, leftFieldBounds, 0) + leftFieldOrdinalOffset;
1442// const int j = relativeEnumerationIndex(rightArguments, rightFieldBounds, 0) + rightFieldOrdinalOffset;
1443
1444 Kokkos::single (Kokkos::PerThread(teamMember), [&] () {
1445 integralViewEntry<integralViewRank>(integralView, cellDataOrdinal, i, j) += Gz;
1446 });
1447 }
1448 }
1449 });
1450 // synchronize threads
1451 teamMember.team_barrier();
1452 }
1453 }
1454 }
1455 }
1456 }
1457
1458 template<size_t numTensorComponents>
1459 KOKKOS_INLINE_FUNCTION
1460 void run( const TeamMember & teamMember ) const
1461 {
1462 INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true, std::logic_error, "implementation incomplete");
1463// Kokkos::Array<int,numTensorComponents> pointBounds;
1464// Kokkos::Array<int,numTensorComponents> leftFieldBounds;
1465// Kokkos::Array<int,numTensorComponents> rightFieldBounds;
1466//
1467// int pointsInNonzeroComponentDimensions = 1;
1468// for (unsigned r=0; r<numTensorComponents; r++)
1469// {
1470// pointBounds[r] = pointBounds_[r];
1471// if (r > 0) pointsInNonzeroComponentDimensions *= pointBounds[r];
1472// leftFieldBounds[r] = leftFieldBounds_[r];
1473// rightFieldBounds[r] = rightFieldBounds_[r];
1474// }
1475//
1476// const int cellDataOrdinal = teamMember.league_rank();
1477// const int numThreads = teamMember.team_size(); // num threads
1478// const int G_k_StackHeight = numTensorComponents - 1; // per thread
1479// Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> G_0_IntegralsView; // for caching G0 values: we integrate out the first component dimension for each coordinate in the remaining dimensios
1480// Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> G_k_StackView; // for caching G_k values (each thread gets a stack, of the same height as tensorComponents - 1)
1481// Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights; // indexed by (expanded) point; stores M_ab * cell measure; shared by team
1482// Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> leftFields, rightFields; // cache the field values at each level for faster access
1483// if (fad_size_output_ > 0) {
1484// G_k_StackView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), G_k_StackHeight * numThreads, fad_size_output_);
1485// G_0_IntegralsView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions, fad_size_output_);
1486// pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1), fad_size_output_);
1487// leftFields = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), maxPointCount_, fad_size_output_);
1488// rightFields = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), maxPointCount_, fad_size_output_);
1489// }
1490// else {
1491// G_k_StackView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), G_k_StackHeight * numThreads);
1492// G_0_IntegralsView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions);
1493// pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1));
1494// leftFields = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), maxPointCount_);
1495// rightFields = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), maxPointCount_);
1496// }
1497//
1500//
1501// constexpr int R = numTensorComponents - 1;
1502//
1503// // approximateFlopCount += composedTransform_.extent_int(1) * cellMeasures.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
1504//
1505// // synchronize threads
1506// teamMember.team_barrier();
1507//
1508// for (int a_component=0; a_component < leftComponentSpan_; a_component++)
1509// {
1510// const int a = a_offset_ + a_component;
1511// for (int b_component=0; b_component < rightComponentSpan_; b_component++)
1512// {
1513// const int b = b_offset_ + b_component;
1514//
1515// const Data<Scalar,DeviceType> & leftFirstComponent = leftComponent_.getTensorComponent(0);
1516// const Data<Scalar,DeviceType> & rightFirstComponent = rightComponent_.getTensorComponent(0);
1517//
1518// const int numLeftFieldsFirst = leftFirstComponent.extent_int(0); // shape (F,P[,D])
1519// const int numRightFieldsFirst = rightFirstComponent.extent_int(0); // shape (F,P[,D])
1520//
1521// const int numPointsFirst = leftFirstComponent.extent_int(1);
1522//
1523// Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
1524// pointWeights(pointOrdinal) = composedTransform_.access(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1525// });
1526//
1527// // synchronize threads
1528// teamMember.team_barrier();
1529//
1530// for (int i0=0; i0<numLeftFieldsFirst; i0++)
1531// {
1532// for (int j0=0; j0<numRightFieldsFirst; j0++)
1533// {
1534// Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numLeftFieldsFirst*numPointsFirst), [&] (const int& fieldOrdinalByPointOrdinal) {
1535// const int fieldOrdinal = fieldOrdinalByPointOrdinal % numPointsFirst;
1536// const int pointOrdinal = fieldOrdinalByPointOrdinal / numPointsFirst;
1537// leftFields(pointOrdinal) = leftFirstComponent(fieldOrdinal,pointOrdinal);
1538// });
1539//
1540// Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numRightFieldsFirst*numPointsFirst), [&] (const int& fieldOrdinalByPointOrdinal) {
1541// const int fieldOrdinal = fieldOrdinalByPointOrdinal % numPointsFirst;
1542// const int pointOrdinal = fieldOrdinalByPointOrdinal / numPointsFirst;
1543// rightFields(pointOrdinal) = rightFirstComponent(fieldOrdinal,pointOrdinal);
1544// });
1545//
1546// // synchronize threads
1547// teamMember.team_barrier();
1548//
1549// Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,pointsInNonzeroComponentDimensions), [&] (const int& pointEnumerationIndexLaterDimensions) {
1550// Kokkos::Array<int,numTensorComponents-1> pointArgumentsInLaterDimensions;
1551// int remainingIndex = pointEnumerationIndexLaterDimensions;
1552//
1553// for (int d=R-1; d>0; d--) // last component (z in 3D hypercube) is fastest-moving // TODO: consider doing first component as fastest-moving. That would make indexing into pointWeights simpler
1554// {
1555// pointArgumentsInLaterDimensions[d] = pointEnumerationIndexLaterDimensions % pointBounds[d+1];
1556// remainingIndex /= pointBounds[d+1];
1557// }
1558// pointArgumentsInLaterDimensions[0] = remainingIndex;
1559//
1560// int tensorPointEnumerationOffset = 0; // compute offset for pointWeights container, for which x is the fastest-moving
1561// for (int d=R; d>0; d--)
1562// {
1563// tensorPointEnumerationOffset += pointArgumentsInLaterDimensions[d-1]; // pointArgumentsInLaterDimensions does not have an x component, hence d-1 here
1564// tensorPointEnumerationOffset *= pointBounds[d-1];
1565// }
1566//
1567// Scalar integralValue = 0;
1568// if (fad_size_output_ == 0)
1569// {
1570// // not a Fad type; we're allow to have a vector range
1571// Kokkos::parallel_reduce("first component integral", Kokkos::ThreadVectorRange(teamMember, numPointsFirst), [&] (const int &x_pointOrdinal, Scalar &integralThusFar)
1572// {
1573// integralThusFar += leftFields(x_pointOrdinal) * rightFields(x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset);
1574// }, integralValue);
1575// }
1576// else
1577// {
1578// for (int pointOrdinal=0; pointOrdinal<numPointsFirst; pointOrdinal++)
1579// {
1580// integralValue += leftFields(pointOrdinal) * rightFields(pointOrdinal) * pointWeights(tensorPointEnumerationOffset);
1581// }
1582// }
1583//
1584// G_0_IntegralsView(pointEnumerationIndexLaterDimensions) = integralValue;
1585// });
1586//
1587// // synchronize threads
1588// teamMember.team_barrier();
1589//
1590// // TODO: finish this, probably after having written up the algorithm for arbitrary component count. (I have it written down for 3D.)
1591// }
1592// }
1593// }
1594// }
1596 }
1597
1598 KOKKOS_INLINE_FUNCTION
1599 void operator()( const TeamMember & teamMember ) const
1600 {
1601 switch (numTensorComponents_)
1602 {
1603 case 1: run<1>(teamMember); break;
1604 case 2: run<2>(teamMember); break;
1605 case 3: runSpecialized3(teamMember); break;
1606 case 4: run<4>(teamMember); break;
1607 case 5: run<5>(teamMember); break;
1608 case 6: run<6>(teamMember); break;
1609 case 7: run<7>(teamMember); break;
1610#ifdef INTREPID2_HAVE_DEBUG
1611 default:
1612 INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true,std::invalid_argument,"Unsupported component count");
1613#endif
1614 }
1615 }
1616
1619 {
1620 // compute flop count on a single cell
1621 int flopCount = 0;
1622
1623 constexpr int numTensorComponents = 3;
1624 Kokkos::Array<int,numTensorComponents> pointBounds;
1625 Kokkos::Array<int,numTensorComponents> leftFieldBounds;
1626 Kokkos::Array<int,numTensorComponents> rightFieldBounds;
1627
1628 int pointsInNonzeroComponentDimensions = 1;
1629 for (unsigned r=0; r<numTensorComponents; r++)
1630 {
1631 pointBounds[r] = pointBounds_[r];
1632 if (r > 0) pointsInNonzeroComponentDimensions *= pointBounds[r];
1633 leftFieldBounds[r] = leftFieldBounds_[r];
1634 rightFieldBounds[r] = rightFieldBounds_[r];
1635 }
1636
1637 for (int a_component=0; a_component < leftComponentSpan_; a_component++)
1638 {
1639 for (int b_component=0; b_component < rightComponentSpan_; b_component++)
1640 {
1641// Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
1642// pointWeights(pointOrdinal) = composedTransform_.access(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1643// });
1644 flopCount += composedTransform_.extent_int(1) * cellMeasures_.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
1645
1646 for (int i0=0; i0<leftFieldBounds[0]; i0++)
1647 {
1648 for (int j0=0; j0<rightFieldBounds[0]; j0++)
1649 {
1650 flopCount += pointsInNonzeroComponentDimensions * pointBounds[0] * 3; // 3 flops per integration point in the loop commented out below
1651// Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,pointsInNonzeroComponentDimensions), [&] (const int& pointEnumerationIndexLaterDimensions) {
1652// Kokkos::Array<int,numTensorComponents-1> pointArgumentsInLaterDimensions;
1653// int remainingIndex = pointEnumerationIndexLaterDimensions;
1654//
1655// for (int d=0; d<R-1; d++) // first component is fastest-moving; this is determined by order of access in the lz/ly loop to compute Gy (integrals in y dimension)
1656// {
1657// pointArgumentsInLaterDimensions[d] = pointEnumerationIndexLaterDimensions % pointBounds[d+1]; // d+1 because x dimension is being integrated away
1658// remainingIndex /= pointBounds[d+1];
1659// }
1660// pointArgumentsInLaterDimensions[R-1] = remainingIndex;
1661//
1662// int tensorPointEnumerationOffset = 0; // compute offset for pointWeights container, for which x is the fastest-moving
1663// for (int d=R; d>0; d--)
1664// {
1665// tensorPointEnumerationOffset += pointArgumentsInLaterDimensions[d-1]; // pointArgumentsInLaterDimensions does not have an x component, hence d-1 here
1666// tensorPointEnumerationOffset *= pointBounds[d-1];
1667// }
1668//
1669// Scalar integralValue = 0;
1670// if (fad_size_output_ == 0)
1671// {
1672// // not a Fad type; we're allow to have a vector range
1673// Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, numPointsFirst), [&] (const int &x_pointOrdinal, Scalar &integralThusFar)
1674// {
1675// integralThusFar += leftFields(x_pointOrdinal) * rightFields(x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1676// }, integralValue);
1677// }
1678// else
1679// {
1680// for (int x_pointOrdinal=0; x_pointOrdinal<numPointsFirst; x_pointOrdinal++)
1681// {
1682// integralValue += leftFields_x(x_pointOrdinal) * rightFields_x(x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1683// }
1684// }
1685//
1686// GxIntegrals(pointEnumerationIndexLaterDimensions) = integralValue;
1687// });
1688
1689
1690 flopCount += leftFieldBounds[1] * rightFieldBounds[1] * pointBounds[1] * pointBounds[2] * 3; // 3 flops for each Gy += line in the below
1691 flopCount += leftFieldBounds[1] * rightFieldBounds[1] * leftFieldBounds[2] * rightFieldBounds[2] * pointBounds[2] * 3; // 3 flops for each Gz += line in the below
1692 flopCount += leftFieldBounds[1] * rightFieldBounds[1] * leftFieldBounds[2] * rightFieldBounds[2] * 1; // 1 flops for the integralView += line below
1693
1694// Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,leftFieldBounds[1] * rightFieldBounds[1]), [&] (const int& i1j1) {
1695// const int i1 = i1j1 % leftFieldBounds[1];
1696// const int j1 = i1j1 / leftFieldBounds[1];
1697//
1699//
1700// int pointEnumerationIndex = 0; // incremented at bottom of lz loop below.
1701// for (int lz=0; lz<pointBounds[2]; lz++)
1702// {
1703// Scalar & Gy = GyIntegrals(Gy_index);
1704// Gy = 0.0;
1705//
1706// const bool leftRankIs3 = ( leftFields_y.rank() == 3);
1707// const bool rightRankIs3 = (rightFields_y.rank() == 3);
1708// for (int ly=0; ly<pointBounds[1]; ly++)
1709// {
1710// const Scalar & leftValue = leftRankIs3 ? leftFields_y(i1,ly,a_component) : leftFields_y(i1,ly);
1711// const Scalar & rightValue = rightRankIs3 ? rightFields_y(j1,ly,b_component) : rightFields_y(j1,ly);
1712//
1713// Gy += leftValue * rightValue * GxIntegrals(pointEnumerationIndex);
1714//
1715// pointEnumerationIndex++;
1716// }
1717// Gy_index++;
1718// }
1719//
1720// for (int i2=0; i2<leftFieldBounds[2]; i2++)
1721// {
1722// for (int j2=0; j2<rightFieldBounds[2]; j2++)
1723// {
1724// Scalar Gz = 0.0;
1725//
1726// int Gy_index = GyEntryCount * threadNumber; // thread-relative index into GyIntegrals container; store one value per z coordinate
1727//
1728// const bool leftRankIs3 = ( leftFields_z.rank() == 3);
1729// const bool rightRankIs3 = (rightFields_z.rank() == 3);
1730// for (int lz=0; lz<pointBounds[2]; lz++)
1731// {
1732// const Scalar & leftValue = leftRankIs3 ? leftFields_z(i2,lz,a_component) : leftFields_z(i2,lz);
1733// const Scalar & rightValue = rightRankIs3 ? rightFields_z(j2,lz,b_component) : rightFields_z(j2,lz);
1734//
1735// Gz += leftValue * rightValue * GyIntegrals(Gy_index);
1736//
1737// Gy_index++;
1738// }
1739//
1740// Kokkos::Array<int,3> leftArguments {i0,i1,i2};
1741// Kokkos::Array<int,3> rightArguments {j0,j1,j2};
1742//
1743// const int i = relativeEnumerationIndex( leftArguments, leftFieldBounds, 0) + leftFieldOrdinalOffset_;
1744// const int j = relativeEnumerationIndex(rightArguments, rightFieldBounds, 0) + rightFieldOrdinalOffset_;
1745//
1746// if (integralViewRank == 2)
1747// {
1748// integralView_.access(i,j,0) += Gz;
1749// }
1750// else
1751// {
1752// integralView_.access(cellDataOrdinal,i,j) += Gz;
1753// }
1754// }
1755// }
1756// });
1757 }
1758 }
1759 }
1760 }
1761 return flopCount;
1762 }
1763
1765 int teamSize(const int &maxTeamSizeFromKokkos) const
1766 {
1767 // TODO: fix this to match the actual parallelism expressed
1768 const int R = numTensorComponents_ - 1;
1769 const int threadParallelismExpressed = leftFieldBounds_[R] * rightFieldBounds_[R];
1770 return std::min(maxTeamSizeFromKokkos, threadParallelismExpressed);
1771 }
1772
1774 size_t team_shmem_size (int numThreads) const
1775 {
1776 // we use shared memory to create a fast buffer for intermediate values, as well as fast access to the current-cell's field values
1777 size_t shmem_size = 0;
1778
1779 const int GyEntryCount = pointBounds_[2]; // for each thread: store one Gy value per z coordinate
1780
1781 int pointsInNonzeroComponentDimensions = 1;
1782 for (int d=1; d<numTensorComponents_; d++)
1783 {
1784 pointsInNonzeroComponentDimensions *= pointBounds_[d];
1785 }
1786
1787 if (fad_size_output_ > 0)
1788 {
1789 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(pointsInNonzeroComponentDimensions, fad_size_output_); // GxIntegrals: entries with x integrated away
1790 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(GyEntryCount * numThreads, fad_size_output_); // GyIntegrals: entries with x,y integrated away
1791 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size (composedTransform_.extent_int(1), fad_size_output_); // pointWeights
1792
1793 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[0], pointBounds_[0], fad_size_output_); // leftFields_x
1794 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[0], pointBounds_[0], fad_size_output_); // rightFields_x
1795 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[1], pointBounds_[1], fad_size_output_); // leftFields_y
1796 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[1], pointBounds_[1], fad_size_output_); // rightFields_y
1797 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[2], pointBounds_[2], fad_size_output_); // leftFields_z
1798 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[2], pointBounds_[2], fad_size_output_); // rightFields_z
1799 }
1800 else
1801 {
1802 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(pointsInNonzeroComponentDimensions); // GxIntegrals: entries with x integrated away
1803 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(GyEntryCount * numThreads); // GyIntegrals: entries with x,y integrated away
1804 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size (composedTransform_.extent_int(1)); // pointWeights
1805
1806 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[0], pointBounds_[0]); // leftFields_x
1807 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[0], pointBounds_[0]); // rightFields_x
1808 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[1], pointBounds_[1]); // leftFields_y
1809 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[1], pointBounds_[1]); // rightFields_y
1810 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[2], pointBounds_[2]); // leftFields_z
1811 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[2], pointBounds_[2]); // rightFields_z
1812 }
1813
1814 return shmem_size;
1815 }
1816 };
1817
1818 template<class Scalar, class DeviceType>
1820 {
1821 ScalarView<Scalar,DeviceType> integral_;
1824 Data<Scalar,DeviceType> weights_;
1825 ordinal_type dimSpan_;
1826 ordinal_type leftRank_;
1827 ordinal_type rightRank_;
1828 ordinal_type numPoints_;
1829 public:
1830 F_RefSpaceIntegral(ScalarView<Scalar,DeviceType> integralView,
1832 ordinal_type dimSpan)
1833 :
1834 integral_(integralView),
1835 left_(left),
1836 right_(right),
1837 weights_(weights),
1838 dimSpan_(dimSpan)
1839 {
1840 leftRank_ = left.rank();
1841 rightRank_ = right.rank();
1842 numPoints_ = weights.extent_int(0);
1843 }
1844
1845 KOKKOS_INLINE_FUNCTION
1846 void operator()( const ordinal_type & i, const ordinal_type & j ) const
1847 {
1848 Scalar refSpaceIntegral = 0.0;
1849 for (int ptOrdinal=0; ptOrdinal<numPoints_; ptOrdinal++)
1850 {
1851 const Scalar & weight = weights_(ptOrdinal);
1852 for (int a=0; a<dimSpan_; a++)
1853 {
1854 const Scalar & leftValue = ( leftRank_ == 2) ? left_(i,ptOrdinal) : left_(i,ptOrdinal,a);
1855 const Scalar & rightValue = (rightRank_ == 2) ? right_(j,ptOrdinal) : right_(j,ptOrdinal,a);
1856 refSpaceIntegral += leftValue * rightValue * weight;
1857 }
1858 }
1859 integral_(i,j) = refSpaceIntegral;
1860 }
1861 };
1862
1863 template <class Scalar, class DeviceType>
1865 {
1866 // Member variables to capture the necessary data
1867
1868 using ComponentIntegralsArray = Kokkos::Array<Kokkos::Array<ScalarView<Scalar, DeviceType>, Parameters::MaxTensorComponents>, Parameters::MaxTensorComponents>;
1869
1870 Kokkos::View<Scalar **, DeviceType> integralView2_;
1871 Kokkos::View<Scalar ***, DeviceType> integralView3_;
1872 const TensorData<Scalar, DeviceType> leftComponent_;
1873 const TensorData<Scalar, DeviceType> rightComponent_;
1874 const TensorData<Scalar, DeviceType> cellMeasures_;
1875 const TransformedBasisValues<Scalar, DeviceType> basisValuesLeft_;
1876 const TransformedBasisValues<Scalar, DeviceType> basisValuesRight_;
1877 const ComponentIntegralsArray componentIntegrals_;
1878 const ordinal_type d_start_;
1879 const ordinal_type d_end_;
1880 const ordinal_type numPointTensorComponents_;
1881 const ordinal_type leftFieldOffset_;
1882 const ordinal_type rightFieldOffset_;
1883 const ordinal_type integralViewRank_;
1884
1885 public:
1886 // Constructor to initialize the functor
1888 Kokkos::View<Scalar **, DeviceType> integralView2,
1889 Kokkos::View<Scalar ***, DeviceType> integralView3,
1890 const TensorData<Scalar, DeviceType> leftComponent,
1891 const TensorData<Scalar, DeviceType> rightComponent,
1892 const TensorData<Scalar, DeviceType> cellMeasures,
1893 const TransformedBasisValues<Scalar, DeviceType> basisValuesLeft,
1894 const TransformedBasisValues<Scalar, DeviceType> basisValuesRight,
1895 const ComponentIntegralsArray componentIntegrals,
1896 const ordinal_type d_start,
1897 const ordinal_type d_end,
1898 const ordinal_type numPointTensorComponents,
1899 const ordinal_type leftFieldOffset,
1900 const ordinal_type rightFieldOffset,
1901 const ordinal_type integralViewRank)
1902 :
1903 integralView2_(integralView2),
1904 integralView3_(integralView3),
1905 leftComponent_(leftComponent),
1906 rightComponent_(rightComponent),
1907 cellMeasures_(cellMeasures),
1908 basisValuesLeft_(basisValuesLeft),
1909 basisValuesRight_(basisValuesRight),
1910 componentIntegrals_(componentIntegrals),
1911 d_start_(d_start),
1912 d_end_(d_end),
1913 numPointTensorComponents_(numPointTensorComponents),
1914 leftFieldOffset_(leftFieldOffset),
1915 rightFieldOffset_(rightFieldOffset),
1916 integralViewRank_(integralViewRank) {}
1917
1918 KOKKOS_INLINE_FUNCTION
1919 void operator()(const ordinal_type &cellDataOrdinal, const ordinal_type &leftFieldOrdinal, const ordinal_type &rightFieldOrdinal) const
1920 {
1921 const Scalar &cellMeasureWeight = cellMeasures_.getTensorComponent(0)(cellDataOrdinal);
1922
1923 TensorArgumentIterator leftTensorIterator(leftComponent_, 0); // shape is (F,P), and we walk the F dimension of the container
1924 leftTensorIterator.setEnumerationIndex(leftFieldOrdinal);
1925
1926 TensorArgumentIterator rightTensorIterator(rightComponent_, 0); // shape is (F,P), and we walk the F dimension of the container
1927 rightTensorIterator.setEnumerationIndex(rightFieldOrdinal);
1928
1929 Scalar integralSum = 0.0;
1930 for (ordinal_type d = d_start_; d < d_end_; d++)
1931 {
1932 const Scalar &transformLeft_d = basisValuesLeft_.transformWeight(cellDataOrdinal, 0, d, d);
1933 const Scalar &transformRight_d = basisValuesRight_.transformWeight(cellDataOrdinal, 0, d, d);
1934
1935 const Scalar &leftRightTransform_d = transformLeft_d * transformRight_d;
1936 // approximateFlopCount++;
1937
1938 Scalar integral_d = 1.0;
1939
1940 for (ordinal_type r = 0; r < numPointTensorComponents_; r++)
1941 {
1942 integral_d *= componentIntegrals_[r][d](leftTensorIterator.argument(r), rightTensorIterator.argument(r));
1943 // approximateFlopCount++; // product
1944 }
1945 integralSum += leftRightTransform_d * integral_d;
1946 // approximateFlopCount += 2; // multiply and sum
1947
1948 const ordinal_type i = leftFieldOrdinal + leftFieldOffset_;
1949 const ordinal_type j = rightFieldOrdinal + rightFieldOffset_;
1950
1951 if (integralViewRank_ == 3)
1952 {
1953 integralView3_(cellDataOrdinal, i, j) += cellMeasureWeight * integralSum;
1954 }
1955 else
1956 {
1957 integralView2_(i, j) += cellMeasureWeight * integralSum;
1958 }
1959 }
1960 }
1961 };
1962 }
1963
1964template<typename DeviceType>
1965template<class Scalar>
1967 const TensorData<Scalar,DeviceType> &cellMeasures,
1968 const TransformedBasisValues<Scalar,DeviceType> &vectorDataRight)
1969{
1970 // allocates a (C,F,F) container for storing integral data
1971
1972 // ordinal filter is used for Serendipity basis; we don't yet support Serendipity for sum factorization.
1973 // (when we do, the strategy will likely be to do sum factorized assembly and then filter the result).
1974 const bool leftHasOrdinalFilter = vectorDataLeft.basisValues().ordinalFilter().extent_int(0) > 0;
1975 const bool rightHasOrdinalFilter = vectorDataRight.basisValues().ordinalFilter().extent_int(0) > 0;
1976 TEUCHOS_TEST_FOR_EXCEPTION(leftHasOrdinalFilter || rightHasOrdinalFilter, std::invalid_argument, "Ordinal filters for BasisValues are not yet supported by IntegrationTools");
1977
1978 // determine cellDataExtent and variation type. We currently support CONSTANT, MODULAR, and GENERAL as possible output variation types, depending on the inputs.
1979 // If cellMeasures has non-trivial tensor structure, the rank-1 cell Data object is the first component.
1980 // If cellMeasures has trivial tensor structure, then the first and only component has the cell index in its first dimension.
1981 // I.e., either way the relevant Data object is cellMeasures.getTensorComponent(0)
1982 const int CELL_DIM = 0;
1983 const auto cellMeasureData = cellMeasures.getTensorComponent(0);
1984 const auto leftTransform = vectorDataLeft.transform();
1985
1986 DimensionInfo combinedCellDimInfo = cellMeasureData.getDimensionInfo(CELL_DIM);
1987 // transforms may be invalid, indicating an identity transform. If so, it will not constrain the output at all.
1988 if (vectorDataLeft.transform().isValid())
1989 {
1990 combinedCellDimInfo = combinedDimensionInfo(combinedCellDimInfo, vectorDataLeft.transform().getDimensionInfo(CELL_DIM));
1991 }
1992 if (vectorDataRight.transform().isValid())
1993 {
1994 combinedCellDimInfo = combinedDimensionInfo(combinedCellDimInfo, vectorDataRight.transform().getDimensionInfo(CELL_DIM));
1995 }
1996
1997 DataVariationType cellVariationType = combinedCellDimInfo.variationType;
1998 int cellDataExtent = combinedCellDimInfo.dataExtent;
1999
2000 const int numCells = vectorDataLeft.numCells();
2001 const int numFieldsLeft = vectorDataLeft.numFields();
2002 const int numFieldsRight = vectorDataRight.numFields();
2003
2004 Kokkos::Array<int,3> extents {numCells, numFieldsLeft, numFieldsRight};
2005 Kokkos::Array<DataVariationType,3> variationTypes {cellVariationType,GENERAL,GENERAL};
2006
2007 if (cellVariationType != CONSTANT)
2008 {
2009 Kokkos::View<Scalar***,DeviceType> data("Intrepid2 integral data",cellDataExtent,numFieldsLeft,numFieldsRight);
2010 return Data<Scalar,DeviceType>(data, extents, variationTypes);
2011 }
2012 else
2013 {
2014 Kokkos::View<Scalar**,DeviceType> data("Intrepid2 integral data",numFieldsLeft,numFieldsRight);
2015 return Data<Scalar,DeviceType>(data, extents, variationTypes);
2016 }
2017}
2018
2022template<typename DeviceType>
2023template<class Scalar>
2025 const TensorData<Scalar,DeviceType> & cellMeasures,
2026 const TransformedBasisValues<Scalar,DeviceType> & basisValuesRight, const bool sumInto,
2027 double* approximateFlops)
2028{
2029 using ExecutionSpace = typename DeviceType::execution_space;
2030
2031 // ordinal filter is used for Serendipity basis; we don't yet support Serendipity for sum factorization.
2032 // (when we do, the strategy will likely be to do sum factorized assembly and then filter the result).
2033 const bool leftHasOrdinalFilter = basisValuesLeft.basisValues().ordinalFilter().extent_int(0) > 0;
2034 const bool rightHasOrdinalFilter = basisValuesRight.basisValues().ordinalFilter().extent_int(0) > 0;
2035 TEUCHOS_TEST_FOR_EXCEPTION(leftHasOrdinalFilter || rightHasOrdinalFilter, std::invalid_argument, "Ordinal filters for BasisValues are not yet supported by IntegrationTools");
2036
2037 if (approximateFlops != NULL)
2038 {
2039 *approximateFlops = 0;
2040 }
2041
2042 // integral data may have shape (C,F1,F2) or (if the variation type is CONSTANT in the cell dimension) shape (F1,F2)
2043 const int integralViewRank = integrals.getUnderlyingViewRank();
2044
2045 if (!sumInto)
2046 {
2047 integrals.clear();
2048 }
2049
2050 const int cellDataExtent = integrals.getDataExtent(0);
2051 const int numFieldsLeft = basisValuesLeft.numFields();
2052 const int numFieldsRight = basisValuesRight.numFields();
2053 const int spaceDim = basisValuesLeft.spaceDim();
2054
2055 INTREPID2_TEST_FOR_EXCEPTION(basisValuesLeft.spaceDim() != basisValuesRight.spaceDim(), std::invalid_argument, "vectorDataLeft and vectorDataRight must agree on the space dimension");
2056
2057 const int leftFamilyCount = basisValuesLeft.basisValues().numFamilies();
2058 const int rightFamilyCount = basisValuesRight.basisValues().numFamilies();
2059
2060 // we require that the number of tensor components in the vectors are the same for each vector entry
2061 // this is not strictly necessary, but it makes implementation easier, and we don't at present anticipate other use cases
2062 int numTensorComponentsLeft = -1;
2063 const bool leftIsVectorValued = basisValuesLeft.vectorData().isValid();
2064
2065 if (leftIsVectorValued)
2066 {
2067 const auto &refVectorLeft = basisValuesLeft.vectorData();
2068 int numFamiliesLeft = refVectorLeft.numFamilies();
2069 int numVectorComponentsLeft = refVectorLeft.numComponents();
2070 Kokkos::Array<int,7> maxFieldsForComponentLeft {0,0,0,0,0,0,0};
2071 for (int familyOrdinal=0; familyOrdinal<numFamiliesLeft; familyOrdinal++)
2072 {
2073 for (int vectorComponent=0; vectorComponent<numVectorComponentsLeft; vectorComponent++)
2074 {
2075 const TensorData<Scalar,DeviceType> &tensorData = refVectorLeft.getComponent(familyOrdinal,vectorComponent);
2076 if (tensorData.numTensorComponents() > 0)
2077 {
2078 if (numTensorComponentsLeft == -1)
2079 {
2080 numTensorComponentsLeft = tensorData.numTensorComponents();
2081 }
2082 INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsLeft != tensorData.numTensorComponents(), std::invalid_argument, "Each valid entry in vectorDataLeft must have the same number of tensor components as every other");
2083 for (int r=0; r<numTensorComponentsLeft; r++)
2084 {
2085 maxFieldsForComponentLeft[r] = std::max(tensorData.getTensorComponent(r).extent_int(0), maxFieldsForComponentLeft[r]);
2086 }
2087 }
2088 }
2089 }
2090 }
2091 else
2092 {
2093 numTensorComponentsLeft = basisValuesLeft.basisValues().tensorData(0).numTensorComponents(); // family ordinal 0
2094 for (int familyOrdinal = 0; familyOrdinal < leftFamilyCount; familyOrdinal++)
2095 {
2096 INTREPID2_TEST_FOR_EXCEPTION(basisValuesLeft.basisValues().tensorData(familyOrdinal).numTensorComponents() != numTensorComponentsLeft, std::invalid_argument, "All families must match in the number of tensor components");
2097 }
2098 }
2099 int numTensorComponentsRight = -1;
2100 const bool rightIsVectorValued = basisValuesRight.vectorData().isValid();
2101
2102 if (rightIsVectorValued)
2103 {
2104 const auto &refVectorRight = basisValuesRight.vectorData();
2105 int numFamiliesRight = refVectorRight.numFamilies();
2106 int numVectorComponentsRight = refVectorRight.numComponents();
2107 Kokkos::Array<int,7> maxFieldsForComponentRight {0,0,0,0,0,0,0};
2108 for (int familyOrdinal=0; familyOrdinal<numFamiliesRight; familyOrdinal++)
2109 {
2110 for (int vectorComponent=0; vectorComponent<numVectorComponentsRight; vectorComponent++)
2111 {
2112 const auto &tensorData = refVectorRight.getComponent(familyOrdinal,vectorComponent);
2113 if (tensorData.numTensorComponents() > 0)
2114 {
2115 if (numTensorComponentsRight == -1)
2116 {
2117 numTensorComponentsRight = tensorData.numTensorComponents();
2118 }
2119 INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsRight != tensorData.numTensorComponents(), std::invalid_argument, "Each valid entry in vectorDataRight must have the same number of tensor components as every other");
2120 for (int r=0; r<numTensorComponentsRight; r++)
2121 {
2122 maxFieldsForComponentRight[r] = std::max(tensorData.getTensorComponent(r).extent_int(0), maxFieldsForComponentRight[r]);
2123 }
2124 }
2125 }
2126 }
2127 INTREPID2_TEST_FOR_EXCEPTION(numTensorComponentsRight != numTensorComponentsLeft, std::invalid_argument, "Right families must match left in the number of tensor components");
2128 }
2129 else
2130 {
2131 // check that right tensor component count agrees with left
2132 for (int familyOrdinal=0; familyOrdinal< rightFamilyCount; familyOrdinal++)
2133 {
2134 INTREPID2_TEST_FOR_EXCEPTION(basisValuesRight.basisValues().tensorData(familyOrdinal).numTensorComponents() != numTensorComponentsLeft, std::invalid_argument, "Right families must match left in the number of tensor components");
2135 }
2136 }
2137 const int numPointTensorComponents = cellMeasures.numTensorComponents() - 1;
2138
2139 if ((numPointTensorComponents == numTensorComponentsLeft) && basisValuesLeft.axisAligned() && basisValuesRight.axisAligned())
2140 {
2141 // cellMeasures is a non-trivial tensor product, and the pullbacks are all diagonals.
2142
2143 // in this case, the integrals in each tensorial direction are entirely separable
2144 // allocate some temporary storage for the integrals in each tensorial direction
2145
2146 // if cellMeasures is a nontrivial tensor product, that means that all cells have the same shape, up to scaling.
2147
2148 Kokkos::Array<int,Parameters::MaxTensorComponents> pointDimensions;
2149 for (int r=0; r<numPointTensorComponents; r++)
2150 {
2151 // first tensorial component of cell measures is the cell dimension; after that we have (P1,P2,…)
2152 pointDimensions[r] = cellMeasures.getTensorComponent(r+1).extent_int(0);
2153 }
2154
2155 // only one of these will be a valid container:
2156 Kokkos::View<Scalar**, DeviceType> integralView2;
2157 Kokkos::View<Scalar***, DeviceType> integralView3;
2158 if (integralViewRank == 3)
2159 {
2160 integralView3 = integrals.getUnderlyingView3();
2161 }
2162 else
2163 {
2164 integralView2 = integrals.getUnderlyingView2();
2165 }
2166 for (int leftFamilyOrdinal=0; leftFamilyOrdinal<leftFamilyCount; leftFamilyOrdinal++)
2167 {
2168 int a_offset = 0; // left vector component offset
2169 int leftFieldOffset = basisValuesLeft.basisValues().familyFieldOrdinalOffset(leftFamilyOrdinal);
2170
2171 const int leftVectorComponentCount = leftIsVectorValued ? basisValuesLeft.vectorData().numComponents() : 1;
2172 for (int leftVectorComponentOrdinal = 0; leftVectorComponentOrdinal < leftVectorComponentCount; leftVectorComponentOrdinal++)
2173 {
2174 const TensorData<Scalar,DeviceType> & leftComponent = leftIsVectorValued ? basisValuesLeft.vectorData().getComponent(leftFamilyOrdinal, leftVectorComponentOrdinal)
2175 : basisValuesLeft.basisValues().tensorData(leftFamilyOrdinal);
2176 if (!leftComponent.isValid())
2177 {
2178 a_offset++; // empty components are understood to take up one dimension
2179 continue;
2180 }
2181 const int leftDimSpan = leftComponent.extent_int(2);
2182
2183 const int leftComponentFieldCount = leftComponent.extent_int(0);
2184
2185 for (int rightFamilyOrdinal=0; rightFamilyOrdinal<rightFamilyCount; rightFamilyOrdinal++)
2186 {
2187 int b_offset = 0; // right vector component offset
2188 int rightFieldOffset = basisValuesRight.vectorData().familyFieldOrdinalOffset(rightFamilyOrdinal);
2189
2190 const int rightVectorComponentCount = rightIsVectorValued ? basisValuesRight.vectorData().numComponents() : 1;
2191 for (int rightVectorComponentOrdinal = 0; rightVectorComponentOrdinal < rightVectorComponentCount; rightVectorComponentOrdinal++)
2192 {
2193 const TensorData<Scalar,DeviceType> & rightComponent = rightIsVectorValued ? basisValuesRight.vectorData().getComponent(rightFamilyOrdinal, rightVectorComponentOrdinal)
2194 : basisValuesRight.basisValues().tensorData(rightFamilyOrdinal);
2195 if (!rightComponent.isValid())
2196 {
2197 b_offset++; // empty components are understood to take up one dimension
2198 continue;
2199 }
2200 const int rightDimSpan = rightComponent.extent_int(2);
2201
2202 const int rightComponentFieldCount = rightComponent.extent_int(0);
2203
2204 // we only accumulate for a == b (since this is the axis-aligned case). Do the a, b spans intersect?
2205 if ((a_offset + leftDimSpan <= b_offset) || (b_offset + rightDimSpan <= a_offset)) // no intersection
2206 {
2207 b_offset += rightDimSpan;
2208
2209 continue;
2210 }
2211
2212 // if the a, b spans intersect, by assumption they should align exactly
2213 INTREPID2_TEST_FOR_EXCEPTION(( a_offset != b_offset), std::logic_error, "left and right dimension offsets should match.");
2214 INTREPID2_TEST_FOR_EXCEPTION(( leftDimSpan != rightDimSpan), std::invalid_argument, "left and right components must span the same number of dimensions.");
2215
2216 const int d_start = a_offset;
2217 const int d_end = d_start + leftDimSpan;
2218
2219 using ComponentIntegralsArray = Kokkos::Array< Kokkos::Array<ScalarView<Scalar,DeviceType>, Parameters::MaxTensorComponents>, Parameters::MaxTensorComponents>;
2220 ComponentIntegralsArray componentIntegrals;
2221 for (int r=0; r<numPointTensorComponents; r++)
2222 {
2223 /*
2224 Four vector data cases to consider:
2225 1. Both vector data containers are filled with axial components - first component in 3D has form (f,0,0), second (0,f,0), third (0,0,f).
2226 2. Both vector data containers have arbitrary components - in 3D: (f1,f2,f3) where f1 is given by the first component, f2 by the second, f3 by the third.
2227 3. First container is axial, second arbitrary.
2228 4. First is arbitrary, second axial.
2229
2230 But note that in all four cases, the structure of the integral is the same: you have three vector component integrals that get summed. The actual difference between
2231 the cases does not show up in the reference-space integrals here, but in the accumulation in physical space below, where the tensor field numbering comes into play.
2232
2233 The choice between axial and arbitrary affects the way the fields are numbered; the arbitrary components' indices refer to the same vector function, so they correspond,
2234 while the axial components refer to distinct scalar functions, so their numbering in the data container is cumulative.
2235 */
2236
2237 Data<Scalar,DeviceType> quadratureWeights = cellMeasures.getTensorComponent(r+1);
2238 const int numPoints = pointDimensions[r];
2239
2240 // It may be worth considering the possibility that some of these components point to the same data -- if so, we could possibly get better data locality by making the corresponding componentIntegral entries point to the same location as well. (And we could avoid some computations here.)
2241
2242 Data<Scalar,DeviceType> leftTensorComponent = leftComponent.getTensorComponent(r);
2243 Data<Scalar,DeviceType> rightTensorComponent = rightComponent.getTensorComponent(r);
2244
2245 const int leftTensorComponentDimSpan = leftTensorComponent.extent_int(2);
2246 const int leftTensorComponentFields = leftTensorComponent.extent_int(0);
2247 const int rightTensorComponentDimSpan = rightTensorComponent.extent_int(2);
2248 const int rightTensorComponentFields = rightTensorComponent.extent_int(0);
2249
2250 INTREPID2_TEST_FOR_EXCEPTION(( leftTensorComponentDimSpan != rightTensorComponentDimSpan), std::invalid_argument, "left and right components must span the same number of dimensions.");
2251
2252 for (int d=d_start; d<d_end; d++)
2253 {
2254 ScalarView<Scalar,DeviceType> componentIntegralView;
2255
2256 const bool allocateFadStorage = !(std::is_standard_layout<Scalar>::value && std::is_trivial<Scalar>::value);
2257 if (allocateFadStorage)
2258 {
2259 auto fad_size_output = dimension_scalar(integrals.getUnderlyingView());
2260 componentIntegralView = ScalarView<Scalar,DeviceType>("componentIntegrals for tensor component " + std::to_string(r) + ", in dimension " + std::to_string(d), leftTensorComponentFields, rightTensorComponentFields, fad_size_output);
2261 }
2262 else
2263 {
2264 componentIntegralView = ScalarView<Scalar,DeviceType>("componentIntegrals for tensor component " + std::to_string(r) + ", in dimension " + std::to_string(d), leftTensorComponentFields, rightTensorComponentFields);
2265 }
2266
2267 auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<2>>({0,0},{leftTensorComponentFields,rightTensorComponentFields});
2268
2269 Impl::F_RefSpaceIntegral<Scalar, DeviceType> refSpaceIntegralFunctor(componentIntegralView, leftTensorComponent, rightTensorComponent, quadratureWeights,
2270 leftTensorComponentDimSpan);
2271 Kokkos::parallel_for("compute componentIntegrals", policy, refSpaceIntegralFunctor);
2272
2273 componentIntegrals[r][d] = componentIntegralView;
2274
2275 if (approximateFlops != NULL)
2276 {
2277 *approximateFlops += leftTensorComponentFields*rightTensorComponentFields*numPoints*(3); // two multiplies, one add in innermost loop
2278 }
2279 } // d
2280 } // r
2281
2282 ExecutionSpace().fence();
2283 Impl::F_ComputeIntegral<Scalar,DeviceType> computeIntegralFunctor(integralView2, integralView3, leftComponent, rightComponent, cellMeasures, basisValuesLeft, basisValuesRight, componentIntegrals, d_start, d_end, numPointTensorComponents, leftFieldOffset, rightFieldOffset, integralViewRank);
2284 Kokkos::Array<int,3> upperBounds {cellDataExtent,leftComponentFieldCount,rightComponentFieldCount}; // separately declared in effort to get around Intel 17.0.1 compiler weirdness.
2285 Kokkos::Array<int,3> lowerBounds {0,0,0};
2286 auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<3>>(lowerBounds, upperBounds);
2287 // TODO: note that for best performance, especially with Fad types, we should replace this parallel for with a Functor and use hierarchical parallelism
2288 Kokkos::parallel_for("compute field integrals", policy, computeIntegralFunctor);
2289 b_offset += rightDimSpan;
2290 } // rightVectorComponentOrdinal
2291 } // rightFamilyOrdinal
2292 a_offset += leftDimSpan;
2293 } // leftVectorComponentOrdinal
2294 } // leftFamilyOrdinal
2295
2296 if (approximateFlops != NULL)
2297 {
2298 // TODO: check the accuracy of this
2299 *approximateFlops += (2 + spaceDim * (3 + numPointTensorComponents)) * cellDataExtent * numFieldsLeft * numFieldsRight;
2300 }
2301 }
2302 else // general case (not axis-aligned + affine tensor-product structure)
2303 {
2304 // prepare composed transformation matrices
2305 const Data<Scalar,DeviceType> & leftTransform = basisValuesLeft.transform();
2306 const Data<Scalar,DeviceType> & rightTransform = basisValuesRight.transform();
2307 const bool transposeLeft = true;
2308 const bool transposeRight = false;
2309// auto timer = Teuchos::TimeMonitor::getNewTimer("mat-mat");
2310// timer->start();
2311 // transforms can be matrices -- (C,P,D,D): rank 4 -- or scalar weights -- (C,P): rank 2 -- or vector weights -- (C,P,D): rank 3
2312 Data<Scalar,DeviceType> composedTransform;
2313 // invalid/empty transforms are used when the identity is intended.
2314 const int leftRank = leftTransform.rank();
2315 const int rightRank = rightTransform.rank();
2316
2317 if (leftTransform.isValid() && rightTransform.isValid())
2318 {
2319 const bool bothRank4 = (leftRank == 4) && (rightRank == 4);
2320 const bool bothRank3 = (leftRank == 3) && (rightRank == 3);
2321 const bool bothRank2 = (leftRank == 2) && (rightRank == 2);
2322 const bool ranks32 = ((leftRank == 3) && (rightRank == 2)) || ((leftRank == 2) && (rightRank == 3));
2323 const bool ranks42 = ((leftRank == 4) && (rightRank == 2)) || ((leftRank == 2) && (rightRank == 4));
2324
2325 if (bothRank4) // (C,P,D,D)
2326 {
2327 composedTransform = Data<Scalar,DeviceType>::allocateMatMatResult(transposeLeft, leftTransform, transposeRight, rightTransform);
2328 composedTransform.storeMatMat(transposeLeft, leftTransform, transposeRight, rightTransform);
2329
2330 // if the composedTransform matrices are full, the following is a good estimate. If they have some diagonal portions, this will overcount.
2331 if (approximateFlops != NULL)
2332 {
2333 *approximateFlops += composedTransform.getUnderlyingViewSize() * (spaceDim - 1) * 2;
2334 }
2335 }
2336 else if (bothRank3) // (C,P,D)
2337 {
2338 // re-cast leftTransform as a rank 4 (C,P,1,D) object -- a 1 x D matrix at each (C,P).
2339 const int newRank = 4;
2340 auto extents = leftTransform.getExtents();
2341 auto variationTypes = leftTransform.getVariationTypes();
2342 extents[3] = extents[2];
2343 extents[2] = 1;
2344 variationTypes[3] = variationTypes[2];
2345 variationTypes[2] = CONSTANT;
2346 auto leftTransformMatrix = leftTransform.shallowCopy(newRank, extents, variationTypes);
2347
2348 // re-cast rightTransform as a rank 4 (C,P,1,D) object -- a 1 x D matrix at each (C,P)
2349 extents = rightTransform.getExtents();
2350 variationTypes = rightTransform.getVariationTypes();
2351 extents[3] = extents[2];
2352 extents[2] = 1;
2353 variationTypes[3] = variationTypes[2];
2354 variationTypes[2] = CONSTANT;
2355 auto rightTransformMatrix = rightTransform.shallowCopy(newRank, extents, variationTypes);
2356
2357 composedTransform = Data<Scalar,DeviceType>::allocateMatMatResult(transposeLeft, leftTransformMatrix, transposeRight, rightTransformMatrix); // false: don't transpose
2358 composedTransform.storeMatMat(transposeLeft, leftTransformMatrix, transposeRight, rightTransformMatrix);
2359
2360 if (approximateFlops != NULL)
2361 {
2362 *approximateFlops += composedTransform.getUnderlyingViewSize(); // one multiply per entry
2363 }
2364 }
2365 else if (bothRank2)
2366 {
2367 composedTransform = leftTransform.allocateInPlaceCombinationResult(leftTransform, rightTransform);
2368 composedTransform.storeInPlaceProduct(leftTransform, rightTransform);
2369
2370 // re-cast composedTranform as a rank 4 (C,P,1,1) object -- a 1 x 1 matrix at each (C,P).
2371 const int newRank = 4;
2372 auto extents = composedTransform.getExtents();
2373 auto variationTypes = composedTransform.getVariationTypes();
2374 composedTransform = composedTransform.shallowCopy(newRank, extents, variationTypes);
2375 if (approximateFlops != NULL)
2376 {
2377 *approximateFlops += composedTransform.getUnderlyingViewSize(); // one multiply per entry
2378 }
2379 }
2380 else if (ranks32) // rank 2 / rank 3 combination.
2381 {
2382 const auto & rank3Transform = (leftRank == 3) ? leftTransform : rightTransform;
2383 const auto & rank2Transform = (leftRank == 2) ? leftTransform : rightTransform;
2384
2385 composedTransform = DataTools::multiplyByCPWeights(rank3Transform, rank2Transform);
2386
2387 // re-cast composedTransform as a rank 4 object:
2388 // logically, the original rank-3 transform can be understood as a 1xD matrix. The composed transform is leftTransform^T * rightTransform, so:
2389 // - if left has the rank-3 transform, composedTransform should be a (C,P,D,1) object -- a D x 1 matrix at each (C,P).
2390 // - if right has the rank-3 transform, composedTransform should be a (C,P,1,D) object -- a 1 x D matrix at each (C,P).
2391 const int newRank = 4;
2392 auto extents = composedTransform.getExtents();
2393 auto variationTypes = composedTransform.getVariationTypes();
2394 if (leftRank == 3)
2395 {
2396 // extents[3] and variationTypes[3] will already be 1 and CONSTANT, respectively
2397 // extents[3] = 1;
2398 // variationTypes[3] = CONSTANT;
2399 }
2400 else
2401 {
2402 extents[3] = extents[2];
2403 extents[2] = 1;
2404 variationTypes[3] = variationTypes[2];
2405 variationTypes[2] = CONSTANT;
2406 }
2407 composedTransform = composedTransform.shallowCopy(newRank, extents, variationTypes);
2408 }
2409 else if (ranks42) // rank 4 / rank 2 combination.
2410 {
2411 if (leftRank == 4)
2412 {
2413 // want to transpose left matrix, and multiply by the values from rightTransform
2414 // start with the multiplication:
2415 auto composedTransformTransposed = DataTools::multiplyByCPWeights(leftTransform, rightTransform);
2416 composedTransform = DataTools::transposeMatrix(composedTransformTransposed);
2417 }
2418 else // (leftRank == 2)
2419 {
2420 composedTransform = DataTools::multiplyByCPWeights(rightTransform, leftTransform);
2421 }
2422 }
2423 else
2424 {
2425 INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Unsupported transform combination");
2426 }
2427 }
2428 else if (leftTransform.isValid())
2429 {
2430 // rightTransform is the identity
2431 switch (leftRank)
2432 {
2433 case 4: composedTransform = DataTools::transposeMatrix(leftTransform); break;
2434 case 3:
2435 {
2436 // - if left has the rank-3 transform, composedTransform should be a (C,P,D,1) object -- a D x 1 matrix at each (C,P).
2437 const int newRank = 4;
2438 auto extents = leftTransform.getExtents();
2439 auto variationTypes = leftTransform.getVariationTypes();
2440
2441 composedTransform = leftTransform.shallowCopy(newRank, extents, variationTypes);
2442 }
2443 break;
2444 case 2: composedTransform = leftTransform; break;
2445 default:
2446 INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Unsupported transform combination");
2447 }
2448 }
2449 else if (rightTransform.isValid())
2450 {
2451 // leftTransform is the identity
2452 composedTransform = rightTransform;
2453 switch (rightRank)
2454 {
2455 case 4: composedTransform = rightTransform; break;
2456 case 3:
2457 {
2458 // - if right has the rank-3 transform, composedTransform should be a (C,P,1,D) object -- a 1 x D matrix at each (C,P).
2459 const int newRank = 4;
2460 auto extents = rightTransform.getExtents();
2461 auto variationTypes = rightTransform.getVariationTypes();
2462 extents[3] = extents[2];
2463 variationTypes[3] = variationTypes[2];
2464 extents[2] = 1;
2465 variationTypes[2] = CONSTANT;
2466
2467 composedTransform = rightTransform.shallowCopy(newRank, extents, variationTypes);
2468 }
2469 break;
2470 case 2: composedTransform = rightTransform; break;
2471 default:
2472 INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Unsupported transform combination");
2473 }
2474 }
2475 else
2476 {
2477 // both left and right transforms are identity
2478 Kokkos::Array<ordinal_type,4> extents {basisValuesLeft.numCells(),basisValuesLeft.numPoints(),spaceDim,spaceDim};
2479 Kokkos::Array<DataVariationType,4> variationTypes {CONSTANT,CONSTANT,BLOCK_PLUS_DIAGONAL,BLOCK_PLUS_DIAGONAL};
2480
2481 Kokkos::View<Scalar*,DeviceType> identityUnderlyingView("Intrepid2::FST::integrate() - identity view",spaceDim);
2482 Kokkos::deep_copy(identityUnderlyingView, 1.0);
2483 composedTransform = Data<Scalar,DeviceType>(identityUnderlyingView,extents,variationTypes);
2484 }
2485
2486// timer->stop();
2487// std::cout << "Completed mat-mat in " << timer->totalElapsedTime() << " seconds.\n";
2488// timer->reset();
2489
2490 const int leftComponentCount = leftIsVectorValued ? basisValuesLeft. vectorData().numComponents() : 1;
2491 const int rightComponentCount = rightIsVectorValued ? basisValuesRight.vectorData().numComponents() : 1;
2492
2493 int leftFieldOrdinalOffset = 0; // keeps track of the number of fields in prior families
2494 for (int leftFamilyOrdinal=0; leftFamilyOrdinal<leftFamilyCount; leftFamilyOrdinal++)
2495 {
2496 // "a" keeps track of the spatial dimension over which we are integrating in the left vector
2497 // components are allowed to span several dimensions; we keep track of the offset for the component in a_offset
2498 int a_offset = 0;
2499 bool haveLaunchedContributionToCurrentFamilyLeft = false; // helps to track whether we need a Kokkos::fence before launching a kernel.
2500 for (int leftComponentOrdinal=0; leftComponentOrdinal<leftComponentCount; leftComponentOrdinal++)
2501 {
2502 const TensorData<Scalar,DeviceType> & leftComponent = leftIsVectorValued ? basisValuesLeft.vectorData().getComponent(leftFamilyOrdinal, leftComponentOrdinal)
2503 : basisValuesLeft.basisValues().tensorData(leftFamilyOrdinal);
2504 if (!leftComponent.isValid())
2505 {
2506 // represents zero
2507 a_offset += basisValuesLeft.vectorData().numDimsForComponent(leftComponentOrdinal);
2508 continue;
2509 }
2510
2511 int rightFieldOrdinalOffset = 0; // keeps track of the number of fields in prior families
2512 for (int rightFamilyOrdinal=0; rightFamilyOrdinal<rightFamilyCount; rightFamilyOrdinal++)
2513 {
2514 // "b" keeps track of the spatial dimension over which we are integrating in the right vector
2515 // components are allowed to span several dimensions; we keep track of the offset for the component in b_offset
2516 bool haveLaunchedContributionToCurrentFamilyRight = false; // helps to track whether we need a Kokkos::fence before launching a kernel.
2517 int b_offset = 0;
2518 for (int rightComponentOrdinal=0; rightComponentOrdinal<rightComponentCount; rightComponentOrdinal++)
2519 {
2520 const TensorData<Scalar,DeviceType> & rightComponent = rightIsVectorValued ? basisValuesRight.vectorData().getComponent(rightFamilyOrdinal, rightComponentOrdinal)
2521 : basisValuesRight.basisValues().tensorData(rightFamilyOrdinal);
2522 if (!rightComponent.isValid())
2523 {
2524 // represents zero
2525 b_offset += basisValuesRight.vectorData().numDimsForComponent(rightComponentOrdinal);
2526 continue;
2527 }
2528
2529 INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(leftComponent.numTensorComponents() != rightComponent.numTensorComponents(), std::invalid_argument, "left TensorData and right TensorData have different number of tensor components. This is not supported.");
2530
2531 const int vectorSize = getVectorSizeForHierarchicalParallelism<Scalar>();
2532 Kokkos::TeamPolicy<ExecutionSpace> policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,Kokkos::AUTO(),vectorSize);
2533
2534 // TODO: expose the options for forceNonSpecialized and usePointCacheForRank3Tensor through an IntegrationAlgorithm enumeration.
2535 // AUTOMATIC: let Intrepid2 choose an algorithm based on the inputs (and, perhaps, the execution space)
2536 // STANDARD: don't use sum factorization or axis alignment -- just do the simple contraction, a (p+1)^9 algorithm in 3D
2537 // SUM_FACTORIZATION // (p+1)^7 algorithm in 3D
2538 // SUM_FACTORIZATION_AXIS_ALIGNED // (p+1)^6 algorithm in 3D
2539 // SUM_FACTORIZATION_FORCE_GENERIC_IMPLEMENTATION // mainly intended for testing purposes (specialized implementations perform better when they are provided)
2540 // SUM_FACTORIZATION_WITH_POINT_CACHE // novel (p+1)^7 (in 3D) algorithm in Intrepid2; unclear as yet when and whether this may be a superior approach
2541 bool forceNonSpecialized = false; // We might expose this in the integrate() arguments in the future. We *should* default to false in the future.
2542 bool usePointCacheForRank3Tensor = true; // EXPERIMENTAL; has better performance under CUDA, but slightly worse performance than standard on serial CPU
2543
2544 // in one branch or another below, we will launch a parallel kernel that contributes to (leftFamily, rightFamily) field ordinal pairs.
2545 // if we have already launched something that contributes to that part of the integral container, we need a Kokkos fence() to ensure that these do not interfere with each other.
2546 if (haveLaunchedContributionToCurrentFamilyLeft && haveLaunchedContributionToCurrentFamilyRight)
2547 {
2548 ExecutionSpace().fence();
2549 }
2550 haveLaunchedContributionToCurrentFamilyLeft = true;
2551 haveLaunchedContributionToCurrentFamilyRight = true;
2552
2553 if (integralViewRank == 2)
2554 {
2555 if (usePointCacheForRank3Tensor && (leftComponent.numTensorComponents() == 3))
2556 {
2557 auto functor = Impl::F_IntegratePointValueCache<Scalar, DeviceType, 2>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset);
2558
2559 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2560 const int teamSize = functor.teamSize(recommendedTeamSize);
2561
2562 policy = Kokkos::TeamPolicy<DeviceType>(cellDataExtent,teamSize,vectorSize);
2563
2564 Kokkos::parallel_for("F_IntegratePointValueCache rank 2", policy, functor);
2565
2566 if (approximateFlops != NULL)
2567 {
2568 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2569 }
2570 }
2571 else
2572 {
2573 auto functor = Impl::F_Integrate<Scalar, DeviceType, 2>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset, forceNonSpecialized);
2574
2575 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2576 const int teamSize = functor.teamSize(recommendedTeamSize);
2577
2578 policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,teamSize,vectorSize);
2579
2580 Kokkos::parallel_for("F_Integrate rank 2", policy, functor);
2581
2582 if (approximateFlops != NULL)
2583 {
2584 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2585 }
2586 }
2587 }
2588 else if (integralViewRank == 3)
2589 {
2590 if (usePointCacheForRank3Tensor && (leftComponent.numTensorComponents() == 3))
2591 {
2592 auto functor = Impl::F_IntegratePointValueCache<Scalar, DeviceType, 3>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset);
2593
2594 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2595 const int teamSize = functor.teamSize(recommendedTeamSize);
2596
2597 policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,teamSize,vectorSize);
2598
2599 Kokkos::parallel_for("F_IntegratePointValueCache rank 3", policy, functor);
2600
2601 if (approximateFlops != NULL)
2602 {
2603 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2604 }
2605 }
2606 else
2607 {
2608 auto functor = Impl::F_Integrate<Scalar, DeviceType, 3>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset, forceNonSpecialized);
2609
2610 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2611 const int teamSize = functor.teamSize(recommendedTeamSize);
2612
2613 policy = Kokkos::TeamPolicy<DeviceType>(cellDataExtent,teamSize,vectorSize);
2614
2615 Kokkos::parallel_for("F_Integrate rank 3", policy, functor);
2616
2617 if (approximateFlops != NULL)
2618 {
2619 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2620 }
2621 }
2622 }
2623 b_offset += rightIsVectorValued ? basisValuesRight.vectorData().numDimsForComponent(rightComponentOrdinal) : 1;
2624 }
2625 rightFieldOrdinalOffset += rightIsVectorValued ? basisValuesRight.vectorData().numFieldsInFamily(rightFamilyOrdinal) : basisValuesRight.basisValues().numFieldsInFamily(rightFamilyOrdinal);
2626 }
2627 a_offset += leftIsVectorValued ? basisValuesLeft.vectorData().numDimsForComponent(leftComponentOrdinal) : 1;
2628 }
2629 leftFieldOrdinalOffset += leftIsVectorValued ? basisValuesLeft.vectorData().numFieldsInFamily(leftFamilyOrdinal) : basisValuesLeft.basisValues().numFieldsInFamily(leftFamilyOrdinal);
2630 }
2631 }
2632// if (approximateFlops != NULL)
2633// {
2634// std::cout << "Approximate flop count (new): " << *approximateFlops << std::endl;
2635// }
2636 ExecutionSpace().fence(); // make sure we've finished writing to integrals container before we return
2637}
2638
2639} // end namespace Intrepid2
2640
2641#endif
KOKKOS_INLINE_FUNCTION DimensionInfo combinedDimensionInfo(const DimensionInfo &myData, const DimensionInfo &otherData)
Returns DimensionInfo for a Data container that combines (through multiplication, say,...
Utility methods for manipulating Intrepid2::Data objects.
@ GENERAL
arbitrary variation
@ BLOCK_PLUS_DIAGONAL
one of two dimensions in a matrix; bottom-right part of matrix is diagonal
Defines the Intrepid2::FunctorIterator class, as well as the Intrepid2::functor_returns_ref SFINAE he...
Defines TensorArgumentIterator, which allows systematic enumeration of a TensorData object.
#define INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(test, x, msg)
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< std::is_standard_layout< T >::value &&std::is_trivial< T >::value, unsigned >::type dimension_scalar(const Kokkos::DynRankView< T, P... >)
specialization of functions for pod types, returning the scalar dimension (1 for pod types) of a view...
Wrapper around a Kokkos::View that allows data that is constant or repeating in various logical dimen...
void storeInPlaceProduct(const Data< DataScalar, DeviceType > &A, const Data< DataScalar, DeviceType > &B)
stores the in-place (entrywise) product, A .* B, into this container.
KOKKOS_INLINE_FUNCTION enable_if_t< rank==1, const Kokkos::View< typename RankExpander< DataScalar, rank >::value_type, DeviceType > & > getUnderlyingView() const
Returns the underlying view. Throws an exception if the underlying view is not rank 1.
KOKKOS_INLINE_FUNCTION int extent_int(const int &r) const
Returns the logical extent in the specified dimension.
KOKKOS_INLINE_FUNCTION const Kokkos::View< DataScalar **, DeviceType > & getUnderlyingView2() const
returns the View that stores the unique data. For rank-2 underlying containers.
KOKKOS_INLINE_FUNCTION const Kokkos::View< DataScalar ****, DeviceType > & getUnderlyingView4() const
returns the View that stores the unique data. For rank-4 underlying containers.
KOKKOS_INLINE_FUNCTION constexpr bool isValid() const
returns true for containers that have data; false for those that don't (namely, those that have been ...
KOKKOS_INLINE_FUNCTION const Kokkos::View< DataScalar ***, DeviceType > & getUnderlyingView3() const
returns the View that stores the unique data. For rank-3 underlying containers.
void clear() const
Copies 0.0 to the underlying View.
KOKKOS_INLINE_FUNCTION ordinal_type getUnderlyingViewSize() const
returns the number of entries in the View that stores the unique data
static Data< DataScalar, DeviceType > allocateInPlaceCombinationResult(const Data< DataScalar, DeviceType > &A, const Data< DataScalar, DeviceType > &B)
KOKKOS_INLINE_FUNCTION const Kokkos::Array< DataVariationType, 7 > & getVariationTypes() const
Returns an array with the variation types in each logical dimension.
KOKKOS_INLINE_FUNCTION int getDataExtent(const ordinal_type &d) const
returns the true extent of the data corresponding to the logical dimension provided; if the data does...
void storeMatMat(const bool transposeA, const Data< DataScalar, DeviceType > &A_MatData, const bool transposeB, const Data< DataScalar, DeviceType > &B_MatData)
KOKKOS_INLINE_FUNCTION Kokkos::Array< int, 7 > getExtents() const
Returns an array containing the logical extents in each dimension.
KOKKOS_INLINE_FUNCTION bool underlyingMatchesLogical() const
Returns true if the underlying container has exactly the same rank and extents as the logical contain...
KOKKOS_INLINE_FUNCTION ordinal_type getUnderlyingViewRank() const
returns the rank of the View that stores the unique data
KOKKOS_INLINE_FUNCTION DimensionInfo getDimensionInfo(const int &dim) const
Returns an object fully specifying the indicated dimension. This is used in determining appropriate s...
Data shallowCopy(const int rank, const Kokkos::Array< int, 7 > &extents, const Kokkos::Array< DataVariationType, 7 > &variationTypes) const
Creates a new Data object with the same underlying view, but with the specified logical rank,...
KOKKOS_INLINE_FUNCTION unsigned rank() const
Returns the logical rank of the Data container.
Implementation of a general sum factorization algorithm, using a novel approach developed by Roberts,...
KOKKOS_INLINE_FUNCTION int incrementArgument(Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of incrementArgument; gets used by approximate flop count.
KOKKOS_INLINE_FUNCTION int nextIncrementResult(const Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of nextIncrementResult; gets used by approximate flop count.
KOKKOS_INLINE_FUNCTION void runSpecialized3(const TeamMember &teamMember) const
Hand-coded 3-component version.
long approximateFlopCountPerCell() const
returns an estimate of the number of floating point operations per cell (counting sums,...
size_t team_shmem_size(int numThreads) const
Provide the shared memory capacity.
int teamSize(const int &maxTeamSizeFromKokkos) const
returns the team size that should be provided to the policy constructor, based on the Kokkos maximum ...
Implementation of a general sum factorization algorithm, abstracted from the algorithm described by M...
KOKKOS_INLINE_FUNCTION int incrementArgument(Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of incrementArgument; gets used by approximate flop count.
size_t team_shmem_size(int team_size) const
Provide the shared memory capacity.
KOKKOS_INLINE_FUNCTION int nextIncrementResult(const Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of nextIncrementResult; gets used by approximate flop count.
int teamSize(const int &maxTeamSizeFromKokkos) const
returns the team size that should be provided to the policy constructor, based on the Kokkos maximum ...
KOKKOS_INLINE_FUNCTION void runSpecialized3(const TeamMember &teamMember) const
runSpecialized implementations are hand-coded variants of run() for a particular number of components...
long approximateFlopCountPerCell() const
returns an estimate of the number of floating point operations per cell (counting sums,...
Provides support for structure-aware integration.
Allows systematic enumeration of all entries in a TensorData object, tracking indices for each tensor...
KOKKOS_INLINE_FUNCTION void setEnumerationIndex(const ordinal_type &enumerationIndex)
Sets the enumeration index; this refers to a 1D enumeration of the possible in-bound arguments.
KOKKOS_INLINE_FUNCTION const ordinal_type & argument(const ordinal_type &r) const
View-like interface to tensor data; tensor components are stored separately and multiplied together a...
KOKKOS_INLINE_FUNCTION const Data< Scalar, DeviceType > & getTensorComponent(const ordinal_type &r) const
Returns the requested tensor component.
KOKKOS_INLINE_FUNCTION constexpr bool isValid() const
Returns true for containers that have data; false for those that don't (e.g., those that have been co...
KOKKOS_INLINE_FUNCTION std::enable_if< std::is_integral< iType >::value, ordinal_type >::type extent_int(const iType &d) const
Returns the logical extent in the requested dimension.
KOKKOS_INLINE_FUNCTION ordinal_type numTensorComponents() const
Return the number of tensorial components.
Structure-preserving representation of transformed vector data; reference space values and transforma...
KOKKOS_INLINE_FUNCTION bool axisAligned() const
Returns true if the transformation matrix is diagonal.
KOKKOS_INLINE_FUNCTION Scalar transformWeight(const int &cellOrdinal, const int &pointOrdinal) const
Returns the specified entry in the (scalar) transform. (Only valid for scalar-valued BasisValues; see...
const VectorData< Scalar, DeviceType > & vectorData() const
Returns the reference-space vector data.
KOKKOS_INLINE_FUNCTION int spaceDim() const
Returns the logical extent in the space dimension, which is the 3 dimension in this container.
const Data< Scalar, DeviceType > & transform() const
Returns the transform matrix. An invalid/empty container indicates the identity transform.
KOKKOS_INLINE_FUNCTION int numCells() const
Returns the logical extent in the cell dimension, which is the 0 dimension in this container.
KOKKOS_INLINE_FUNCTION int numPoints() const
Returns the logical extent in the points dimension, which is the 2 dimension in this container.
KOKKOS_INLINE_FUNCTION int numFields() const
Returns the logical extent in the fields dimension, which is the 1 dimension in this container.
Struct expressing all variation information about a Data object in a single dimension,...