Compadre 1.6.4
Loading...
Searching...
No Matches
Compadre_ApplyTargetEvaluations.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// Compadre: COMpatible PArticle Discretization and REmap Toolkit
4//
5// Copyright 2018 NTESS and the Compadre contributors.
6// SPDX-License-Identifier: BSD-2-Clause
7// *****************************************************************************
8// @HEADER
9#ifndef _COMPADRE_APPLY_TARGET_EVALUATIONS_HPP_
10#define _COMPADRE_APPLY_TARGET_EVALUATIONS_HPP_
11
12#include "Compadre_GMLS.hpp"
13namespace Compadre {
14
15/*! \brief For applying the evaluations from a target functional to the polynomial coefficients
16 \param data [out/in] - GMLSSolutionData struct (stores solution in data._d_ss._alphas)
17 \param teamMember [in] - Kokkos::TeamPolicy member type (created by parallel_for)
18 \param Q [in] - 2D Kokkos View containing the polynomial coefficients
19 \param P_target_row [in] - 1D Kokkos View where the evaluation of the polynomial basis is stored
20*/
21template <typename SolutionData>
22KOKKOS_INLINE_FUNCTION
23void applyTargetsToCoefficients(const SolutionData& data, const member_type& teamMember, scratch_matrix_right_type Q, scratch_matrix_right_type P_target_row) {
24
25 const int target_index = data._initial_index_for_batch + teamMember.league_rank();
26
27#if defined(COMPADRE_USE_CUDA)
28// // GPU
29// for (int j=0; j<_operations.size(); ++j) {
30// for (int k=0; k<_lro_output_tile_size[j]; ++k) {
31// for (int m=0; m<_lro_input_tile_size[j]; ++m) {
32// const int alpha_offset = (_lro_total_offsets[j] + m*_lro_output_tile_size[j] + k)*_neighbor_lists(target_index,0);
33// const int P_offset =_basis_multiplier*target_NP*(_lro_total_offsets[j] + m*_lro_output_tile_size[j] + k);
34// Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,
35// _pc._nla.getNumberOfNeighborsDevice(target_index)), [=] (const int i) {
36//
37// double alpha_ij = 0;
38// if (_sampling_multiplier>1 && m<_sampling_multiplier) {
39// const int m_neighbor_offset = i+m*_pc._nla.getNumberOfNeighborsDevice(target_index);
40// Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember,
41// _basis_multiplier*target_NP), [=] (const int l, double &talpha_ij) {
42// //for (int l=0; l<_basis_multiplier*target_NP; ++l) {
43// talpha_ij += P_target_row(P_offset + l)*Q(ORDER_INDICES(m_neighbor_offset,l));
44// }, alpha_ij);
45// //}
46// } else if (_sampling_multiplier == 1) {
47// Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember,
48// _basis_multiplier*target_NP), [=] (const int l, double &talpha_ij) {
49// //for (int l=0; l<_basis_multiplier*target_NP; ++l) {
50// talpha_ij += P_target_row(P_offset + l)*Q(ORDER_INDICES(i,l));
51// }, alpha_ij);
52// //}
53// }
54// Kokkos::single(Kokkos::PerThread(teamMember), [&] () {
55// t1(i) = alpha_ij;
56// });
57// });
58// Kokkos::parallel_for(Kokkos::ThreadVectorRange(teamMember,
59// _pc._nla.getNumberOfNeighborsDevice(target_index)), [=] (const int i) {
60// _alphas(ORDER_INDICES(target_index, alpha_offset + i)) = t1(i);
61// });
62// teamMember.team_barrier();
63// }
64// }
65// }
66
67 // GPU
68 const auto n_evaluation_sites_per_target = data.additional_number_of_neighbors_list(target_index) + 1;
69 const auto nn = data.number_of_neighbors_list(target_index);
70 auto alphas = data._d_ss._alphas;
71 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,
72 nn + data._d_ss._added_alpha_size), [&] (const int i) {
73 for (int e=0; e<n_evaluation_sites_per_target; ++e) {
74 for (int j=0; j<(int)data.operations_size; ++j) {
75 for (int k=0; k<data._d_ss._lro_output_tile_size[j]; ++k) {
76 for (int m=0; m<data._d_ss._lro_input_tile_size[j]; ++m) {
77 const int offset_index_jmke = data._d_ss.getTargetOffsetIndex(j,m,k,e);
78 const int alphas_index = data._d_ss.getAlphaIndex(target_index, offset_index_jmke);
79 double alpha_ij = 0;
80 if (data._sampling_multiplier>1 && m<data._sampling_multiplier) {
81 const int m_neighbor_offset = i+m*nn;
82 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, data.this_num_cols),
83 [&] (int& l, double& t_alpha_ij) {
84 t_alpha_ij += P_target_row(offset_index_jmke, l)*Q(l, m_neighbor_offset);
85
86 compadre_kernel_assert_extreme_debug(P_target_row(offset_index_jmke, l)==P_target_row(offset_index_jmke, l)
87 && "NaN in P_target_row matrix.");
88 compadre_kernel_assert_extreme_debug(Q(l, m_neighbor_offset)==Q(l, m_neighbor_offset)
89 && "NaN in Q coefficient matrix.");
90
91 }, alpha_ij);
92 } else if (data._sampling_multiplier == 1) {
93 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, data.this_num_cols),
94 [&] (int& l, double& t_alpha_ij) {
95 t_alpha_ij += P_target_row(offset_index_jmke, l)*Q(l,i);
96
97 compadre_kernel_assert_extreme_debug(P_target_row(offset_index_jmke, l)==P_target_row(offset_index_jmke, l)
98 && "NaN in P_target_row matrix.");
99 compadre_kernel_assert_extreme_debug(Q(l,i)==Q(l,i)
100 && "NaN in Q coefficient matrix.");
101
102 }, alpha_ij);
103 }
104 // could use a PerThread here, but performance takes a hit
105 // and it isn't necessary
106 alphas(alphas_index+i) = alpha_ij;
107 compadre_kernel_assert_extreme_debug(alpha_ij==alpha_ij && "NaN in alphas.");
108
109 }
110 }
111 }
112 }
113 });
114#else
115
116 // CPU
117 const int alphas_per_tile_per_target = data.number_of_neighbors_list(target_index) + data._d_ss._added_alpha_size;
118 const global_index_type base_offset_index_jmke = data._d_ss.getTargetOffsetIndex(0,0,0,0);
119 const global_index_type base_alphas_index = data._d_ss.getAlphaIndex(target_index, base_offset_index_jmke);
120
121 scratch_matrix_right_type this_alphas(data._d_ss._alphas.data() + TO_GLOBAL(base_alphas_index), data._d_ss._total_alpha_values*data._d_ss._max_evaluation_sites_per_target, alphas_per_tile_per_target);
122
123 auto n_evaluation_sites_per_target = data.additional_number_of_neighbors_list(target_index) + 1;
124 const auto nn = data.number_of_neighbors_list(target_index);
125 for (int e=0; e<n_evaluation_sites_per_target; ++e) {
126 // evaluating alpha_ij
127 for (size_t j=0; j<data.operations_size; ++j) {
128 for (int k=0; k<data._d_ss._lro_output_tile_size[j]; ++k) {
129 for (int m=0; m<data._d_ss._lro_input_tile_size[j]; ++m) {
130 const int offset_index_jmke = data._d_ss.getTargetOffsetIndex(j,m,k,e);
131 for (int i=0; i<nn + data._d_ss._added_alpha_size; ++i) {
132 double alpha_ij = 0;
133 const int Q_col = i+m*nn;
134 // one thread per team on CPU, so no reason to add inner parallel reductions here
135 // or Kokkos::single. Either will cause significant slowdown.
136 for (int l=0; l<data.this_num_cols; ++l) {
137 if (data._sampling_multiplier>1 && m<data._sampling_multiplier) {
138
139 alpha_ij += P_target_row(offset_index_jmke, l)*Q(l, Q_col);
140
141 compadre_kernel_assert_extreme_debug(P_target_row(offset_index_jmke, l)==P_target_row(offset_index_jmke, l)
142 && "NaN in P_target_row matrix.");
143 compadre_kernel_assert_extreme_debug(Q(l, i+m*data.number_of_neighbors_list(target_index))==Q(l, i+m*data.number_of_neighbors_list(target_index))
144 && "NaN in Q coefficient matrix.");
145
146 } else if (data._sampling_multiplier == 1) {
147
148 alpha_ij += P_target_row(offset_index_jmke, l)*Q(l, i);
149
150 compadre_kernel_assert_extreme_debug(P_target_row(offset_index_jmke, l)==P_target_row(offset_index_jmke, l)
151 && "NaN in P_target_row matrix.");
153 && "NaN in Q coefficient matrix.");
154
155 } else {
156 alpha_ij += 0;
157 }
158 }
159 this_alphas(offset_index_jmke,i) = alpha_ij;
160 }
161 }
162 }
163 }
164 }
165#endif
166
167 teamMember.team_barrier();
168}
169
170} // Compadre
171#endif
#define TO_GLOBAL(variable)
#define compadre_kernel_assert_extreme_debug(condition)
team_policy::member_type member_type
KOKKOS_INLINE_FUNCTION void applyTargetsToCoefficients(const SolutionData &data, const member_type &teamMember, scratch_matrix_right_type Q, scratch_matrix_right_type P_target_row)
For applying the evaluations from a target functional to the polynomial coefficients.
std::size_t global_index_type
Kokkos::View< double **, layout_right, Kokkos::MemoryTraits< Kokkos::Unmanaged > > scratch_matrix_right_type