42#if KOKKOS_VERSION >= 40799
43 using impl_scalar_type =
typename KokkosKernels::ArithTraits<Scalar>::val_type;
45 using impl_scalar_type =
typename Kokkos::ArithTraits<Scalar>::val_type;
48 using exec_space =
typename Node::execution_space;
49 using memory_space =
typename Node::memory_space;
50 using range_policy = Kokkos::RangePolicy<exec_space>;
52 Kokkos::View<impl_scalar_type *, memory_space> a(
"a", VECTOR_SIZE);
53 Kokkos::View<impl_scalar_type *, memory_space> b(
"b", VECTOR_SIZE);
54 Kokkos::View<impl_scalar_type *, memory_space> c(
"c", VECTOR_SIZE);
55 double total_test_time = 0.0;
57 impl_scalar_type ONE = Teuchos::ScalarTraits<impl_scalar_type>::one();
60 "stream/fill", range_policy(0, VECTOR_SIZE), KOKKOS_LAMBDA(
const size_t i) {
61 a(i) = ONE * (double)i;
66 using clock = std::chrono::high_resolution_clock;
68 clock::time_point start, stop;
70 for (
int i = 0; i < KERNEL_REPEATS; i++) {
73 "stream/add", range_policy(0, VECTOR_SIZE), KOKKOS_LAMBDA(
const size_t j) {
79 double my_test_time = std::chrono::duration<double>(stop - start).count();
80 total_test_time += my_test_time;
83 return total_test_time / KERNEL_REPEATS;
89#if KOKKOS_VERSION >= 40799
90 using impl_scalar_type =
typename KokkosKernels::ArithTraits<Scalar>::val_type;
92 using impl_scalar_type =
typename Kokkos::ArithTraits<Scalar>::val_type;
95 using exec_space =
typename Node::execution_space;
96 using memory_space =
typename Node::memory_space;
97 using range_policy = Kokkos::RangePolicy<exec_space>;
99 Kokkos::View<impl_scalar_type *, memory_space> a(
"a", VECTOR_SIZE);
100 Kokkos::View<impl_scalar_type *, memory_space> b(
"b", VECTOR_SIZE);
101 double total_test_time = 0.0;
103 impl_scalar_type ONE = Teuchos::ScalarTraits<impl_scalar_type>::one();
105 Kokkos::parallel_for(
106 "stream/fill", range_policy(0, VECTOR_SIZE), KOKKOS_LAMBDA(
const size_t i) {
109 exec_space().fence();
111 using clock = std::chrono::high_resolution_clock;
112 clock::time_point start, stop;
114 for (
int i = 0; i < KERNEL_REPEATS; i++) {
115 start = clock::now();
116 Kokkos::parallel_for(
117 "stream/copy", range_policy(0, VECTOR_SIZE), KOKKOS_LAMBDA(
const size_t j) {
121 exec_space().fence();
123 double my_test_time = std::chrono::duration<double>(stop - start).count();
124 total_test_time += my_test_time;
127 return total_test_time / KERNEL_REPEATS;
178 int rank = comm.getRank();
179 int nproc = comm.getSize();
181 if (nproc < 2)
return;
183 const int buff_size = (int)pow(2, MAX_SIZE);
185 sizes.resize(MAX_SIZE + 1);
186 times.resize(MAX_SIZE + 1);
189 Kokkos::View<char *, memory_space> r_buf(
"recv", buff_size), s_buf(
"send", buff_size);
190 Kokkos::deep_copy(s_buf, 1);
195 int buddy = odd ? rank - 1 : rank + 1;
197 for (
int i = 0; i < MAX_SIZE + 1; i++) {
198 int msg_size = (int)pow(2, i);
201 double t0 = MPI_Wtime();
202 for (
int j = 0; j < KERNEL_REPEATS; j++) {
205 comm.send(msg_size, (
char *)s_buf.data(), buddy);
206 comm.receive(buddy, msg_size, (
char *)r_buf.data());
208 comm.receive(buddy, msg_size, (
char *)r_buf.data());
209 comm.send(msg_size, (
char *)s_buf.data(), buddy);
214 double time_per_call = (MPI_Wtime() - t0) / (2.0 * KERNEL_REPEATS);
216 times[i] = time_per_call;
224void halopong_basic(
int KERNEL_REPEATS,
int MAX_SIZE,
const RCP<
const Xpetra::Import<LocalOrdinal, GlobalOrdinal, Node> > &
import, std::vector<int> &sizes, std::vector<double> ×) {
225 int nproc =
import->getSourceMap()->getComm()->getSize();
226 if (nproc < 2)
return;
227#if defined(HAVE_MUELU_TPETRA) && defined(HAVE_MPI)
230 using x_import_type = Xpetra::TpetraImport<LocalOrdinal, GlobalOrdinal, Node>;
231 RCP<const x_import_type> Ximport = Teuchos::rcp_dynamic_cast<const x_import_type>(
import);
232 RCP<const Teuchos::MpiComm<int> > mcomm = Teuchos::rcp_dynamic_cast<const Teuchos::MpiComm<int> >(
import->getSourceMap()->getComm());
233 MPI_Comm communicator = *mcomm->getRawMpiComm();
235 if (Ximport.is_null() || mcomm.is_null())
return;
236 auto Timport = Ximport->getTpetra_Import();
237 auto distor = Timport->getDistributor();
240 Teuchos::ArrayView<const int> procsFrom = distor.getProcsFrom();
241 Teuchos::ArrayView<const int> procsTo = distor.getProcsTo();
242 int num_recvs = (int)distor.getNumReceives();
243 int num_sends = (int)distor.getNumSends();
245 const int buff_size_per_msg = (int)pow(2, MAX_SIZE);
246 sizes.resize(MAX_SIZE + 1);
247 times.resize(MAX_SIZE + 1);
250 Kokkos::View<char *, memory_space> f_recv_buf(
"forward_recv", buff_size_per_msg * num_recvs), f_send_buf(
"forward_send", buff_size_per_msg * num_sends);
251 Kokkos::View<char *, memory_space> r_recv_buf(
"reverse_recv", buff_size_per_msg * num_sends), r_send_buf(
"reverse_send", buff_size_per_msg * num_recvs);
252 Kokkos::deep_copy(f_send_buf, 1);
253 Kokkos::deep_copy(r_send_buf, 1);
255 std::vector<MPI_Request> requests(num_sends + num_recvs);
256 std::vector<MPI_Status> status(num_sends + num_recvs);
258 for (
int i = 0; i < MAX_SIZE + 1; i++) {
259 int msg_size = (int)pow(2, i);
261 MPI_Barrier(communicator);
263 double t0 = MPI_Wtime();
264 for (
int j = 0; j < KERNEL_REPEATS; j++) {
267 for (
int r = 0; r < num_recvs; r++) {
268 const int tag = 1000 + j;
269 MPI_Irecv(f_recv_buf.data() + msg_size * r, msg_size, MPI_CHAR, procsFrom[r], tag, communicator, &requests[ct]);
272 for (
int s = 0; s < num_sends; s++) {
273 const int tag = 1000 + j;
274 MPI_Isend(f_send_buf.data() + msg_size * s, msg_size, MPI_CHAR, procsTo[s], tag, communicator, &requests[ct]);
278 MPI_Waitall(ct, requests.data(), status.data());
282 for (
int r = 0; r < num_sends; r++) {
283 const int tag = 2000 + j;
284 MPI_Irecv(r_recv_buf.data() + msg_size * r, msg_size, MPI_CHAR, procsTo[r], tag, communicator, &requests[ct]);
287 for (
int s = 0; s < num_recvs; s++) {
288 const int tag = 2000 + j;
289 MPI_Isend(r_send_buf.data() + msg_size * s, msg_size, MPI_CHAR, procsFrom[s], tag, communicator, &requests[ct]);
293 MPI_Waitall(ct, requests.data(), status.data());
296 double time_per_call = (MPI_Wtime() - t0) / (2.0 * KERNEL_REPEATS);
298 times[i] = time_per_call;