38 using impl_scalar_type =
typename KokkosKernels::ArithTraits<Scalar>::val_type;
40 using exec_space =
typename Node::execution_space;
41 using memory_space =
typename Node::memory_space;
42 using range_policy = Kokkos::RangePolicy<exec_space>;
44 Kokkos::View<impl_scalar_type *, memory_space> a(
"a", VECTOR_SIZE);
45 Kokkos::View<impl_scalar_type *, memory_space> b(
"b", VECTOR_SIZE);
46 Kokkos::View<impl_scalar_type *, memory_space> c(
"c", VECTOR_SIZE);
47 double total_test_time = 0.0;
49 impl_scalar_type ONE = Teuchos::ScalarTraits<impl_scalar_type>::one();
52 "stream/fill", range_policy(0, VECTOR_SIZE), KOKKOS_LAMBDA(
const size_t i) {
53 a(i) = ONE * (double)i;
58 using clock = std::chrono::high_resolution_clock;
60 clock::time_point start, stop;
62 for (
int i = 0; i < KERNEL_REPEATS; i++) {
65 "stream/add", range_policy(0, VECTOR_SIZE), KOKKOS_LAMBDA(
const size_t j) {
71 double my_test_time = std::chrono::duration<double>(stop - start).count();
72 total_test_time += my_test_time;
75 return total_test_time / KERNEL_REPEATS;
81 using impl_scalar_type =
typename KokkosKernels::ArithTraits<Scalar>::val_type;
83 using exec_space =
typename Node::execution_space;
84 using memory_space =
typename Node::memory_space;
85 using range_policy = Kokkos::RangePolicy<exec_space>;
87 Kokkos::View<impl_scalar_type *, memory_space> a(
"a", VECTOR_SIZE);
88 Kokkos::View<impl_scalar_type *, memory_space> b(
"b", VECTOR_SIZE);
89 double total_test_time = 0.0;
91 impl_scalar_type ONE = Teuchos::ScalarTraits<impl_scalar_type>::one();
94 "stream/fill", range_policy(0, VECTOR_SIZE), KOKKOS_LAMBDA(
const size_t i) {
99 using clock = std::chrono::high_resolution_clock;
100 clock::time_point start, stop;
102 for (
int i = 0; i < KERNEL_REPEATS; i++) {
103 start = clock::now();
104 Kokkos::parallel_for(
105 "stream/copy", range_policy(0, VECTOR_SIZE), KOKKOS_LAMBDA(
const size_t j) {
109 exec_space().fence();
111 double my_test_time = std::chrono::duration<double>(stop - start).count();
112 total_test_time += my_test_time;
115 return total_test_time / KERNEL_REPEATS;
166 int rank = comm.getRank();
167 int nproc = comm.getSize();
169 if (nproc < 2)
return;
171 const int buff_size = (int)pow(2, MAX_SIZE);
173 sizes.resize(MAX_SIZE + 1);
174 times.resize(MAX_SIZE + 1);
177 Kokkos::View<char *, memory_space> r_buf(
"recv", buff_size), s_buf(
"send", buff_size);
178 Kokkos::deep_copy(s_buf, 1);
183 int buddy = odd ? rank - 1 : rank + 1;
185 for (
int i = 0; i < MAX_SIZE + 1; i++) {
186 int msg_size = (int)pow(2, i);
189 double t0 = MPI_Wtime();
190 for (
int j = 0; j < KERNEL_REPEATS; j++) {
193 comm.send(msg_size, (
char *)s_buf.data(), buddy);
194 comm.receive(buddy, msg_size, (
char *)r_buf.data());
196 comm.receive(buddy, msg_size, (
char *)r_buf.data());
197 comm.send(msg_size, (
char *)s_buf.data(), buddy);
202 double time_per_call = (MPI_Wtime() - t0) / (2.0 * KERNEL_REPEATS);
204 times[i] = time_per_call;
212void halopong_basic(
int KERNEL_REPEATS,
int MAX_SIZE,
const RCP<
const Xpetra::Import<LocalOrdinal, GlobalOrdinal, Node> > &
import, std::vector<int> &sizes, std::vector<double> ×) {
213 int nproc =
import->getSourceMap()->getComm()->getSize();
214 if (nproc < 2)
return;
215#if defined(HAVE_MUELU_TPETRA) && defined(HAVE_MPI)
218 using x_import_type = Xpetra::TpetraImport<LocalOrdinal, GlobalOrdinal, Node>;
219 RCP<const x_import_type> Ximport = Teuchos::rcp_dynamic_cast<const x_import_type>(
import);
220 RCP<const Teuchos::MpiComm<int> > mcomm = Teuchos::rcp_dynamic_cast<const Teuchos::MpiComm<int> >(
import->getSourceMap()->getComm());
221 MPI_Comm communicator = *mcomm->getRawMpiComm();
223 if (Ximport.is_null() || mcomm.is_null())
return;
224 auto Timport = Ximport->getTpetra_Import();
225 auto distor = Timport->getDistributor();
228 Teuchos::ArrayView<const int> procsFrom = distor.getProcsFrom();
229 Teuchos::ArrayView<const int> procsTo = distor.getProcsTo();
230 int num_recvs = (int)distor.getNumReceives();
231 int num_sends = (int)distor.getNumSends();
233 const int buff_size_per_msg = (int)pow(2, MAX_SIZE);
234 sizes.resize(MAX_SIZE + 1);
235 times.resize(MAX_SIZE + 1);
238 Kokkos::View<char *, memory_space> f_recv_buf(
"forward_recv", buff_size_per_msg * num_recvs), f_send_buf(
"forward_send", buff_size_per_msg * num_sends);
239 Kokkos::View<char *, memory_space> r_recv_buf(
"reverse_recv", buff_size_per_msg * num_sends), r_send_buf(
"reverse_send", buff_size_per_msg * num_recvs);
240 Kokkos::deep_copy(f_send_buf, 1);
241 Kokkos::deep_copy(r_send_buf, 1);
243 std::vector<MPI_Request> requests(num_sends + num_recvs);
244 std::vector<MPI_Status> status(num_sends + num_recvs);
246 for (
int i = 0; i < MAX_SIZE + 1; i++) {
247 int msg_size = (int)pow(2, i);
249 MPI_Barrier(communicator);
251 double t0 = MPI_Wtime();
252 for (
int j = 0; j < KERNEL_REPEATS; j++) {
255 for (
int r = 0; r < num_recvs; r++) {
256 const int tag = 1000 + j;
257 MPI_Irecv(f_recv_buf.data() + msg_size * r, msg_size, MPI_CHAR, procsFrom[r], tag, communicator, &requests[ct]);
260 for (
int s = 0; s < num_sends; s++) {
261 const int tag = 1000 + j;
262 MPI_Isend(f_send_buf.data() + msg_size * s, msg_size, MPI_CHAR, procsTo[s], tag, communicator, &requests[ct]);
266 MPI_Waitall(ct, requests.data(), status.data());
270 for (
int r = 0; r < num_sends; r++) {
271 const int tag = 2000 + j;
272 MPI_Irecv(r_recv_buf.data() + msg_size * r, msg_size, MPI_CHAR, procsTo[r], tag, communicator, &requests[ct]);
275 for (
int s = 0; s < num_recvs; s++) {
276 const int tag = 2000 + j;
277 MPI_Isend(r_send_buf.data() + msg_size * s, msg_size, MPI_CHAR, procsFrom[s], tag, communicator, &requests[ct]);
281 MPI_Waitall(ct, requests.data(), status.data());
284 double time_per_call = (MPI_Wtime() - t0) / (2.0 * KERNEL_REPEATS);
286 times[i] = time_per_call;