MueLu Version of the Day
Loading...
Searching...
No Matches
MueLu_PerfModels_def.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// MueLu: A package for multigrid based preconditioning
4//
5// Copyright 2012 NTESS and the MueLu contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
11
12#include <cstdio>
13#include <cmath>
14#include <numeric>
15#include <utility>
16#include <chrono>
17#include <iomanip>
18#include <Teuchos_ScalarTraits.hpp>
19#if KOKKOS_VERSION >= 40799
20#include <KokkosKernels_ArithTraits.hpp>
21#else
22#include <Kokkos_ArithTraits.hpp>
23#endif
24#include <Xpetra_Import.hpp>
25#if defined(HAVE_MUELU_TPETRA) && defined(HAVE_MPI)
26#include <Xpetra_TpetraImport.hpp>
27#include <Tpetra_Import.hpp>
28#include <Tpetra_Distributor.hpp>
29#include <mpi.h>
30#endif
31
32#ifdef HAVE_MPI
33#include <mpi.h>
34#endif
35
36namespace MueLu {
37
38namespace PerfDetails {
39template <class Scalar, class Node>
40double stream_vector_add(int KERNEL_REPEATS, int VECTOR_SIZE) {
41 // PerfDetails' STREAM routines need to be instantiatiated on impl_scalar_type, not Scalar
42#if KOKKOS_VERSION >= 40799
43 using impl_scalar_type = typename KokkosKernels::ArithTraits<Scalar>::val_type;
44#else
45 using impl_scalar_type = typename Kokkos::ArithTraits<Scalar>::val_type;
46#endif
47
48 using exec_space = typename Node::execution_space;
49 using memory_space = typename Node::memory_space;
50 using range_policy = Kokkos::RangePolicy<exec_space>;
51
52 Kokkos::View<impl_scalar_type *, memory_space> a("a", VECTOR_SIZE);
53 Kokkos::View<impl_scalar_type *, memory_space> b("b", VECTOR_SIZE);
54 Kokkos::View<impl_scalar_type *, memory_space> c("c", VECTOR_SIZE);
55 double total_test_time = 0.0;
56
57 impl_scalar_type ONE = Teuchos::ScalarTraits<impl_scalar_type>::one();
58
59 Kokkos::parallel_for(
60 "stream/fill", range_policy(0, VECTOR_SIZE), KOKKOS_LAMBDA(const size_t i) {
61 a(i) = ONE * (double)i;
62 b(i) = a(i);
63 });
64 exec_space().fence();
65
66 using clock = std::chrono::high_resolution_clock;
67
68 clock::time_point start, stop;
69
70 for (int i = 0; i < KERNEL_REPEATS; i++) {
71 start = clock::now();
72 Kokkos::parallel_for(
73 "stream/add", range_policy(0, VECTOR_SIZE), KOKKOS_LAMBDA(const size_t j) { // Vector Addition
74 c(j) = a(j) + b(j);
75 });
76
77 exec_space().fence();
78 stop = clock::now();
79 double my_test_time = std::chrono::duration<double>(stop - start).count();
80 total_test_time += my_test_time;
81 }
82
83 return total_test_time / KERNEL_REPEATS;
84}
85
86template <class Scalar, class Node>
87double stream_vector_copy(int KERNEL_REPEATS, int VECTOR_SIZE) {
88 // PerfDetails' STREAM routines need to be instantiatiated on impl_scalar_type, not Scalar
89#if KOKKOS_VERSION >= 40799
90 using impl_scalar_type = typename KokkosKernels::ArithTraits<Scalar>::val_type;
91#else
92 using impl_scalar_type = typename Kokkos::ArithTraits<Scalar>::val_type;
93#endif
94
95 using exec_space = typename Node::execution_space;
96 using memory_space = typename Node::memory_space;
97 using range_policy = Kokkos::RangePolicy<exec_space>;
98
99 Kokkos::View<impl_scalar_type *, memory_space> a("a", VECTOR_SIZE);
100 Kokkos::View<impl_scalar_type *, memory_space> b("b", VECTOR_SIZE);
101 double total_test_time = 0.0;
102
103 impl_scalar_type ONE = Teuchos::ScalarTraits<impl_scalar_type>::one();
104
105 Kokkos::parallel_for(
106 "stream/fill", range_policy(0, VECTOR_SIZE), KOKKOS_LAMBDA(const size_t i) {
107 a(i) = ONE;
108 });
109 exec_space().fence();
110
111 using clock = std::chrono::high_resolution_clock;
112 clock::time_point start, stop;
113
114 for (int i = 0; i < KERNEL_REPEATS; i++) {
115 start = clock::now();
116 Kokkos::parallel_for(
117 "stream/copy", range_policy(0, VECTOR_SIZE), KOKKOS_LAMBDA(const size_t j) { // Vector Addition
118 b(j) = a(j);
119 });
120
121 exec_space().fence();
122 stop = clock::now();
123 double my_test_time = std::chrono::duration<double>(stop - start).count();
124 total_test_time += my_test_time;
125 }
126
127 return total_test_time / KERNEL_REPEATS;
128}
129
130double table_lookup(const std::vector<int> &x, const std::vector<double> &y, int value) {
131 // If there's no table, nan
132 if (x.size() == 0) return Teuchos::ScalarTraits<double>::nan();
133
134 // NOTE: This should probably be a binary search, but this isn't performance sensitive, so we'll go simple
135 int N = (int)x.size();
136 int hi = 0;
137 for (; hi < N; hi++) {
138 if (x[hi] > value)
139 break;
140 }
141
142 if (hi == 0) {
143 // Lower end (return the min time)
144 // printf("Lower end: %d < %d\n",value,x[0]);
145 return y[0];
146 } else if (hi == N) {
147 // Higher end (extrapolate from the last two points)
148 // printf("Upper end: %d > %d\n",value,x[N-1]);
149 hi = N - 1;
150 int run = x[hi] - x[hi - 1];
151 double rise = y[hi] - y[hi - 1];
152 double slope = rise / run;
153 int diff = value - x[hi - 1];
154
155 return y[hi - 1] + slope * diff;
156 } else {
157 // Interpolate
158 // printf("Middle: %d < %d < %d\n",x[hi-1],value,x[hi]);
159 int run = x[hi] - x[hi - 1];
160 double rise = y[hi] - y[hi - 1];
161 double slope = rise / run;
162 int diff = value - x[hi - 1];
163
164 return y[hi - 1] + slope * diff;
165 }
166}
167
168// Report bandwidth in GB / sec
169const double GB = 1024.0 * 1024.0 * 1024.0;
170double convert_time_to_bandwidth_gbs(double time, int num_calls, double memory_per_call_bytes) {
171 double time_per_call = time / num_calls;
172 return memory_per_call_bytes / GB / time_per_call;
173}
174
175template <class exec_space, class memory_space>
176void pingpong_basic(int KERNEL_REPEATS, int MAX_SIZE, const Teuchos::Comm<int> &comm, std::vector<int> &sizes, std::vector<double> &times) {
177#ifdef HAVE_MPI
178 int rank = comm.getRank();
179 int nproc = comm.getSize();
180
181 if (nproc < 2) return;
182
183 const int buff_size = (int)pow(2, MAX_SIZE);
184
185 sizes.resize(MAX_SIZE + 1);
186 times.resize(MAX_SIZE + 1);
187
188 // Allocate memory for the buffers (and fill send)
189 Kokkos::View<char *, memory_space> r_buf("recv", buff_size), s_buf("send", buff_size);
190 Kokkos::deep_copy(s_buf, 1);
191
192 // Send and recieve.
193 // NOTE: Do consectutive pair buddies here for simplicity. We should be smart later
194 int odd = rank % 2;
195 int buddy = odd ? rank - 1 : rank + 1;
196
197 for (int i = 0; i < MAX_SIZE + 1; i++) {
198 int msg_size = (int)pow(2, i);
199 comm.barrier();
200
201 double t0 = MPI_Wtime();
202 for (int j = 0; j < KERNEL_REPEATS; j++) {
203 if (buddy < nproc) {
204 if (odd) {
205 comm.send(msg_size, (char *)s_buf.data(), buddy);
206 comm.receive(buddy, msg_size, (char *)r_buf.data());
207 } else {
208 comm.receive(buddy, msg_size, (char *)r_buf.data());
209 comm.send(msg_size, (char *)s_buf.data(), buddy);
210 }
211 }
212 }
213
214 double time_per_call = (MPI_Wtime() - t0) / (2.0 * KERNEL_REPEATS);
215 sizes[i] = msg_size;
216 times[i] = time_per_call;
217 }
218#else
219 return;
220#endif
221}
222
223template <class exec_space, class memory_space, class LocalOrdinal, class GlobalOrdinal, class Node>
224void halopong_basic(int KERNEL_REPEATS, int MAX_SIZE, const RCP<const Xpetra::Import<LocalOrdinal, GlobalOrdinal, Node> > &import, std::vector<int> &sizes, std::vector<double> &times) {
225 int nproc = import->getSourceMap()->getComm()->getSize();
226 if (nproc < 2) return;
227#if defined(HAVE_MUELU_TPETRA) && defined(HAVE_MPI)
228 // NOTE: We need to get the distributer here, which means we need Tpetra, since Xpetra does
229 // not have a distributor interface
230 using x_import_type = Xpetra::TpetraImport<LocalOrdinal, GlobalOrdinal, Node>;
231 RCP<const x_import_type> Ximport = Teuchos::rcp_dynamic_cast<const x_import_type>(import);
232 RCP<const Teuchos::MpiComm<int> > mcomm = Teuchos::rcp_dynamic_cast<const Teuchos::MpiComm<int> >(import->getSourceMap()->getComm());
233 MPI_Comm communicator = *mcomm->getRawMpiComm();
234
235 if (Ximport.is_null() || mcomm.is_null()) return;
236 auto Timport = Ximport->getTpetra_Import();
237 auto distor = Timport->getDistributor();
238
239 // Distributor innards
240 Teuchos::ArrayView<const int> procsFrom = distor.getProcsFrom();
241 Teuchos::ArrayView<const int> procsTo = distor.getProcsTo();
242 int num_recvs = (int)distor.getNumReceives();
243 int num_sends = (int)distor.getNumSends();
244
245 const int buff_size_per_msg = (int)pow(2, MAX_SIZE);
246 sizes.resize(MAX_SIZE + 1);
247 times.resize(MAX_SIZE + 1);
248
249 // Allocate memory for the buffers (and fill send)
250 Kokkos::View<char *, memory_space> f_recv_buf("forward_recv", buff_size_per_msg * num_recvs), f_send_buf("forward_send", buff_size_per_msg * num_sends);
251 Kokkos::View<char *, memory_space> r_recv_buf("reverse_recv", buff_size_per_msg * num_sends), r_send_buf("reverse_send", buff_size_per_msg * num_recvs);
252 Kokkos::deep_copy(f_send_buf, 1);
253 Kokkos::deep_copy(r_send_buf, 1);
254
255 std::vector<MPI_Request> requests(num_sends + num_recvs);
256 std::vector<MPI_Status> status(num_sends + num_recvs);
257
258 for (int i = 0; i < MAX_SIZE + 1; i++) {
259 int msg_size = (int)pow(2, i);
260
261 MPI_Barrier(communicator);
262
263 double t0 = MPI_Wtime();
264 for (int j = 0; j < KERNEL_REPEATS; j++) {
265 int ct = 0;
266 // Recv/Send the forward messsages
267 for (int r = 0; r < num_recvs; r++) {
268 const int tag = 1000 + j;
269 MPI_Irecv(f_recv_buf.data() + msg_size * r, msg_size, MPI_CHAR, procsFrom[r], tag, communicator, &requests[ct]);
270 ct++;
271 }
272 for (int s = 0; s < num_sends; s++) {
273 const int tag = 1000 + j;
274 MPI_Isend(f_send_buf.data() + msg_size * s, msg_size, MPI_CHAR, procsTo[s], tag, communicator, &requests[ct]);
275 ct++;
276 }
277 // Wait for the forward messsages
278 MPI_Waitall(ct, requests.data(), status.data());
279
280 ct = 0;
281 // Recv/Send the reverse messsages
282 for (int r = 0; r < num_sends; r++) {
283 const int tag = 2000 + j;
284 MPI_Irecv(r_recv_buf.data() + msg_size * r, msg_size, MPI_CHAR, procsTo[r], tag, communicator, &requests[ct]);
285 ct++;
286 }
287 for (int s = 0; s < num_recvs; s++) {
288 const int tag = 2000 + j;
289 MPI_Isend(r_send_buf.data() + msg_size * s, msg_size, MPI_CHAR, procsFrom[s], tag, communicator, &requests[ct]);
290 ct++;
291 }
292 // Wait for the reverse messsages
293 MPI_Waitall(ct, requests.data(), status.data());
294 }
295
296 double time_per_call = (MPI_Wtime() - t0) / (2.0 * KERNEL_REPEATS);
297 sizes[i] = msg_size;
298 times[i] = time_per_call;
299 }
300
301#endif
302}
303
304} // end namespace PerfDetails
305
306template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
309
310template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
312
313/****************************************************************************************/
314/****************************************************************************************/
315/****************************************************************************************/
316
317template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
319 // We need launch/waits latency estimates for corrected stream
320 launch_latency_make_table(KERNEL_REPEATS);
321 double latency = launch_latency_lookup();
322
323 if (LOG_MAX_SIZE < 2)
324 LOG_MAX_SIZE = 20;
325
326 stream_sizes_.resize(LOG_MAX_SIZE + 1);
327 stream_copy_times_.resize(LOG_MAX_SIZE + 1);
328 stream_add_times_.resize(LOG_MAX_SIZE + 1);
329 latency_corrected_stream_copy_times_.resize(LOG_MAX_SIZE + 1);
330 latency_corrected_stream_add_times_.resize(LOG_MAX_SIZE + 1);
331
332 for (int i = 0; i < LOG_MAX_SIZE + 1; i++) {
333 int size = (int)pow(2, i);
334 double c_time = PerfDetails::stream_vector_copy<Scalar, Node>(KERNEL_REPEATS, size);
335 double a_time = PerfDetails::stream_vector_add<Scalar, Node>(KERNEL_REPEATS, size);
336
337 stream_sizes_[i] = size;
338
339 // Correct for the difference in memory transactions per element
340 stream_copy_times_[i] = c_time / 2.0;
341 stream_add_times_[i] = a_time / 3.0;
342
343 // Correct for launch latency too. We'll note that sometimes the latency estimate
344 // is higher than the actual copy/add time estimate. If so, we don't correct
345 latency_corrected_stream_copy_times_[i] = (c_time - latency <= 0.0) ? c_time / 2.0 : ((c_time - latency) / 2.0);
346 latency_corrected_stream_add_times_[i] = (a_time - latency <= 0.0) ? a_time / 3.0 : ((a_time - latency) / 3.0);
347 }
348}
349
350template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
351double
353 return PerfDetails::table_lookup(stream_sizes_, stream_copy_times_, SIZE_IN_BYTES / sizeof(Scalar));
354}
355
356template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
357double
359 return PerfDetails::table_lookup(stream_sizes_, stream_add_times_, SIZE_IN_BYTES / sizeof(Scalar));
360}
361
362template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
363double
365 return std::min(stream_vector_copy_lookup(SIZE_IN_BYTES), stream_vector_add_lookup(SIZE_IN_BYTES));
366}
367
368template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
369double
371 return PerfDetails::table_lookup(stream_sizes_, latency_corrected_stream_copy_times_, SIZE_IN_BYTES / sizeof(Scalar));
372}
373
374template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
375double
377 return PerfDetails::table_lookup(stream_sizes_, latency_corrected_stream_add_times_, SIZE_IN_BYTES / sizeof(Scalar));
378}
379
380template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
381double
383 return std::min(latency_corrected_stream_vector_copy_lookup(SIZE_IN_BYTES), latency_corrected_stream_vector_add_lookup(SIZE_IN_BYTES));
384}
385
386template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
388 print_stream_vector_table_impl(out, false, prefix);
389}
390
391template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
393 print_stream_vector_table_impl(out, true, prefix);
394}
395
396template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
397void PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_stream_vector_table_impl(std::ostream &out, bool use_latency_correction, const std::string &prefix) {
398 using namespace std;
399 std::ios old_format(NULL);
400 old_format.copyfmt(out);
401
402 out << prefix
403 << setw(20) << "Length in Scalars" << setw(1) << " "
404 << setw(20) << "COPY (us)" << setw(1) << " "
405 << setw(20) << "ADD (us)" << setw(1) << " "
406 << setw(20) << "COPY (GB/s)" << setw(1) << " "
407 << setw(20) << "ADD (GB/s)" << std::endl;
408
409 out << prefix
410 << setw(20) << "-----------------" << setw(1) << " "
411 << setw(20) << "---------" << setw(1) << " "
412 << setw(20) << "--------" << setw(1) << " "
413 << setw(20) << "-----------" << setw(1) << " "
414 << setw(20) << "----------" << std::endl;
415
416 for (int i = 0; i < (int)stream_sizes_.size(); i++) {
417 int size = stream_sizes_[i];
418 double c_time = use_latency_correction ? latency_corrected_stream_copy_times_[i] : stream_copy_times_[i];
419 double a_time = use_latency_correction ? latency_corrected_stream_add_times_[i] : stream_add_times_[i];
420 // We've already corrected for the transactions per element difference
421 double c_bw = PerfDetails::convert_time_to_bandwidth_gbs(c_time, 1, size * sizeof(Scalar));
422 double a_bw = PerfDetails::convert_time_to_bandwidth_gbs(a_time, 1, size * sizeof(Scalar));
423
424 out << prefix
425 << setw(20) << size << setw(1) << " "
426 << setw(20) << fixed << setprecision(4) << (c_time * 1e6) << setw(1) << " "
427 << setw(20) << fixed << setprecision(4) << (a_time * 1e6) << setw(1) << " "
428 << setw(20) << fixed << setprecision(4) << c_bw << setw(1) << " "
429 << setw(20) << fixed << setprecision(4) << a_bw << std::endl;
430 }
431
432 out.copyfmt(old_format);
433}
434
435/****************************************************************************************/
436/****************************************************************************************/
437/****************************************************************************************/
438
439template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
440void PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::pingpong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP<const Teuchos::Comm<int> > &comm) {
441 PerfDetails::pingpong_basic<Kokkos::HostSpace::execution_space, Kokkos::HostSpace::memory_space>(KERNEL_REPEATS, LOG_MAX_SIZE, *comm, pingpong_sizes_, pingpong_host_times_);
442
443 PerfDetails::pingpong_basic<typename Node::execution_space, typename Node::memory_space>(KERNEL_REPEATS, LOG_MAX_SIZE, *comm, pingpong_sizes_, pingpong_device_times_);
444}
445
446template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
447double
449 return PerfDetails::table_lookup(pingpong_sizes_, pingpong_host_times_, SIZE_IN_BYTES);
450}
451
452template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
453double
455 return PerfDetails::table_lookup(pingpong_sizes_, pingpong_device_times_, SIZE_IN_BYTES);
456}
457
458template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
459void PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_pingpong_table(std::ostream &out, const std::string &prefix) {
460 if (pingpong_sizes_.size() == 0) return;
461
462 using namespace std;
463 std::ios old_format(NULL);
464 old_format.copyfmt(out);
465
466 out << prefix
467 << setw(20) << "Message Size" << setw(1) << " "
468 << setw(20) << "Host (us)" << setw(1) << " "
469 << setw(20) << "Device (us)" << std::endl;
470
471 out << prefix
472 << setw(20) << "------------" << setw(1) << " "
473 << setw(20) << "---------" << setw(1) << " "
474 << setw(20) << "-----------" << std::endl;
475
476 for (int i = 0; i < (int)pingpong_sizes_.size(); i++) {
477 int size = pingpong_sizes_[i];
478 double h_time = pingpong_host_times_[i];
479 double d_time = pingpong_device_times_[i];
480
481 out << prefix
482 << setw(20) << size << setw(1) << " "
483 << setw(20) << fixed << setprecision(4) << (h_time * 1e6) << setw(1) << " "
484 << setw(20) << fixed << setprecision(4) << (d_time * 1e6) << setw(1) << std::endl;
485 }
486
487 out.copyfmt(old_format);
488}
489
490/****************************************************************************************/
491/****************************************************************************************/
492/****************************************************************************************/
493template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
494void PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::halopong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP<const Xpetra::Import<LocalOrdinal, GlobalOrdinal, Node> > &import) {
495 PerfDetails::halopong_basic<Kokkos::HostSpace::execution_space, Kokkos::HostSpace::memory_space>(KERNEL_REPEATS, LOG_MAX_SIZE, import, halopong_sizes_, halopong_host_times_);
496
497 PerfDetails::halopong_basic<typename Node::execution_space, typename Node::memory_space>(KERNEL_REPEATS, LOG_MAX_SIZE, import, halopong_sizes_, halopong_device_times_);
498}
499
500template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
501double
503 return PerfDetails::table_lookup(halopong_sizes_, halopong_host_times_, SIZE_IN_BYTES);
504}
505
506template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
507double
509 return PerfDetails::table_lookup(halopong_sizes_, halopong_device_times_, SIZE_IN_BYTES);
510}
511
512template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
513void PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_halopong_table(std::ostream &out, const std::string &prefix) {
514 if (halopong_sizes_.size() == 0) return;
515
516 using namespace std;
517 std::ios old_format(NULL);
518 old_format.copyfmt(out);
519
520 out << prefix
521 << setw(20) << "Message Size" << setw(1) << " "
522 << setw(20) << "Host (us)" << setw(1) << " "
523 << setw(20) << "Device (us)" << std::endl;
524
525 out << prefix
526 << setw(20) << "------------" << setw(1) << " "
527 << setw(20) << "---------" << setw(1) << " "
528 << setw(20) << "-----------" << std::endl;
529
530 for (int i = 0; i < (int)halopong_sizes_.size(); i++) {
531 int size = halopong_sizes_[i];
532 double h_time = halopong_host_times_[i];
533 double d_time = halopong_device_times_[i];
534
535 out << prefix
536 << setw(20) << size << setw(1) << " "
537 << setw(20) << fixed << setprecision(4) << (h_time * 1e6) << setw(1) << " "
538 << setw(20) << fixed << setprecision(4) << (d_time * 1e6) << setw(1) << std::endl;
539 }
540
541 out.copyfmt(old_format);
542}
543
544/****************************************************************************************/
545/****************************************************************************************/
546/****************************************************************************************/
547
548template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
550 using exec_space = typename Node::execution_space;
551 using range_policy = Kokkos::RangePolicy<exec_space>;
552 using clock = std::chrono::high_resolution_clock;
553
554 double total_test_time = 0;
555 clock::time_point start, stop;
556 for (int i = 0; i < KERNEL_REPEATS; i++) {
557 start = clock::now();
558 Kokkos::parallel_for(
559 "empty kernel", range_policy(0, 1), KOKKOS_LAMBDA(const size_t j) {
560 ;
561 });
562 exec_space().fence();
563 stop = clock::now();
564 double my_test_time = std::chrono::duration<double>(stop - start).count();
565 total_test_time += my_test_time;
566 }
567
568 launch_and_wait_latency_ = total_test_time / KERNEL_REPEATS;
569}
570
571template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
572double
576
577template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
579 using namespace std;
580 std::ios old_format(NULL);
581 old_format.copyfmt(out);
582
583 out << prefix
584 << setw(20) << "Launch+Wait Latency (us)" << setw(1) << " "
585 << setw(20) << fixed << setprecision(4) << (launch_and_wait_latency_ * 1e6) << std::endl;
586
587 out.copyfmt(old_format);
588}
589
590} // namespace MueLu
MueLu::DefaultScalar Scalar
void print_halopong_table(std::ostream &out, const std::string &prefix="")
void halopong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP< const Xpetra::Import< LocalOrdinal, GlobalOrdinal, Node > > &import)
void print_launch_latency_table(std::ostream &out, const std::string &prefix="")
void print_stream_vector_table(std::ostream &out, const std::string &prefix="")
double stream_vector_copy_lookup(int SIZE_IN_BYTES)
void stream_vector_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE=20)
double latency_corrected_stream_vector_lookup(int SIZE_IN_BYTES)
void print_latency_corrected_stream_vector_table(std::ostream &out, const std::string &prefix="")
void print_stream_vector_table_impl(std::ostream &out, bool use_latency_correction, const std::string &prefix)
void pingpong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP< const Teuchos::Comm< int > > &comm)
double halopong_device_lookup(int SIZE_IN_BYTES_PER_MESSAGE)
double latency_corrected_stream_vector_copy_lookup(int SIZE_IN_BYTES)
double pingpong_device_lookup(int SIZE_IN_BYTES)
double pingpong_host_lookup(int SIZE_IN_BYTES)
double latency_corrected_stream_vector_add_lookup(int SIZE_IN_BYTES)
double stream_vector_add_lookup(int SIZE_IN_BYTES)
void print_pingpong_table(std::ostream &out, const std::string &prefix="")
double halopong_host_lookup(int SIZE_IN_BYTES_PER_MESSAGE)
double stream_vector_lookup(int SIZE_IN_BYTES)
void launch_latency_make_table(int KERNEL_REPEATS)
double table_lookup(const std::vector< int > &x, const std::vector< double > &y, int value)
double stream_vector_add(int KERNEL_REPEATS, int VECTOR_SIZE)
void pingpong_basic(int KERNEL_REPEATS, int MAX_SIZE, const Teuchos::Comm< int > &comm, std::vector< int > &sizes, std::vector< double > &times)
void halopong_basic(int KERNEL_REPEATS, int MAX_SIZE, const RCP< const Xpetra::Import< LocalOrdinal, GlobalOrdinal, Node > > &import, std::vector< int > &sizes, std::vector< double > &times)
double convert_time_to_bandwidth_gbs(double time, int num_calls, double memory_per_call_bytes)
double stream_vector_copy(int KERNEL_REPEATS, int VECTOR_SIZE)
Namespace for MueLu classes and methods.