MueLu Version of the Day
Loading...
Searching...
No Matches
MueLu_PerfModels_def.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// MueLu: A package for multigrid based preconditioning
4//
5// Copyright 2012 NTESS and the MueLu contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
11
12#include <cstdio>
13#include <cmath>
14#include <numeric>
15#include <utility>
16#include <chrono>
17#include <iomanip>
18#include <Teuchos_ScalarTraits.hpp>
19#include <KokkosKernels_ArithTraits.hpp>
20#include <Xpetra_Import.hpp>
21#if defined(HAVE_MUELU_TPETRA) && defined(HAVE_MPI)
22#include <Xpetra_TpetraImport.hpp>
23#include <Tpetra_Import.hpp>
24#include <Tpetra_Distributor.hpp>
25#include <mpi.h>
26#endif
27
28#ifdef HAVE_MPI
29#include <mpi.h>
30#endif
31
32namespace MueLu {
33
34namespace PerfDetails {
35template <class Scalar, class Node>
36double stream_vector_add(int KERNEL_REPEATS, int VECTOR_SIZE) {
37 // PerfDetails' STREAM routines need to be instantiatiated on impl_scalar_type, not Scalar
38 using impl_scalar_type = typename KokkosKernels::ArithTraits<Scalar>::val_type;
39
40 using exec_space = typename Node::execution_space;
41 using memory_space = typename Node::memory_space;
42 using range_policy = Kokkos::RangePolicy<exec_space>;
43
44 Kokkos::View<impl_scalar_type *, memory_space> a("a", VECTOR_SIZE);
45 Kokkos::View<impl_scalar_type *, memory_space> b("b", VECTOR_SIZE);
46 Kokkos::View<impl_scalar_type *, memory_space> c("c", VECTOR_SIZE);
47 double total_test_time = 0.0;
48
49 impl_scalar_type ONE = Teuchos::ScalarTraits<impl_scalar_type>::one();
50
51 Kokkos::parallel_for(
52 "stream/fill", range_policy(0, VECTOR_SIZE), KOKKOS_LAMBDA(const size_t i) {
53 a(i) = ONE * (double)i;
54 b(i) = a(i);
55 });
56 exec_space().fence();
57
58 using clock = std::chrono::high_resolution_clock;
59
60 clock::time_point start, stop;
61
62 for (int i = 0; i < KERNEL_REPEATS; i++) {
63 start = clock::now();
64 Kokkos::parallel_for(
65 "stream/add", range_policy(0, VECTOR_SIZE), KOKKOS_LAMBDA(const size_t j) { // Vector Addition
66 c(j) = a(j) + b(j);
67 });
68
69 exec_space().fence();
70 stop = clock::now();
71 double my_test_time = std::chrono::duration<double>(stop - start).count();
72 total_test_time += my_test_time;
73 }
74
75 return total_test_time / KERNEL_REPEATS;
76}
77
78template <class Scalar, class Node>
79double stream_vector_copy(int KERNEL_REPEATS, int VECTOR_SIZE) {
80 // PerfDetails' STREAM routines need to be instantiatiated on impl_scalar_type, not Scalar
81 using impl_scalar_type = typename KokkosKernels::ArithTraits<Scalar>::val_type;
82
83 using exec_space = typename Node::execution_space;
84 using memory_space = typename Node::memory_space;
85 using range_policy = Kokkos::RangePolicy<exec_space>;
86
87 Kokkos::View<impl_scalar_type *, memory_space> a("a", VECTOR_SIZE);
88 Kokkos::View<impl_scalar_type *, memory_space> b("b", VECTOR_SIZE);
89 double total_test_time = 0.0;
90
91 impl_scalar_type ONE = Teuchos::ScalarTraits<impl_scalar_type>::one();
92
93 Kokkos::parallel_for(
94 "stream/fill", range_policy(0, VECTOR_SIZE), KOKKOS_LAMBDA(const size_t i) {
95 a(i) = ONE;
96 });
97 exec_space().fence();
98
99 using clock = std::chrono::high_resolution_clock;
100 clock::time_point start, stop;
101
102 for (int i = 0; i < KERNEL_REPEATS; i++) {
103 start = clock::now();
104 Kokkos::parallel_for(
105 "stream/copy", range_policy(0, VECTOR_SIZE), KOKKOS_LAMBDA(const size_t j) { // Vector Addition
106 b(j) = a(j);
107 });
108
109 exec_space().fence();
110 stop = clock::now();
111 double my_test_time = std::chrono::duration<double>(stop - start).count();
112 total_test_time += my_test_time;
113 }
114
115 return total_test_time / KERNEL_REPEATS;
116}
117
118double table_lookup(const std::vector<int> &x, const std::vector<double> &y, int value) {
119 // If there's no table, nan
120 if (x.size() == 0) return Teuchos::ScalarTraits<double>::nan();
121
122 // NOTE: This should probably be a binary search, but this isn't performance sensitive, so we'll go simple
123 int N = (int)x.size();
124 int hi = 0;
125 for (; hi < N; hi++) {
126 if (x[hi] > value)
127 break;
128 }
129
130 if (hi == 0) {
131 // Lower end (return the min time)
132 // printf("Lower end: %d < %d\n",value,x[0]);
133 return y[0];
134 } else if (hi == N) {
135 // Higher end (extrapolate from the last two points)
136 // printf("Upper end: %d > %d\n",value,x[N-1]);
137 hi = N - 1;
138 int run = x[hi] - x[hi - 1];
139 double rise = y[hi] - y[hi - 1];
140 double slope = rise / run;
141 int diff = value - x[hi - 1];
142
143 return y[hi - 1] + slope * diff;
144 } else {
145 // Interpolate
146 // printf("Middle: %d < %d < %d\n",x[hi-1],value,x[hi]);
147 int run = x[hi] - x[hi - 1];
148 double rise = y[hi] - y[hi - 1];
149 double slope = rise / run;
150 int diff = value - x[hi - 1];
151
152 return y[hi - 1] + slope * diff;
153 }
154}
155
156// Report bandwidth in GB / sec
157const double GB = 1024.0 * 1024.0 * 1024.0;
158double convert_time_to_bandwidth_gbs(double time, int num_calls, double memory_per_call_bytes) {
159 double time_per_call = time / num_calls;
160 return memory_per_call_bytes / GB / time_per_call;
161}
162
163template <class exec_space, class memory_space>
164void pingpong_basic(int KERNEL_REPEATS, int MAX_SIZE, const Teuchos::Comm<int> &comm, std::vector<int> &sizes, std::vector<double> &times) {
165#ifdef HAVE_MPI
166 int rank = comm.getRank();
167 int nproc = comm.getSize();
168
169 if (nproc < 2) return;
170
171 const int buff_size = (int)pow(2, MAX_SIZE);
172
173 sizes.resize(MAX_SIZE + 1);
174 times.resize(MAX_SIZE + 1);
175
176 // Allocate memory for the buffers (and fill send)
177 Kokkos::View<char *, memory_space> r_buf("recv", buff_size), s_buf("send", buff_size);
178 Kokkos::deep_copy(s_buf, 1);
179
180 // Send and recieve.
181 // NOTE: Do consectutive pair buddies here for simplicity. We should be smart later
182 int odd = rank % 2;
183 int buddy = odd ? rank - 1 : rank + 1;
184
185 for (int i = 0; i < MAX_SIZE + 1; i++) {
186 int msg_size = (int)pow(2, i);
187 comm.barrier();
188
189 double t0 = MPI_Wtime();
190 for (int j = 0; j < KERNEL_REPEATS; j++) {
191 if (buddy < nproc) {
192 if (odd) {
193 comm.send(msg_size, (char *)s_buf.data(), buddy);
194 comm.receive(buddy, msg_size, (char *)r_buf.data());
195 } else {
196 comm.receive(buddy, msg_size, (char *)r_buf.data());
197 comm.send(msg_size, (char *)s_buf.data(), buddy);
198 }
199 }
200 }
201
202 double time_per_call = (MPI_Wtime() - t0) / (2.0 * KERNEL_REPEATS);
203 sizes[i] = msg_size;
204 times[i] = time_per_call;
205 }
206#else
207 return;
208#endif
209}
210
211template <class exec_space, class memory_space, class LocalOrdinal, class GlobalOrdinal, class Node>
212void halopong_basic(int KERNEL_REPEATS, int MAX_SIZE, const RCP<const Xpetra::Import<LocalOrdinal, GlobalOrdinal, Node> > &import, std::vector<int> &sizes, std::vector<double> &times) {
213 int nproc = import->getSourceMap()->getComm()->getSize();
214 if (nproc < 2) return;
215#if defined(HAVE_MUELU_TPETRA) && defined(HAVE_MPI)
216 // NOTE: We need to get the distributer here, which means we need Tpetra, since Xpetra does
217 // not have a distributor interface
218 using x_import_type = Xpetra::TpetraImport<LocalOrdinal, GlobalOrdinal, Node>;
219 RCP<const x_import_type> Ximport = Teuchos::rcp_dynamic_cast<const x_import_type>(import);
220 RCP<const Teuchos::MpiComm<int> > mcomm = Teuchos::rcp_dynamic_cast<const Teuchos::MpiComm<int> >(import->getSourceMap()->getComm());
221 MPI_Comm communicator = *mcomm->getRawMpiComm();
222
223 if (Ximport.is_null() || mcomm.is_null()) return;
224 auto Timport = Ximport->getTpetra_Import();
225 auto distor = Timport->getDistributor();
226
227 // Distributor innards
228 Teuchos::ArrayView<const int> procsFrom = distor.getProcsFrom();
229 Teuchos::ArrayView<const int> procsTo = distor.getProcsTo();
230 int num_recvs = (int)distor.getNumReceives();
231 int num_sends = (int)distor.getNumSends();
232
233 const int buff_size_per_msg = (int)pow(2, MAX_SIZE);
234 sizes.resize(MAX_SIZE + 1);
235 times.resize(MAX_SIZE + 1);
236
237 // Allocate memory for the buffers (and fill send)
238 Kokkos::View<char *, memory_space> f_recv_buf("forward_recv", buff_size_per_msg * num_recvs), f_send_buf("forward_send", buff_size_per_msg * num_sends);
239 Kokkos::View<char *, memory_space> r_recv_buf("reverse_recv", buff_size_per_msg * num_sends), r_send_buf("reverse_send", buff_size_per_msg * num_recvs);
240 Kokkos::deep_copy(f_send_buf, 1);
241 Kokkos::deep_copy(r_send_buf, 1);
242
243 std::vector<MPI_Request> requests(num_sends + num_recvs);
244 std::vector<MPI_Status> status(num_sends + num_recvs);
245
246 for (int i = 0; i < MAX_SIZE + 1; i++) {
247 int msg_size = (int)pow(2, i);
248
249 MPI_Barrier(communicator);
250
251 double t0 = MPI_Wtime();
252 for (int j = 0; j < KERNEL_REPEATS; j++) {
253 int ct = 0;
254 // Recv/Send the forward messsages
255 for (int r = 0; r < num_recvs; r++) {
256 const int tag = 1000 + j;
257 MPI_Irecv(f_recv_buf.data() + msg_size * r, msg_size, MPI_CHAR, procsFrom[r], tag, communicator, &requests[ct]);
258 ct++;
259 }
260 for (int s = 0; s < num_sends; s++) {
261 const int tag = 1000 + j;
262 MPI_Isend(f_send_buf.data() + msg_size * s, msg_size, MPI_CHAR, procsTo[s], tag, communicator, &requests[ct]);
263 ct++;
264 }
265 // Wait for the forward messsages
266 MPI_Waitall(ct, requests.data(), status.data());
267
268 ct = 0;
269 // Recv/Send the reverse messsages
270 for (int r = 0; r < num_sends; r++) {
271 const int tag = 2000 + j;
272 MPI_Irecv(r_recv_buf.data() + msg_size * r, msg_size, MPI_CHAR, procsTo[r], tag, communicator, &requests[ct]);
273 ct++;
274 }
275 for (int s = 0; s < num_recvs; s++) {
276 const int tag = 2000 + j;
277 MPI_Isend(r_send_buf.data() + msg_size * s, msg_size, MPI_CHAR, procsFrom[s], tag, communicator, &requests[ct]);
278 ct++;
279 }
280 // Wait for the reverse messsages
281 MPI_Waitall(ct, requests.data(), status.data());
282 }
283
284 double time_per_call = (MPI_Wtime() - t0) / (2.0 * KERNEL_REPEATS);
285 sizes[i] = msg_size;
286 times[i] = time_per_call;
287 }
288
289#endif
290}
291
292} // end namespace PerfDetails
293
294template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
297
298template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
300
301/****************************************************************************************/
302/****************************************************************************************/
303/****************************************************************************************/
304
305template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
307 // We need launch/waits latency estimates for corrected stream
308 launch_latency_make_table(KERNEL_REPEATS);
309 double latency = launch_latency_lookup();
310
311 if (LOG_MAX_SIZE < 2)
312 LOG_MAX_SIZE = 20;
313
314 stream_sizes_.resize(LOG_MAX_SIZE + 1);
315 stream_copy_times_.resize(LOG_MAX_SIZE + 1);
316 stream_add_times_.resize(LOG_MAX_SIZE + 1);
317 latency_corrected_stream_copy_times_.resize(LOG_MAX_SIZE + 1);
318 latency_corrected_stream_add_times_.resize(LOG_MAX_SIZE + 1);
319
320 for (int i = 0; i < LOG_MAX_SIZE + 1; i++) {
321 int size = (int)pow(2, i);
322 double c_time = PerfDetails::stream_vector_copy<Scalar, Node>(KERNEL_REPEATS, size);
323 double a_time = PerfDetails::stream_vector_add<Scalar, Node>(KERNEL_REPEATS, size);
324
325 stream_sizes_[i] = size;
326
327 // Correct for the difference in memory transactions per element
328 stream_copy_times_[i] = c_time / 2.0;
329 stream_add_times_[i] = a_time / 3.0;
330
331 // Correct for launch latency too. We'll note that sometimes the latency estimate
332 // is higher than the actual copy/add time estimate. If so, we don't correct
333 latency_corrected_stream_copy_times_[i] = (c_time - latency <= 0.0) ? c_time / 2.0 : ((c_time - latency) / 2.0);
334 latency_corrected_stream_add_times_[i] = (a_time - latency <= 0.0) ? a_time / 3.0 : ((a_time - latency) / 3.0);
335 }
336}
337
338template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
339double
341 return PerfDetails::table_lookup(stream_sizes_, stream_copy_times_, SIZE_IN_BYTES / sizeof(Scalar));
342}
343
344template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
345double
347 return PerfDetails::table_lookup(stream_sizes_, stream_add_times_, SIZE_IN_BYTES / sizeof(Scalar));
348}
349
350template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
351double
353 return std::min(stream_vector_copy_lookup(SIZE_IN_BYTES), stream_vector_add_lookup(SIZE_IN_BYTES));
354}
355
356template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
357double
359 return PerfDetails::table_lookup(stream_sizes_, latency_corrected_stream_copy_times_, SIZE_IN_BYTES / sizeof(Scalar));
360}
361
362template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
363double
365 return PerfDetails::table_lookup(stream_sizes_, latency_corrected_stream_add_times_, SIZE_IN_BYTES / sizeof(Scalar));
366}
367
368template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
369double
371 return std::min(latency_corrected_stream_vector_copy_lookup(SIZE_IN_BYTES), latency_corrected_stream_vector_add_lookup(SIZE_IN_BYTES));
372}
373
374template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
376 print_stream_vector_table_impl(out, false, prefix);
377}
378
379template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
381 print_stream_vector_table_impl(out, true, prefix);
382}
383
384template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
385void PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_stream_vector_table_impl(std::ostream &out, bool use_latency_correction, const std::string &prefix) {
386 using namespace std;
387 std::ios old_format(NULL);
388 old_format.copyfmt(out);
389
390 out << prefix
391 << setw(20) << "Length in Scalars" << setw(1) << " "
392 << setw(20) << "COPY (us)" << setw(1) << " "
393 << setw(20) << "ADD (us)" << setw(1) << " "
394 << setw(20) << "COPY (GB/s)" << setw(1) << " "
395 << setw(20) << "ADD (GB/s)" << std::endl;
396
397 out << prefix
398 << setw(20) << "-----------------" << setw(1) << " "
399 << setw(20) << "---------" << setw(1) << " "
400 << setw(20) << "--------" << setw(1) << " "
401 << setw(20) << "-----------" << setw(1) << " "
402 << setw(20) << "----------" << std::endl;
403
404 for (int i = 0; i < (int)stream_sizes_.size(); i++) {
405 int size = stream_sizes_[i];
406 double c_time = use_latency_correction ? latency_corrected_stream_copy_times_[i] : stream_copy_times_[i];
407 double a_time = use_latency_correction ? latency_corrected_stream_add_times_[i] : stream_add_times_[i];
408 // We've already corrected for the transactions per element difference
409 double c_bw = PerfDetails::convert_time_to_bandwidth_gbs(c_time, 1, size * sizeof(Scalar));
410 double a_bw = PerfDetails::convert_time_to_bandwidth_gbs(a_time, 1, size * sizeof(Scalar));
411
412 out << prefix
413 << setw(20) << size << setw(1) << " "
414 << setw(20) << fixed << setprecision(4) << (c_time * 1e6) << setw(1) << " "
415 << setw(20) << fixed << setprecision(4) << (a_time * 1e6) << setw(1) << " "
416 << setw(20) << fixed << setprecision(4) << c_bw << setw(1) << " "
417 << setw(20) << fixed << setprecision(4) << a_bw << std::endl;
418 }
419
420 out.copyfmt(old_format);
421}
422
423/****************************************************************************************/
424/****************************************************************************************/
425/****************************************************************************************/
426
427template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
428void PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::pingpong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP<const Teuchos::Comm<int> > &comm) {
429 PerfDetails::pingpong_basic<Kokkos::HostSpace::execution_space, Kokkos::HostSpace::memory_space>(KERNEL_REPEATS, LOG_MAX_SIZE, *comm, pingpong_sizes_, pingpong_host_times_);
430
431 PerfDetails::pingpong_basic<typename Node::execution_space, typename Node::memory_space>(KERNEL_REPEATS, LOG_MAX_SIZE, *comm, pingpong_sizes_, pingpong_device_times_);
432}
433
434template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
435double
437 return PerfDetails::table_lookup(pingpong_sizes_, pingpong_host_times_, SIZE_IN_BYTES);
438}
439
440template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
441double
443 return PerfDetails::table_lookup(pingpong_sizes_, pingpong_device_times_, SIZE_IN_BYTES);
444}
445
446template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
447void PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_pingpong_table(std::ostream &out, const std::string &prefix) {
448 if (pingpong_sizes_.size() == 0) return;
449
450 using namespace std;
451 std::ios old_format(NULL);
452 old_format.copyfmt(out);
453
454 out << prefix
455 << setw(20) << "Message Size" << setw(1) << " "
456 << setw(20) << "Host (us)" << setw(1) << " "
457 << setw(20) << "Device (us)" << std::endl;
458
459 out << prefix
460 << setw(20) << "------------" << setw(1) << " "
461 << setw(20) << "---------" << setw(1) << " "
462 << setw(20) << "-----------" << std::endl;
463
464 for (int i = 0; i < (int)pingpong_sizes_.size(); i++) {
465 int size = pingpong_sizes_[i];
466 double h_time = pingpong_host_times_[i];
467 double d_time = pingpong_device_times_[i];
468
469 out << prefix
470 << setw(20) << size << setw(1) << " "
471 << setw(20) << fixed << setprecision(4) << (h_time * 1e6) << setw(1) << " "
472 << setw(20) << fixed << setprecision(4) << (d_time * 1e6) << setw(1) << std::endl;
473 }
474
475 out.copyfmt(old_format);
476}
477
478/****************************************************************************************/
479/****************************************************************************************/
480/****************************************************************************************/
481template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
482void PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::halopong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP<const Xpetra::Import<LocalOrdinal, GlobalOrdinal, Node> > &import) {
483 PerfDetails::halopong_basic<Kokkos::HostSpace::execution_space, Kokkos::HostSpace::memory_space>(KERNEL_REPEATS, LOG_MAX_SIZE, import, halopong_sizes_, halopong_host_times_);
484
485 PerfDetails::halopong_basic<typename Node::execution_space, typename Node::memory_space>(KERNEL_REPEATS, LOG_MAX_SIZE, import, halopong_sizes_, halopong_device_times_);
486}
487
488template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
489double
491 return PerfDetails::table_lookup(halopong_sizes_, halopong_host_times_, SIZE_IN_BYTES);
492}
493
494template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
495double
497 return PerfDetails::table_lookup(halopong_sizes_, halopong_device_times_, SIZE_IN_BYTES);
498}
499
500template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
501void PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_halopong_table(std::ostream &out, const std::string &prefix) {
502 if (halopong_sizes_.size() == 0) return;
503
504 using namespace std;
505 std::ios old_format(NULL);
506 old_format.copyfmt(out);
507
508 out << prefix
509 << setw(20) << "Message Size" << setw(1) << " "
510 << setw(20) << "Host (us)" << setw(1) << " "
511 << setw(20) << "Device (us)" << std::endl;
512
513 out << prefix
514 << setw(20) << "------------" << setw(1) << " "
515 << setw(20) << "---------" << setw(1) << " "
516 << setw(20) << "-----------" << std::endl;
517
518 for (int i = 0; i < (int)halopong_sizes_.size(); i++) {
519 int size = halopong_sizes_[i];
520 double h_time = halopong_host_times_[i];
521 double d_time = halopong_device_times_[i];
522
523 out << prefix
524 << setw(20) << size << setw(1) << " "
525 << setw(20) << fixed << setprecision(4) << (h_time * 1e6) << setw(1) << " "
526 << setw(20) << fixed << setprecision(4) << (d_time * 1e6) << setw(1) << std::endl;
527 }
528
529 out.copyfmt(old_format);
530}
531
532/****************************************************************************************/
533/****************************************************************************************/
534/****************************************************************************************/
535
536template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
538 using exec_space = typename Node::execution_space;
539 using range_policy = Kokkos::RangePolicy<exec_space>;
540 using clock = std::chrono::high_resolution_clock;
541
542 double total_test_time = 0;
543 clock::time_point start, stop;
544 for (int i = 0; i < KERNEL_REPEATS; i++) {
545 start = clock::now();
546 Kokkos::parallel_for(
547 "empty kernel", range_policy(0, 1), KOKKOS_LAMBDA(const size_t j) {
548 ;
549 });
550 exec_space().fence();
551 stop = clock::now();
552 double my_test_time = std::chrono::duration<double>(stop - start).count();
553 total_test_time += my_test_time;
554 }
555
556 launch_and_wait_latency_ = total_test_time / KERNEL_REPEATS;
557}
558
559template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
560double
564
565template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
567 using namespace std;
568 std::ios old_format(NULL);
569 old_format.copyfmt(out);
570
571 out << prefix
572 << setw(20) << "Launch+Wait Latency (us)" << setw(1) << " "
573 << setw(20) << fixed << setprecision(4) << (launch_and_wait_latency_ * 1e6) << std::endl;
574
575 out.copyfmt(old_format);
576}
577
578} // namespace MueLu
MueLu::DefaultScalar Scalar
void print_halopong_table(std::ostream &out, const std::string &prefix="")
void halopong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP< const Xpetra::Import< LocalOrdinal, GlobalOrdinal, Node > > &import)
void print_launch_latency_table(std::ostream &out, const std::string &prefix="")
void print_stream_vector_table(std::ostream &out, const std::string &prefix="")
double stream_vector_copy_lookup(int SIZE_IN_BYTES)
void stream_vector_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE=20)
double latency_corrected_stream_vector_lookup(int SIZE_IN_BYTES)
void print_latency_corrected_stream_vector_table(std::ostream &out, const std::string &prefix="")
void print_stream_vector_table_impl(std::ostream &out, bool use_latency_correction, const std::string &prefix)
void pingpong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP< const Teuchos::Comm< int > > &comm)
double halopong_device_lookup(int SIZE_IN_BYTES_PER_MESSAGE)
double latency_corrected_stream_vector_copy_lookup(int SIZE_IN_BYTES)
double pingpong_device_lookup(int SIZE_IN_BYTES)
double pingpong_host_lookup(int SIZE_IN_BYTES)
double latency_corrected_stream_vector_add_lookup(int SIZE_IN_BYTES)
double stream_vector_add_lookup(int SIZE_IN_BYTES)
void print_pingpong_table(std::ostream &out, const std::string &prefix="")
double halopong_host_lookup(int SIZE_IN_BYTES_PER_MESSAGE)
double stream_vector_lookup(int SIZE_IN_BYTES)
void launch_latency_make_table(int KERNEL_REPEATS)
double table_lookup(const std::vector< int > &x, const std::vector< double > &y, int value)
double stream_vector_add(int KERNEL_REPEATS, int VECTOR_SIZE)
void pingpong_basic(int KERNEL_REPEATS, int MAX_SIZE, const Teuchos::Comm< int > &comm, std::vector< int > &sizes, std::vector< double > &times)
void halopong_basic(int KERNEL_REPEATS, int MAX_SIZE, const RCP< const Xpetra::Import< LocalOrdinal, GlobalOrdinal, Node > > &import, std::vector< int > &sizes, std::vector< double > &times)
double convert_time_to_bandwidth_gbs(double time, int num_calls, double memory_per_call_bytes)
double stream_vector_copy(int KERNEL_REPEATS, int VECTOR_SIZE)
Namespace for MueLu classes and methods.