MueLu Version of the Day
Loading...
Searching...
No Matches
MueLu_PerfModels_decl.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// MueLu: A package for multigrid based preconditioning
4//
5// Copyright 2012 NTESS and the MueLu contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef MUELU_PERFMODELS_HPP
11#define MUELU_PERFMODELS_HPP
12
13#include "MueLu_ConfigDefs.hpp"
14#include "Xpetra_Import_fwd.hpp"
15
16#include <vector>
17#include <ostream>
18#include <Teuchos_DefaultComm.hpp>
19
21
22namespace MueLu {
23
24template <class Scalar,
27 class Node = DefaultNode>
29 public:
30 PerfModels();
32
33 /* Single Node tests based upon the STREAM benchmark for measuring memory
34 * bandwith and computation rate. These processes compute either the addition
35 * of two vectors or the multiplication of dense matrices of any given size.
36 * Many iterations occur which then return a vector containing the individual
37 * lengths of time per iteration.
38 *
39 * See further here:
40 * - https://www.cs.virginia.edu/stream/ref.html
41 * - https://github.com/UoB-HPC/BabelStream
42 */
43
44 /* This version is for table interpolation and works on chars, so the LOG_MAX_SIZE is for bytes */
45 void stream_vector_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE = 20);
46 bool has_stream_vector_table() const { return stream_sizes_.size() > 0; }
47
48 /* Lookup in the stream_vector table */
49 double stream_vector_copy_lookup(int SIZE_IN_BYTES);
50 double stream_vector_add_lookup(int SIZE_IN_BYTES);
51 double latency_corrected_stream_vector_copy_lookup(int SIZE_IN_BYTES);
52 double latency_corrected_stream_vector_add_lookup(int SIZE_IN_BYTES);
53
54 // Uses the faster of the tables. The time is then divided by the number of memory transactions
55 // per element in the kernel (e.g. 2 for COPY and 3 for ADD).
56 double stream_vector_lookup(int SIZE_IN_BYTES);
57 double latency_corrected_stream_vector_lookup(int SIZE_IN_BYTES);
58
59 /* Print table */
60 void print_stream_vector_table(std::ostream &out, const std::string &prefix = "");
61 void print_latency_corrected_stream_vector_table(std::ostream &out, const std::string &prefix = "");
62
63 /* A latency test between two processes based upon the MVAPICH OSU Micro-Benchmarks.
64 * The sender process sends a message and then waits for confirmation of reception.
65 * Many iterations occur with various message sizes and the average latency values
66 * are returned within a map. Utilizes blocking send and recieve.
67 *
68 * See further: https://mvapich.cse.ohio-state.edu/benchmarks/
69 */
70 void pingpong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP<const Teuchos::Comm<int> > &comm);
71 bool has_pingpong_table() const { return pingpong_sizes_.size() > 0; }
72
73 /* Lookup in the pingpong_vector table */
74 double pingpong_host_lookup(int SIZE_IN_BYTES);
75 double pingpong_device_lookup(int SIZE_IN_BYTES);
76
77 /* Print table */
78 void print_pingpong_table(std::ostream &out, const std::string &prefix = "");
79
80 /* A halo-exchange based ping-pong, inspired by halo-mode in MPPTEST from ANL.
81 * Here we use exactly the communication pattern specified in the import object
82 * and send messages accordingly. We vary the size in bytes sent per message,
83 * which should capture max-rate effects to some degree.
84 *
85 * See further: https://www.mcs.anl.gov/research/projects/mpi/mpptest/
86 */
87 void halopong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP<const Xpetra::Import<LocalOrdinal, GlobalOrdinal, Node> > &import);
88 bool has_halopong_table() const { return halopong_sizes_.size() > 0; }
89
90 /* Lookup in the halopong_vector table */
91 double halopong_host_lookup(int SIZE_IN_BYTES_PER_MESSAGE);
92 double halopong_device_lookup(int SIZE_IN_BYTES_PER_MESSAGE);
93
94 /* Print table */
95 void print_halopong_table(std::ostream &out, const std::string &prefix = "");
96
97 /* Estimate launch latency based on the cost of submitting an empty Kokkos::parallel_for.
98 * This necessary to correct the memory bandwidth costs for models on high latency platforms,
99 * e.g., GPUS.
100 */
101 void launch_latency_make_table(int KERNEL_REPEATS);
103
104 /* Lookup launch latency */
105 double launch_latency_lookup();
106
107 /* Print table */
108 void print_launch_latency_table(std::ostream &out, const std::string &prefix = "");
109
110 private:
111 void print_stream_vector_table_impl(std::ostream &out, bool use_latency_correction, const std::string &prefix);
112
113 std::vector<int> stream_sizes_;
114 std::vector<double> stream_copy_times_;
115 std::vector<double> stream_add_times_;
118
119 std::vector<int> pingpong_sizes_;
120 std::vector<double> pingpong_host_times_;
121 std::vector<double> pingpong_device_times_;
122
123 std::vector<int> halopong_sizes_;
124 std::vector<double> halopong_host_times_;
125 std::vector<double> halopong_device_times_;
126
128
129}; // class PerfModels
130
131} // namespace MueLu
132
133#endif // ifndef MUELU_PERFMODELS_HPP
MueLu::DefaultLocalOrdinal LocalOrdinal
MueLu::DefaultScalar Scalar
MueLu::DefaultGlobalOrdinal GlobalOrdinal
MueLu::DefaultNode Node
std::vector< int > halopong_sizes_
void print_halopong_table(std::ostream &out, const std::string &prefix="")
std::vector< int > pingpong_sizes_
void halopong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP< const Xpetra::Import< LocalOrdinal, GlobalOrdinal, Node > > &import)
void print_launch_latency_table(std::ostream &out, const std::string &prefix="")
std::vector< double > halopong_device_times_
void print_stream_vector_table(std::ostream &out, const std::string &prefix="")
std::vector< int > stream_sizes_
double stream_vector_copy_lookup(int SIZE_IN_BYTES)
void stream_vector_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE=20)
bool has_launch_latency_table() const
double latency_corrected_stream_vector_lookup(int SIZE_IN_BYTES)
void print_latency_corrected_stream_vector_table(std::ostream &out, const std::string &prefix="")
void print_stream_vector_table_impl(std::ostream &out, bool use_latency_correction, const std::string &prefix)
std::vector< double > latency_corrected_stream_copy_times_
void pingpong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP< const Teuchos::Comm< int > > &comm)
std::vector< double > pingpong_device_times_
bool has_stream_vector_table() const
double halopong_device_lookup(int SIZE_IN_BYTES_PER_MESSAGE)
double latency_corrected_stream_vector_copy_lookup(int SIZE_IN_BYTES)
double pingpong_device_lookup(int SIZE_IN_BYTES)
std::vector< double > latency_corrected_stream_add_times_
double pingpong_host_lookup(int SIZE_IN_BYTES)
double latency_corrected_stream_vector_add_lookup(int SIZE_IN_BYTES)
double stream_vector_add_lookup(int SIZE_IN_BYTES)
void print_pingpong_table(std::ostream &out, const std::string &prefix="")
double halopong_host_lookup(int SIZE_IN_BYTES_PER_MESSAGE)
std::vector< double > stream_copy_times_
double stream_vector_lookup(int SIZE_IN_BYTES)
std::vector< double > halopong_host_times_
std::vector< double > stream_add_times_
std::vector< double > pingpong_host_times_
void launch_latency_make_table(int KERNEL_REPEATS)
Namespace for MueLu classes and methods.
Tpetra::KokkosClassic::DefaultNode::DefaultNodeType DefaultNode