preCICE v3.2.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
ParallelMatrixOperations.hpp
Go to the documentation of this file.
1#pragma once
2
3#ifndef PRECICE_NO_MPI
4
5#include <Eigen/Core>
6#include <memory>
7#include <stddef.h>
8#include <string>
9#include <vector>
10
11#include "com/Communication.hpp"
13#include "com/Request.hpp"
14#include "com/SharedPointer.hpp"
15#include "logging/LogMacros.hpp"
16#include "logging/Logger.hpp"
18#include "utils/IntraComm.hpp"
19#include "utils/assertion.hpp"
20
22
24public:
26
28 void initialize(const bool needCyclicComm);
29
30 template <typename Derived1, typename Derived2>
32 Eigen::PlainObjectBase<Derived1> &leftMatrix,
33 Eigen::PlainObjectBase<Derived2> &rightMatrix,
34 Eigen::PlainObjectBase<Derived2> &result,
35 const std::vector<int> &offsets,
36 int p, int q, int r,
37 bool cyclicComm = true,
38 bool dotProductComputation = true)
39 {
41 PRECICE_ASSERT(result.cols() == rightMatrix.cols(), result.cols(), rightMatrix.cols());
42 PRECICE_ASSERT(leftMatrix.cols() == rightMatrix.rows(), leftMatrix.cols(), rightMatrix.rows());
43
44 // if serial computation on single processor
46 result.noalias() = leftMatrix * rightMatrix;
47
48 // if parallel computation on p processors
49 } else {
52
53 // The result matrix is of size (p x r)
54 // if the cyclic communication is needed, we use block-wise matrix-matrix multiplication
55 if (cyclicComm) {
58 PRECICE_ASSERT(_cyclicCommLeft->isConnected());
60 PRECICE_ASSERT(_cyclicCommRight->isConnected());
61
62 _multiply_cyclic(leftMatrix, rightMatrix, result, offsets, p, q, r);
63
64 // case the cyclic communication is not needed, i.e., usually p = number of columns of the least squares system
65 // perform parallel multiplication based on dot-product
66 } else {
67 if (dotProductComputation)
68 _multiplyNM_dotProduct(leftMatrix, rightMatrix, result, offsets, p, q, r);
69 else
70 _multiplyNM_block(leftMatrix, rightMatrix, result, offsets, p, q, r);
71 }
72 }
73 }
74
87 template <typename Derived1, typename Derived2, typename Derived3>
89 const Eigen::MatrixBase<Derived1> &leftMatrix,
90 const Eigen::MatrixBase<Derived2> &rightMatrix,
91 Eigen::PlainObjectBase<Derived3> &result,
92 int p, int q, int r)
93 {
95 PRECICE_ASSERT(leftMatrix.rows() == p, leftMatrix.rows(), p);
96 PRECICE_ASSERT(leftMatrix.cols() == rightMatrix.rows(), leftMatrix.cols(), rightMatrix.rows());
97 PRECICE_ASSERT(result.rows() == p, result.rows(), p);
98 PRECICE_ASSERT(result.cols() == r, result.cols(), r);
99
100 Eigen::MatrixXd localResult(result.rows(), result.cols());
101 localResult.noalias() = leftMatrix * rightMatrix;
102
103 // if serial computation on single processor
105 result = localResult;
106 } else {
107 utils::IntraComm::allreduceSum(localResult, result);
108 }
109 }
110
111private:
112 logging::Logger _log{"acceleration::ParallelMatrixOperations"};
113
114 // @brief multiplies matrices based on a cyclic communication and block-wise matrix multiplication
115 template <typename Derived1, typename Derived2>
117 Eigen::PlainObjectBase<Derived1> &leftMatrix,
118 Eigen::PlainObjectBase<Derived2> &rightMatrix,
119 Eigen::PlainObjectBase<Derived2> &result,
120 const std::vector<int> &offsets,
121 int p, int q, int r)
122 {
124 /*
125 * For multiplication W_til * Z = J
126 * -----------------------------------------------------------------------
127 * p = n_global, q = m, r = n_global_primary
128 *
129 * leftMatrix: local: (n_local x m) global: (n_global x m)
130 * rightMatrix: local: (m x n_local_primary) global: (m x n_global_primary)
131 * result: local: (n_global x n_local_primary) global: (n_global x n_global_primary)
132 * -----------------------------------------------------------------------
133 */
134
136 PRECICE_ASSERT(leftMatrix.cols() == q, leftMatrix.cols(), q);
137 PRECICE_ASSERT(result.rows() == p, result.rows(), p);
138
139 // int nextProc = (utils::IntraComm::getRank() + 1) % utils::IntraComm::getSize();
140 int prevProc = (utils::IntraComm::getRank() - 1 < 0) ? utils::IntraComm::getSize() - 1 : utils::IntraComm::getRank() - 1;
141 int rows_rcv = (prevProc > 0) ? offsets[prevProc + 1] - offsets[prevProc] : offsets[1];
142 // Eigen::MatrixXd leftMatrix_rcv = Eigen::MatrixXd::Zero(rows_rcv, q);
143 Eigen::MatrixXd leftMatrix_rcv(rows_rcv, q);
144
145 com::PtrRequest requestSend;
146 com::PtrRequest requestRcv;
147
148 // initiate asynchronous send operation of leftMatrix (W_til) --> nextProc (this data is needed in cycle 1) dim: n_local x cols
149 if (leftMatrix.size() > 0)
150 requestSend = _cyclicCommRight->aSend(leftMatrix, 0);
151
152 // initiate asynchronous receive operation for leftMatrix (W_til) from previous processor --> W_til dim: rows_rcv x cols
153 if (leftMatrix_rcv.size() > 0)
154 requestRcv = _cyclicCommLeft->aReceive(leftMatrix_rcv, 0);
155
156 // compute diagonal blocks where all data is local and no communication is needed
157 // compute block matrices of J_inv of size (n_til x n_til), n_til = local n
158 Eigen::MatrixXd diagBlock(leftMatrix.rows(), leftMatrix.rows());
159 diagBlock.noalias() = leftMatrix * rightMatrix;
160
161 // set block at corresponding row-index on proc
162 int off = offsets[utils::IntraComm::getRank()];
163 PRECICE_ASSERT(result.cols() == diagBlock.cols(), result.cols(), diagBlock.cols());
164 result.block(off, 0, diagBlock.rows(), diagBlock.cols()) = diagBlock;
165
169 for (int cycle = 1; cycle < utils::IntraComm::getSize(); cycle++) {
170
171 // wait until W_til from previous processor is fully received
172 if (requestSend != nullptr)
173 requestSend->wait();
174 if (requestRcv != nullptr)
175 requestRcv->wait();
176
177 // leftMatrix (leftMatrix_rcv) is available - needed for local multiplication and hand over to next proc
178 Eigen::MatrixXd leftMatrix_copy(leftMatrix_rcv);
179
180 // initiate async send to hand over leftMatrix (W_til) to the next proc (this data will be needed in the next cycle) dim: n_local x cols
181 if (cycle < utils::IntraComm::getSize() - 1) {
182 if (leftMatrix_copy.size() > 0)
183 requestSend = _cyclicCommRight->aSend(leftMatrix_copy, 0);
184 }
185
186 // compute proc that owned leftMatrix_rcv (Wtil_rcv) at the very beginning for each cycle
187 int sourceProc_nextCycle = (utils::IntraComm::getRank() - (cycle + 1) < 0) ? utils::IntraComm::getSize() + (utils::IntraComm::getRank() - (cycle + 1)) : utils::IntraComm::getRank() - (cycle + 1);
188
189 int sourceProc = (utils::IntraComm::getRank() - cycle < 0) ? utils::IntraComm::getSize() + (utils::IntraComm::getRank() - cycle) : utils::IntraComm::getRank() - cycle;
190
191 int rows_rcv_nextCycle = (sourceProc_nextCycle > 0) ? offsets[sourceProc_nextCycle + 1] - offsets[sourceProc_nextCycle] : offsets[1];
192 rows_rcv = (sourceProc > 0) ? offsets[sourceProc + 1] - offsets[sourceProc] : offsets[1];
193 leftMatrix_rcv = Eigen::MatrixXd::Zero(rows_rcv_nextCycle, q);
194
195 // initiate asynchronous receive operation for leftMatrix (W_til) from previous processor --> W_til (this data is needed in the next cycle)
196 if (cycle < utils::IntraComm::getSize() - 1) {
197 if (leftMatrix_rcv.size() > 0) // only receive data, if data has been sent
198 requestRcv = _cyclicCommLeft->aReceive(leftMatrix_rcv, 0);
199 }
200
201 if (requestSend != nullptr)
202 requestSend->wait();
203 // compute block with new local data
204 Eigen::MatrixXd block(rows_rcv, rightMatrix.cols());
205 block.noalias() = leftMatrix_copy * rightMatrix;
206
207 // set block at corresponding index in J_inv
208 // the row-offset of the current block is determined by the proc that sends the part of the W_til matrix
209 // note: the direction and ordering of the cyclic sending operation is chosen s.t. the computed block is
210 // local on the current processor (in J_inv).
211 off = offsets[sourceProc];
212 PRECICE_ASSERT(result.cols() == block.cols(), result.cols(), block.cols());
213 result.block(off, 0, block.rows(), block.cols()) = block;
214 }
215 }
216
217 // @brief multiplies matrices based on a dot-product computation with a rectangular result matrix
218 template <typename Derived1, typename Derived2>
220 Eigen::PlainObjectBase<Derived1> &leftMatrix,
221 Eigen::PlainObjectBase<Derived2> &rightMatrix,
222 Eigen::PlainObjectBase<Derived2> &result,
223 const std::vector<int> &offsets,
224 int p, int q, int r)
225 {
227 for (int i = 0; i < leftMatrix.rows(); i++) {
228 Rank rank = 0;
229 // find rank of processor that stores the result
230 // the second while is necessary if processors with no vertices are present
231 // Note: the >'=' here is crucial: In case some procs do not have any vertices,
232 // this while loop continues incrementing rank if entries in offsets are equal, i.e.,
233 // it runs to the next non-empty proc.
234 while (i >= offsets[rank + 1])
235 rank++;
236
237 Eigen::VectorXd lMRow = leftMatrix.row(i);
238
239 for (int j = 0; j < r; j++) {
240
241 Eigen::VectorXd rMCol = rightMatrix.col(j);
242 double res_ij = utils::IntraComm::dot(lMRow, rMCol);
243
244 // find proc that needs to store the result.
245 int local_row;
246 if (utils::IntraComm::getRank() == rank) {
247 local_row = i - offsets[rank];
248 result(local_row, j) = res_ij;
249 }
250 }
251 }
252 }
253
255 template <typename Derived1, typename Derived2>
257 Eigen::PlainObjectBase<Derived1> &leftMatrix,
258 Eigen::PlainObjectBase<Derived2> &rightMatrix,
259 Eigen::PlainObjectBase<Derived2> &result,
260 const std::vector<int> &offsets,
261 int p, int q, int r)
262 {
264
265 // ensure that both matrices are stored in the same order. Important for reduce function, that adds serialized data.
266 PRECICE_ASSERT(static_cast<int>(leftMatrix.IsRowMajor) == static_cast<int>(rightMatrix.IsRowMajor),
267 static_cast<int>(leftMatrix.IsRowMajor), static_cast<int>(rightMatrix.IsRowMajor));
268
269 // multiply local block (saxpy-based approach)
270 // dimension: (n_global x n_local) * (n_local x m) = (n_global x m)
271 Eigen::MatrixXd block = Eigen::MatrixXd::Zero(p, r);
272 block.noalias() = leftMatrix * rightMatrix;
273
274 // all blocks have size (n_global x m)
275 // Note: if procs have no vertices, the block size remains (n_global x m), however,
276 // it must be initialized with zeros, so zeros are added for those procs)
277
278 // sum up blocks on the primary rank, reduce
279 Eigen::MatrixXd summarizedBlocks = Eigen::MatrixXd::Zero(p, r);
280 utils::IntraComm::reduceSum(block, summarizedBlocks);
281
282 // secondary ranks wait to receive their local result
284 if (result.size() > 0)
285 utils::IntraComm::getCommunication()->receive(result, 0);
286 }
287
288 // primary rank distributes the sub blocks of the results
290 // distribute blocks of summarizedBlocks (result of multiplication) to corresponding secondary ranks
291 result = summarizedBlocks.block(0, 0, offsets[1], r);
292
293 for (Rank secondaryRank : utils::IntraComm::allSecondaryRanks()) {
294 int off = offsets[secondaryRank];
295 int send_rows = offsets[secondaryRank + 1] - offsets[secondaryRank];
296
297 if (summarizedBlocks.block(off, 0, send_rows, r).size() > 0) {
298 // necessary to save the matrix-block that is to be sent in a temporary matrix-object
299 // otherwise, the send routine walks over the bounds of the block (matrix structure is still from the entire matrix)
300 Eigen::MatrixXd sendBlock = summarizedBlocks.block(off, 0, send_rows, r);
301 utils::IntraComm::getCommunication()->send(sendBlock, secondaryRank);
302 }
303 }
304 }
305 }
306
309
312
313 bool _needCyclicComm = true;
314
323
330};
331
332} // namespace precice::acceleration::impl
333
334#endif
#define PRECICE_TRACE(...)
Definition LogMacros.hpp:92
#define PRECICE_ASSERT(...)
Definition assertion.hpp:85
void _multiplyNM_dotProduct(Eigen::PlainObjectBase< Derived1 > &leftMatrix, Eigen::PlainObjectBase< Derived2 > &rightMatrix, Eigen::PlainObjectBase< Derived2 > &result, const std::vector< int > &offsets, int p, int q, int r)
void initialize(const bool needCyclicComm)
Initializes the acceleration.
void multiply(const Eigen::MatrixBase< Derived1 > &leftMatrix, const Eigen::MatrixBase< Derived2 > &rightMatrix, Eigen::PlainObjectBase< Derived3 > &result, int p, int q, int r)
: Method computes the matrix-matrix/matrix-vector product of a (p x q) matrix that is distributed col...
void _multiply_cyclic(Eigen::PlainObjectBase< Derived1 > &leftMatrix, Eigen::PlainObjectBase< Derived2 > &rightMatrix, Eigen::PlainObjectBase< Derived2 > &result, const std::vector< int > &offsets, int p, int q, int r)
void multiply(Eigen::PlainObjectBase< Derived1 > &leftMatrix, Eigen::PlainObjectBase< Derived2 > &rightMatrix, Eigen::PlainObjectBase< Derived2 > &result, const std::vector< int > &offsets, int p, int q, int r, bool cyclicComm=true, bool dotProductComputation=true)
void _multiplyNM_block(Eigen::PlainObjectBase< Derived1 > &leftMatrix, Eigen::PlainObjectBase< Derived2 > &rightMatrix, Eigen::PlainObjectBase< Derived2 > &result, const std::vector< int > &offsets, int p, int q, int r)
Multiplies matrices based on a SAXPY-like block-wise computation with a rectangular result matrix of ...
com::PtrCommunication _cyclicCommLeft
Communication between neighboring ranks, backwards.
com::PtrCommunication _cyclicCommRight
Communication between neighboring ranks, forward.
This class provides a lightweight logger.
Definition Logger.hpp:17
static void allreduceSum(precice::span< const double > sendData, precice::span< double > rcvData)
static int getSize()
Number of ranks. This includes ranks from both participants, e.g. minimal size is 2.
Definition IntraComm.cpp:47
static Rank getRank()
Current rank.
Definition IntraComm.cpp:42
static double dot(const Eigen::VectorXd &vec1, const Eigen::VectorXd &vec2)
static bool isPrimary()
True if this process is running the primary rank.
Definition IntraComm.cpp:52
static auto allSecondaryRanks()
Returns an iterable range over salve ranks [1, _size)
Definition IntraComm.hpp:37
static bool isParallel()
True if this process is running in parallel.
Definition IntraComm.cpp:62
static bool isSecondary()
True if this process is running a secondary rank.
Definition IntraComm.cpp:57
static com::PtrCommunication & getCommunication()
Intra-participant communication.
Definition IntraComm.hpp:31
static void reduceSum(precice::span< const double > sendData, precice::span< double > rcvData)
T get(T... args)
int Rank
Definition Types.hpp:37