preCICE v3.1.2
Loading...
Searching...
No Matches
ParallelMatrixOperations.hpp
Go to the documentation of this file.
1#pragma once
2
3#ifndef PRECICE_NO_MPI
4
5#include <Eigen/Core>
6#include <memory>
7#include <stddef.h>
8#include <string>
9#include <vector>
10
11#include "com/Communication.hpp"
13#include "com/Request.hpp"
14#include "com/SharedPointer.hpp"
15#include "logging/LogMacros.hpp"
16#include "logging/Logger.hpp"
18#include "utils/IntraComm.hpp"
19#include "utils/assertion.hpp"
20
21namespace precice {
22namespace acceleration {
23namespace impl {
24
26public:
28
30 void initialize(const bool needCyclicComm);
31
32 template <typename Derived1, typename Derived2>
34 Eigen::PlainObjectBase<Derived1> &leftMatrix,
35 Eigen::PlainObjectBase<Derived2> &rightMatrix,
36 Eigen::PlainObjectBase<Derived2> &result,
37 const std::vector<int> & offsets,
38 int p, int q, int r,
39 bool dotProductComputation = true)
40 {
42 PRECICE_ASSERT(result.cols() == rightMatrix.cols(), result.cols(), rightMatrix.cols());
43 PRECICE_ASSERT(leftMatrix.cols() == rightMatrix.rows(), leftMatrix.cols(), rightMatrix.rows());
44
45 // if serial computation on single processor
47 result.noalias() = leftMatrix * rightMatrix;
48
49 // if parallel computation on p processors
50 } else {
53
54 // The result matrix is of size (p x r)
55 // if p equals r (and p = global_n), we have to perform the
56 // cyclic communication with block-wise matrix-matrix multiplication
57 if (p == r) {
60 PRECICE_ASSERT(_cyclicCommLeft->isConnected());
62 PRECICE_ASSERT(_cyclicCommRight->isConnected());
63
64 _multiplyNN(leftMatrix, rightMatrix, result, offsets, p, q, r);
65
66 // case p != r, i.e., usually p = number of columns of the least squares system
67 // perform parallel multiplication based on dot-product
68 } else {
69 if (dotProductComputation)
70 _multiplyNM_dotProduct(leftMatrix, rightMatrix, result, offsets, p, q, r);
71 else
72 _multiplyNM_block(leftMatrix, rightMatrix, result, offsets, p, q, r);
73 }
74 }
75 }
76
89 template <typename Derived1, typename Derived2, typename Derived3>
91 const Eigen::MatrixBase<Derived1> &leftMatrix,
92 const Eigen::MatrixBase<Derived2> &rightMatrix,
93 Eigen::PlainObjectBase<Derived3> & result,
94 int p, int q, int r)
95 {
97 PRECICE_ASSERT(leftMatrix.rows() == p, leftMatrix.rows(), p);
98 PRECICE_ASSERT(leftMatrix.cols() == rightMatrix.rows(), leftMatrix.cols(), rightMatrix.rows());
99 PRECICE_ASSERT(result.rows() == p, result.rows(), p);
100 PRECICE_ASSERT(result.cols() == r, result.cols(), r);
101
102 Eigen::MatrixXd localResult(result.rows(), result.cols());
103 localResult.noalias() = leftMatrix * rightMatrix;
104
105 // if serial computation on single processor
107 result = localResult;
108 } else {
109 utils::IntraComm::allreduceSum(localResult, result);
110 }
111 }
112
113private:
114 logging::Logger _log{"acceleration::ParallelMatrixOperations"};
115
116 // @brief multiplies matrices based on a cyclic communication and block-wise matrix multiplication with a quadratic result matrix
117 template <typename Derived1, typename Derived2>
119 Eigen::PlainObjectBase<Derived1> &leftMatrix,
120 Eigen::PlainObjectBase<Derived2> &rightMatrix,
121 Eigen::PlainObjectBase<Derived2> &result,
122 const std::vector<int> & offsets,
123 int p, int q, int r)
124 {
126 /*
127 * For multiplication W_til * Z = J
128 * -----------------------------------------------------------------------
129 * p = r = n_global, q = m
130 *
131 * leftMatrix: local: (n_local x m) global: (n_global x m)
132 * rightMatrix: local: (m x n_local) global: (m x n_global)
133 * result: local: (n_global x n_local) global: (n_global x n_global)
134 * -----------------------------------------------------------------------
135 */
136
138 PRECICE_ASSERT(leftMatrix.cols() == q, leftMatrix.cols(), q);
139 PRECICE_ASSERT(leftMatrix.rows() == rightMatrix.cols(), leftMatrix.rows(), rightMatrix.cols());
140 PRECICE_ASSERT(result.rows() == p, result.rows(), p);
141
142 //int nextProc = (utils::IntraComm::getRank() + 1) % utils::IntraComm::getSize();
143 int prevProc = (utils::IntraComm::getRank() - 1 < 0) ? utils::IntraComm::getSize() - 1 : utils::IntraComm::getRank() - 1;
144 int rows_rcv = (prevProc > 0) ? offsets[prevProc + 1] - offsets[prevProc] : offsets[1];
145 //Eigen::MatrixXd leftMatrix_rcv = Eigen::MatrixXd::Zero(rows_rcv, q);
146 Eigen::MatrixXd leftMatrix_rcv(rows_rcv, q);
147
148 com::PtrRequest requestSend;
149 com::PtrRequest requestRcv;
150
151 // initiate asynchronous send operation of leftMatrix (W_til) --> nextProc (this data is needed in cycle 1) dim: n_local x cols
152 if (leftMatrix.size() > 0)
153 requestSend = _cyclicCommRight->aSend(leftMatrix, 0);
154
155 // initiate asynchronous receive operation for leftMatrix (W_til) from previous processor --> W_til dim: rows_rcv x cols
156 if (leftMatrix_rcv.size() > 0)
157 requestRcv = _cyclicCommLeft->aReceive(leftMatrix_rcv, 0);
158
159 // compute diagonal blocks where all data is local and no communication is needed
160 // compute block matrices of J_inv of size (n_til x n_til), n_til = local n
161 Eigen::MatrixXd diagBlock(leftMatrix.rows(), leftMatrix.rows());
162 diagBlock.noalias() = leftMatrix * rightMatrix;
163
164 // set block at corresponding row-index on proc
165 int off = offsets[utils::IntraComm::getRank()];
166 PRECICE_ASSERT(result.cols() == diagBlock.cols(), result.cols(), diagBlock.cols());
167 result.block(off, 0, diagBlock.rows(), diagBlock.cols()) = diagBlock;
168
172 for (int cycle = 1; cycle < utils::IntraComm::getSize(); cycle++) {
173
174 // wait until W_til from previous processor is fully received
175 if (requestSend != NULL)
176 requestSend->wait();
177 if (requestRcv != NULL)
178 requestRcv->wait();
179
180 // leftMatrix (leftMatrix_rcv) is available - needed for local multiplication and hand over to next proc
181 Eigen::MatrixXd leftMatrix_copy(leftMatrix_rcv);
182
183 // initiate async send to hand over leftMatrix (W_til) to the next proc (this data will be needed in the next cycle) dim: n_local x cols
184 if (cycle < utils::IntraComm::getSize() - 1) {
185 if (leftMatrix_copy.size() > 0)
186 requestSend = _cyclicCommRight->aSend(leftMatrix_copy, 0);
187 }
188
189 // compute proc that owned leftMatrix_rcv (Wtil_rcv) at the very beginning for each cylce
190 int sourceProc_nextCycle = (utils::IntraComm::getRank() - (cycle + 1) < 0) ? utils::IntraComm::getSize() + (utils::IntraComm::getRank() - (cycle + 1)) : utils::IntraComm::getRank() - (cycle + 1);
191
192 int sourceProc = (utils::IntraComm::getRank() - cycle < 0) ? utils::IntraComm::getSize() + (utils::IntraComm::getRank() - cycle) : utils::IntraComm::getRank() - cycle;
193
194 int rows_rcv_nextCycle = (sourceProc_nextCycle > 0) ? offsets[sourceProc_nextCycle + 1] - offsets[sourceProc_nextCycle] : offsets[1];
195 rows_rcv = (sourceProc > 0) ? offsets[sourceProc + 1] - offsets[sourceProc] : offsets[1];
196 leftMatrix_rcv = Eigen::MatrixXd::Zero(rows_rcv_nextCycle, q);
197
198 // initiate asynchronous receive operation for leftMatrix (W_til) from previous processor --> W_til (this data is needed in the next cycle)
199 if (cycle < utils::IntraComm::getSize() - 1) {
200 if (leftMatrix_rcv.size() > 0) // only receive data, if data has been sent
201 requestRcv = _cyclicCommLeft->aReceive(leftMatrix_rcv, 0);
202 }
203
204 if (requestSend != NULL)
205 requestSend->wait();
206 // compute block with new local data
207 Eigen::MatrixXd block(rows_rcv, rightMatrix.cols());
208 block.noalias() = leftMatrix_copy * rightMatrix;
209
210 // set block at corresponding index in J_inv
211 // the row-offset of the current block is determined by the proc that sends the part of the W_til matrix
212 // note: the direction and ordering of the cyclic sending operation is chosen s.t. the computed block is
213 // local on the current processor (in J_inv).
214 off = offsets[sourceProc];
215 PRECICE_ASSERT(result.cols() == block.cols(), result.cols(), block.cols());
216 result.block(off, 0, block.rows(), block.cols()) = block;
217 }
218 }
219
220 // @brief multiplies matrices based on a dot-product computation with a rectangular result matrix
221 template <typename Derived1, typename Derived2>
223 Eigen::PlainObjectBase<Derived1> &leftMatrix,
224 Eigen::PlainObjectBase<Derived2> &rightMatrix,
225 Eigen::PlainObjectBase<Derived2> &result,
226 const std::vector<int> & offsets,
227 int p, int q, int r)
228 {
230 for (int i = 0; i < leftMatrix.rows(); i++) {
231 Rank rank = 0;
232 // find rank of processor that stores the result
233 // the second while is necessary if processors with no vertices are present
234 // Note: the >'=' here is crucial: In case some procs do not have any vertices,
235 // this while loop continues incrementing rank if entries in offsets are equal, i.e.,
236 // it runs to the next non-empty proc.
237 while (i >= offsets[rank + 1])
238 rank++;
239
240 Eigen::VectorXd lMRow = leftMatrix.row(i);
241
242 for (int j = 0; j < r; j++) {
243
244 Eigen::VectorXd rMCol = rightMatrix.col(j);
245 double res_ij = utils::IntraComm::dot(lMRow, rMCol);
246
247 // find proc that needs to store the result.
248 int local_row;
249 if (utils::IntraComm::getRank() == rank) {
250 local_row = i - offsets[rank];
251 result(local_row, j) = res_ij;
252 }
253 }
254 }
255 }
256
258 template <typename Derived1, typename Derived2>
260 Eigen::PlainObjectBase<Derived1> &leftMatrix,
261 Eigen::PlainObjectBase<Derived2> &rightMatrix,
262 Eigen::PlainObjectBase<Derived2> &result,
263 const std::vector<int> & offsets,
264 int p, int q, int r)
265 {
267
268 // ensure that both matrices are stored in the same order. Important for reduce function, that adds serialized data.
269 PRECICE_ASSERT(static_cast<int>(leftMatrix.IsRowMajor) == static_cast<int>(rightMatrix.IsRowMajor),
270 static_cast<int>(leftMatrix.IsRowMajor), static_cast<int>(rightMatrix.IsRowMajor));
271
272 // multiply local block (saxpy-based approach)
273 // dimension: (n_global x n_local) * (n_local x m) = (n_global x m)
274 Eigen::MatrixXd block = Eigen::MatrixXd::Zero(p, r);
275 block.noalias() = leftMatrix * rightMatrix;
276
277 // all blocks have size (n_global x m)
278 // Note: if procs have no vertices, the block size remains (n_global x m), however,
279 // it must be initialized with zeros, so zeros are added for those procs)
280
281 // sum up blocks on the primary rank, reduce
282 Eigen::MatrixXd summarizedBlocks = Eigen::MatrixXd::Zero(p, r);
283 utils::IntraComm::reduceSum(block, summarizedBlocks);
284
285 // secondary ranks wait to receive their local result
287 if (result.size() > 0)
288 utils::IntraComm::getCommunication()->receive(result, 0);
289 }
290
291 // primary rank distributes the sub blocks of the results
293 // distribute blocks of summarizedBlocks (result of multiplication) to corresponding secondary ranks
294 result = summarizedBlocks.block(0, 0, offsets[1], r);
295
296 for (Rank secondaryRank : utils::IntraComm::allSecondaryRanks()) {
297 int off = offsets[secondaryRank];
298 int send_rows = offsets[secondaryRank + 1] - offsets[secondaryRank];
299
300 if (summarizedBlocks.block(off, 0, send_rows, r).size() > 0) {
301 // necessary to save the matrix-block that is to be sent in a temporary matrix-object
302 // otherwise, the send routine walks over the bounds of the block (matrix structure is still from the entire matrix)
303 Eigen::MatrixXd sendBlock = summarizedBlocks.block(off, 0, send_rows, r);
304 utils::IntraComm::getCommunication()->send(sendBlock, secondaryRank);
305 }
306 }
307 }
308 }
309
312
315
316 bool _needCyclicComm = true;
317
326
333};
334
335} // namespace impl
336} // namespace acceleration
337} // namespace precice
338
339#endif
#define PRECICE_TRACE(...)
Definition LogMacros.hpp:95
#define PRECICE_ASSERT(...)
Definition assertion.hpp:87
void _multiplyNM_dotProduct(Eigen::PlainObjectBase< Derived1 > &leftMatrix, Eigen::PlainObjectBase< Derived2 > &rightMatrix, Eigen::PlainObjectBase< Derived2 > &result, const std::vector< int > &offsets, int p, int q, int r)
void initialize(const bool needCyclicComm)
Initializes the acceleration.
void multiply(const Eigen::MatrixBase< Derived1 > &leftMatrix, const Eigen::MatrixBase< Derived2 > &rightMatrix, Eigen::PlainObjectBase< Derived3 > &result, int p, int q, int r)
: Method computes the matrix-matrix/matrix-vector product of a (p x q) matrix that is distributed col...
void _multiplyNM_block(Eigen::PlainObjectBase< Derived1 > &leftMatrix, Eigen::PlainObjectBase< Derived2 > &rightMatrix, Eigen::PlainObjectBase< Derived2 > &result, const std::vector< int > &offsets, int p, int q, int r)
Multiplies matrices based on a SAXPY-like block-wise computation with a rectangular result matrix of ...
com::PtrCommunication _cyclicCommLeft
Communication between neighboring ranks, backwards.
void multiply(Eigen::PlainObjectBase< Derived1 > &leftMatrix, Eigen::PlainObjectBase< Derived2 > &rightMatrix, Eigen::PlainObjectBase< Derived2 > &result, const std::vector< int > &offsets, int p, int q, int r, bool dotProductComputation=true)
com::PtrCommunication _cyclicCommRight
Communication between neighboring ranks, forward.
void _multiplyNN(Eigen::PlainObjectBase< Derived1 > &leftMatrix, Eigen::PlainObjectBase< Derived2 > &rightMatrix, Eigen::PlainObjectBase< Derived2 > &result, const std::vector< int > &offsets, int p, int q, int r)
This class provides a lightweight logger.
Definition Logger.hpp:16
static void allreduceSum(precice::span< const double > sendData, precice::span< double > rcvData)
static int getSize()
Number of ranks. This includes ranks from both participants, e.g. minimal size is 2.
Definition IntraComm.cpp:47
static Rank getRank()
Current rank.
Definition IntraComm.cpp:42
static double dot(const Eigen::VectorXd &vec1, const Eigen::VectorXd &vec2)
static bool isPrimary()
True if this process is running the primary rank.
Definition IntraComm.cpp:52
static auto allSecondaryRanks()
Returns an iterable range over salve ranks [1, _size)
Definition IntraComm.hpp:37
static bool isParallel()
True if this process is running in parallel.
Definition IntraComm.cpp:62
static bool isSecondary()
True if this process is running a secondary rank.
Definition IntraComm.cpp:57
static com::PtrCommunication & getCommunication()
Intra-participant communication.
Definition IntraComm.hpp:31
static void reduceSum(precice::span< const double > sendData, precice::span< double > rcvData)
T get(T... args)
Main namespace of the precice library.
int Rank
Definition Types.hpp:37
static std::unique_ptr< precice::Participant > impl
Definition preciceC.cpp:21