preCICE v3.4.1
Loading...
Searching...
No Matches
KokkosPUMKernels_Impl.hpp
Go to the documentation of this file.
1#include <KokkosBatched_LU_Decl.hpp>
2#include <KokkosBatched_Util.hpp>
3
4#include <KokkosBatched_ApplyPivot_Decl.hpp>
5#include <KokkosBatched_ApplyQ_Decl.hpp>
6#include <KokkosBatched_QR_WithColumnPivoting_Decl.hpp>
7#include <KokkosBatched_Trsv_Decl.hpp>
8#include <KokkosBlas2_gemv.hpp>
9
11#include "math/math.hpp"
12
13#include <functional>
14
17
19
20namespace impl {
21
22// Enum to use when computing the team size
28
29// Helper to compute the power of 2^x for the given n and the above,
30// i.e., either the flooring, ceiling, or the closest one
31// used only below to determine a reasonable team size
32inline int powerOfTwo(int n, const Pow2Mode mode)
33{
34 if (n < 1)
35 return 1;
36 // find highest power-of-two <= n
37 int pow2 = 1;
38 while (pow2 <= n) {
39 pow2 = pow2 * 2;
40 }
41 // now that we have the value larger, we look for the pow2 below n
42 int lower = pow2 / 2;
43
44 if (mode == Pow2Mode::Larger) {
45 return pow2;
46 } else if (mode == Pow2Mode::Smaller) {
47 return lower;
48 } else {
50 // decide whether n is closer to lower or upper (pow2)
51 if (n - lower < pow2 - n) {
52 return lower;
53 } else {
54 return pow2;
55 }
56 }
57}
58
59// Try to find a good team size based on the cluster size
60// avgWork is in our case simply the average cluster size, as it determines
61// the local matrix sizes
62template <typename ExecSpace, typename FunctorType, typename Policy>
63auto findTeamSize(int avgWork, const Pow2Mode mode, const FunctorType &functor, const Policy &policy)
64{
65 // If using OpenMP, Kokkos::AUTO works best
67 return Kokkos::AUTO;
68 } else {
69 // Compute the power of two according to the configuration
70 int teamSize = powerOfTwo(avgWork, mode);
71 // Ensure minimum of one warp for the GPU to avoid partial-warp inefficiency
72 int warpSize = Kokkos::TeamPolicy<ExecSpace>::vector_length_max();
73 if (teamSize < warpSize) {
74 teamSize = warpSize;
75 }
76 // Query Kokkos for the max recommended team size for this functor
77 // (takes also memory constraints into account)
78 int maxRecommended = policy.team_size_recommended(functor, Kokkos::ParallelForTag());
79 if (teamSize > maxRecommended) {
80 teamSize = maxRecommended;
81 }
82 return teamSize;
83 }
84}
85
86// small helper function to make the compiler handle variables in lambdas which are only conditionally used
87template <typename... Args>
88KOKKOS_INLINE_FUNCTION constexpr void capture_conditional_variables(const Args &...) {}
89} // namespace impl
90
91// For within the kernels
92// The batch matrix has a default layout, which is consistent across kernels
93template <typename MemorySpace = ExecutionSpace>
94using BatchMatrix = Kokkos::View<double **, MemorySpace, UnmanagedMemory>;
95template <typename T = double *, typename MemorySpace = ExecutionSpace>
96using BatchVector = Kokkos::View<T, MemorySpace, UnmanagedMemory>;
97
98template <typename MemorySpace>
99bool compute_weights(const int nCenters,
100 const int avgOutClusterSize,
101 const offset_1d_type nWeights,
102 const int nMeshVertices,
103 const int dim,
105 MeshView<MemorySpace> centers,
108 const CompactPolynomialC2 &w,
109 VectorView<MemorySpace> normalizedWeights)
110{
111 using TeamPolicy = Kokkos::TeamPolicy<MemorySpace>;
112 using TeamMember = typename TeamPolicy::member_type;
113
114 VectorView<MemorySpace> weightSum("weightSum", nMeshVertices);
115 Kokkos::deep_copy(weightSum, 0.0);
116 Kokkos::fence();
117
118 const auto rbf_params = w.getFunctionParameters();
119
120 // We launch one team per local system
121 auto kernel = KOKKOS_LAMBDA(const TeamMember &team)
122 {
123 const int batch = team.league_rank();
124 const auto begin = offsets(batch);
125 const auto end = offsets(batch + 1);
126
127 Kokkos::parallel_for(
128 Kokkos::TeamThreadRange(team, begin, end),
129 [&](offset_1d_type i) {
130 auto globalID = globalIDs(i);
131 double dist = 0.0;
132 for (int d = 0; d < dim; ++d) {
133 double diff = mesh(globalID, d) - centers(batch, d);
134 dist += diff * diff;
135 }
136 dist = Kokkos::sqrt(dist);
137 double val = w(dist, rbf_params);
138 // is NUMERICAL_ZERO_DIFFERENCE_DEVICE
139 double res = std::max(val, 1.0e-14);
140 normalizedWeights(i) = res;
141
142 Kokkos::atomic_add(&weightSum(globalID), res);
143 }); // TeamThreadRange
144 };
145
146 // auto teamSize = impl::findTeamSize<typename MemorySpace::execution_space>(avgOutClusterSize, impl::Pow2Mode::Larger, kernel, TeamPolicy(nCenters, Kokkos::AUTO));
147 Kokkos::parallel_for("compute_weights", TeamPolicy(nCenters, Kokkos::AUTO), kernel);
148
149 // Check for output mesh vertices which are unassigned
150 // This check is a pure sanity check
151 bool hasZero = false;
152 Kokkos::parallel_reduce(
153 "check_zero",
154 nMeshVertices,
155 KOKKOS_LAMBDA(int i, bool &local) {
156 if (weightSum(i) == 0.0)
157 local = true;
158 },
159 Kokkos::LOr<bool>(hasZero));
160
161 if (hasZero) {
162 return false;
163 }
164
165 // Now scale back the sum
166 Kokkos::parallel_for(
167 "scale_weights",
168 Kokkos::RangePolicy<MemorySpace>(0, nWeights),
169 KOKKOS_LAMBDA(const int i) {
170 const int id = globalIDs(i);
171 normalizedWeights(i) /= weightSum(id);
172 });
173
174 return true;
175}
176
177template <typename MemorySpace>
178void do_batched_qr(int nCluster,
179 int dim,
180 int avgClusterSize,
181 int maxClusterSize,
188{
189 using TeamPolicy = Kokkos::TeamPolicy<MemorySpace>;
190 using MemberType = typename TeamPolicy::member_type;
191 using ScratchView = Kokkos::View<double *, typename MemorySpace::scratch_memory_space, UnmanagedMemory>;
192
193 // First, we fill the entire matrix with ones such that we don't need to fill the 1 separately later
194 Kokkos::deep_copy(qrMatrix, 1.0);
195 Kokkos::fence();
196
197 auto kernel = KOKKOS_LAMBDA(const MemberType &team)
198 {
199 // Step 1: define some pointers we need
200 const int batch = team.league_rank();
201
202 // For the batch
203 const auto begin = offsets(batch);
204 const int verticesPerCluster = offsets(batch + 1) - begin; // the local cluster size
205 const int matrixCols = dim + 1; // our polyParams
206 const offset_1d_type matrixBegin = begin * matrixCols;
207
208 // Step 2: fill the polynomial matrix
209 BatchMatrix<MemorySpace> qr(&qrMatrix(matrixBegin), verticesPerCluster, matrixCols);
210
211 Kokkos::parallel_for(
212 Kokkos::TeamThreadRange(team, verticesPerCluster),
213 [&](int i) {
214 auto globalID = globalIDs(i + begin);
215 // the 1 is already set in the last column
216 for (int d = 0; d < dim; ++d) {
217 qr(i, d) = mesh(globalID, d);
218 }
219 });
220
221 // Step 3: Compute the QR decomposition
222 const offset_1d_type tauBegin = batch * matrixCols;
223 // the 1 here is for the local rank and has nothing to do with the permutation itself
224 const offset_1d_type PBegin = batch * (matrixCols + 1);
225
226 BatchVector<double *, MemorySpace> tau(&qrTau(tauBegin), matrixCols);
227 BatchVector<int *, MemorySpace> P(&qrP(PBegin), matrixCols);
228
229 // Essentially P.end() - 1 = matrixCols + 1 - 1
230 int &rank = qrP(PBegin + matrixCols);
231
232 // The scratch memory (shared memory for the device)
233 ScratchView work(team.team_scratch(0), 2 * verticesPerCluster);
234
235 // auto b = Kokkos::subview(work, std::pair<int, int>(0, n));
236 // auto res = Kokkos::subview(work, std::pair<int, int>(n, n + m));
237 // A Serial QR_WithColumnPivoting would probably be better suited here, but this is as of now not available
238 // The workload to put a Team on the whole matrix is most likely not a good balance
239 KokkosBatched::TeamVectorQR_WithColumnPivoting<MemberType,
240 KokkosBatched::Algo::QR::Unblocked>::invoke(team, qr, tau,
241 P, work,
242 rank);
243 // We have to define our own criterion for the rank, as the one provided is not stable enough
244 // A pivot will be considered nonzero if its absolute value is strictly greater than |pivot|⩽threshold×|maxpivot| where maxpivot is the biggest pivot.
245 // We use 1e-6 as threshold, which is much stricter than the Kokkos internal criterion
246 // The kokkos internal criterion can be found in KokkosBatched_QR_WithColumnPivoting_TeamVector_Internal.hpp
247 // if diagonal value is smaller than threshold(10 * max_diag * ats::epsilon())
248 // note that the pivoted algorithm aborts once the rank deficiency is detected, thus, values larger than rank just contain rubbish
249 double threshold = 1e-5;
250 if (team.team_rank() == 0) {
251 const double maxp = Kokkos::abs(qr(0, 0)); // largest pivot
252 int r = 0;
253 for (int i = 0; i < rank; ++i) {
254 r += static_cast<int>(Kokkos::abs(qr(i, i)) > (threshold * maxp));
255 }
256 rank = r;
257 }
258 // parallel_for
259 };
260
261 // Required as workspace for the pivoted QR, see code comment
262 // workspace (norm and householder application, 2 * max(m,n) is needed)
263
264 auto scratchSize = ScratchView::shmem_size(2 * maxClusterSize);
265 auto teamSize = impl::findTeamSize<typename MemorySpace::execution_space>(avgClusterSize, impl::Pow2Mode::Smaller, kernel, TeamPolicy(nCluster, Kokkos::AUTO).set_scratch_size(0, Kokkos::PerTeam(scratchSize)));
266 // const int VL = TeamPolicy::vector_length_max();
267 // The inner loop uses vector parallelism, so we should try to configure accordingly
269 if constexpr (std::is_same_v<decltype(teamSize), Kokkos::AUTO_t>) {
270 policy = std::make_unique<TeamPolicy>(nCluster, Kokkos::AUTO);
271 } else {
272 policy = std::make_unique<TeamPolicy>(nCluster, 4, teamSize / 4);
273 }
274 policy->set_scratch_size(/* level = */ 0, Kokkos::PerTeam(scratchSize));
275 Kokkos::parallel_for("do_batched_qr", *policy, kernel);
276}
277
278template <typename MemorySpace>
280 MatrixOffsetView<MemorySpace> dst, int nCluster)
281{
282 PRECICE_ASSERT(src1.extent(0) == src2.extent(0));
283 PRECICE_ASSERT(src2.extent(0) == dst.extent(0));
284 Kokkos::parallel_scan("compute_offsets", nCluster, KOKKOS_LAMBDA(const int i, offset_2d_type &update, const bool final) {
285 // Number of rows for local system i
286 int nrows = src1(i + 1) - src1(i);
287 // Number of columns for local system i
288 int ncols = src2(i + 1) - src2(i);
289
290 // Number of entries in the i-th local matrix
291 int localSize = nrows * ncols;
292
293 // Add to running sum
294 update += static_cast<offset_2d_type>(localSize);
295
296 // 'final == true' indicates we should write to matrixOffsets
297 if (final) {
298 // matrixOffsets(i+1) = partial sum up to i
299 dst(i + 1) = update;
300 }
301 // end parallel_for
302 });
303}
304
305template <typename EvalFunctionType, typename MemorySpace>
307 int nCluster, // Number of local systems
308 int dim, // Dimension of points
309 int avgClusterSize,
310 int maxInClusterSize,
311 EvalFunctionType f,
312 const VectorOffsetView<MemorySpace> &inOffsets, // vertex offsets (length N+1)
313 const GlobalIDView<MemorySpace> &globalInIDs,
314 const MeshView<MemorySpace> &inCoords, // meshes
315 const MatrixOffsetView<MemorySpace> &matrixOffsets,
316 VectorView<MemorySpace> matrices) // 1D view of batched matrices
317{
318 using ExecSpace = typename MemorySpace::execution_space;
319 using TeamPolicy = Kokkos::TeamPolicy<ExecSpace>;
320 using MemberType = typename TeamPolicy::member_type;
321
322 using ScratchSpace = typename MemorySpace::scratch_memory_space;
323 using ScratchView1d = Kokkos::View<double *, ScratchSpace, UnmanagedMemory>;
324 using ScratchMatrix = Kokkos::View<double **, Kokkos::LayoutRight, ScratchSpace, UnmanagedMemory>;
325
326 const auto rbf_params = f.getFunctionParameters();
327 auto kernel = KOKKOS_LAMBDA(const MemberType &team)
328 {
329 const int batch = team.league_rank();
330 // Ranges
331 const auto inBegin = inOffsets(batch);
332 const auto inEnd = inOffsets(batch + 1);
333 const int n = inEnd - inBegin;
334
335 ScratchMatrix mesh(team.team_scratch(0), n, dim);
336
337 Kokkos::parallel_for(
338 Kokkos::TeamThreadRange(team, n), [&](int i) {
339 auto globalID = globalInIDs(i + inBegin);
340 for (int d = 0; d < dim; ++d) {
341 mesh(i, d) = inCoords(globalID, d);
342 }
343 });
344
345 // The matrix offset
346 const auto matrixBegin = matrixOffsets(batch);
347
348 team.team_barrier();
349
350 // Create an unmanaged 2D subview pointing into matrices
351 // This constructor: View(pointer, layout)
352 BatchMatrix<MemorySpace> localMatrix(&matrices(matrixBegin), n, n);
353
354 // Now fill localMatrix(r,c). We'll do a standard 2D nested parallel loop
355 Kokkos::parallel_for(
356 Kokkos::TeamThreadMDRange(team, n, n),
357 [=](int r, int c) {
358 // global indices in the original support/target arrays
359 // 1) Compute Euclidean distance
360 double dist = 0;
361 for (int d = 0; d < dim; ++d) {
362 double diff = mesh(r, d) - mesh(c, d);
363 dist += diff * diff;
364 }
365 dist = Kokkos::sqrt(dist);
366
367 // 2) Evaluate the RBF
368 double val = f(dist, rbf_params);
369
370 // 3) Store into localMatrix (2D)
371 localMatrix(c, r) = val;
372 }); // TeamThreadRange
373 };
374
375 // We put the solution and the in data values into shared memory
376 auto inBytes = ScratchView1d::shmem_size(maxInClusterSize);
377
378 auto teamSize = impl::findTeamSize<ExecSpace>(avgClusterSize, impl::Pow2Mode::Larger, kernel, TeamPolicy(nCluster, Kokkos::AUTO).set_scratch_size(/* level = */ 0, Kokkos::PerTeam(dim * inBytes)));
379 Kokkos::parallel_for("do_input_assembly", TeamPolicy(nCluster, teamSize).set_scratch_size(/* level = */ 0, Kokkos::PerTeam(dim * inBytes)), kernel);
380}
381
382// TODO: Using Kokkos::LayoutRight for the Coords performs a bit better for the assembly,
383// but it might deteriorate performance related to the polynomials. Especially for the gemv
384// for the polynomial contributions etc
385// For the full GPU porting, the Layout can only be LayoutRight, as we don't access the coordinates
386// coalesced, but rather first pick what we need
387template <typename EvalFunctionType, typename MemorySpace>
389 int nCluster, // Number of local systems
390 int dim, // Dimension of points
391 int avgClusterSize,
392 EvalFunctionType f,
393 const VectorOffsetView<MemorySpace> &inOffsets, // vertex offsets (length N+1)
394 const GlobalIDView<MemorySpace> &globalInIDs,
395 const MeshView<MemorySpace> &inCoords, // meshes
396 const VectorOffsetView<MemorySpace> &targetOffsets,
397 const GlobalIDView<MemorySpace> &globalTargetIDs,
398 const MeshView<MemorySpace> &targetCoords,
399 const MatrixOffsetView<MemorySpace> &matrixOffsets,
400 VectorView<MemorySpace> matrices) // 1D view of batched matrices
401{
402 using ExecSpace = typename MemorySpace::execution_space;
403 using TeamPolicy = Kokkos::TeamPolicy<ExecSpace>;
404 using MemberType = typename TeamPolicy::member_type;
405
406 const auto rbf_params = f.getFunctionParameters();
407 auto kernel = KOKKOS_LAMBDA(const MemberType &team)
408 {
409 const int batch = team.league_rank();
410 // Ranges
411 const auto inBegin = inOffsets(batch);
412 const auto inEnd = inOffsets(batch + 1);
413 const auto targetBegin = targetOffsets(batch);
414 const auto targetEnd = targetOffsets(batch + 1);
415
416 // For our batched matrix, this results in
417 const int nrows = targetEnd - targetBegin;
418 const int ncols = inEnd - inBegin;
419
420 // The matrix offset
421 const auto matrixBegin = matrixOffsets(batch);
422 // const size_t matrixEnd = matrixOffsets(batch + 1);
423
424 // Create an unmanaged 2D subview pointing into matrices
425 // This constructor: View(pointer, layout)
426 BatchMatrix<MemorySpace> localMatrix(&matrices(matrixBegin), nrows, ncols);
427
428 // Now fill localMatrix(r,c). We'll do a standard 2D nested parallel loop
429 Kokkos::parallel_for(
430 Kokkos::TeamThreadMDRange(team, nrows, ncols),
431 [=](int r, int c) {
432 // global indices in the original support/target arrays
433 offset_1d_type targetIdx = targetBegin + r;
434 offset_1d_type inIdx = inBegin + c;
435
436 auto globalIn = globalInIDs(inIdx);
437 auto globalTarget = globalTargetIDs(targetIdx);
438 // 1) Compute Euclidean distance
439 double dist = 0;
440 for (int d = 0; d < dim; ++d) {
441 double diff = inCoords(globalIn, d) - targetCoords(globalTarget, d);
442 dist += diff * diff;
443 }
444 dist = Kokkos::sqrt(dist);
445
446 // 2) Evaluate the RBF
447 double val = f(dist, rbf_params);
448
449 // 3) Store into localMatrix (2D)
450 localMatrix(r, c) = val;
451 }); // ThreadVectorRange
452 };
453
454 auto teamSize = impl::findTeamSize<ExecSpace>(avgClusterSize, impl::Pow2Mode::Larger, kernel, TeamPolicy(nCluster, Kokkos::AUTO));
455 Kokkos::parallel_for("do_batched_assembly", TeamPolicy(nCluster, teamSize), kernel);
456}
457
458template <typename MemorySpace>
460 int nCluster,
461 int avgClusterSize,
462 const MatrixOffsetView<MemorySpace> &matrixOffsets,
464{
465 using ExecSpace = typename MemorySpace::execution_space;
466 using TeamPolicy = Kokkos::TeamPolicy<ExecSpace>;
467 using MemberType = typename TeamPolicy::member_type;
468
469 auto kernel = KOKKOS_LAMBDA(const MemberType &team)
470 {
471 const int i = team.league_rank();
472 auto start = matrixOffsets(i);
473 auto end = matrixOffsets(i + 1);
474 int n = static_cast<int>(Kokkos::sqrt(end - start));
475
476 BatchMatrix<MemorySpace> A(&matrices(start), n, n);
477
478 KokkosBatched::TeamLU<MemberType, KokkosBatched::Algo::LU::Blocked>::invoke(team, A);
479 // Parallel end
480 };
481
482 // Using Kokkos::AUTO resulted in the best performance for all cases
483 auto teamSize = impl::findTeamSize<ExecSpace>(avgClusterSize, impl::Pow2Mode::Larger, kernel, TeamPolicy(nCluster, Kokkos::AUTO));
484 Kokkos::parallel_for("do_batched_lu", TeamPolicy(nCluster, teamSize), kernel);
485}
486
491template <bool polynomial, bool evaluation_op_available, typename EvalFunctionType, typename MemorySpace>
493 int nCluster,
494 int dim,
495 int avgInClusterSize,
496 int maxInClusterSize,
497 int maxOutClusterSize,
498 EvalFunctionType f,
499 const VectorOffsetView<MemorySpace> &rhsOffsets,
500 const GlobalIDView<MemorySpace> &globalRhsIDs,
502 const MatrixOffsetView<MemorySpace> &matrixOffsets,
503 const VectorView<MemorySpace> &matrices,
504 const VectorView<MemorySpace> &normalizedWeights,
505 const MatrixOffsetView<MemorySpace> &evalOffsets,
506 const VectorView<MemorySpace> &evalMat,
507 const VectorOffsetView<MemorySpace> &outOffsets,
508 const GlobalIDView<MemorySpace> &globalOutIDs,
510 // For the polynomial required in addition
511 const MeshView<MemorySpace> &inMesh,
512 const MeshView<MemorySpace> &outMesh,
513 const VectorView<MemorySpace> &qrMatrix,
514 const VectorView<MemorySpace> &qrTau,
515 const PivotView<MemorySpace> &qrP)
516{
517 using ExecSpace = typename MemorySpace::execution_space;
518 using TeamPolicy = Kokkos::TeamPolicy<ExecSpace>;
519 using MemberType = typename TeamPolicy::member_type;
520
521 using ScratchSpace = typename MemorySpace::scratch_memory_space;
522 // Layout is important for how we use these matrices: we need to ensure that cols are contiguous in memory
523 // We use the scratch memory for the input data and potentially for workspace required for the
524 // polynomial or for the input mesh in case we have to compute the evaluation on the fly
525 using ScratchView1d = Kokkos::View<double *[1], Kokkos::LayoutLeft, ScratchSpace, UnmanagedMemory>;
526 using ScratchView4d = Kokkos::View<double *[4], Kokkos::LayoutLeft, ScratchSpace, UnmanagedMemory>;
527 using ScratchVector = Kokkos::View<double *, ScratchSpace, UnmanagedMemory>;
529 using ScratchMesh = Kokkos::View<double **, Kokkos::LayoutRight, ScratchSpace, UnmanagedMemory>;
530
531 // TODO: Add checks for the matrices (extent() > 0 with the template params)
532 // PRECICE_ASSERT()
533
534 const auto rbf_params = f.getFunctionParameters();
535 // We define the lambda here such that we can query the recommended team size from Kokkos
536 // Launch policy is then handled below
537 auto kernel = KOKKOS_LAMBDA(const MemberType &team)
538 {
539 // Required for correct capturing (mostly by device compilers), as these variables are only conditionally used further down
540 impl::capture_conditional_variables(dim, qrMatrix, qrTau, qrP, inMesh, outMesh, evalOffsets, evalMat, globalOutIDs, f, rbf_params, normalizedWeights, out);
541
542 // Step 1: Define some pointers
543 // TODO: We could potentially remove the rhsOffsets here and use a sqrt instead
544 const int batch = team.league_rank();
545 const auto inBegin = rhsOffsets(batch);
546 const int inSize = rhsOffsets(batch + 1) - inBegin;
547 const auto outBegin = outOffsets(batch);
548 const int outSize = outOffsets(batch + 1) - outBegin;
549
550 // Step 2: Allocate shared memory for the team and fill it with the inData of this cluster
551 // The scratch memory (shared memory for the device)
552 ScratchMatrix work(team.team_scratch(0), Kokkos::max(4, inSize));
553 auto in = Kokkos::subview(work, std::pair<int, int>(0, inSize), 0);
554
555 Kokkos::parallel_for(
556 Kokkos::TeamThreadRange(team, inSize), [&](int i) {
557 auto globalID = globalRhsIDs(i + inBegin);
558 in(i) = rhs(globalID);
559 });
560 team.team_barrier();
561
562 Kokkos::Array<double, 4> qrCoeffs = {0., 0., 0., 0.};
563
564 // Step 3: Solve the polynomial QR system, if we have one
565 if constexpr (polynomial) {
566
567 // Step 3a: Backup the current in data, since we solve the QR in place
568 // In principle, we need a vector here (just as in), but the ApplyQ routine expects a Rank2 matrix,
569 // so we have to stick to this particular syntax (keeping it rank 2 with one column)
570 auto in_cp = Kokkos::subview(work, std::pair<int, int>(0, inSize), std::pair<int, int>(1, 2));
571 Kokkos::parallel_for(
572 Kokkos::TeamThreadRange(team, inSize), [&](int i) { in_cp(i, 0) = in(i); });
573 team.team_barrier();
574
575 // Step 3b: Define pointers and matrices
576 const int matrixCols = dim + 1;
577 const offset_1d_type qrBegin = inBegin * matrixCols;
578 const offset_1d_type tauBegin = batch * matrixCols;
579 const offset_1d_type PBegin = batch * (matrixCols + 1);
580 const int rank = qrP(PBegin + matrixCols);
581
582 BatchMatrix<MemorySpace> qr(&qrMatrix(qrBegin), inSize, matrixCols);
583 BatchVector<double *, MemorySpace> tau(&qrTau(tauBegin), matrixCols);
584 BatchVector<int *, MemorySpace> P(&qrP(PBegin), matrixCols);
585
586 // Step 3c: Apply Q on the left of in, i.e., y = Q^T * in
587 if (team.team_rank() == 0) {
588
589 // tmp size might be insufficient: there was no size requirement specified for the workspace
590 // however, it needs to be contiguous
591 auto tmp = Kokkos::subview(work, Kokkos::ALL, 2);
592 KokkosBatched::ApplyQ<MemberType,
593 KokkosBatched::Side::Left,
594 KokkosBatched::Trans::Transpose,
595 KokkosBatched::Mode::Serial,
596 KokkosBatched::Algo::ApplyQ::Unblocked>::invoke(team, qr, tau, in_cp, tmp);
597
598 // Step 3d: Solve triangular solve R z = y
599 auto in_r = Kokkos::subview(in_cp, std::pair<int, int>(0, rank), 0);
600 auto R = Kokkos::subview(qr, std::pair<int, int>(0, rank), std::pair<int, int>(0, rank));
601
602 KokkosBatched::Trsv<
603 MemberType,
604 KokkosBatched::Uplo::Upper,
605 KokkosBatched::Trans::NoTranspose,
606 KokkosBatched::Diag::NonUnit,
607 KokkosBatched::Mode::Serial,
608 KokkosBatched::Algo::Trsv::Unblocked>::invoke(team, 1.0, R, in_r);
609 }
610
611 team.team_barrier();
612
613 // Step 3e: Copy the result over into memory for each thread
614 for (int r = 0; r < rank; ++r) {
615 qrCoeffs[r] = in_cp(r, 0);
616 }
617
618 // Step 3f: Apply pivoting x = P z
619 // There is also convenience routines for the pivoting, but we let every thread just
620 // apply the pivoting on its own, as it is more compact and the routine doesn't allow
621 // using an Array The below is the equivalent of the following (but qrCoeffs would need to be a view):
622 // KokkosBatched::TeamVectorApplyPivot<MemberType, Side::Left, Direct::Backward>::invoke(team, P, qrCoeffs);
623 for (int i = (matrixCols - 1); i >= 0; --i) {
624 Kokkos::kokkos_swap(qrCoeffs[i], qrCoeffs[i + P(i)]);
625 }
626 }
627
628 // Step 3g: Subtract polynomial portion from the input data: in -= Q * p
629 // In case we have to compute the evaluation operator further down, we
630 // also pull the in mesh into local shared memory
631 // threading over inSize
632 // This vector uses memory we have in principle managed by "work" (for the QR solve as tmp),
633 // but its layout is different (LayoutRight to have the coordinates aligned in memory).
634 // We have to declare it here outsie the "if" to be able to use it further down then.
635 // The memory here might point to null (or rather the end of "work"), in case polynomial = false
636 // and the evaluation_op is available but then it also remains unused. Using a std::optional
637 // is not portable, so the view here is unintitalized and only assigned in the "if"
638 // branch below
639 ScratchMesh localInMesh;
640
641 if constexpr (!evaluation_op_available || polynomial) {
642 localInMesh = ScratchMesh(&work(0, 1), inSize, dim);
643
644 Kokkos::parallel_for(
645 Kokkos::TeamThreadRange(team, inSize),
646 [&](int i) {
647 auto globalID = globalRhsIDs(i + inBegin);
648 // The "1"/constant term is the last value in the result
649 // dim is here matrixCols - 1
650 double sum = qrCoeffs[dim];
651 // ... and the linear polynomial
652 for (int d = 0; d < dim; ++d) {
653 sum += inMesh(globalID, d) * qrCoeffs[d];
654 // Put it in shared memory as we later need it in the output evaluation
655 localInMesh(i, d) = inMesh(globalID, d);
656 }
657 in(i) -= sum;
658 });
659 team.team_barrier();
660 }
661
662 // Step 4: Solve the LU decomposition
663 // The lu inplace lu decomposition computed with KokkosBatched
664 // There is also a convenience routine for the LU solve, but it
665 // uses Trsm under the hood, which is quite a bit slower
666 auto matStart = matrixOffsets(batch);
667 BatchMatrix<MemorySpace> A(&matrices(matStart), inSize, inSize);
668
669 // Forward substitution: solve L * y = b and
670 KokkosBatched::Trsv<
671 MemberType,
672 KokkosBatched::Uplo::Lower,
673 KokkosBatched::Trans::NoTranspose,
674 KokkosBatched::Diag::Unit,
675 KokkosBatched::Mode::Team,
676 KokkosBatched::Algo::Trsv::Blocked>::invoke(team, 1.0, A, in);
677
678 team.team_barrier();
679
680 // Backward substitution: solve U * x = y
681 KokkosBatched::Trsv<
682 MemberType,
683 KokkosBatched::Uplo::Upper,
684 KokkosBatched::Trans::NoTranspose,
685 KokkosBatched::Diag::NonUnit,
686 KokkosBatched::Mode::Team,
687 KokkosBatched::Algo::Trsv::Blocked>::invoke(team, 1.0, A, in);
688 team.team_barrier();
689
690 // Step 5: Apply the output operator
691 // If we have the evaluation operator, we use it (might be slower though)
692 if constexpr (evaluation_op_available) {
693 // Step 5a: Allocate and zero out a local result vector (more of a safety feature)
694 ScratchVector res(team.team_scratch(1), outSize);
695 Kokkos::parallel_for(
696 Kokkos::TeamThreadRange(team, outSize),
697 [&](int i) { res(i) = 0; });
698 team.team_barrier();
699
700 // Step 5b: Multiply by the evaluation operator
701 // the evaluation matrix
702 auto startEval = evalOffsets(batch);
703 BatchMatrix<MemorySpace> eval(&evalMat(startEval), outSize, inSize);
704 // res := 1.0 * eval * b + 0.0 * res
705 KokkosBlas::Experimental::Gemv<
706 KokkosBlas::Mode::Team,
707 KokkosBlas::Algo::Gemv::Blocked>::invoke(team, 'N', 1.0, eval, in, 0.0, res);
708
709 team.team_barrier();
710
711 // Step 5c: write the weightes result back to the global vector
712 // potentially applying the polynomial term alongside
713 Kokkos::parallel_for(
714 Kokkos::TeamThreadRange(team, outSize),
715 [&](int i) {
716 auto globalID = globalOutIDs(i + outBegin);
717 double sum = res(i);
718 // Add polynomial portion to the output data: out += V * p
719 if constexpr (polynomial) {
720 // The "1"/constant term is the last value in the result
721 // dim is here matrixCols - 1
722 sum += qrCoeffs[dim];
723 // ... and the linear polynomial
724 for (int d = 0; d < dim; ++d) {
725 sum += outMesh(globalID, d) * qrCoeffs[d];
726 }
727 }
728 auto w = normalizedWeights(i + outBegin);
729 Kokkos::atomic_add(&out(globalID), sum * w);
730 }); // TeamThreadRange
731 } else {
732 // Alternative approach: do all in one go:
733 // Step 5a: each thread takes care of one output vertex
734 Kokkos::parallel_for(
735 Kokkos::TeamThreadRange(team, outSize), [&](int r) {
736 auto globalID = globalOutIDs(r + outBegin);
737
738 // we first extract the vertex coordinates
739 Kokkos::Array<double, 3> outVertex = {0., 0., 0.};
740 for (int d = 0; d < dim; ++d) {
741 outVertex[d] = outMesh(globalID, d);
742 }
743
744 // Step 5b: The matrix vector multiplication res = A * in
745 // Accumulate partial dot product in a thread-parallel manner
746 double sum = 0.0;
747 Kokkos::parallel_reduce(
748 Kokkos::ThreadVectorRange(team, inSize),
749 [&](int c, double &localSum) {
750 // compute the local output coefficients
751 double dist = 0;
752 for (int d = 0; d < dim; ++d) {
753 double diff = outVertex[d] - localInMesh(c, d);
754 dist += diff * diff;
755 }
756 dist = Kokkos::sqrt(dist);
757 // Evaluate the RBF
758 double val = f(dist, rbf_params);
759 localSum += val * in(c);
760 },
761 sum); // ThreadVectorRange
762
763 // Step 5c: if we have the polynomial as well, we have to apply it here
764 if constexpr (polynomial) {
765 // The "1"/constant term is the last value in the result
766 // dim is here matrixCols - 1
767 sum += qrCoeffs[dim];
768 // ... and the linear polynomial
769 for (int d = 0; d < dim; ++d) {
770 sum += outVertex[d] * qrCoeffs[d];
771 }
772 }
773 // Step 5d: Store final (weighted result) in the global out vector
774 auto w = normalizedWeights(r + outBegin);
775 Kokkos::atomic_add(&out(globalID), sum * w);
776 }); // End TeamThreadRange
777 } // end if-merged-evaluation
778 // End Team parallel loop
779 };
780
781 // We allocate one vector for indata and once for outdata
782 // We need at least four entries for the polynomial, seems unlikely for maxInClusterSize to be lower
783 // but we should be certain
784 auto inBytes = ScratchVector::shmem_size(std::max(4, maxInClusterSize));
785 auto outBytes = ScratchVector::shmem_size(maxOutClusterSize);
786 if (!evaluation_op_available || polynomial) {
787 // and additional storage if we have the polynomial
788 inBytes = 4 * inBytes;
789 }
790 // We use the outBytes only for the per-cluster output vector, which
791 // we don't need if we evaluate everything on the fly
792 if (!evaluation_op_available) {
793 outBytes = 0;
794 }
795
796 // We put the solution and the in data values into shared memory
797 // TODO: Avoid the duplicate memory definitions here, currently needed as we
798 // cannot change the Kokkos::AUTO to the actual recommendataion later
799 auto tmpPol = TeamPolicy(nCluster, Kokkos::AUTO)
800 .set_scratch_size(
801 /* level = */ 0, Kokkos::PerTeam(inBytes))
802 .set_scratch_size(
803 /* level = */ 1, Kokkos::PerTeam(outBytes));
804
805 auto teamSize = impl::findTeamSize<ExecSpace>(avgInClusterSize, impl::Pow2Mode::Smaller, kernel, tmpPol);
806
807 auto policy = TeamPolicy(nCluster, teamSize)
808 .set_scratch_size(
809 /* level = */ 0, Kokkos::PerTeam(inBytes))
810 .set_scratch_size(
811 /* level = */ 1, Kokkos::PerTeam(outBytes));
812
813 Kokkos::parallel_for("do_batched_solve", policy, kernel);
814}
815
816// Currently not used at all, solves the QR decomposition used for the polynomial
817// but cannot be applied standalone, as we subtract the polynomial part on a per-cluster
818// basis from the inData. Still useful for development purposes
819template <typename MemorySpace>
820void do_qr_solve(int nCluster,
821 int dim,
822 int maxInClusterSize,
824 GlobalIDView<MemorySpace> globalInIDs,
830 const VectorView<MemorySpace> weights,
832 GlobalIDView<MemorySpace> globalOutIDs,
834 MeshView<MemorySpace> outMesh)
835{
836 using TeamPolicy = Kokkos::TeamPolicy<MemorySpace>;
837 using MemberType = typename TeamPolicy::member_type;
838 using ScratchView = Kokkos::View<double *[2], typename MemorySpace::scratch_memory_space, UnmanagedMemory>;
839
840 // Used for the inData solution vector and the QR solving
841 auto scratchSize = ScratchView::shmem_size(2 * maxInClusterSize);
842 Kokkos::parallel_for("do_qr_solve", TeamPolicy(nCluster, Kokkos::AUTO).set_scratch_size(
843 /* level = */ 0, Kokkos::PerTeam(scratchSize)),
844 KOKKOS_LAMBDA(const MemberType &team) {
845 // Step 1: Some pointers we need
846 const int batch = team.league_rank();
847 // Ranges
848 const auto inBegin = inOffsets(batch);
849 const auto inEnd = inOffsets(batch + 1);
850 const int inSize = inEnd - inBegin;
851 const auto outBegin = outOffsets(batch);
852 const auto outEnd = outOffsets(batch + 1);
853 const int outSize = outEnd - outBegin;
854
855 // Step 2: Collect the inData
856 ScratchView tmp(team.team_scratch(0), inSize, 2);
857 auto in = Kokkos::subview(tmp, Kokkos::ALL, 0);
858 auto work = Kokkos::subview(tmp, Kokkos::ALL, 1); // work size is just a guess at the moment
859
860 Kokkos::parallel_for(
861 Kokkos::TeamThreadRange(team, inSize),
862 [&](int i) {
863 auto globalID = globalInIDs(i + inBegin);
864 in(i) = inData(globalID);
865 });
866
867 team.team_barrier();
868
869 // Step 3: Gather the QR data structures
870 const int matrixCols = dim + 1;
871 const offset_1d_type matrixBegin = inBegin * matrixCols;
872 const offset_1d_type tauBegin = batch * matrixCols;
873 const offset_1d_type PBegin = batch * (matrixCols + 1);
874 const int rank = qrP(PBegin + matrixCols);
875
876 BatchMatrix<MemorySpace> qr(&qrMatrix(matrixBegin), inSize, matrixCols);
877 BatchVector<double *, MemorySpace> tau(&qrTau(tauBegin), matrixCols);
878 BatchVector<int *, MemorySpace> P(&qrP(PBegin), matrixCols);
879
880 // Step 4: Solve the linear least-square system A x = b, in our case Q R P^T x = in
881
882 // Step 4a: Apply Q on the left of in, i.e., y = Q^T * in
883 KokkosBatched::TeamVectorApplyQ<MemberType,
884 KokkosBatched::Side::Left,
885 KokkosBatched::Trans::Transpose,
886 KokkosBatched::Algo::ApplyQ::Unblocked>::invoke(team, qr, tau, in, work);
887 team.team_barrier();
888
889 auto in_r = Kokkos::subview(in, std::pair<int, int>(0, rank));
890 auto R = Kokkos::subview(qr, std::pair<int, int>(0, rank), std::pair<int, int>(0, rank));
891
892 // Step 4b: Solve triangular solve R z = y
893 KokkosBatched::Trsv<
894 MemberType,
895 KokkosBatched::Uplo::Upper,
896 KokkosBatched::Trans::NoTranspose,
897 KokkosBatched::Diag::NonUnit,
898 KokkosBatched::Mode::Team,
899 KokkosBatched::Algo::Trsv::Blocked>::invoke(team, 1.0, R, in_r);
900 team.team_barrier();
901
902 auto res = Kokkos::subview(in, std::pair<int, int>(0, matrixCols));
903
904 // Steo 4c: zero out the entries which are not within the rank region
905 Kokkos::parallel_for(
906 Kokkos::TeamThreadRange(team, rank, matrixCols), [&](int i) { res(i) = 0; });
907
908 team.team_barrier();
909
910 // Step 4d: Apply pivoting x = P z
911 KokkosBatched::TeamVectorApplyPivot<MemberType,
912 KokkosBatched::Side::Left,
913 KokkosBatched::Direct::Backward>::invoke(team, P, res);
914 team.team_barrier();
915
916 // Step 5: Subtract polynomial portion from the input data: in -= Q * p
917 // threading over inSize
918 Kokkos::parallel_for(
919 Kokkos::TeamThreadRange(team, inBegin, inEnd),
920 [&](int i) {
921 auto globalID = globalInIDs(i);
922
923 // The "1"/constant term is the last value in the result
924 double tmp = res(matrixCols - 1);
925 // ... and the linear polynomial
926 for (int d = 0; d < dim; ++d)
927 tmp += inMesh(globalID, d) * res(d);
928
929 Kokkos::atomic_sub(&inData(globalID), tmp);
930 });
931 // no barrier needed here
932
933 // Step 6: Add polynomial portion to the output data: out += V * p
934 // threading over outSize
935 Kokkos::parallel_for(
936 Kokkos::TeamThreadRange(team, outBegin, outEnd),
937 [&](offset_1d_type i) {
938 auto globalID = globalOutIDs(i);
939
940 // The "1"/constant term is the last value in the result
941 double tmp = res(matrixCols - 1);
942 // ... and the linear polynomial
943 for (int d = 0; d < dim; ++d)
944 tmp += outMesh(globalID, d) * res(d);
945
946 // and the weight as usual
947 double w = weights(i);
948 Kokkos::atomic_add(&outData(globalID), tmp * w);
949 });
950 // end parallel_for
951 });
952}
953} // namespace precice::mapping::kernel
#define PRECICE_ASSERT(...)
Definition assertion.hpp:85
Wendland radial basis function with compact support.
RadialBasisParameters getFunctionParameters() const
T is_same_v
T make_unique(T... args)
T max(T... args)
auto findTeamSize(int avgWork, const Pow2Mode mode, const FunctorType &functor, const Policy &policy)
int powerOfTwo(int n, const Pow2Mode mode)
KOKKOS_INLINE_FUNCTION constexpr void capture_conditional_variables(const Args &...)
void do_batched_qr(int nCluster, int dim, int avgClusterSize, int maxClusterSize, VectorOffsetView< MemorySpace > inOffsets, GlobalIDView< MemorySpace > globalInIDs, MeshView< MemorySpace > inMesh, VectorView< MemorySpace > qrMatrix, VectorView< MemorySpace > qrTau, PivotView< MemorySpace > qrP)
void do_batched_assembly(int nCluster, int dim, int avgClusterSize, EvalFunctionType f, const VectorOffsetView< MemorySpace > &inOffsets, const GlobalIDView< MemorySpace > &globalInIDs, const MeshView< MemorySpace > &inCoords, const VectorOffsetView< MemorySpace > &targetOffsets, const GlobalIDView< MemorySpace > &globalTargetIDs, const MeshView< MemorySpace > &targetCoords, const MatrixOffsetView< MemorySpace > &matrixOffsets, VectorView< MemorySpace > matrices)
bool compute_weights(const int nCluster, const int avgOutClusterSize, const offset_1d_type nWeights, const int nMeshVertices, const int dim, VectorOffsetView< MemorySpace > offsets, MeshView< MemorySpace > centers, GlobalIDView< MemorySpace > globalIDs, MeshView< MemorySpace > mesh, const CompactPolynomialC2 &w, VectorView< MemorySpace > normalizedWeights)
void do_input_assembly(int nCluster, int dim, int avgClusterSize, int maxInClusterSize, EvalFunctionType f, const VectorOffsetView< MemorySpace > &inOffsets, const GlobalIDView< MemorySpace > &globalInIDs, const MeshView< MemorySpace > &inCoords, const MatrixOffsetView< MemorySpace > &matrixOffsets, VectorView< MemorySpace > matrices)
void do_qr_solve(int nCluster, int dim, int maxInClusterSize, VectorOffsetView< MemorySpace > inOffsets, GlobalIDView< MemorySpace > globalInIDs, VectorView< MemorySpace > inData, MeshView< MemorySpace > inMesh, VectorView< MemorySpace > qrMatrix, VectorView< MemorySpace > qrTau, PivotView< MemorySpace > qrP, const VectorView< MemorySpace > weights, VectorOffsetView< MemorySpace > outOffsets, GlobalIDView< MemorySpace > globalOutIDs, VectorView< MemorySpace > outData, MeshView< MemorySpace > outMesh)
void do_batched_solve(int nCluster, int dim, int avgInClusterSize, int maxInClusterSize, int maxOutClusterSize, EvalFunctionType f, const VectorOffsetView< MemorySpace > &rhsOffsets, const GlobalIDView< MemorySpace > &globalRhsIDs, VectorView< MemorySpace > rhs, const MatrixOffsetView< MemorySpace > &matrixOffsets, const VectorView< MemorySpace > &matrices, const VectorView< MemorySpace > &normalizedWeights, const MatrixOffsetView< MemorySpace > &evalOffsets, const VectorView< MemorySpace > &evalMat, const VectorOffsetView< MemorySpace > &outOffsets, const GlobalIDView< MemorySpace > &globalOutIDs, VectorView< MemorySpace > out, const MeshView< MemorySpace > &inMesh, const MeshView< MemorySpace > &outMesh, const VectorView< MemorySpace > &qrMatrix, const VectorView< MemorySpace > &qrTau, const PivotView< MemorySpace > &qrP)
void compute_offsets(const VectorOffsetView< MemorySpace > src1, const VectorOffsetView< MemorySpace > src2, MatrixOffsetView< MemorySpace > dst, int nCluster)
void do_batched_lu(int nCluster, int avgClusterSize, const MatrixOffsetView< MemorySpace > &matrixOffsets, VectorView< MemorySpace > matrices)
Kokkos::View< T, MemorySpace, UnmanagedMemory > BatchVector
Kokkos::View< double **, MemorySpace, UnmanagedMemory > BatchMatrix
ExecutionSpace::size_type offset_2d_type
Kokkos::View< int *, MemorySpace > PivotView
Kokkos::View< offset_2d_type *, MemorySpace > MatrixOffsetView
Kokkos::View< offset_1d_type *, MemorySpace > VectorOffsetView
Kokkos::View< VertexID *, MemorySpace > GlobalIDView
ExecutionSpace::size_type offset_1d_type
Kokkos::View< double *, MemorySpace > VectorView
Kokkos::View< double **, Kokkos::LayoutRight, MemorySpace > MeshView
constexpr T pow_int(const T base)
Computes the power of a given number by an integral exponent given at compile time,...
Definition math.hpp:22
provides Mesh, Data and primitives.
Wrapper struct that is used to transfer RBF-specific parameters to the GPU.