189 using TeamPolicy = Kokkos::TeamPolicy<MemorySpace>;
190 using MemberType =
typename TeamPolicy::member_type;
191 using ScratchView = Kokkos::View<double *, typename MemorySpace::scratch_memory_space, UnmanagedMemory>;
194 Kokkos::deep_copy(qrMatrix, 1.0);
197 auto kernel = KOKKOS_LAMBDA(
const MemberType &team)
200 const int batch = team.league_rank();
203 const auto begin = offsets(batch);
204 const int verticesPerCluster = offsets(batch + 1) - begin;
205 const int matrixCols = dim + 1;
211 Kokkos::parallel_for(
212 Kokkos::TeamThreadRange(team, verticesPerCluster),
214 auto globalID = globalIDs(i + begin);
216 for (
int d = 0; d < dim; ++d) {
217 qr(i, d) =
mesh(globalID, d);
230 int &rank = qrP(PBegin + matrixCols);
233 ScratchView work(team.team_scratch(0), 2 * verticesPerCluster);
239 KokkosBatched::TeamVectorQR_WithColumnPivoting<MemberType,
240 KokkosBatched::Algo::QR::Unblocked>::invoke(team, qr, tau,
249 double threshold = 1e-5;
250 if (team.team_rank() == 0) {
251 const double maxp = Kokkos::abs(qr(0, 0));
253 for (
int i = 0; i < rank; ++i) {
254 r +=
static_cast<int>(Kokkos::abs(qr(i, i)) > (threshold * maxp));
264 auto scratchSize = ScratchView::shmem_size(2 * maxClusterSize);
269 if constexpr (
std::is_same_v<
decltype(teamSize), Kokkos::AUTO_t>) {
274 policy->set_scratch_size( 0, Kokkos::PerTeam(scratchSize));
275 Kokkos::parallel_for(
"do_batched_qr", *policy,
kernel);
310 int maxInClusterSize,
318 using ExecSpace =
typename MemorySpace::execution_space;
319 using TeamPolicy = Kokkos::TeamPolicy<ExecSpace>;
320 using MemberType =
typename TeamPolicy::member_type;
322 using ScratchSpace =
typename MemorySpace::scratch_memory_space;
323 using ScratchView1d = Kokkos::View<double *, ScratchSpace, UnmanagedMemory>;
324 using ScratchMatrix = Kokkos::View<double **, Kokkos::LayoutRight, ScratchSpace, UnmanagedMemory>;
326 const auto rbf_params = f.getFunctionParameters();
327 auto kernel = KOKKOS_LAMBDA(
const MemberType &team)
329 const int batch = team.league_rank();
331 const auto inBegin = inOffsets(batch);
332 const auto inEnd = inOffsets(batch + 1);
333 const int n = inEnd - inBegin;
335 ScratchMatrix
mesh(team.team_scratch(0), n, dim);
337 Kokkos::parallel_for(
338 Kokkos::TeamThreadRange(team, n), [&](
int i) {
339 auto globalID = globalInIDs(i + inBegin);
340 for (
int d = 0; d < dim; ++d) {
341 mesh(i, d) = inCoords(globalID, d);
346 const auto matrixBegin = matrixOffsets(batch);
355 Kokkos::parallel_for(
356 Kokkos::TeamThreadMDRange(team, n, n),
361 for (
int d = 0; d < dim; ++d) {
362 double diff =
mesh(r, d) -
mesh(c, d);
365 dist = Kokkos::sqrt(dist);
368 double val = f(dist, rbf_params);
371 localMatrix(c, r) = val;
376 auto inBytes = ScratchView1d::shmem_size(maxInClusterSize);
379 Kokkos::parallel_for(
"do_input_assembly", TeamPolicy(nCluster, teamSize).set_scratch_size( 0, Kokkos::PerTeam(dim * inBytes)),
kernel);
495 int avgInClusterSize,
496 int maxInClusterSize,
497 int maxOutClusterSize,
517 using ExecSpace =
typename MemorySpace::execution_space;
518 using TeamPolicy = Kokkos::TeamPolicy<ExecSpace>;
519 using MemberType =
typename TeamPolicy::member_type;
521 using ScratchSpace =
typename MemorySpace::scratch_memory_space;
525 using ScratchView1d = Kokkos::View<double *[1], Kokkos::LayoutLeft, ScratchSpace, UnmanagedMemory>;
526 using ScratchView4d = Kokkos::View<double *[4], Kokkos::LayoutLeft, ScratchSpace, UnmanagedMemory>;
527 using ScratchVector = Kokkos::View<double *, ScratchSpace, UnmanagedMemory>;
529 using ScratchMesh = Kokkos::View<double **, Kokkos::LayoutRight, ScratchSpace, UnmanagedMemory>;
534 const auto rbf_params = f.getFunctionParameters();
537 auto kernel = KOKKOS_LAMBDA(
const MemberType &team)
540 impl::capture_conditional_variables(dim, qrMatrix, qrTau, qrP, inMesh, outMesh, evalOffsets, evalMat, globalOutIDs, f, rbf_params, normalizedWeights, out);
544 const int batch = team.league_rank();
545 const auto inBegin = rhsOffsets(batch);
546 const int inSize = rhsOffsets(batch + 1) - inBegin;
547 const auto outBegin = outOffsets(batch);
548 const int outSize = outOffsets(batch + 1) - outBegin;
552 ScratchMatrix work(team.team_scratch(0), Kokkos::max(4, inSize));
555 Kokkos::parallel_for(
556 Kokkos::TeamThreadRange(team, inSize), [&](
int i) {
557 auto globalID = globalRhsIDs(i + inBegin);
558 in(i) = rhs(globalID);
562 Kokkos::Array<double, 4> qrCoeffs = {0., 0., 0., 0.};
565 if constexpr (polynomial) {
571 Kokkos::parallel_for(
572 Kokkos::TeamThreadRange(team, inSize), [&](
int i) { in_cp(i, 0) = in(i); });
576 const int matrixCols = dim + 1;
580 const int rank = qrP(PBegin + matrixCols);
587 if (team.team_rank() == 0) {
591 auto tmp = Kokkos::subview(work, Kokkos::ALL, 2);
592 KokkosBatched::ApplyQ<MemberType,
593 KokkosBatched::Side::Left,
594 KokkosBatched::Trans::Transpose,
595 KokkosBatched::Mode::Serial,
596 KokkosBatched::Algo::ApplyQ::Unblocked>::invoke(team, qr, tau, in_cp, tmp);
604 KokkosBatched::Uplo::Upper,
605 KokkosBatched::Trans::NoTranspose,
606 KokkosBatched::Diag::NonUnit,
607 KokkosBatched::Mode::Serial,
608 KokkosBatched::Algo::Trsv::Unblocked>::invoke(team, 1.0, R, in_r);
614 for (
int r = 0; r < rank; ++r) {
615 qrCoeffs[r] = in_cp(r, 0);
623 for (
int i = (matrixCols - 1); i >= 0; --i) {
624 Kokkos::kokkos_swap(qrCoeffs[i], qrCoeffs[i + P(i)]);
639 ScratchMesh localInMesh;
641 if constexpr (!evaluation_op_available || polynomial) {
642 localInMesh = ScratchMesh(&work(0, 1), inSize, dim);
644 Kokkos::parallel_for(
645 Kokkos::TeamThreadRange(team, inSize),
647 auto globalID = globalRhsIDs(i + inBegin);
650 double sum = qrCoeffs[dim];
652 for (
int d = 0; d < dim; ++d) {
653 sum += inMesh(globalID, d) * qrCoeffs[d];
655 localInMesh(i, d) = inMesh(globalID, d);
666 auto matStart = matrixOffsets(batch);
672 KokkosBatched::Uplo::Lower,
673 KokkosBatched::Trans::NoTranspose,
674 KokkosBatched::Diag::Unit,
675 KokkosBatched::Mode::Team,
676 KokkosBatched::Algo::Trsv::Blocked>::invoke(team, 1.0, A, in);
683 KokkosBatched::Uplo::Upper,
684 KokkosBatched::Trans::NoTranspose,
685 KokkosBatched::Diag::NonUnit,
686 KokkosBatched::Mode::Team,
687 KokkosBatched::Algo::Trsv::Blocked>::invoke(team, 1.0, A, in);
692 if constexpr (evaluation_op_available) {
694 ScratchVector res(team.team_scratch(1), outSize);
695 Kokkos::parallel_for(
696 Kokkos::TeamThreadRange(team, outSize),
697 [&](
int i) { res(i) = 0; });
702 auto startEval = evalOffsets(batch);
705 KokkosBlas::Experimental::Gemv<
706 KokkosBlas::Mode::Team,
707 KokkosBlas::Algo::Gemv::Blocked>::invoke(team,
'N', 1.0, eval, in, 0.0, res);
713 Kokkos::parallel_for(
714 Kokkos::TeamThreadRange(team, outSize),
716 auto globalID = globalOutIDs(i + outBegin);
719 if constexpr (polynomial) {
722 sum += qrCoeffs[dim];
724 for (
int d = 0; d < dim; ++d) {
725 sum += outMesh(globalID, d) * qrCoeffs[d];
728 auto w = normalizedWeights(i + outBegin);
729 Kokkos::atomic_add(&out(globalID), sum * w);
734 Kokkos::parallel_for(
735 Kokkos::TeamThreadRange(team, outSize), [&](
int r) {
736 auto globalID = globalOutIDs(r + outBegin);
739 Kokkos::Array<double, 3> outVertex = {0., 0., 0.};
740 for (
int d = 0; d < dim; ++d) {
741 outVertex[d] = outMesh(globalID, d);
747 Kokkos::parallel_reduce(
748 Kokkos::ThreadVectorRange(team, inSize),
749 [&](
int c,
double &localSum) {
752 for (
int d = 0; d < dim; ++d) {
753 double diff = outVertex[d] - localInMesh(c, d);
756 dist = Kokkos::sqrt(dist);
758 double val = f(dist, rbf_params);
759 localSum += val * in(c);
764 if constexpr (polynomial) {
767 sum += qrCoeffs[dim];
769 for (
int d = 0; d < dim; ++d) {
770 sum += outVertex[d] * qrCoeffs[d];
774 auto w = normalizedWeights(r + outBegin);
775 Kokkos::atomic_add(&out(globalID), sum * w);
784 auto inBytes = ScratchVector::shmem_size(
std::max(4, maxInClusterSize));
785 auto outBytes = ScratchVector::shmem_size(maxOutClusterSize);
786 if (!evaluation_op_available || polynomial) {
788 inBytes = 4 * inBytes;
792 if (!evaluation_op_available) {
799 auto tmpPol = TeamPolicy(nCluster, Kokkos::AUTO)
801 0, Kokkos::PerTeam(inBytes))
803 1, Kokkos::PerTeam(outBytes));
807 auto policy = TeamPolicy(nCluster, teamSize)
809 0, Kokkos::PerTeam(inBytes))
811 1, Kokkos::PerTeam(outBytes));
813 Kokkos::parallel_for(
"do_batched_solve", policy,
kernel);
822 int maxInClusterSize,
836 using TeamPolicy = Kokkos::TeamPolicy<MemorySpace>;
837 using MemberType =
typename TeamPolicy::member_type;
838 using ScratchView = Kokkos::View<double *[2], typename MemorySpace::scratch_memory_space, UnmanagedMemory>;
841 auto scratchSize = ScratchView::shmem_size(2 * maxInClusterSize);
842 Kokkos::parallel_for(
"do_qr_solve", TeamPolicy(nCluster, Kokkos::AUTO).set_scratch_size(
843 0, Kokkos::PerTeam(scratchSize)),
844 KOKKOS_LAMBDA(
const MemberType &team) {
846 const int batch = team.league_rank();
848 const auto inBegin = inOffsets(batch);
849 const auto inEnd = inOffsets(batch + 1);
850 const int inSize = inEnd - inBegin;
851 const auto outBegin = outOffsets(batch);
852 const auto outEnd = outOffsets(batch + 1);
853 const int outSize = outEnd - outBegin;
856 ScratchView tmp(team.team_scratch(0), inSize, 2);
857 auto in = Kokkos::subview(tmp, Kokkos::ALL, 0);
858 auto work = Kokkos::subview(tmp, Kokkos::ALL, 1);
860 Kokkos::parallel_for(
861 Kokkos::TeamThreadRange(team, inSize),
863 auto globalID = globalInIDs(i + inBegin);
864 in(i) = inData(globalID);
870 const int matrixCols = dim + 1;
874 const int rank = qrP(PBegin + matrixCols);
883 KokkosBatched::TeamVectorApplyQ<MemberType,
884 KokkosBatched::Side::Left,
885 KokkosBatched::Trans::Transpose,
886 KokkosBatched::Algo::ApplyQ::Unblocked>::invoke(team, qr, tau, in, work);
895 KokkosBatched::Uplo::Upper,
896 KokkosBatched::Trans::NoTranspose,
897 KokkosBatched::Diag::NonUnit,
898 KokkosBatched::Mode::Team,
899 KokkosBatched::Algo::Trsv::Blocked>::invoke(team, 1.0, R, in_r);
905 Kokkos::parallel_for(
906 Kokkos::TeamThreadRange(team, rank, matrixCols), [&](
int i) { res(i) = 0; });
911 KokkosBatched::TeamVectorApplyPivot<MemberType,
912 KokkosBatched::Side::Left,
913 KokkosBatched::Direct::Backward>::invoke(team, P, res);
918 Kokkos::parallel_for(
919 Kokkos::TeamThreadRange(team, inBegin, inEnd),
921 auto globalID = globalInIDs(i);
924 double tmp = res(matrixCols - 1);
926 for (
int d = 0; d < dim; ++d)
927 tmp += inMesh(globalID, d) * res(d);
929 Kokkos::atomic_sub(&inData(globalID), tmp);
935 Kokkos::parallel_for(
936 Kokkos::TeamThreadRange(team, outBegin, outEnd),
938 auto globalID = globalOutIDs(i);
941 double tmp = res(matrixCols - 1);
943 for (
int d = 0; d < dim; ++d)
944 tmp += outMesh(globalID, d) * res(d);
947 double w = weights(i);
948 Kokkos::atomic_add(&outData(globalID), tmp * w);