Ifpack2 Templated Preconditioning Package  Version 1.0
Ifpack2_BlockTriDiContainer_impl.hpp
1 /*@HEADER
2 // ***********************************************************************
3 //
4 // Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
5 // Copyright (2009) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ***********************************************************************
40 //@HEADER
41 */
42 
43 #ifndef IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
44 #define IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
45 
47 
48 #include <Tpetra_Distributor.hpp>
49 #include <Tpetra_BlockMultiVector.hpp>
50 
51 #include <Kokkos_ArithTraits.hpp>
52 #include <KokkosBatched_Util.hpp>
53 #include <KokkosBatched_Vector.hpp>
54 #include <KokkosBatched_Copy_Decl.hpp>
55 #include <KokkosBatched_Copy_Impl.hpp>
56 #include <KokkosBatched_AddRadial_Decl.hpp>
57 #include <KokkosBatched_AddRadial_Impl.hpp>
58 #include <KokkosBatched_Gemm_Decl.hpp>
59 #include <KokkosBatched_Gemm_Serial_Impl.hpp>
60 #include <KokkosBatched_Gemm_Team_Impl.hpp>
61 #include <KokkosBatched_Gemv_Decl.hpp>
62 #include <KokkosBatched_Gemv_Serial_Impl.hpp>
63 #include <KokkosBatched_Gemv_Team_Impl.hpp>
64 #include <KokkosBatched_Trsm_Decl.hpp>
65 #include <KokkosBatched_Trsm_Serial_Impl.hpp>
66 #include <KokkosBatched_Trsm_Team_Impl.hpp>
67 #include <KokkosBatched_Trsv_Decl.hpp>
68 #include <KokkosBatched_Trsv_Serial_Impl.hpp>
69 #include <KokkosBatched_Trsv_Team_Impl.hpp>
70 #include <KokkosBatched_LU_Decl.hpp>
71 #include <KokkosBatched_LU_Serial_Impl.hpp>
72 #include <KokkosBatched_LU_Team_Impl.hpp>
73 
74 #include <memory>
75 
76 // need to interface this into cmake variable (or only use this flag when it is necessary)
77 //#define IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
78 #undef IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
79 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
80 #include "cuda_profiler_api.h"
81 #endif
82 
83 namespace Ifpack2 {
84 
85  namespace BlockTriDiContainerDetails {
86 
90  using do_not_initialize_tag = Kokkos::ViewAllocateWithoutInitializing;
91 
92  template <typename MemoryTraitsType, Kokkos::MemoryTraitsFlags flag>
93  using MemoryTraits = Kokkos::MemoryTraits<MemoryTraitsType::Unmanaged |
94  MemoryTraitsType::RandomAccess |
95  flag>;
96 
97  template <typename ViewType>
98  using Unmanaged = Kokkos::View<typename ViewType::data_type,
99  typename ViewType::array_layout,
100  typename ViewType::device_type,
101  MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
102  template <typename ViewType>
103  using Const = Kokkos::View<typename ViewType::const_data_type,
104  typename ViewType::array_layout,
105  typename ViewType::device_type,
106  typename ViewType::memory_traits>;
107  template <typename ViewType>
108  using ConstUnmanaged = Const<Unmanaged<ViewType> >;
109 
113  template<typename T> struct is_cuda { enum : bool { value = false }; };
114 #if defined(KOKKOS_ENABLE_CUDA)
115  template<> struct is_cuda<Kokkos::Cuda> { enum : bool { value = true }; };
116 #endif
117 
121  template<typename CommPtrType>
122  std::string get_msg_prefix (const CommPtrType &comm) {
123  const auto rank = comm->getRank();
124  const auto nranks = comm->getSize();
125  std::stringstream ss;
126  ss << "Rank " << rank << " of " << nranks << ": ";
127  return ss.str();
128  }
129 
133  template<typename T, int N>
134  struct ArrayValueType {
135  T v[N];
136  KOKKOS_INLINE_FUNCTION
137  ArrayValueType() {
138  for (int i=0;i<N;++i)
139  this->v[i] = 0;
140  }
141  KOKKOS_INLINE_FUNCTION
142  ArrayValueType(const ArrayValueType &b) {
143  for (int i=0;i<N;++i)
144  this->v[i] = b.v[i];
145  }
146  };
147  template<typename T, int N>
148  static
149  KOKKOS_INLINE_FUNCTION
150  void
151  operator+=(volatile ArrayValueType<T,N> &a,
152  volatile const ArrayValueType<T,N> &b) {
153  for (int i=0;i<N;++i)
154  a.v[i] += b.v[i];
155  }
156  template<typename T, int N>
157  static
158  KOKKOS_INLINE_FUNCTION
159  void
160  operator+=(ArrayValueType<T,N> &a,
161  const ArrayValueType<T,N> &b) {
162  for (int i=0;i<N;++i)
163  a.v[i] += b.v[i];
164  }
165 
169  template<typename T, int N, typename ExecSpace>
170  struct SumReducer {
171  typedef SumReducer reducer;
173  typedef Kokkos::View<value_type,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
174  value_type *value;
175 
176  KOKKOS_INLINE_FUNCTION
177  SumReducer(value_type &val) : value(&val) {}
178 
179  KOKKOS_INLINE_FUNCTION
180  void join(value_type &dst, value_type &src) const {
181  for (int i=0;i<N;++i)
182  dst.v[i] += src.v[i];
183  }
184  KOKKOS_INLINE_FUNCTION
185  void join(volatile value_type &dst, const volatile value_type &src) const {
186  for (int i=0;i<N;++i)
187  dst.v[i] += src.v[i];
188  }
189  KOKKOS_INLINE_FUNCTION
190  void init(value_type &val) const {
191  for (int i=0;i<N;++i)
192  val.v[i] = Kokkos::reduction_identity<T>::sum();
193  }
194  KOKKOS_INLINE_FUNCTION
195  value_type& reference() {
196  return *value;
197  }
198  KOKKOS_INLINE_FUNCTION
199  result_view_type view() const {
200  return result_view_type(value);
201  }
202  };
203 
207  template <typename MatrixType>
208  struct ImplType {
212  typedef size_t size_type;
213  typedef typename MatrixType::scalar_type scalar_type;
214  typedef typename MatrixType::local_ordinal_type local_ordinal_type;
215  typedef typename MatrixType::global_ordinal_type global_ordinal_type;
216  typedef typename MatrixType::node_type node_type;
217 
221  typedef typename Kokkos::Details::ArithTraits<scalar_type>::val_type impl_scalar_type;
222  typedef typename Kokkos::ArithTraits<impl_scalar_type>::mag_type magnitude_type;
223 
227  typedef Kokkos::DefaultHostExecutionSpace host_execution_space;
228 
232  typedef typename node_type::device_type device_type;
233  typedef typename device_type::execution_space execution_space;
234  typedef typename device_type::memory_space memory_space;
235  typedef Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_multivector_type;
236  typedef Tpetra::Map<local_ordinal_type,global_ordinal_type,node_type> tpetra_map_type;
237  typedef Tpetra::Import<local_ordinal_type,global_ordinal_type,node_type> tpetra_import_type;
238  typedef Tpetra::RowMatrix<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_row_matrix_type;
239  typedef Tpetra::BlockCrsMatrix<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_block_crs_matrix_type;
240  typedef typename tpetra_block_crs_matrix_type::little_block_type tpetra_block_access_view_type;
241  typedef Tpetra::BlockMultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_block_multivector_type;
242  typedef typename tpetra_block_crs_matrix_type::crs_graph_type::local_graph_type local_crs_graph_type;
243 
247  template<typename T, int l> using Vector = KokkosBatched::Experimental::Vector<T,l>;
248  template<typename T> using SIMD = KokkosBatched::Experimental::SIMD<T>;
249  template<typename T, typename M> using DefaultVectorLength = KokkosBatched::Experimental::DefaultVectorLength<T,M>;
250 
251  // Kyungjoo: hansen enum does not work (don't know why)
252  // enum : int { vector_length = DefaultVectorLength<impl_scalar_type,memory_space>::value };
253  static constexpr int vector_length = DefaultVectorLength<impl_scalar_type,memory_space>::value;
254  typedef Vector<SIMD<impl_scalar_type>,vector_length> vector_type;
255 
259  typedef Kokkos::View<size_type*,device_type> size_type_1d_view;
260  typedef Kokkos::View<local_ordinal_type*,device_type> local_ordinal_type_1d_view;
261  // tpetra block crs values
262  typedef Kokkos::View<impl_scalar_type*,device_type> impl_scalar_type_1d_view;
263  // tpetra multivector values (layout left): may need to change the typename more explicitly
264  typedef Kokkos::View<impl_scalar_type**,Kokkos::LayoutLeft,device_type> impl_scalar_type_2d_view;
265  // packed data always use layout right
266  typedef Kokkos::View<vector_type*,device_type> vector_type_1d_view;
267  typedef Kokkos::View<vector_type***,Kokkos::LayoutRight,device_type> vector_type_3d_view;
268  typedef Kokkos::View<impl_scalar_type****,Kokkos::LayoutRight,device_type> impl_scalar_type_4d_view;
269  };
270 
274  template<typename MatrixType>
276  createBlockCrsTpetraImporter(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A) {
277  using impl_type = ImplType<MatrixType>;
278  using tpetra_map_type = typename impl_type::tpetra_map_type;
279  using tpetra_mv_type = typename impl_type::tpetra_block_multivector_type;
280  using tpetra_import_type = typename impl_type::tpetra_import_type;
281 
282  const auto g = A->getCrsGraph(); // tpetra crs graph object
283  const auto blocksize = A->getBlockSize();
284  const auto src = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getDomainMap(), blocksize)));
285  const auto tgt = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getColMap() , blocksize)));
286 
287  return Teuchos::rcp(new tpetra_import_type(src, tgt));
288  }
289 
290  // Partial replacement for forward-mode MultiVector::doImport.
291  // Permits overlapped communication and computation, but also supports sync'ed.
292  // I'm finding that overlapped comm/comp can give quite poor performance on some
293  // platforms, so we can't just use it straightforwardly always.
294 
295  template<typename MatrixType>
296  struct AsyncableImport {
297  public:
298  using impl_type = ImplType<MatrixType>;
299 
300  private:
304 #if !defined(HAVE_IFPACK2_MPI)
305  typedef int MPI_Request;
306  typedef int MPI_Comm;
307 #endif
308  using scalar_type = typename impl_type::scalar_type;
311 
312  template <typename T>
313  static int isend(const MPI_Comm comm, const T* buf, int count, int dest, int tag, MPI_Request* ireq) {
314 #ifdef HAVE_IFPACK2_MPI
315  MPI_Request ureq;
316  const auto dt = Teuchos::Details::MpiTypeTraits<scalar_type>::getType();
317  int ret = MPI_Isend(const_cast<T*>(buf), count, dt, dest, tag, comm, ireq == NULL ? &ureq : ireq);
318  if (ireq == NULL) MPI_Request_free(&ureq);
319  return ret;
320 #else
321  return 0;
322 #endif
323  }
324 
325  template <typename T>
326  static int irecv(const MPI_Comm comm, T* buf, int count, int src, int tag, MPI_Request* ireq) {
327 #ifdef HAVE_IFPACK2_MPI
328  MPI_Request ureq;
329  const auto dt = Teuchos::Details::MpiTypeTraits<scalar_type>::getType();
330  int ret = MPI_Irecv(buf, count, dt, src, tag, comm, ireq == NULL ? &ureq : ireq);
331  if (ireq == NULL) MPI_Request_free(&ureq);
332  return ret;
333 #else
334  return 0;
335 #endif
336  }
337 
338  static int waitany(int count, MPI_Request* reqs, int* index) {
339 #ifdef HAVE_IFPACK2_MPI
340  return MPI_Waitany(count, reqs, index, MPI_STATUS_IGNORE);
341 #else
342  return 0;
343 #endif
344  }
345 
346  static int waitall(int count, MPI_Request* reqs) {
347 #ifdef HAVE_IFPACK2_MPI
348  return MPI_Waitall(count, reqs, MPI_STATUS_IGNORE);
349 #else
350  return 0;
351 #endif
352  }
353 
354  public:
355  using tpetra_map_type = typename impl_type::tpetra_map_type;
356  using tpetra_import_type = typename impl_type::tpetra_import_type;
357 
358  using local_ordinal_type = typename impl_type::local_ordinal_type;
359  using global_ordinal_type = typename impl_type::global_ordinal_type;
360  using size_type = typename impl_type::size_type;
361  using impl_scalar_type = typename impl_type::impl_scalar_type;
362 
363  using host_execution_space = typename impl_type::host_execution_space;
364  using int_1d_view_host = Kokkos::View<int*,host_execution_space>;
365  using local_ordinal_type_1d_view_host = typename impl_type::local_ordinal_type_1d_view::HostMirror;
366 
367  using execution_space = typename impl_type::execution_space;
368  using memory_space = typename impl_type::memory_space;
369  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
370  using size_type_1d_view = typename impl_type::size_type_1d_view;
371  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
372  using impl_scalar_type_2d_view = typename impl_type::impl_scalar_type_2d_view;
373 
374 #ifdef HAVE_IFPACK2_MPI
375  MPI_Comm comm;
376 #endif
377  impl_scalar_type_2d_view remote_multivector;
378  local_ordinal_type blocksize;
379 
380  template<typename T>
381  struct SendRecvPair {
382  T send, recv;
383  };
384 
385  // (s)end and (r)eceive data:
386  SendRecvPair<int_1d_view_host> pids; // mpi ranks
387  SendRecvPair<std::vector<MPI_Request> > reqs; // MPI_Request is pointer, cannot use kokkos view
388  SendRecvPair<size_type_1d_view> offset; // offsets to local id list and data buffer
389  SendRecvPair<local_ordinal_type_1d_view> lids; // local id list
390  SendRecvPair<impl_scalar_type_1d_view> buffer; // data buffer
391 
392  local_ordinal_type_1d_view dm2cm; // permutation
393 
394  // for cuda
395  public:
396  void setOffsetValues(const Teuchos::ArrayView<const size_t> &lens,
397  const size_type_1d_view &offs) {
398  // wrap lens to kokkos view and deep copy to device
399  Kokkos::View<size_t*,host_execution_space> lens_host(const_cast<size_t*>(lens.getRawPtr()), lens.size());
400  const auto lens_device = Kokkos::create_mirror_view(memory_space(), lens_host);
401  Kokkos::deep_copy(lens_device, lens_host);
402 
403  // exclusive scan
404  const Kokkos::RangePolicy<execution_space> policy(0,offs.extent(0));
405  const local_ordinal_type lens_size = lens_device.extent(0);
406  Kokkos::parallel_scan
407  ("AsyncableImport::RangePolicy::setOffsetValues",
408  policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
409  if (final)
410  offs(i) = update;
411  update += (i < lens_size ? lens_device[i] : 0);
412  });
413  }
414 
415  private:
416  void createMpiRequests(const tpetra_import_type &import) {
417  Tpetra::Distributor &distributor = import.getDistributor();
418 
419  // copy pids from distributor
420  const auto pids_from = distributor.getProcsFrom();
421  pids.recv = int_1d_view_host(do_not_initialize_tag("pids recv"), pids_from.size());
422  memcpy(pids.recv.data(), pids_from.getRawPtr(), sizeof(int)*pids.recv.extent(0));
423 
424  const auto pids_to = distributor.getProcsTo();
425  pids.send = int_1d_view_host(do_not_initialize_tag("pids send"), pids_to.size());
426  memcpy(pids.send.data(), pids_to.getRawPtr(), sizeof(int)*pids.send.extent(0));
427 
428  // mpi requests
429  reqs.recv.resize(pids.recv.extent(0)); memset(reqs.recv.data(), 0, reqs.recv.size()*sizeof(MPI_Request));
430  reqs.send.resize(pids.send.extent(0)); memset(reqs.send.data(), 0, reqs.send.size()*sizeof(MPI_Request));
431 
432  // construct offsets
433  const auto lengths_to = distributor.getLengthsTo();
434  offset.send = size_type_1d_view(do_not_initialize_tag("offset send"), lengths_to.size() + 1);
435 
436  const auto lengths_from = distributor.getLengthsFrom();
437  offset.recv = size_type_1d_view(do_not_initialize_tag("offset recv"), lengths_from.size() + 1);
438 
439  setOffsetValues(lengths_to, offset.send);
440  setOffsetValues(lengths_from, offset.recv);
441  }
442 
443  void createSendRecvIDs(const tpetra_import_type &import) {
444  // For each remote PID, the list of LIDs to receive.
445  const auto remote_lids = import.getRemoteLIDs();
446  const local_ordinal_type_1d_view_host
447  remote_lids_view_host(const_cast<local_ordinal_type*>(remote_lids.getRawPtr()), remote_lids.size());
448  lids.recv = local_ordinal_type_1d_view(do_not_initialize_tag("lids recv"), remote_lids.size());
449  Kokkos::deep_copy(lids.recv, remote_lids_view_host);
450 
451  // For each export PID, the list of LIDs to send.
452  auto epids = import.getExportPIDs();
453  auto elids = import.getExportLIDs();
454  TEUCHOS_ASSERT(epids.size() == elids.size());
455  lids.send = local_ordinal_type_1d_view(do_not_initialize_tag("lids send"), elids.size());
456  auto lids_send_host = Kokkos::create_mirror_view(lids.send);
457 
458  // naive search (not sure if pids or epids are sorted)
459  for (local_ordinal_type cnt=0,i=0,iend=pids.send.extent(0);i<iend;++i) {
460  const auto pid_send_value = pids.send[i];
461  for (local_ordinal_type j=0,jend=epids.size();j<jend;++j)
462  if (epids[j] == pid_send_value) lids_send_host[cnt++] = elids[j];
463 #if !defined(__CUDA_ARCH__)
464  TEUCHOS_ASSERT(static_cast<size_t>(cnt) == offset.send[i+1]);
465 #endif
466  }
467  Kokkos::deep_copy(lids.send, lids_send_host);
468  }
469 
470  public:
471  // for cuda, all tag types are public
472  struct ToBuffer {};
473  struct ToMultiVector {};
474 
475  // for cuda, kernel launch should be public too.
476  template<typename PackTag>
477  static
478  void copy(const local_ordinal_type_1d_view &lids_,
479  const impl_scalar_type_1d_view &buffer_,
480  const local_ordinal_type &ibeg_,
481  const local_ordinal_type &iend_,
482  const impl_scalar_type_2d_view &multivector_,
483  const local_ordinal_type blocksize_) {
484  const local_ordinal_type num_vectors = multivector_.extent(1);
485  const local_ordinal_type mv_blocksize = blocksize_*num_vectors;
486  const local_ordinal_type idiff = iend_ - ibeg_;
487  const auto abase = buffer_.data() + mv_blocksize*ibeg_;
488 
489  if (is_cuda<execution_space>::value) {
490 #if defined(KOKKOS_ENABLE_CUDA)
491  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
492  const team_policy_type policy(idiff, num_vectors == 1 ? 1 : 2);
493  Kokkos::parallel_for
494  ("AsyncableImport::TeamPolicy::copy",
495  policy, KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
496  const local_ordinal_type i = member.league_rank();
497  Kokkos::parallel_for
498  (Kokkos::TeamThreadRange(member,num_vectors),[&](const local_ordinal_type &j) {
499  auto aptr = abase + blocksize_*(i + idiff*j);
500  auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
501  if (std::is_same<PackTag,ToBuffer>::value)
502  Kokkos::parallel_for
503  (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
504  aptr[k] = bptr[k];
505  });
506  else
507  Kokkos::parallel_for
508  (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
509  bptr[k] = aptr[k];
510  });
511  });
512  });
513 #endif
514  } else {
515 #if defined(__CUDA_ARCH__)
516  TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Error: CUDA should not see this code");
517 #else
518  if (num_vectors == 1) {
519  const Kokkos::RangePolicy<execution_space> policy(ibeg_, iend_);
520  Kokkos::parallel_for
521  ("AsyncableImport::RangePolicy::copy",
522  policy, KOKKOS_LAMBDA(const local_ordinal_type &i) {
523  auto aptr = buffer_.data() + blocksize_*i;
524  auto bptr = multivector_.data() + blocksize_*lids_(i);
525  if (std::is_same<PackTag,ToBuffer>::value)
526  memcpy(aptr, bptr, sizeof(impl_scalar_type)*blocksize_);
527  else
528  memcpy(bptr, aptr, sizeof(impl_scalar_type)*blocksize_);
529  });
530 
531  } else {
532  Kokkos::MDRangePolicy
533  <execution_space, Kokkos::Rank<2>, Kokkos::IndexType<local_ordinal_type> >
534  policy( { 0, 0 }, { idiff, num_vectors } );
535 
536  Kokkos::parallel_for
537  ("AsyncableImport::MDRangePolicy::copy",
538  policy, KOKKOS_LAMBDA(const local_ordinal_type &i,
539  const local_ordinal_type &j) {
540  auto aptr = abase + blocksize_*(i + idiff*j);
541  auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
542  if (std::is_same<PackTag,ToBuffer>::value)
543  for (local_ordinal_type k=0;k<blocksize_;++k) aptr[k] = bptr[k];
544  else
545  for (local_ordinal_type k=0;k<blocksize_;++k) bptr[k] = aptr[k];
546  });
547  }
548 #endif
549  }
550  Kokkos::fence();
551  }
552 
553  void createDataBuffer(const local_ordinal_type &num_vectors) {
554  const size_type extent_0 = lids.recv.extent(0)*blocksize;
555  const size_type extent_1 = num_vectors;
556  if (remote_multivector.extent(0) == extent_0 &&
557  remote_multivector.extent(1) == extent_1) {
558  // skip
559  } else {
560  remote_multivector =
561  impl_scalar_type_2d_view(do_not_initialize_tag("remote multivector"), extent_0, extent_1);
562 
563  const auto send_buffer_size = offset.send[offset.send.extent(0)-1]*blocksize*num_vectors;
564  buffer.send = impl_scalar_type_1d_view(do_not_initialize_tag("buffer send"), send_buffer_size);
565 
566  const auto recv_buffer_size = offset.recv[offset.recv.extent(0)-1]*blocksize*num_vectors;
567  buffer.recv = impl_scalar_type_1d_view(do_not_initialize_tag("buffer recv"), recv_buffer_size);
568  }
569  }
570 
571  AsyncableImport (const Teuchos::RCP<const tpetra_map_type>& src_map,
572  const Teuchos::RCP<const tpetra_map_type>& tgt_map,
573  const local_ordinal_type blocksize_,
574  const local_ordinal_type_1d_view dm2cm_) {
575  blocksize = blocksize_;
576  dm2cm = dm2cm_;
577 
578 #ifdef HAVE_IFPACK2_MPI
579  const auto mpi_comm = Teuchos::rcp_dynamic_cast<const Teuchos::MpiComm<int> >(tgt_map->getComm());
580  TEUCHOS_ASSERT(!mpi_comm.is_null());
581  comm = *mpi_comm->getRawMpiComm();
582 #endif
583  const tpetra_import_type import(src_map, tgt_map);
584 
585  createMpiRequests(import);
586  createSendRecvIDs(import);
587  }
588 
589  void asyncSendRecv(const impl_scalar_type_2d_view &mv) {
590 #ifdef HAVE_IFPACK2_BLOCKTRIDICONTAINER_TIMERS
591  TEUCHOS_FUNC_TIME_MONITOR("BlockTriDi::Setup::AsyncSendRecv");
592 #endif
593 #ifdef HAVE_IFPACK2_MPI
594  // constants and reallocate data buffers if necessary
595  const local_ordinal_type num_vectors = mv.extent(1);
596  const local_ordinal_type mv_blocksize = blocksize*num_vectors;
597 
598  // send async
599  for (local_ordinal_type i=0,iend=pids.send.extent(0);i<iend;++i) {
600  copy<ToBuffer>(lids.send, buffer.send, offset.send(i), offset.send(i+1),
601  mv, blocksize);
602  isend(comm,
603  buffer.send.data() + offset.send[i]*mv_blocksize,
604  (offset.send[i+1] - offset.send[i])*mv_blocksize,
605  pids.send[i],
606  42,
607  &reqs.send[i]);
608  }
609 
610  // receive async
611  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
612  irecv(comm,
613  buffer.recv.data() + offset.recv[i]*mv_blocksize,
614  (offset.recv[i+1] - offset.recv[i])*mv_blocksize,
615  pids.recv[i],
616  42,
617  &reqs.recv[i]);
618  }
619 
620  // I find that issuing an Iprobe seems to nudge some MPIs into action,
621  // which helps with overlapped comm/comp performance.
622  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
623  int flag;
624  MPI_Status stat;
625  MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
626  }
627 #endif
628  }
629 
630  void cancel () {
631 #ifdef HAVE_IFPACK2_MPI
632  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i)
633  MPI_Cancel(&reqs.recv[i]);
634 #endif
635  }
636 
637  void syncRecv() {
638 #ifdef HAVE_IFPACK2_BLOCKTRIDICONTAINER_TIMERS
639  TEUCHOS_FUNC_TIME_MONITOR("BlockTriDi::Setup::SyncRecv");
640 #endif
641 #ifdef HAVE_IFPACK2_MPI
642  // receive async.
643  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
644  local_ordinal_type idx = i;
645  waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
646  copy<ToMultiVector>(lids.recv, buffer.recv, offset.recv(idx), offset.recv(idx+1),
647  remote_multivector, blocksize);
648  }
649  // wait on the sends to match all Isends with a cleanup operation.
650  waitall(reqs.send.size(), reqs.send.data());
651 #endif
652  }
653 
654  void syncExchange(const impl_scalar_type_2d_view &mv) {
655  asyncSendRecv(mv);
656  syncRecv();
657  }
658 
659  impl_scalar_type_2d_view getRemoteMultiVectorLocalView() const { return remote_multivector; }
660  };
661 
665  template<typename MatrixType>
667  createBlockCrsAsyncImporter(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A) {
668  using impl_type = ImplType<MatrixType>;
669  using tpetra_map_type = typename impl_type::tpetra_map_type;
670  using local_ordinal_type = typename impl_type::local_ordinal_type;
671  using global_ordinal_type = typename impl_type::global_ordinal_type;
672  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
673 
674  const auto g = A->getCrsGraph(); // tpetra crs graph object
675  const auto blocksize = A->getBlockSize();
676  const auto domain_map = g.getDomainMap();
677  const auto column_map = g.getColMap();
678 
679  std::vector<global_ordinal_type> gids;
680  bool separate_remotes = true, found_first = false, need_owned_permutation = false;
681  for (size_t i=0;i<column_map->getNodeNumElements();++i) {
682  const global_ordinal_type gid = column_map->getGlobalElement(i);
683  if (!domain_map->isNodeGlobalElement(gid)) {
684  found_first = true;
685  gids.push_back(gid);
686  } else if (found_first) {
687  separate_remotes = false;
688  break;
689  }
690  if (!need_owned_permutation &&
691  domain_map->getLocalElement(gid) != static_cast<local_ordinal_type>(i)) {
692  // The owned part of the domain and column maps are different
693  // orderings. We *could* do a super efficient impl of this case in the
694  // num_sweeps > 1 case by adding complexity to PermuteAndRepack. But,
695  // really, if a caller cares about speed, they wouldn't make different
696  // local permutations like this. So we punt on the best impl and go for
697  // a pretty good one: the permutation is done in place in
698  // compute_b_minus_Rx for the pure-owned part of the MVP. The only cost
699  // is the presumably worse memory access pattern of the input vector.
700  need_owned_permutation = true;
701  }
702  }
703 
704  if (separate_remotes) {
706  const auto parsimonious_col_map
707  = Teuchos::rcp(new tpetra_map_type(invalid, gids.data(), gids.size(), 0, domain_map->getComm()));
708  if (parsimonious_col_map->getGlobalNumElements() > 0) {
709  // make the importer only if needed.
710  local_ordinal_type_1d_view dm2cm;
711  if (need_owned_permutation) {
712  dm2cm = local_ordinal_type_1d_view("dm2cm", domain_map->getNodeNumElements());
713  const auto dm2cm_host = Kokkos::create_mirror_view(dm2cm);
714  for (size_t i=0;i<domain_map->getNodeNumElements();++i)
715  dm2cm_host(i) = domain_map->getLocalElement(column_map->getGlobalElement(i));
716  Kokkos::deep_copy(dm2cm, dm2cm_host);
717  }
718  return Teuchos::rcp(new AsyncableImport<MatrixType>(domain_map, parsimonious_col_map, blocksize, dm2cm));
719  }
720  }
721  return Teuchos::null;
722  }
723 
724  template<typename MatrixType>
725  struct PartInterface {
726  using local_ordinal_type = typename ImplType<MatrixType>::local_ordinal_type;
727  using local_ordinal_type_1d_view = typename ImplType<MatrixType>::local_ordinal_type_1d_view;
728 
729  PartInterface() = default;
730  PartInterface(const PartInterface &b) = default;
731 
732  // Some terms:
733  // The matrix A is split as A = D + R, where D is the matrix of tridiag
734  // blocks and R is the remainder.
735  // A part is roughly a synonym for a tridiag. The distinction is that a part
736  // is the set of rows belonging to one tridiag and, equivalently, the off-diag
737  // rows in R associated with that tridiag. In contrast, the term tridiag is
738  // used to refer specifically to tridiag data, such as the pointer into the
739  // tridiag data array.
740  // Local (lcl) row arge the LIDs. lclrow lists the LIDs belonging to each
741  // tridiag, and partptr points to the beginning of each tridiag. This is the
742  // LID space.
743  // Row index (idx) is the ordinal in the tridiag ordering. lclrow is indexed
744  // by this ordinal. This is the 'index' space.
745  // A flat index is the mathematical index into an array. A pack index
746  // accounts for SIMD packing.
747 
748  // Local row LIDs. Permutation from caller's index space to tridiag index
749  // space.
750  local_ordinal_type_1d_view lclrow;
751  // partptr_ is the pointer array into lclrow_.
752  local_ordinal_type_1d_view partptr; // np+1
753  // packptr_(i), for i the pack index, indexes partptr_. partptr_(packptr_(i))
754  // is the start of the i'th pack.
755  local_ordinal_type_1d_view packptr; // npack+1
756  // part2rowidx0_(i) is the flat row index of the start of the i'th part. It's
757  // an alias of partptr_ in the case of no overlap.
758  local_ordinal_type_1d_view part2rowidx0; // np+1
759  // part2packrowidx0_(i) is the packed row index. If vector_length is 1, then
760  // it's the same as part2rowidx0_; if it's > 1, then the value is combined
761  // with i % vector_length to get the location in the packed data.
762  local_ordinal_type_1d_view part2packrowidx0; // np+1
763  local_ordinal_type part2packrowidx0_back; // So we don't need to grab the array from the GPU.
764  // rowidx2part_ maps the row index to the part index.
765  local_ordinal_type_1d_view rowidx2part; // nr
766  // True if lcl{row|col} is at most a constant away from row{idx|col}. In
767  // practice, this knowledge is not particularly useful, as packing for batched
768  // processing is done at the same time as the permutation from LID to index
769  // space. But it's easy to detect, so it's recorded in case an optimization
770  // can be made based on it.
771  bool row_contiguous;
772  };
773 
777  template<typename MatrixType>
778  PartInterface<MatrixType>
779  createPartInterface(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
780  const Teuchos::Array<Teuchos::Array<typename ImplType<MatrixType>::local_ordinal_type> > &partitions) {
781  using impl_type = ImplType<MatrixType>;
782  using local_ordinal_type = typename impl_type::local_ordinal_type;
783  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
784 
785  constexpr int vector_length = impl_type::vector_length;
786 
787  const auto comm = A->getRowMap()->getComm();
788 
789  PartInterface<MatrixType> interf;
790 
791  const bool jacobi = partitions.size() == 0;
792  const local_ordinal_type A_n_lclrows = A->getNodeNumRows();
793  const local_ordinal_type nparts = jacobi ? A_n_lclrows : partitions.size();
794 
795 #if defined(BLOCKTRIDICONTAINER_DEBUG)
796  local_ordinal_type nrows = 0;
797  if (jacobi)
798  nrows = nparts;
799  else
800  for (local_ordinal_type i=0;i<nparts;++i) nrows += partitions[i].size();
801 
803  (nrows != A_n_lclrows, get_msg_prefix(comm) << "The #rows implied by the local partition is not "
804  << "the same as getNodeNumRows: " << nrows << " vs " << A_n_lclrows);
805 #endif
806 
807  // permutation vector
808  std::vector<local_ordinal_type> p;
809  if (!jacobi) {
810  // reorder parts to maximize simd packing efficiency
811  p.resize(nparts);
812 
813  typedef std::pair<local_ordinal_type,local_ordinal_type> size_idx_pair_type;
814  std::vector<size_idx_pair_type> partsz(nparts);
815  for (local_ordinal_type i=0;i<nparts;++i)
816  partsz[i] = size_idx_pair_type(partitions[i].size(), i);
817  std::sort(partsz.begin(), partsz.end(),
818  [] (const size_idx_pair_type& x, const size_idx_pair_type& y) {
819  return x.first > y.first;
820  });
821  for (local_ordinal_type i=0;i<nparts;++i)
822  p[i] = partsz[i].second;
823  }
824 
825  // allocate parts
826  interf.partptr = local_ordinal_type_1d_view(do_not_initialize_tag("partptr"), nparts + 1);
827  interf.lclrow = local_ordinal_type_1d_view(do_not_initialize_tag("lclrow"), A_n_lclrows);
828  interf.part2rowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag("part2rowidx0"), nparts + 1);
829  interf.part2packrowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag("part2packrowidx0"), nparts + 1);
830  interf.rowidx2part = local_ordinal_type_1d_view(do_not_initialize_tag("rowidx2part"), A_n_lclrows);
831 
832  // mirror to host and compute on host execution space
833  const auto partptr = Kokkos::create_mirror_view(interf.partptr);
834  const auto lclrow = Kokkos::create_mirror_view(interf.lclrow);
835  const auto part2rowidx0 = Kokkos::create_mirror_view(interf.part2rowidx0);
836  const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
837  const auto rowidx2part = Kokkos::create_mirror_view(interf.rowidx2part);
838 
839  // Determine parts.
840  interf.row_contiguous = true;
841  partptr(0) = 0;
842  part2rowidx0(0) = 0;
843  part2packrowidx0(0) = 0;
844  local_ordinal_type pack_nrows = 0;
845  for (local_ordinal_type ip=0;ip<nparts;++ip) {
846  const auto* part = jacobi ? NULL : &partitions[p[ip]];
847  const local_ordinal_type ipnrows = jacobi ? 1 : part->size();
848  TEUCHOS_ASSERT(ip == 0 || (jacobi || ipnrows <= static_cast<local_ordinal_type>(partitions[p[ip-1]].size())));
849  TEUCHOS_TEST_FOR_EXCEPT_MSG(ipnrows == 0,
850  get_msg_prefix(comm)
851  << "partition " << p[ip]
852  << " is empty, which is not allowed.");
853  //assume No overlap.
854  part2rowidx0(ip+1) = part2rowidx0(ip) + ipnrows;
855  // Since parts are ordered in nonincreasing size, the size of the first
856  // part in a pack is the size for all parts in the pack.
857  if (ip % vector_length == 0) pack_nrows = ipnrows;
858  part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0);
859  const local_ordinal_type os = partptr(ip);
860  for (local_ordinal_type i=0;i<ipnrows;++i) {
861  const auto lcl_row = jacobi ? ip : (*part)[i];
862  TEUCHOS_TEST_FOR_EXCEPT_MSG(lcl_row < 0 || lcl_row >= A_n_lclrows,
863  get_msg_prefix(comm)
864  << "partitions[" << p[ip] << "]["
865  << i << "] = " << lcl_row
866  << " but input matrix implies limits of [0, " << A_n_lclrows-1
867  << "].");
868  lclrow(os+i) = lcl_row;
869  rowidx2part(os+i) = ip;
870  if (interf.row_contiguous && os+i > 0 && lclrow((os+i)-1) + 1 != lcl_row)
871  interf.row_contiguous = false;
872  }
873  partptr(ip+1) = os + ipnrows;
874  }
875 #if defined(BLOCKTRIDICONTAINER_DEBUG)
876  TEUCHOS_ASSERT(partptr(nparts) == nrows);
877 #endif
878  if (lclrow(0) != 0) interf.row_contiguous = false;
879 
880  Kokkos::deep_copy(interf.partptr, partptr);
881  Kokkos::deep_copy(interf.lclrow, lclrow);
882 
883  //assume No overlap. Thus:
884  interf.part2rowidx0 = interf.partptr;
885  Kokkos::deep_copy(interf.part2packrowidx0, part2packrowidx0);
886 
887  interf.part2packrowidx0_back = part2packrowidx0(part2packrowidx0.extent(0) - 1);
888  Kokkos::deep_copy(interf.rowidx2part, rowidx2part);
889 
890  { // Fill packptr.
891  local_ordinal_type npacks = 0;
892  for (local_ordinal_type ip=1;ip<=nparts;++ip)
893  if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
894  ++npacks;
895  interf.packptr = local_ordinal_type_1d_view("packptr", npacks + 1);
896  const auto packptr = Kokkos::create_mirror_view(interf.packptr);
897  packptr(0) = 0;
898  for (local_ordinal_type ip=1,k=1;ip<=nparts;++ip)
899  if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
900  packptr(k++) = ip;
901  Kokkos::deep_copy(interf.packptr, packptr);
902  }
903 
904  return interf;
905  }
906 
910  template <typename MatrixType>
911  struct BlockTridiags {
913  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
914  using size_type_1d_view = typename impl_type::size_type_1d_view;
915  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
916 
917  // flat_td_ptr(i) is the index into flat-array values of the start of the
918  // i'th tridiag. pack_td_ptr is the same, but for packs. If vector_length ==
919  // 1, pack_td_ptr is the same as flat_td_ptr; if vector_length > 1, then i %
920  // vector_length is the position in the pack.
921  size_type_1d_view flat_td_ptr, pack_td_ptr;
922  // List of local column indices into A from which to grab
923  // data. flat_td_ptr(i) points to the start of the i'th tridiag's data.
924  local_ordinal_type_1d_view A_colindsub;
925  // Tridiag block values. pack_td_ptr(i) points to the start of the i'th
926  // tridiag's pack, and i % vector_length gives the position in the pack.
927  vector_type_3d_view values;
928 
929  bool is_diagonal_only;
930 
931  BlockTridiags() = default;
932  BlockTridiags(const BlockTridiags &b) = default;
933 
934  // Index into row-major block of a tridiag.
935  template <typename idx_type>
936  static KOKKOS_FORCEINLINE_FUNCTION
937  idx_type IndexToRow (const idx_type& ind) { return (ind + 1) / 3; }
938  // Given a row of a row-major tridiag, return the index of the first block
939  // in that row.
940  template <typename idx_type>
941  static KOKKOS_FORCEINLINE_FUNCTION
942  idx_type RowToIndex (const idx_type& row) { return row > 0 ? 3*row - 1 : 0; }
943  // Number of blocks in a tridiag having a given number of rows.
944  template <typename idx_type>
945  static KOKKOS_FORCEINLINE_FUNCTION
946  idx_type NumBlocks (const idx_type& nrows) { return nrows > 0 ? 3*nrows - 2 : 0; }
947  };
948 
949 
953  template<typename MatrixType>
955  createBlockTridiags(const PartInterface<MatrixType> &interf) {
956  using impl_type = ImplType<MatrixType>;
957  using execution_space = typename impl_type::execution_space;
958  using local_ordinal_type = typename impl_type::local_ordinal_type;
959  using size_type = typename impl_type::size_type;
960  using size_type_1d_view = typename impl_type::size_type_1d_view;
961 
962  constexpr int vector_length = impl_type::vector_length;
963 
965 
966  const local_ordinal_type ntridiags = interf.partptr.extent(0) - 1;
967 
968  { // construct the flat index pointers into the tridiag values array.
969  btdm.flat_td_ptr = size_type_1d_view(do_not_initialize_tag("btdm.flat_td_ptr"), ntridiags + 1);
970  const Kokkos::RangePolicy<execution_space> policy(0,ntridiags + 1);
971  Kokkos::parallel_scan
972  ("createBlockTridiags::RangePolicy::flat_td_ptr",
973  policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
974  if (final)
975  btdm.flat_td_ptr(i) = update;
976  if (i < ntridiags) {
977  const local_ordinal_type nrows = interf.partptr(i+1) - interf.partptr(i);
978  update += btdm.NumBlocks(nrows);
979  }
980  });
981 
982  const auto nblocks = Kokkos::create_mirror_view_and_copy
983  (Kokkos::HostSpace(), Kokkos::subview(btdm.flat_td_ptr, ntridiags));
984  btdm.is_diagonal_only = (static_cast<local_ordinal_type>(nblocks()) == ntridiags);
985  }
986 
987  // And the packed index pointers.
988  if (vector_length == 1) {
989  btdm.pack_td_ptr = btdm.flat_td_ptr;
990  } else {
991  const local_ordinal_type npacks = interf.packptr.extent(0) - 1;
992  btdm.pack_td_ptr = size_type_1d_view(do_not_initialize_tag("btdm.pack_td_ptr"), ntridiags + 1);
993  const Kokkos::RangePolicy<execution_space> policy(0,npacks);
994  Kokkos::parallel_scan
995  ("createBlockTridiags::RangePolicy::pack_td_ptr",
996  policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
997  const local_ordinal_type parti = interf.packptr(i);
998  const local_ordinal_type parti_next = interf.packptr(i+1);
999  if (final) {
1000  const size_type nblks = update;
1001  for (local_ordinal_type pti=parti;pti<parti_next;++pti)
1002  btdm.pack_td_ptr(pti) = nblks;
1003  const local_ordinal_type nrows = interf.partptr(parti+1) - interf.partptr(parti);
1004  // last one
1005  if (i == npacks-1)
1006  btdm.pack_td_ptr(ntridiags) = nblks + btdm.NumBlocks(nrows);
1007  }
1008  {
1009  const local_ordinal_type nrows = interf.partptr(parti+1) - interf.partptr(parti);
1010  update += btdm.NumBlocks(nrows);
1011  }
1012  });
1013  }
1014 
1015  // values and A_colindsub are created in the symbolic phase
1016 
1017  return btdm;
1018  }
1019 
1020  // Set the tridiags to be I to the full pack block size. That way, if a
1021  // tridiag within a pack is shorter than the longest one, the extra blocks are
1022  // processed in a safe way. Similarly, in the solve phase, if the extra blocks
1023  // in the packed multvector are 0, and the tridiag LU reflects the extra I
1024  // blocks, then the solve proceeds as though the extra blocks aren't
1025  // present. Since this extra work is part of the SIMD calls, it's not actually
1026  // extra work. Instead, it means we don't have to put checks or masks in, or
1027  // quiet NaNs. This functor has to be called just once, in the symbolic phase,
1028  // since the numeric phase fills in only the used entries, leaving these I
1029  // blocks intact.
1030  template<typename MatrixType>
1031  void setTridiagsToIdentity(const BlockTridiags<MatrixType> &btdm,
1032  const typename ImplType<MatrixType>::local_ordinal_type_1d_view &packptr) {
1033  using impl_type = ImplType<MatrixType>;
1034  using execution_space = typename impl_type::execution_space;
1035  using local_ordinal_type = typename impl_type::local_ordinal_type;
1036  using size_type = typename impl_type::size_type;
1037 
1038  using size_type_1d_view = typename impl_type::size_type_1d_view;
1039  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1040 
1041  const ConstUnmanaged<size_type_1d_view> pack_td_ptr(btdm.pack_td_ptr);
1042  const local_ordinal_type blocksize = btdm.values.extent(1);
1043 
1044  if (is_cuda<execution_space>::value) {
1045 #if defined(KOKKOS_ENABLE_CUDA)
1046  constexpr int vector_length = impl_type::vector_length;
1047  using impl_scalar_type = typename impl_type::impl_scalar_type;
1048  using impl_scalar_type_4d_view = typename impl_type::impl_scalar_type_4d_view;
1049  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
1050  const impl_scalar_type_4d_view values((impl_scalar_type*)btdm.values.data(),
1051  btdm.values.extent(0),
1052  btdm.values.extent(1),
1053  btdm.values.extent(2),
1054  vector_length);
1055  // vector_lengh (enum or constexpr) sometimes is not captured by device lambda
1056  // reference should not be used but some compilers interpret vector_length as
1057  // a reference
1058  const local_ordinal_type vector_length_value = vector_length;
1059  const team_policy_type policy(packptr.extent(0)-1, Kokkos::AUTO(), vector_length);
1060  Kokkos::parallel_for
1061  ("setTridiagsToIdentity::TeamPolicy",
1062  policy, KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
1063  const local_ordinal_type k = member.league_rank();
1064  const local_ordinal_type ibeg = pack_td_ptr(packptr(k));
1065  const local_ordinal_type iend = pack_td_ptr(packptr(k+1));
1066  const local_ordinal_type diff = iend - ibeg;
1067  const local_ordinal_type icount = diff/3 + (diff%3 > 0);
1068  Kokkos::parallel_for(Kokkos::TeamThreadRange(member,icount),[&](const local_ordinal_type &ii) {
1069  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_length_value),[&](const int &v) {
1070  const local_ordinal_type i = ibeg + ii*3;
1071  for (local_ordinal_type j=0;j<blocksize;++j)
1072  values(i,j,j,v) = 1;
1073  });
1074  });
1075  });
1076 #endif
1077  } else {
1078  // exclude from cuda to remove warning to compile host code on device
1079 #if defined(__CUDA_ARCH__)
1080  TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Error: CUDA should not see this code");
1081 #else
1082  const Unmanaged<vector_type_3d_view> values = btdm.values;
1083  const Kokkos::RangePolicy<execution_space> policy(0, packptr.extent(0) - 1);
1084  Kokkos::parallel_for
1085  ("setTridiagsToIdentity::RangePolicy",
1086  policy, KOKKOS_LAMBDA(const local_ordinal_type k) {
1087  for (size_type i=pack_td_ptr(packptr(k)),iend=pack_td_ptr(packptr(k+1));i<iend;i+=3)
1088  for (local_ordinal_type j=0;j<blocksize;++j)
1089  values(i,j,j) = 1;
1090  });
1091 #endif
1092  }
1093  }
1094 
1098  template <typename MatrixType>
1099  struct AmD {
1101  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1102  using size_type_1d_view = typename impl_type::size_type_1d_view;
1103  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
1104 
1105  // rowptr points to the start of each row of A_colindsub.
1106  size_type_1d_view rowptr, rowptr_remote;
1107  // Indices into A's rows giving the blocks to extract. rowptr(i) points to
1108  // the i'th row. Thus, g.entries(A_colindsub(rowptr(row) : rowptr(row+1))),
1109  // where g is A's graph, are the columns AmD uses. If seq_method_, then
1110  // A_colindsub contains all the LIDs and A_colindsub_remote is empty. If !
1111  // seq_method_, then A_colindsub contains owned LIDs and A_colindsub_remote
1112  // contains the remote ones.
1113  local_ordinal_type_1d_view A_colindsub, A_colindsub_remote;
1114 
1115  // Currently always true.
1116  bool is_tpetra_block_crs;
1117 
1118  // If is_tpetra_block_crs, then this is a pointer to A_'s value data.
1119  impl_scalar_type_1d_view tpetra_values;
1120 
1121  AmD() = default;
1122  AmD(const AmD &b) = default;
1123  };
1124 
1128  template<typename MatrixType>
1129  void
1130  performSymbolicPhase(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
1131  const PartInterface<MatrixType> &interf,
1133  AmD<MatrixType> &amd,
1134  const bool overlap_communication_and_computation) {
1135 
1136 #ifdef HAVE_IFPACK2_BLOCKTRIDICONTAINER_TIMERS
1137  TEUCHOS_FUNC_TIME_MONITOR("BlockTriDi::SymbolicPhase");
1138 #endif
1139  using impl_type = ImplType<MatrixType>;
1140  using memory_space = typename impl_type::memory_space;
1141  using host_execution_space = typename impl_type::host_execution_space;
1142 
1143  using local_ordinal_type = typename impl_type::local_ordinal_type;
1144  using global_ordinal_type = typename impl_type::global_ordinal_type;
1145  using size_type = typename impl_type::size_type;
1146  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1147  using size_type_1d_view = typename impl_type::size_type_1d_view;
1148  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1149  using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
1150 
1151  constexpr int vector_length = impl_type::vector_length;
1152 
1153  const auto comm = A->getRowMap()->getComm();
1154  const auto& g = A->getCrsGraph();
1155  const auto blocksize = A->getBlockSize();
1156 
1157  // mirroring to host
1158  const auto partptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.partptr);
1159  const auto lclrow = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.lclrow);
1160  const auto rowidx2part = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.rowidx2part);
1161  const auto part2rowidx0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.part2rowidx0);
1162  const auto packptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.packptr);
1163 
1164  const local_ordinal_type nrows = partptr(partptr.extent(0) - 1);
1165 
1166  // find column to row map on host
1167  Kokkos::View<local_ordinal_type*,host_execution_space> col2row("col2row", A->getNodeNumCols());
1168  Kokkos::deep_copy(col2row, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
1169  {
1170  const auto rowmap = g.getRowMap();
1171  const auto colmap = g.getColMap();
1172  const auto dommap = g.getDomainMap();
1173  TEUCHOS_ASSERT( !(rowmap.is_null() || colmap.is_null() || dommap.is_null()));
1174 
1175 #if !defined(__CUDA_ARCH__)
1176  const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1177  Kokkos::parallel_for
1178  ("performSymbolicPhase::RangePolicy::col2row",
1179  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
1180  const global_ordinal_type gid = rowmap->getGlobalElement(lr);
1182  if (dommap->isNodeGlobalElement(gid)) {
1183  const local_ordinal_type lc = colmap->getLocalElement(gid);
1184 # if defined(BLOCKTRIDICONTAINER_DEBUG)
1186  get_msg_prefix(comm) << "GID " << gid
1187  << " gives an invalid local column.");
1188 # endif
1189  col2row(lc) = lr;
1190  }
1191  });
1192 #endif
1193  }
1194 
1195  // construct the D and R graphs in A = D + R.
1196  {
1197  const auto& local_graph = g.getLocalGraph();
1198  const auto& local_graph_rowptr = local_graph.row_map;
1199  TEUCHOS_ASSERT(local_graph_rowptr.size() == static_cast<size_t>(nrows + 1));
1200  const auto& local_graph_colidx = local_graph.entries;
1201 
1202  //assume no overlap.
1203 
1204  Kokkos::View<local_ordinal_type*,host_execution_space> lclrow2idx("lclrow2idx", nrows);
1205  {
1206  const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1207  Kokkos::parallel_for
1208  ("performSymbolicPhase::RangePolicy::lclrow2idx",
1209  policy, KOKKOS_LAMBDA(const local_ordinal_type &i) {
1210  lclrow2idx[lclrow(i)] = i;
1211  });
1212  }
1213 
1214  // count (block) nnzs in D and R.
1215  typedef SumReducer<size_type,3,host_execution_space> sum_reducer_type;
1216  typename sum_reducer_type::value_type sum_reducer_value;
1217  {
1218  const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1219  Kokkos::parallel_reduce
1220  // profiling interface does not work
1221  (//"performSymbolicPhase::RangePolicy::count_nnz",
1222  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr, typename sum_reducer_type::value_type &update) {
1223  // LID -> index.
1224  const local_ordinal_type ri0 = lclrow2idx[lr];
1225  const local_ordinal_type pi0 = rowidx2part(ri0);
1226  for (size_type j=local_graph_rowptr(lr);j<local_graph_rowptr(lr+1);++j) {
1227  const local_ordinal_type lc = local_graph_colidx(j);
1228  const local_ordinal_type lc2r = col2row[lc];
1229  bool incr_R = false;
1230  do { // breakable
1232  incr_R = true;
1233  break;
1234  }
1235  const local_ordinal_type ri = lclrow2idx[lc2r];
1236  const local_ordinal_type pi = rowidx2part(ri);
1237  if (pi != pi0) {
1238  incr_R = true;
1239  break;
1240  }
1241  // Test for being in the tridiag. This is done in index space. In
1242  // LID space, tridiag LIDs in a row are not necessarily related by
1243  // {-1, 0, 1}.
1244  if (ri0 + 1 >= ri && ri0 <= ri + 1)
1245  ++update.v[0]; // D_nnz
1246  else
1247  incr_R = true;
1248  } while (0);
1249  if (incr_R) {
1250  if (lc < nrows) ++update.v[1]; // R_nnz_owned
1251  else ++update.v[2]; // R_nnz_remote
1252  }
1253  }
1254  }, sum_reducer_type(sum_reducer_value));
1255  }
1256  size_type D_nnz = sum_reducer_value.v[0];
1257  size_type R_nnz_owned = sum_reducer_value.v[1];
1258  size_type R_nnz_remote = sum_reducer_value.v[2];
1259 
1260  if (!overlap_communication_and_computation) {
1261  R_nnz_owned += R_nnz_remote;
1262  R_nnz_remote = 0;
1263  }
1264 
1265  // construct the D graph.
1266  {
1267  const auto flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
1268 
1269  btdm.A_colindsub = local_ordinal_type_1d_view("btdm.A_colindsub", D_nnz);
1270  const auto D_A_colindsub = Kokkos::create_mirror_view(btdm.A_colindsub);
1271 
1272 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1273  Kokkos::deep_copy(D_A_colindsub, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
1274 #endif
1275 
1276  const local_ordinal_type nparts = partptr.extent(0) - 1;
1277  {
1278  const Kokkos::RangePolicy<host_execution_space> policy(0, nparts);
1279  Kokkos::parallel_for
1280  ("performSymbolicPhase::RangePolicy<host_execution_space>::D_graph",
1281  policy, KOKKOS_LAMBDA(const local_ordinal_type &pi0) {
1282  const local_ordinal_type part_ri0 = part2rowidx0(pi0);
1283  local_ordinal_type offset = 0;
1284  for (local_ordinal_type ri0=partptr(pi0);ri0<partptr(pi0+1);++ri0) {
1285  const local_ordinal_type td_row_os = btdm.RowToIndex(ri0 - part_ri0) + offset;
1286  offset = 1;
1287  const local_ordinal_type lr0 = lclrow(ri0);
1288  const size_type j0 = local_graph_rowptr(lr0);
1289  for (size_type j=j0;j<local_graph_rowptr(lr0+1);++j) {
1290  const local_ordinal_type lc = local_graph_colidx(j);
1291  const local_ordinal_type lc2r = col2row[lc];
1293  const local_ordinal_type ri = lclrow2idx[lc2r];
1294  const local_ordinal_type pi = rowidx2part(ri);
1295  if (pi != pi0) continue;
1296  if (ri + 1 < ri0 || ri > ri0 + 1) continue;
1297  const local_ordinal_type row_entry = j - j0;
1298  D_A_colindsub(flat_td_ptr(pi0) + ((td_row_os + ri) - ri0)) = row_entry;
1299  }
1300  }
1301  });
1302  }
1303 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1304  for (size_t i=0;i<D_A_colindsub.extent(0);++i)
1306 #endif
1307  Kokkos::deep_copy(btdm.A_colindsub, D_A_colindsub);
1308 
1309  // Allocate values.
1310  {
1311  const local_ordinal_type npacks = packptr.extent(0) - 1;
1312  const auto pack_td_ptr_last = Kokkos::subview(btdm.pack_td_ptr, nparts);
1313  const auto num_packed_blocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_last);
1314  btdm.values = vector_type_3d_view("btdm.values", num_packed_blocks(), blocksize, blocksize);
1315  if (vector_length > 1) setTridiagsToIdentity(btdm, interf.packptr);
1316  }
1317  }
1318 
1319  // Construct the R graph.
1320  {
1321  amd.rowptr = size_type_1d_view("amd.rowptr", nrows + 1);
1322  amd.A_colindsub = local_ordinal_type_1d_view(Kokkos::ViewAllocateWithoutInitializing("amd.A_colindsub"), R_nnz_owned);
1323 
1324  const auto R_rowptr = Kokkos::create_mirror_view(amd.rowptr);
1325  const auto R_A_colindsub = Kokkos::create_mirror_view(amd.A_colindsub);
1326 
1327  amd.rowptr_remote = size_type_1d_view("amd.rowptr_remote", overlap_communication_and_computation ? nrows + 1 : 0);
1328  amd.A_colindsub_remote = local_ordinal_type_1d_view(Kokkos::ViewAllocateWithoutInitializing("amd.A_colindsub_remote"), R_nnz_remote);
1329 
1330  const auto R_rowptr_remote = Kokkos::create_mirror_view(amd.rowptr_remote);
1331  const auto R_A_colindsub_remote = Kokkos::create_mirror_view(amd.A_colindsub_remote);
1332 
1333  {
1334  const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1335  Kokkos::parallel_for
1336  ("performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_count",
1337  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
1338  const local_ordinal_type ri0 = lclrow2idx[lr];
1339  const local_ordinal_type pi0 = rowidx2part(ri0);
1340  const size_type j0 = local_graph_rowptr(lr);
1341  for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
1342  const local_ordinal_type lc = local_graph_colidx(j);
1343  const local_ordinal_type lc2r = col2row[lc];
1345  const local_ordinal_type ri = lclrow2idx[lc2r];
1346  const local_ordinal_type pi = rowidx2part(ri);
1347  if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1) {
1348  continue;
1349  }
1350  }
1351  // exclusive scan will be performed later
1352  if (!overlap_communication_and_computation || lc < nrows) {
1353  ++R_rowptr(lr);
1354  } else {
1355  ++R_rowptr_remote(lr);
1356  }
1357  }
1358  });
1359  }
1360 
1361  // exclusive scan
1362  typedef ArrayValueType<size_type,2> update_type;
1363  {
1364  Kokkos::RangePolicy<host_execution_space> policy(0,nrows+1);
1365  Kokkos::parallel_scan
1366  ("performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_fill",
1367  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr,
1368  update_type &update,
1369  const bool &final) {
1370  update_type val;
1371  val.v[0] = R_rowptr(lr);
1372  if (overlap_communication_and_computation)
1373  val.v[1] = R_rowptr_remote(lr);
1374 
1375  if (final) {
1376  R_rowptr(lr) = update.v[0];
1377  if (overlap_communication_and_computation)
1378  R_rowptr_remote(lr) = update.v[1];
1379 
1380  if (lr < nrows) {
1381  const local_ordinal_type ri0 = lclrow2idx[lr];
1382  const local_ordinal_type pi0 = rowidx2part(ri0);
1383 
1384  size_type cnt_rowptr = R_rowptr(lr);
1385  size_type cnt_rowptr_remote = overlap_communication_and_computation ? R_rowptr_remote(lr) : 0; // when not overlap_communication_and_computation, this value is garbage
1386 
1387  const size_type j0 = local_graph_rowptr(lr);
1388  for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
1389  const local_ordinal_type lc = local_graph_colidx(j);
1390  const local_ordinal_type lc2r = col2row[lc];
1392  const local_ordinal_type ri = lclrow2idx[lc2r];
1393  const local_ordinal_type pi = rowidx2part(ri);
1394  if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1)
1395  continue;
1396  }
1397  const local_ordinal_type row_entry = j - j0;
1398  if (!overlap_communication_and_computation || lc < nrows)
1399  R_A_colindsub(cnt_rowptr++) = row_entry;
1400  else
1401  R_A_colindsub_remote(cnt_rowptr_remote++) = row_entry;
1402  }
1403  }
1404  }
1405  update += val;
1406  });
1407  }
1408  TEUCHOS_ASSERT(R_rowptr(nrows) == R_nnz_owned);
1409  Kokkos::deep_copy(amd.rowptr, R_rowptr);
1410  Kokkos::deep_copy(amd.A_colindsub, R_A_colindsub);
1411  if (overlap_communication_and_computation) {
1412  TEUCHOS_ASSERT(R_rowptr_remote(nrows) == R_nnz_remote);
1413  Kokkos::deep_copy(amd.rowptr_remote, R_rowptr_remote);
1414  Kokkos::deep_copy(amd.A_colindsub_remote, R_A_colindsub_remote);
1415  }
1416 
1417  // Allocate or view values.
1418  amd.tpetra_values = (const_cast<block_crs_matrix_type*>(A.get())->
1419  template getValues<memory_space>());
1420  }
1421  }
1422  }
1423 
1427  template<typename MatrixType>
1429  public:
1431  // a functor cannot have both device_type and execution_space; specialization error in kokkos
1432  using execution_space = typename impl_type::execution_space;
1433  using memory_space = typename impl_type::memory_space;
1434 
1435  using local_ordinal_type = typename impl_type::local_ordinal_type;
1436  using size_type = typename impl_type::size_type;
1437  using impl_scalar_type = typename impl_type::impl_scalar_type;
1438  using magnitude_type = typename impl_type::magnitude_type;
1440  using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
1442  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1443  using size_type_1d_view = typename impl_type::size_type_1d_view;
1444  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
1446  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1447  using impl_scalar_type_4d_view = typename impl_type::impl_scalar_type_4d_view;
1448  static constexpr int vector_length = impl_type::vector_length;
1449 
1451  using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
1452 
1453  private:
1454  // part interface
1455  const ConstUnmanaged<local_ordinal_type_1d_view> partptr, lclrow, packptr;
1456  // block crs matrix (it could be Kokkos::UVMSpace::size_type, which is int)
1457  using a_rowptr_value_type = typename Kokkos::ViewTraits<local_ordinal_type*,typename impl_type::device_type>::size_type;
1458  const ConstUnmanaged<Kokkos::View<a_rowptr_value_type*,typename impl_type::device_type> > A_rowptr;
1459  const ConstUnmanaged<impl_scalar_type_1d_view> A_values;
1460  // block tridiags
1461  const ConstUnmanaged<size_type_1d_view> pack_td_ptr, flat_td_ptr;
1462  const ConstUnmanaged<local_ordinal_type_1d_view> A_colindsub;
1463  const Unmanaged<vector_type_3d_view> vector_values;
1464  const Unmanaged<impl_scalar_type_4d_view> scalar_values;
1465  // shared information
1466  const local_ordinal_type blocksize, blocksize_square;
1467  // diagonal safety
1468  const magnitude_type tiny;
1469  const local_ordinal_type vector_length_value;
1470 
1471  public:
1473  const PartInterface<MatrixType> &interf_,
1475  const magnitude_type& tiny_) :
1476  // interface
1477  partptr(interf_.partptr),
1478  lclrow(interf_.lclrow),
1479  packptr(interf_.packptr),
1480  // block crs matrix
1481  A_rowptr(A_->getCrsGraph().getLocalGraph().row_map),
1482  A_values(const_cast<block_crs_matrix_type*>(A_.get())->template getValues<memory_space>()),
1483  // block tridiags
1484  pack_td_ptr(btdm_.pack_td_ptr),
1485  flat_td_ptr(btdm_.flat_td_ptr),
1486  A_colindsub(btdm_.A_colindsub),
1487  vector_values(btdm_.values),
1488  scalar_values((impl_scalar_type*)vector_values.data(),
1489  vector_values.extent(0),
1490  vector_values.extent(1),
1491  vector_values.extent(2),
1492  vector_length),
1493  blocksize(btdm_.values.extent(1)),
1494  blocksize_square(blocksize*blocksize),
1495  // diagonal weight to avoid zero pivots
1496  tiny(tiny_),
1497  vector_length_value(vector_length) {}
1498 
1499  private:
1500 
1501  inline
1502  void
1503  extract(const local_ordinal_type &packidx) const {
1504  local_ordinal_type partidx = packptr(packidx);
1505  local_ordinal_type npacks = packptr(packidx+1) - partidx;
1506 
1507  const size_type kps = pack_td_ptr(partidx);
1508  local_ordinal_type kfs[vector_length] = {};
1509  local_ordinal_type ri0[vector_length] = {};
1510  local_ordinal_type nrows[vector_length] = {};
1511 
1512  for (local_ordinal_type vi=0;vi<npacks;++vi,++partidx) {
1513  kfs[vi] = flat_td_ptr(partidx);
1514  ri0[vi] = partptr(partidx);
1515  nrows[vi] = partptr(partidx+1) - ri0[vi];
1516  }
1517  for (local_ordinal_type tr=0,j=0;tr<nrows[0];++tr) {
1518  for (local_ordinal_type e=0;e<3;++e) {
1519  const impl_scalar_type* block[vector_length] = {};
1520  for (local_ordinal_type vi=0;vi<npacks;++vi) {
1521  const size_type Aj = A_rowptr(lclrow(ri0[vi] + tr)) + A_colindsub(kfs[vi] + j);
1522  block[vi] = &A_values(Aj*blocksize_square);
1523  }
1524  const size_type pi = kps + j;
1525  ++j;
1526  for (local_ordinal_type ii=0;ii<blocksize;++ii) {
1527  for (local_ordinal_type jj=0;jj<blocksize;++jj) {
1528  const auto idx = ii*blocksize + jj;
1529  auto& v = vector_values(pi, ii, jj);
1530  for (local_ordinal_type vi=0;vi<npacks;++vi)
1531  v[vi] = block[vi][idx];
1532  }
1533  }
1534 
1535  if (nrows[0] == 1) break;
1536  if (e == 1 && (tr == 0 || tr+1 == nrows[0])) break;
1537  for (local_ordinal_type vi=1;vi<npacks;++vi) {
1538  if ((e == 0 && nrows[vi] == 1) || (e == 1 && tr+1 == nrows[vi])) {
1539  npacks = vi;
1540  break;
1541  }
1542  }
1543  }
1544  }
1545  }
1546 
1547  inline
1548  void
1549  factorize (const local_ordinal_type& packidx) const {
1550  namespace KB = KokkosBatched::Experimental;
1551  using AlgoType = KB::Algo::Level3::Blocked;
1552 
1553  // constant
1554  const auto one = Kokkos::ArithTraits<magnitude_type>::one();
1555 
1556  auto i0 = pack_td_ptr(packptr(packidx));
1557  const local_ordinal_type nrows
1558  = BlockTridiags<MatrixType>::IndexToRow(pack_td_ptr(packptr(packidx+1)) - i0 - 1) + 1;
1559 
1560  // subview pattern
1561  auto A = Kokkos::subview(vector_values, i0, Kokkos::ALL(), Kokkos::ALL());
1562  KB::SerialLU<KB::Algo::LU::Unblocked>::invoke(A, tiny);
1563 
1564  if (nrows > 1) {
1565  auto B = A;
1566  auto C = A;
1567  for (local_ordinal_type i=1;i<nrows;++i,i0+=3) {
1568  B.assign_data( &vector_values(i0+1,0,0) );
1569  KB::SerialTrsm<KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,AlgoType>
1570  ::invoke(one, A, B);
1571  C.assign_data( &vector_values(i0+2,0,0) );
1572  KB::SerialTrsm<KB::Side::Right,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,AlgoType>
1573  ::invoke(one, A, C);
1574  A.assign_data( &vector_values(i0+3,0,0) );
1575  KB::SerialGemm<KB::Trans::NoTranspose,KB::Trans::NoTranspose,AlgoType>
1576  ::invoke(-one, C, B, one, A);
1577  KB::SerialLU<KB::Algo::LU::Unblocked>::invoke(A, tiny);
1578  }
1579  }
1580  }
1581 
1582  KOKKOS_INLINE_FUNCTION
1583  void
1584  extract(const member_type &member,
1585  const local_ordinal_type &partidx,
1586  const local_ordinal_type &v) const {
1587  const size_type kps = pack_td_ptr(partidx);
1588  const local_ordinal_type kfs = flat_td_ptr(partidx);
1589  const local_ordinal_type ri0 = partptr(partidx);
1590  const local_ordinal_type nrows = partptr(partidx+1) - ri0;
1591  for (local_ordinal_type tr=0,j=0;tr<nrows;++tr) {
1592  local_ordinal_type lbeg = (tr == 0 ? 1 : 0);
1593  local_ordinal_type lend = (tr == nrows - 1 ? 2 : 3);
1594  for (local_ordinal_type l=lbeg;l<lend;++l,++j) { // l == 1, diagonal
1595  const size_type Aj = A_rowptr(lclrow(ri0 + tr)) + A_colindsub(kfs + j);
1596  const impl_scalar_type* block = &A_values(Aj*blocksize_square);
1597  const size_type pi = kps + j;
1598  Kokkos::parallel_for
1599  (Kokkos::TeamThreadRange(member,blocksize_square), [&](const local_ordinal_type &ij) {
1600  const local_ordinal_type ii = ij/blocksize;
1601  const local_ordinal_type jj = ij%blocksize;
1602  scalar_values(pi, ii, jj, v) = block[ii*blocksize + jj];
1603  });
1604  }
1605  }
1606  }
1607 
1608  KOKKOS_INLINE_FUNCTION
1609  void
1610  factorize(const member_type &member,
1611  const local_ordinal_type &i0,
1612  const local_ordinal_type &nrows,
1613  const local_ordinal_type &v) const {
1614  namespace KB = KokkosBatched::Experimental;
1615  using AlgoType = KB::Algo::Level3::Unblocked;
1616 
1617  // constant
1618  const auto one = Kokkos::ArithTraits<magnitude_type>::one();
1619 
1620  // subview pattern
1621  auto A = Kokkos::subview(scalar_values, i0, Kokkos::ALL(), Kokkos::ALL(), 0);
1622  A.assign_data( &scalar_values(i0,0,0,v) );
1623  KB::TeamLU<member_type,KB::Algo::LU::Unblocked>::invoke(member, A, tiny);
1624  if (nrows > 1) {
1625  auto B = A;
1626  auto C = A;
1627  local_ordinal_type i = i0;
1628  for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
1629  B.assign_data( &scalar_values(i+1,0,0,v) );
1630  KB::TeamTrsm<member_type,KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,AlgoType>
1631  ::invoke(member, one, A, B);
1632  C.assign_data( &scalar_values(i+2,0,0,v) );
1633  KB::TeamTrsm<member_type,KB::Side::Right,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,AlgoType>
1634  ::invoke(member, one, A, C);
1635  A.assign_data( &scalar_values(i+3,0,0,v) );
1636  KB::TeamGemm<member_type,KB::Trans::NoTranspose,KB::Trans::NoTranspose,AlgoType>
1637  ::invoke(member, -one, C, B, one, A);
1638  KB::TeamLU<member_type,KB::Algo::LU::Unblocked>::invoke(member, A, tiny);
1639  }
1640  }
1641  }
1642 
1643  public:
1647  inline
1648  void
1649  operator() (const local_ordinal_type& packidx) const {
1650  extract(packidx);
1651  factorize(packidx);
1652  }
1653 
1657  KOKKOS_INLINE_FUNCTION
1658  void
1659  operator() (const member_type &member) const {
1660  // btdm is packed and sorted from largest one
1661  const local_ordinal_type packidx = member.league_rank();
1662  const local_ordinal_type partidx = packptr(packidx);
1663  const local_ordinal_type npacks = packptr(packidx+1) - partidx;
1664  const local_ordinal_type i0 = pack_td_ptr(partidx);
1665  const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
1666 
1667  // every cuda threads should participate in computation (do not reduce by npacks)
1668  // Kyungjoo Why NVCC does not understand "constexpr"
1669  // if I do set enum, some compilers take the enum value as ZERO and creates silent erros
1670  // for other compilers, simply it generates constexpr is undefined on device.
1671  // seriously, better compiler support is necessary from NVIDIA
1672  Kokkos::parallel_for
1673  (Kokkos::ThreadVectorRange(member, vector_length_value), [&](const local_ordinal_type &v) {
1674  // extract team should not include any team barrier as it is masked out by npacks
1675  if (v < npacks) extract(member, partidx + v, v);
1676  member.team_barrier();
1677  // factorize should be invoked by all threads as team kernels invokes barrier inside
1678  factorize(member, i0, nrows, v);
1679  });
1680  }
1681 
1682  void run() {
1683 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
1684  cudaProfilerStart();
1685 #endif
1686 
1688 #if defined(KOKKOS_ENABLE_CUDA)
1689  const local_ordinal_type vl = vector_length;
1690  const Kokkos::TeamPolicy<execution_space> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
1691  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run", policy, *this);
1692 #endif
1693  } else {
1694 #if defined(__CUDA_ARCH__)
1695  TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Error: CUDA should not see this code");
1696 #else
1697  const Kokkos::RangePolicy<execution_space> policy(0, packptr.extent(0) - 1);
1698  Kokkos::parallel_for("ExtractAndFactorize::RangePolicy::run", policy, *this);
1699 #endif
1700  }
1701 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
1702  cudaProfilerStop();
1703 #endif
1704  }
1705  };
1706 
1710  template<typename MatrixType>
1711  void
1712  performNumericPhase(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
1713  const PartInterface<MatrixType> &interf,
1715  const typename ImplType<MatrixType>::magnitude_type tiny) {
1716 #ifdef HAVE_IFPACK2_BLOCKTRIDICONTAINER_TIMERS
1717  TEUCHOS_FUNC_TIME_MONITOR("BlockTriDi::NumericPhase");
1718 #endif
1719  ExtractAndFactorizeTridiags<MatrixType> function(btdm, interf, A, tiny);
1720  function.run();
1721  }
1722 
1726  template<typename MatrixType>
1728  public:
1730  using execution_space = typename impl_type::execution_space;
1731 
1732  using local_ordinal_type = typename impl_type::local_ordinal_type;
1733  using impl_scalar_type = typename impl_type::impl_scalar_type;
1734  using magnitude_type = typename impl_type::magnitude_type;
1735  using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
1736  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1737  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1738  using impl_scalar_type_2d_view = typename impl_type::impl_scalar_type_2d_view;
1739  static constexpr int vector_length = impl_type::vector_length;
1740 
1741  using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
1742 
1743  private:
1744  // part interface
1745  const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
1746  const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
1747  const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
1748  const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
1749  const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
1750  const local_ordinal_type blocksize;
1751  const local_ordinal_type num_vectors;
1752 
1753  // packed multivector output (or input)
1754  Unmanaged<vector_type_3d_view> packed_multivector;
1755  Unmanaged<impl_scalar_type_2d_view> scalar_multivector;
1756  impl_scalar_type damping_factor;
1757 
1758  template<typename TagType>
1759  KOKKOS_INLINE_FUNCTION
1760  void copy_multivectors(const local_ordinal_type &j,
1761  const local_ordinal_type &vi,
1762  const local_ordinal_type &pri,
1763  const local_ordinal_type &/* nrow */,
1764  const local_ordinal_type &ri0) const {
1765  if (TagType::id == 0) { // ToPackedMultiVectorTag
1766  for (local_ordinal_type col=0;col<num_vectors;++col)
1767  for (local_ordinal_type i=0;i<blocksize;++i)
1768  packed_multivector(pri, i, col)[vi] = scalar_multivector(blocksize*lclrow(ri0+j)+i,col);
1769  } else if (TagType::id > 0) { //ToScalarMultiVector
1770  const impl_scalar_type df = damping_factor;
1771  for (local_ordinal_type col=0;col<num_vectors;++col)
1772  for (local_ordinal_type i=0;i<blocksize;++i) {
1773  impl_scalar_type &y = scalar_multivector(blocksize*lclrow(ri0+j)+i,col);
1774  const impl_scalar_type yc = packed_multivector(pri, i, col)[vi];
1775  if (TagType::id == 1) y = df*yc;
1776  else y += df*(yc - y);
1777  }
1778  }
1779  }
1780 
1781  template<typename TagType>
1782  KOKKOS_INLINE_FUNCTION
1783  void copy_multivectors_with_norm(const local_ordinal_type &j,
1784  const local_ordinal_type &vi,
1785  const local_ordinal_type &pri,
1786  const local_ordinal_type &/* nrow */,
1787  const local_ordinal_type &ri0,
1788  /* */ magnitude_type *norm) const {
1789  if (TagType::id > 0) { //ToScalarMultiVector
1790  const impl_scalar_type df = damping_factor;
1791  for (local_ordinal_type col=0;col<num_vectors;++col) {
1792  const local_ordinal_type offset = col*blocksize;
1793  for (local_ordinal_type i=0;i<blocksize;++i) {
1794  impl_scalar_type &y = scalar_multivector(blocksize*lclrow(ri0+j)+i,col);
1795  const impl_scalar_type yc = packed_multivector(pri, i, col)[vi];
1796  const impl_scalar_type yd = TagType::id == 1 ? yc : yc - y;
1797  const magnitude_type abs_yd = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
1798  norm[offset+i] += abs_yd*abs_yd;
1799  if (TagType::id == 1) y = df*yc;
1800  else y += df*yd;
1801  }
1802  }
1803  }
1804  }
1805 
1806  public:
1807 
1808  // local reduction of norms with runtime array
1809  // this value type and value_count is required for Kokkos
1810  typedef magnitude_type value_type[];
1811  int value_count;
1812 
1813  KOKKOS_INLINE_FUNCTION
1814  void
1815  init(magnitude_type *dst) const {
1816  for (int i=0;i<value_count;++i)
1817  dst[i] = Kokkos::reduction_identity<magnitude_type>::sum();
1818  }
1819 
1820  KOKKOS_INLINE_FUNCTION
1821  void
1822  join(volatile magnitude_type *dst, const volatile magnitude_type *src) const {
1823  for (int i=0;i<value_count;++i)
1824  dst[i] += src[i];
1825  }
1826 
1827  MultiVectorConverter() = delete;
1828  MultiVectorConverter(const MultiVectorConverter &b) = default;
1829  MultiVectorConverter(const PartInterface<MatrixType> &interf,
1830  const vector_type_3d_view &pmv)
1831  : partptr(interf.partptr),
1832  packptr(interf.packptr),
1833  part2packrowidx0(interf.part2packrowidx0),
1834  part2rowidx0(interf.part2rowidx0),
1835  lclrow(interf.lclrow),
1836  blocksize(pmv.extent(1)),
1837  num_vectors(pmv.extent(2)),
1838  packed_multivector(pmv) {}
1839 
1840  // TODO:: modify this routine similar to the team level functions
1841  template<typename TagType>
1842  inline
1843  void
1844  operator() (const TagType&, const local_ordinal_type &packidx, magnitude_type* const norm = NULL) const {
1845  local_ordinal_type partidx = packptr(packidx);
1846  local_ordinal_type npacks = packptr(packidx+1) - partidx;
1847  const local_ordinal_type pri0 = part2packrowidx0(partidx);
1848 
1849  local_ordinal_type ri0[vector_length] = {};
1850  local_ordinal_type nrows[vector_length] = {};
1851  for (local_ordinal_type v=0;v<npacks;++v,++partidx) {
1852  ri0[v] = part2rowidx0(partidx);
1853  nrows[v] = part2rowidx0(partidx+1) - ri0[v];
1854  }
1855  for (local_ordinal_type j=0;j<nrows[0];++j) {
1856  local_ordinal_type cnt = 1;
1857  for (;cnt<npacks && j!= nrows[cnt];++cnt);
1858  npacks = cnt;
1859  const local_ordinal_type pri = pri0 + j;
1860  for (local_ordinal_type v=0;v<npacks;++v) {
1861  if (norm == NULL) copy_multivectors<TagType>(j, v, pri, nrows[v], ri0[v]);
1862  else copy_multivectors_with_norm<TagType>(j, v, pri, nrows[v], ri0[v], norm);
1863  }
1864  }
1865  }
1866 
1867  template<typename TagType>
1868  KOKKOS_INLINE_FUNCTION
1869  void
1870  operator() (const TagType&, const member_type &member, magnitude_type* const norm = NULL) const {
1871  const local_ordinal_type packidx = member.league_rank();
1872  const local_ordinal_type partidx_begin = packptr(packidx);
1873  const local_ordinal_type npacks = packptr(packidx+1) - partidx_begin;
1874  const local_ordinal_type pri0 = part2packrowidx0(partidx_begin);
1875  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, npacks), [&](const local_ordinal_type &v) {
1876  const local_ordinal_type partidx = partidx_begin + v;
1877  const local_ordinal_type ri0 = part2rowidx0(partidx);
1878  const local_ordinal_type nrows = part2rowidx0(partidx+1) - ri0;
1879  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, nrows), [&](const local_ordinal_type &j) {
1880  const local_ordinal_type pri = pri0 + j;
1881  if (norm == NULL) copy_multivectors<TagType>(j, v, pri, nrows, ri0);
1882  else copy_multivectors_with_norm<TagType>(j, v, pri, nrows, ri0, norm);
1883  });
1884  });
1885  }
1886 
1887  struct ToPackedMultiVectorTag { enum : int { id = 0 }; };
1888  struct ToScalarMultiVectorFirstTag { enum : int { id = 1 }; };
1889  struct ToScalarMultiVectorSecondTag { enum : int { id = 2 }; };
1890 
1891  template<typename TpetraLocalViewType>
1892  void to_packed_multivector(const TpetraLocalViewType &scalar_multivector_) {
1893 #ifdef HAVE_IFPACK2_BLOCKTRIDICONTAINER_TIMERS
1894  TEUCHOS_FUNC_TIME_MONITOR("BlockTriDi::MultiVectorConverter::ToPackedMultiVector");
1895 #endif
1896  value_count = 0;
1897  scalar_multivector = scalar_multivector_;
1899 #if defined(KOKKOS_ENABLE_CUDA)
1900  const local_ordinal_type vl = vector_length;
1901  const Kokkos::TeamPolicy<execution_space,ToPackedMultiVectorTag> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
1902  Kokkos::parallel_for
1903  ("MultiVectorConverter::TeamPolicy::to_packed_multivector",
1904  policy, *this);
1905 #endif
1906  } else {
1907 #if defined(__CUDA_ARCH__)
1908  TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Error: CUDA should not see this code");
1909 #else
1910  const Kokkos::RangePolicy<execution_space,ToPackedMultiVectorTag> policy(0, packptr.extent(0) - 1);
1911  Kokkos::parallel_for
1912  ("MultiVectorConverter::RangePolicy::to_packed_multivector",
1913  policy, *this);
1914 #endif
1915  }
1916  }
1917 
1918  template<typename TpetraLocalViewType>
1919  void to_scalar_multivector(const TpetraLocalViewType &scalar_multivector_,
1920  const impl_scalar_type &damping_factor_,
1921  const bool &is_vectors_zero,
1922  /* */ magnitude_type *norm = NULL) {
1923 #ifdef HAVE_IFPACK2_BLOCKTRIDICONTAINER_TIMERS
1924  TEUCHOS_FUNC_TIME_MONITOR("BlockTriDi::MultiVectorConverter::ToScalarMultiVector");
1925 #endif
1926  scalar_multivector = scalar_multivector_;
1927  damping_factor = damping_factor_;
1928  if (norm != NULL) {
1929  value_count = blocksize*num_vectors;
1930  for (int i=0;i<value_count;++i)
1931  norm[i] = Kokkos::ArithTraits<magnitude_type>::zero();
1932  } else {
1933  value_count = 0;
1934  }
1935 
1937 #if defined(KOKKOS_ENABLE_CUDA)
1938  // dynamic reduce does not support for vl > 1
1939  const local_ordinal_type vl = norm == NULL ? vector_length : 1;
1940 
1941  if (is_vectors_zero) {
1942  const Kokkos::TeamPolicy
1943  <execution_space,ToScalarMultiVectorFirstTag> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
1944  if (norm == NULL)
1945  Kokkos::parallel_for
1946  ("MultiVectorConverter::TeamPolicy::to_scalar_multivector::fist", policy, *this);
1947  else
1948  Kokkos::parallel_reduce
1949  ("MultiVectorConverter::TeamPolicy::to_scalar_multivector::fist_w_norm", policy, *this, norm);
1950  } else {
1951  const Kokkos::TeamPolicy
1952  <execution_space,ToScalarMultiVectorSecondTag> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
1953  if (norm == NULL)
1954  Kokkos::parallel_for
1955  ("MultiVectorConverter::TeamPolicy::to_scalar_multivector::second", policy, *this);
1956  else
1957  Kokkos::parallel_reduce
1958  ("MultiVectorConverter::TeamPolicy::to_scalar_multivector::second_w_norm", policy, *this, norm);
1959  }
1960 #endif
1961  } else {
1962 #if defined(__CUDA_ARCH__)
1963  TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Error: CUDA should not see this code");
1964 #else
1965  if (is_vectors_zero) {
1966  const Kokkos::RangePolicy
1967  <execution_space,ToScalarMultiVectorFirstTag> policy(0, packptr.extent(0) - 1);
1968  if (norm == NULL)
1969  Kokkos::parallel_for
1970  ("MultiVectorConverter::RangePolicy::to_scalar_multivector::fist", policy, *this);
1971  else
1972  Kokkos::parallel_reduce
1973  ("MultiVectorConverter::RangePolicy::to_scalar_multivector::fist_w_norm", policy, *this, norm);
1974  } else {
1975  const Kokkos::RangePolicy
1976  <execution_space,ToScalarMultiVectorSecondTag> policy(0, packptr.extent(0) - 1);
1977  if (norm == NULL)
1978  Kokkos::parallel_for
1979  ("MultiVectorConverter::RangePolicy::to_scalar_multivector::second", policy, *this);
1980  else
1981  Kokkos::parallel_reduce
1982  ("MultiVectorConverter::RangePolicy::to_scalar_multivector::second_w_norm", policy, *this, norm);
1983  }
1984 #endif
1985  }
1986  }
1987  };
1988 
1992  template<typename MatrixType>
1993  struct SolveTridiags {
1994  public:
1996  using execution_space = typename impl_type::execution_space;
1997 
1998  using local_ordinal_type = typename impl_type::local_ordinal_type;
1999  using size_type = typename impl_type::size_type;
2000  using impl_scalar_type = typename impl_type::impl_scalar_type;
2001  using magnitude_type = typename impl_type::magnitude_type;
2003  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
2004  using size_type_1d_view = typename impl_type::size_type_1d_view;
2006  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
2007  using impl_scalar_type_4d_view = typename impl_type::impl_scalar_type_4d_view;
2008  static constexpr int vector_length = impl_type::vector_length;
2009 
2011  using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
2012 
2013  private:
2014  // part interface
2015  const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
2016  const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
2017  const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
2018  // block tridiags
2019  const ConstUnmanaged<size_type_1d_view> pack_td_ptr;
2020  // block tridiags values
2021  const ConstUnmanaged<vector_type_3d_view> D_vector_values;
2022  const Unmanaged<vector_type_3d_view> X_vector_values;
2023  // scalar tridiag values
2024  const ConstUnmanaged<impl_scalar_type_4d_view> D_scalar_values;
2025  const Unmanaged<impl_scalar_type_4d_view> X_scalar_values;
2026 
2027  const local_ordinal_type vector_length_value;
2028 
2029  public:
2030  SolveTridiags(const PartInterface<MatrixType> &interf,
2031  const BlockTridiags<MatrixType> &btdm,
2032  const vector_type_3d_view &pmv)
2033  :
2034  // interface
2035  partptr(interf.partptr),
2036  packptr(interf.packptr),
2037  part2packrowidx0(interf.part2packrowidx0),
2038  // block tridiags and multivector
2039  pack_td_ptr(btdm.pack_td_ptr),
2040  D_vector_values(btdm.values),
2041  X_vector_values(pmv),
2042  // scalar tridiags and multivector
2043  D_scalar_values((impl_scalar_type*)D_vector_values.data(),
2044  D_vector_values.extent(0),
2045  D_vector_values.extent(1),
2046  D_vector_values.extent(2),
2047  vector_length),
2048  X_scalar_values((impl_scalar_type*)X_vector_values.data(),
2049  X_vector_values.extent(0),
2050  X_vector_values.extent(1),
2051  X_vector_values.extent(2),
2052  vector_length),
2053  vector_length_value(vector_length)
2054  {}
2055 
2056  public:
2057 
2061  inline
2062  void
2063  serialSolveSingleVector(const local_ordinal_type &blocksize,
2064  const local_ordinal_type &packidx) const {
2065  namespace KB = KokkosBatched::Experimental;
2066  using AlgoType = KB::Algo::Level2::Unblocked;
2067 
2068  // base pointers
2069  auto A = D_vector_values.data();
2070  auto X = X_vector_values.data();
2071 
2072  // constant
2073  const auto one = Kokkos::ArithTraits<magnitude_type>::one();
2074  // const local_ordinal_type blocksize = D_vector_values.extent(1);
2075  const local_ordinal_type astep = D_vector_values.stride_0();
2076  const local_ordinal_type as0 = blocksize; //D_vector_values.stride_1();
2077  const local_ordinal_type as1 = 1; //D_vector_values.stride_2();
2078  const local_ordinal_type xstep = X_vector_values.stride_0();
2079  const local_ordinal_type xs0 = 1; //X_vector_values.stride_1();
2080 
2081  // index counting
2082  const local_ordinal_type partidx = packptr(packidx);
2083  const size_type i0 = pack_td_ptr(partidx);
2084  const local_ordinal_type r0 = part2packrowidx0(partidx);
2085  const local_ordinal_type nrows = part2packrowidx0(packptr(packidx+1)) - r0;
2086 
2087  // move to starting point
2088  A += i0*astep;
2089  X += r0*xstep;
2090 
2091  // solve Lx = x
2092  KOKKOSBATCHED_SERIAL_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2093  (AlgoType,KB::Diag::Unit,
2094  blocksize,blocksize,
2095  one,
2096  A, as0, as1,
2097  X, xs0);
2098 
2099  for (local_ordinal_type tr=1;tr<nrows;++tr) {
2100  KOKKOSBATCHED_SERIAL_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2101  (AlgoType,
2102  blocksize, blocksize,
2103  -one,
2104  A+2*astep, as0, as1,
2105  X, xs0,
2106  one,
2107  X+1*xstep, xs0);
2108 
2109  KOKKOSBATCHED_SERIAL_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2110  (AlgoType,KB::Diag::Unit,
2111  blocksize,blocksize,
2112  one,
2113  A+3*astep, as0, as1,
2114  X+1*xstep, xs0);
2115 
2116  A += 3*astep;
2117  X += 1*xstep;
2118  }
2119 
2120  // solve Ux = x
2121  KOKKOSBATCHED_SERIAL_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2122  (AlgoType,KB::Diag::NonUnit,
2123  blocksize, blocksize,
2124  one,
2125  A, as0, as1,
2126  X, xs0);
2127 
2128  for (local_ordinal_type tr=nrows;tr>1;--tr) {
2129  A -= 3*astep;
2130  KOKKOSBATCHED_SERIAL_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2131  (AlgoType,
2132  blocksize, blocksize,
2133  -one,
2134  A+1*astep, as0, as1,
2135  X, xs0,
2136  one,
2137  X-1*xstep, xs0);
2138 
2139  KOKKOSBATCHED_SERIAL_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2140  (AlgoType,KB::Diag::NonUnit,
2141  blocksize, blocksize,
2142  one,
2143  A, as0, as1,
2144  X-1*xstep,xs0);
2145 
2146  X -= 1*xstep;
2147  }
2148  }
2149 
2150  inline
2151  void
2152  serialSolveMultiVector(const local_ordinal_type &/* blocksize */,
2153  const local_ordinal_type &packidx) const {
2154  namespace KB = KokkosBatched::Experimental;
2155  using AlgoType = KB::Algo::Level3::Blocked;
2156 
2157  // constant
2158  const auto one = Kokkos::ArithTraits<magnitude_type>::one();
2159  //const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
2160 
2161  // subview pattern
2162  auto A = Kokkos::subview(D_vector_values, 0, Kokkos::ALL(), Kokkos::ALL());
2163  auto X1 = Kokkos::subview(X_vector_values, 0, Kokkos::ALL(), Kokkos::ALL());
2164  auto X2 = X1;
2165 
2166  // index counting
2167  const local_ordinal_type partidx = packptr(packidx);
2168  size_type i = pack_td_ptr(partidx);
2169  local_ordinal_type r = part2packrowidx0(partidx);
2170  const local_ordinal_type nrows = part2packrowidx0(packptr(packidx+1)) - r;
2171 
2172  // solve Lx = x
2173  A.assign_data( &D_vector_values(i,0,0) );
2174  X1.assign_data( &X_vector_values(r,0,0) );
2175  KB::SerialTrsm<KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,AlgoType>
2176  ::invoke(one, A, X1);
2177  for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
2178  A.assign_data( &D_vector_values(i+2,0,0) );
2179  X2.assign_data( &X_vector_values(++r,0,0) );
2180  KB::SerialGemm<KB::Trans::NoTranspose,KB::Trans::NoTranspose,AlgoType>
2181  ::invoke(-one, A, X1, one, X2);
2182  A.assign_data( &D_vector_values(i+3,0,0) );
2183  KB::SerialTrsm<KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,AlgoType>
2184  ::invoke(one, A, X2);
2185  X1.assign_data( X2.data() );
2186  }
2187 
2188  // solve Ux = x
2189  KB::SerialTrsm<KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,AlgoType>
2190  ::invoke(one, A, X1);
2191  for (local_ordinal_type tr=nrows;tr>1;--tr) {
2192  i -= 3;
2193  A.assign_data( &D_vector_values(i+1,0,0) );
2194  X2.assign_data( &X_vector_values(--r,0,0) );
2195  KB::SerialGemm<KB::Trans::NoTranspose,KB::Trans::NoTranspose,AlgoType>
2196  ::invoke(-one, A, X1, one, X2);
2197  A.assign_data( &D_vector_values(i,0,0) );
2198  KB::SerialTrsm<KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,AlgoType>
2199  ::invoke(one, A, X2);
2200  X1.assign_data( X2.data() );
2201  }
2202  }
2203 
2207  KOKKOS_INLINE_FUNCTION
2208  void
2210  const local_ordinal_type &blocksize,
2211  const local_ordinal_type &i0,
2212  const local_ordinal_type &r0,
2213  const local_ordinal_type &nrows,
2214  const local_ordinal_type &v) const {
2215  namespace KB = KokkosBatched::Experimental;
2216  using AlgoType = KB::Algo::Level2::Unblocked;
2217 
2218  // base pointers
2219  auto A = D_scalar_values.data();
2220  auto X = X_scalar_values.data();
2221 
2222  // constant
2223  const auto one = Kokkos::ArithTraits<magnitude_type>::one();
2224  //const local_ordinal_type num_vectors = X_scalar_values.extent(2);
2225 
2226  // const local_ordinal_type blocksize = D_scalar_values.extent(1);
2227  const local_ordinal_type astep = D_scalar_values.stride_0();
2228  const local_ordinal_type as0 = blocksize*vector_length; //D_scalar_values.stride_1();
2229  const local_ordinal_type as1 = vector_length; //D_scalar_values.stride_2();
2230  const local_ordinal_type xstep = X_scalar_values.stride_0();
2231  const local_ordinal_type xs0 = vector_length; //X_scalar_values.stride_1();
2232 
2233  // for multiple rhs
2234  //const local_ordinal_type xs0 = num_vectors*vector_length; //X_scalar_values.stride_1();
2235  //const local_ordinal_type xs1 = vector_length; //X_scalar_values.stride_2();
2236 
2237  // move to starting point
2238  A += i0*astep + v;
2239  X += r0*xstep + v;
2240 
2241  //for (local_ordinal_type col=0;col<num_vectors;++col)
2242  {
2243  // solve Lx = x
2244  KOKKOSBATCHED_TEAM_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2245  (AlgoType,member,KB::Diag::Unit,
2246  blocksize,blocksize,
2247  one,
2248  A, as0, as1,
2249  X, xs0);
2250 
2251  for (local_ordinal_type tr=1;tr<nrows;++tr) {
2252  KOKKOSBATCHED_TEAM_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2253  (AlgoType,member,
2254  blocksize, blocksize,
2255  -one,
2256  A+2*astep, as0, as1,
2257  X, xs0,
2258  one,
2259  X+1*xstep, xs0);
2260 
2261  KOKKOSBATCHED_TEAM_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2262  (AlgoType,member,KB::Diag::Unit,
2263  blocksize,blocksize,
2264  one,
2265  A+3*astep, as0, as1,
2266  X+1*xstep, xs0);
2267 
2268  A += 3*astep;
2269  X += 1*xstep;
2270  }
2271 
2272  // solve Ux = x
2273  KOKKOSBATCHED_TEAM_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2274  (AlgoType,member,KB::Diag::NonUnit,
2275  blocksize, blocksize,
2276  one,
2277  A, as0, as1,
2278  X, xs0);
2279 
2280  for (local_ordinal_type tr=nrows;tr>1;--tr) {
2281  A -= 3*astep;
2282  KOKKOSBATCHED_TEAM_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2283  (AlgoType,member,
2284  blocksize, blocksize,
2285  -one,
2286  A+1*astep, as0, as1,
2287  X, xs0,
2288  one,
2289  X-1*xstep, xs0);
2290 
2291  KOKKOSBATCHED_TEAM_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2292  (AlgoType,member,KB::Diag::NonUnit,
2293  blocksize, blocksize,
2294  one,
2295  A, as0, as1,
2296  X-1*xstep,xs0);
2297 
2298  X -= 1*xstep;
2299  }
2300  // for multiple rhs
2301  //X += xs1;
2302  }
2303  }
2304 
2305  KOKKOS_INLINE_FUNCTION
2306  void
2307  teamSolveMultiVector(const member_type &member,
2308  const local_ordinal_type &blocksize,
2309  const local_ordinal_type &i0,
2310  const local_ordinal_type &r0,
2311  const local_ordinal_type &nrows,
2312  const local_ordinal_type &v) const {
2313  namespace KB = KokkosBatched::Experimental;
2314  using AlgoType = KB::Algo::Level3::Blocked;
2315 
2316  // constant
2317  const auto one = Kokkos::ArithTraits<magnitude_type>::one();
2318 
2319  // subview pattern
2320  auto A = Kokkos::subview(D_scalar_values, 0, Kokkos::ALL(), Kokkos::ALL(), 0);
2321  auto X1 = Kokkos::subview(X_scalar_values, 0, Kokkos::ALL(), Kokkos::ALL(), 0);
2322  auto X2 = X1;
2323 
2324  local_ordinal_type i = i0, r = r0;
2325 
2326  // solve Lx = x
2327  A.assign_data( &D_scalar_values(i,0,0,v) );
2328  X1.assign_data( &X_scalar_values(r,0,0,v) );
2329  KB::TeamTrsm<member_type,KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,AlgoType>
2330  ::invoke(member, one, A, X1);
2331  for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
2332  A.assign_data( &D_scalar_values(i+2,0,0,v) );
2333  X2.assign_data( &X_scalar_values(++r,0,0,v) );
2334  KB::TeamGemm<member_type,KB::Trans::NoTranspose,KB::Trans::NoTranspose,AlgoType>
2335  ::invoke(member, -one, A, X1, one, X2);
2336  A.assign_data( &D_scalar_values(i+3,0,0,v) );
2337  KB::TeamTrsm<member_type,KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,AlgoType>
2338  ::invoke(member, one, A, X2);
2339  X1.assign_data( X2.data() );
2340  }
2341 
2342  // solve Ux = x
2343  KB::TeamTrsm<member_type,KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,AlgoType>
2344  ::invoke(member, one, A, X1);
2345  for (local_ordinal_type tr=nrows;tr>1;--tr) {
2346  i -= 3;
2347  A.assign_data( &D_scalar_values(i+1,0,0,v) );
2348  X2.assign_data( &X_scalar_values(--r,0,0,v) );
2349  KB::TeamGemm<member_type,KB::Trans::NoTranspose,KB::Trans::NoTranspose,AlgoType>
2350  ::invoke(member, -one, A, X1, one, X2);
2351  A.assign_data( &D_scalar_values(i,0,0,v) );
2352  KB::TeamTrsm<member_type,KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,AlgoType>
2353  ::invoke(member, one, A, X2);
2354  X1.assign_data( X2.data() );
2355  }
2356  }
2357 
2358 
2359  template<int B> struct SingleVectorTag {};
2360  template<int B> struct MultiVectorTag {};
2361 
2362  template<int B>
2363  KOKKOS_INLINE_FUNCTION
2364  void
2365  operator() (const SingleVectorTag<B> &, const local_ordinal_type& packidx) const {
2366  const local_ordinal_type blocksize = (B == 0 ? D_vector_values.extent(1) : B);
2367  serialSolveSingleVector(blocksize, packidx);
2368  }
2369 
2370  template<int B>
2371  KOKKOS_INLINE_FUNCTION
2372  void
2373  operator() (const MultiVectorTag<B> &, const local_ordinal_type& packidx) const {
2374  // this needs level 3 operations internal access (not yet)
2375  const local_ordinal_type blocksize = (B == 0 ? D_vector_values.extent(1) : B);
2376  serialSolveMultiVector(blocksize, packidx);
2377  }
2378 
2379  template<int B>
2380  KOKKOS_INLINE_FUNCTION
2381  void
2382  operator() (const SingleVectorTag<B> &, const member_type &member) const {
2383  const local_ordinal_type packidx = member.league_rank();
2384  const local_ordinal_type partidx = packptr(packidx);
2385  const local_ordinal_type i0 = pack_td_ptr(partidx);
2386  const local_ordinal_type r0 = part2packrowidx0(partidx);
2387  const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
2388  const local_ordinal_type blocksize = (B == 0 ? D_scalar_values.extent(1) : B);
2389  Kokkos::parallel_for
2390  (Kokkos::ThreadVectorRange(member, vector_length_value),[&](const int &v) {
2391  teamSolveSingleVector(member, blocksize, i0, r0, nrows, v);
2392  });
2393  }
2394 
2395  template<int B>
2396  KOKKOS_INLINE_FUNCTION
2397  void
2398  operator() (const MultiVectorTag<B> &, const member_type &member) const {
2399  const local_ordinal_type packidx = member.league_rank();
2400  const local_ordinal_type partidx = packptr(packidx);
2401  const local_ordinal_type i0 = pack_td_ptr(partidx);
2402  const local_ordinal_type r0 = part2packrowidx0(partidx);
2403  const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
2404  const local_ordinal_type blocksize = (B == 0 ? D_scalar_values.extent(1) : B);
2405  Kokkos::parallel_for
2406  (Kokkos::ThreadVectorRange(member, vector_length_value),[&](const int &v) {
2407  teamSolveMultiVector(member, blocksize, i0, r0, nrows, v);
2408  });
2409  }
2410 
2411  void run() {
2412 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
2413  cudaProfilerStart();
2414 #endif
2415 
2416 #ifdef HAVE_IFPACK2_BLOCKTRIDICONTAINER_TIMERS
2417  TEUCHOS_FUNC_TIME_MONITOR("BlockTriDi::SolveTridiags::Run");
2418 #endif
2419  if (is_cuda<execution_space>::value) {
2420 #if defined(KOKKOS_ENABLE_CUDA)
2421  const local_ordinal_type num_vectors = X_scalar_values.extent(2);
2422  const local_ordinal_type vl = vector_length;
2423 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
2424  if (num_vectors == 1) { \
2425  const Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl); \
2426  Kokkos::parallel_for \
2427  ("SolveTridiags::TeamPolicy::run<SingleVector>", policy, *this); \
2428  } else { \
2429  const Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl); \
2430  Kokkos::parallel_for \
2431  ("SolveTridiags::TeamPolicy::run<MultiVector>", policy, *this); \
2432  } break
2433 
2434  const local_ordinal_type blocksize = D_scalar_values.extent(1);
2435  switch (blocksize) {
2436  case 3: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 3);
2437  case 5: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 5);
2438  case 7: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 7);
2439  case 9: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 9);
2440  case 10: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(10);
2441  case 11: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(11);
2442  case 16: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(16);
2443  case 17: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(17);
2444  case 18: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(18);
2445  default : BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 0);
2446  }
2447 #undef BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS
2448 
2449 #endif
2450  } else {
2451 #if defined(__CUDA_ARCH__)
2452  TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Error: CUDA should not see this code");
2453 #else
2454  const local_ordinal_type num_vectors = X_vector_values.extent(2);
2455 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
2456  if (num_vectors == 1) { \
2457  const Kokkos::RangePolicy<execution_space,SingleVectorTag<B> > policy(0, packptr.extent(0) - 1); \
2458  Kokkos::parallel_for \
2459  ("SolveTridiags::RangePolicy::run<SingleVector>", policy, *this); \
2460  } else { \
2461  const Kokkos::RangePolicy<execution_space,MultiVectorTag<B> > policy(0, packptr.extent(0) - 1); \
2462  Kokkos::parallel_for \
2463  ("SolveTridiags::RangePolicy::run<MultiVector>", policy, *this); \
2464  } break
2465 
2466  const local_ordinal_type blocksize = D_vector_values.extent(1);
2467  switch (blocksize) {
2468  case 3: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 3);
2469  case 5: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 5);
2470  case 7: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 7);
2471  case 9: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 9);
2472  case 10: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(10);
2473  case 11: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(11);
2474  case 16: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(16);
2475  case 17: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(17);
2476  case 18: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(18);
2477  default : BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 0);
2478  }
2479 #undef BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS
2480 #endif
2481  }
2482 
2483 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
2484  cudaProfilerStop();
2485 #endif
2486  }
2487  };
2488 
2492  template<typename MatrixType>
2494  public:
2496  using execution_space = typename impl_type::execution_space;
2497  using memory_space = typename impl_type::memory_space;
2498 
2499  using local_ordinal_type = typename impl_type::local_ordinal_type;
2500  using size_type = typename impl_type::size_type;
2501  using impl_scalar_type = typename impl_type::impl_scalar_type;
2502  using magnitude_type = typename impl_type::magnitude_type;
2504  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
2505  using size_type_1d_view = typename impl_type::size_type_1d_view;
2506  using tpetra_block_access_view_type = typename impl_type::tpetra_block_access_view_type; // block crs (layout right)
2507  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
2508  using impl_scalar_type_2d_view = typename impl_type::impl_scalar_type_2d_view; // block multivector (layout left)
2509  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
2510  using impl_scalar_type_4d_view = typename impl_type::impl_scalar_type_4d_view;
2511  static constexpr int vector_length = impl_type::vector_length;
2512 
2514  using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
2515 
2516  // enum for max blocksize and vector length
2517  enum : int { max_blocksize = 32 };
2518 
2519  private:
2520  ConstUnmanaged<impl_scalar_type_2d_view> b;
2521  ConstUnmanaged<impl_scalar_type_2d_view> x; // x_owned
2522  ConstUnmanaged<impl_scalar_type_2d_view> x_remote;
2523  /* */Unmanaged<impl_scalar_type_2d_view> y;
2524  /* */Unmanaged<vector_type_3d_view> y_packed;
2525  /* */Unmanaged<impl_scalar_type_4d_view> y_packed_scalar;
2526 
2527  // AmD information
2528  const ConstUnmanaged<size_type_1d_view> rowptr, rowptr_remote;
2529  const ConstUnmanaged<local_ordinal_type_1d_view> colindsub, colindsub_remote;
2530  const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
2531 
2532  // block crs graph information
2533  // for cuda (kokkos crs graph uses a different size_type from size_t)
2534  using a_rowptr_value_type = typename Kokkos::ViewTraits<local_ordinal_type*,typename impl_type::device_type>::size_type;
2535  const ConstUnmanaged<Kokkos::View<a_rowptr_value_type*,typename impl_type::device_type> > A_rowptr;
2536  const ConstUnmanaged<local_ordinal_type_1d_view> A_colind;
2537 
2538  // blocksize
2539  const local_ordinal_type blocksize_requested;
2540 
2541  // part interface
2542  const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
2543  const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
2544  const ConstUnmanaged<local_ordinal_type_1d_view> rowidx2part;
2545  const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
2546  const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
2547  const ConstUnmanaged<local_ordinal_type_1d_view> dm2cm;
2548  const bool is_dm2cm_active;
2549 
2550  public:
2551  template<typename LocalCrsGraphType>
2553  const LocalCrsGraphType &graph,
2554  const local_ordinal_type &blocksize_requested_,
2555  const PartInterface<MatrixType> &interf,
2556  const local_ordinal_type_1d_view &dm2cm_)
2557  : rowptr(amd.rowptr), rowptr_remote(amd.rowptr_remote),
2558  colindsub(amd.A_colindsub), colindsub_remote(amd.A_colindsub_remote),
2559  tpetra_values(amd.tpetra_values),
2560  A_rowptr(graph.row_map),
2561  A_colind(graph.entries),
2562  blocksize_requested(blocksize_requested_),
2563  part2packrowidx0(interf.part2packrowidx0),
2564  part2rowidx0(interf.part2rowidx0),
2565  rowidx2part(interf.rowidx2part),
2566  partptr(interf.partptr),
2567  lclrow(interf.lclrow),
2568  dm2cm(dm2cm_),
2569  is_dm2cm_active(dm2cm_.span() > 0)
2570  {}
2571 
2572 
2573  inline
2574  void
2575  serialGemv(const local_ordinal_type &blocksize,
2576  const impl_scalar_type * const __restrict__ AA,
2577  const impl_scalar_type * const __restrict__ xx,
2578  /* */ impl_scalar_type * __restrict__ yy) const {
2579  for (local_ordinal_type k0=0;k0<blocksize;++k0) {
2580  impl_scalar_type val = 0;
2581  const local_ordinal_type offset = k0*blocksize;
2582 #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
2583 # pragma ivdep
2584 #endif
2585 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
2586 # pragma unroll
2587 #endif
2588  for (local_ordinal_type k1=0;k1<blocksize;++k1)
2589  val += AA[offset+k1]*xx[k1];
2590  yy[k0] -= val;
2591  }
2592  }
2593 
2594  template<typename bbViewType, typename yyViewType>
2595  KOKKOS_INLINE_FUNCTION
2596  void
2597  vectorCopy(const member_type &member,
2598  const local_ordinal_type &blocksize,
2599  const bbViewType &bb,
2600  const yyViewType &yy) const {
2601  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), [&](const local_ordinal_type &k0) {
2602  yy(k0) = bb(k0);
2603  });
2604  }
2605 
2606  template<typename AAViewType, typename xxViewType, typename yyViewType>
2607  KOKKOS_INLINE_FUNCTION
2608  void
2609  teamGemv(const member_type &member,
2610  const local_ordinal_type &blocksize,
2611  const AAViewType &AA,
2612  const xxViewType &xx,
2613  const yyViewType &yy) const {
2614  Kokkos::parallel_for
2615  (Kokkos::TeamThreadRange(member, blocksize),
2616  [&](const local_ordinal_type &k0) {
2617  impl_scalar_type val = 0;
2618  Kokkos::parallel_reduce
2619  (Kokkos::ThreadVectorRange(member, blocksize),
2620  [&](const local_ordinal_type &k1, impl_scalar_type &update) {
2621  update += AA(k0,k1)*xx(k1);
2622  }, val);
2623  Kokkos::single(Kokkos::PerThread(member), [&]() {
2624  yy(k0) -= val;
2625  });
2626  });
2627  }
2628 
2629  struct SeqTag {};
2630 
2631  inline
2632  void
2633  operator() (const SeqTag &, const local_ordinal_type& i) const {
2634  const local_ordinal_type blocksize = blocksize_requested;
2635  const local_ordinal_type blocksize_square = blocksize*blocksize;
2636 
2637  // constants
2638  const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
2639  const local_ordinal_type num_vectors = y.extent(1);
2640  const local_ordinal_type row = i*blocksize;
2641  for (local_ordinal_type col=0;col<num_vectors;++col) {
2642  // y := b
2643  impl_scalar_type *yy = &y(row, col);
2644  const impl_scalar_type * const bb = &b(row, col);
2645  memcpy(yy, bb, sizeof(impl_scalar_type)*blocksize);
2646 
2647  // y -= Rx
2648  const size_type A_k0 = A_rowptr[i];
2649  for (size_type k=rowptr[i];k<rowptr[i+1];++k) {
2650  const size_type j = A_k0 + colindsub[k];
2651  const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
2652  const impl_scalar_type * const xx = &x(A_colind[j]*blocksize, col);
2653  serialGemv(blocksize,AA,xx,yy);
2654  }
2655  }
2656  }
2657 
2658  KOKKOS_INLINE_FUNCTION
2659  void
2660  operator() (const SeqTag &, const member_type &member) const {
2661  namespace KB = KokkosBatched::Experimental;
2662 
2663  // constants
2664  const local_ordinal_type blocksize = blocksize_requested;
2665  const local_ordinal_type blocksize_square = blocksize*blocksize;
2666 
2667  const local_ordinal_type i = member.league_rank();
2668  const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
2669  const local_ordinal_type num_vectors = y.extent(1);
2670 
2671  // subview pattern
2672  auto bb = Kokkos::subview(b, block_range, 0);
2673  auto xx = bb;
2674  auto yy = Kokkos::subview(y, block_range, 0);
2675  auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
2676 
2677  const local_ordinal_type row = i*blocksize;
2678  for (local_ordinal_type col=0;col<num_vectors;++col) {
2679  // y := b
2680  yy.assign_data(&y(row, col));
2681  bb.assign_data(&b(row, col));
2682  if (member.team_rank() == 0)
2683  vectorCopy(member, blocksize, bb, yy);
2684  member.team_barrier();
2685 
2686  // y -= Rx
2687  const size_type A_k0 = A_rowptr[i];
2688  for (size_type k=rowptr[i];k<rowptr[i+1];++k) {
2689  const size_type j = A_k0 + colindsub[k];
2690  A_block.assign_data( &tpetra_values(j*blocksize_square) );
2691  xx.assign_data( &x(A_colind[j]*blocksize, col) );
2692  teamGemv(member, blocksize, A_block, xx, yy);
2693  }
2694  }
2695  }
2696 
2697  template<int B>
2698  struct AsyncTag {};
2699 
2700  template<int B>
2701  inline
2702  void
2703  operator() (const AsyncTag<B> &, const local_ordinal_type &rowidx) const {
2704  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
2705  const local_ordinal_type blocksize_square = blocksize*blocksize;
2706 
2707  // constants
2708  const local_ordinal_type partidx = rowidx2part(rowidx);
2709  const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
2710  const local_ordinal_type v = partidx % vector_length;
2711 
2712  const local_ordinal_type num_vectors = y_packed.extent(2);
2713  const local_ordinal_type num_local_rows = lclrow.extent(0);
2714 
2715  // temporary buffer for y flat
2716  impl_scalar_type yy[B == 0 ? max_blocksize : B] = {};
2717 
2718  const local_ordinal_type lr = lclrow(rowidx);
2719  const local_ordinal_type row = lr*blocksize;
2720  for (local_ordinal_type col=0;col<num_vectors;++col) {
2721  // y := b
2722  memcpy(yy, &b(row, col), sizeof(impl_scalar_type)*blocksize);
2723 
2724  // y -= Rx
2725  const size_type A_k0 = A_rowptr[lr];
2726  for (size_type k=rowptr[lr];k<rowptr[lr+1];++k) {
2727  const size_type j = A_k0 + colindsub[k];
2728  const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
2729  const local_ordinal_type A_colind_at_j = A_colind[j];
2730  if (A_colind_at_j < num_local_rows) {
2731  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
2732  const impl_scalar_type * const xx = &x(loc*blocksize, col);
2733  serialGemv(blocksize, AA,xx,yy);
2734  } else {
2735  const auto loc = A_colind_at_j - num_local_rows;
2736  const impl_scalar_type * const xx_remote = &x_remote(loc*blocksize, col);
2737  serialGemv(blocksize, AA,xx_remote,yy);
2738  }
2739  }
2740  // move yy to y_packed
2741  for (local_ordinal_type k=0;k<blocksize;++k)
2742  y_packed(pri, k, col)[v] = yy[k];
2743  }
2744  }
2745 
2746  template<int B>
2747  KOKKOS_INLINE_FUNCTION
2748  void
2749  operator() (const AsyncTag<B> &, const member_type &member) const {
2750  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
2751  const local_ordinal_type blocksize_square = blocksize*blocksize;
2752 
2753  // constants
2754  const local_ordinal_type rowidx = member.league_rank();
2755  const local_ordinal_type partidx = rowidx2part(rowidx);
2756  const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
2757  const local_ordinal_type v = partidx % vector_length;
2758 
2759  const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
2760  const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
2761  const local_ordinal_type num_local_rows = lclrow.extent(0);
2762 
2763  // subview pattern
2764  auto bb = Kokkos::subview(b, block_range, 0);
2765  auto xx = Kokkos::subview(x, block_range, 0);
2766  auto xx_remote = Kokkos::subview(x_remote, block_range, 0);
2767  auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
2768  auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
2769 
2770  const local_ordinal_type lr = lclrow(rowidx);
2771  const local_ordinal_type row = lr*blocksize;
2772  for (local_ordinal_type col=0;col<num_vectors;++col) {
2773  // y := b
2774  bb.assign_data(&b(row, col));
2775  yy.assign_data(&y_packed_scalar(pri, 0, col, v));
2776  if (member.team_rank() == 0)
2777  vectorCopy(member, blocksize, bb, yy);
2778  member.team_barrier();
2779 
2780  // y -= Rx
2781  const size_type A_k0 = A_rowptr[lr];
2782  for (size_type k=rowptr[lr];k<rowptr[lr+1];++k) {
2783  const size_type j = A_k0 + colindsub[k];
2784  A_block.assign_data( &tpetra_values(j*blocksize_square) );
2785 
2786  const local_ordinal_type A_colind_at_j = A_colind[j];
2787  if (A_colind_at_j < num_local_rows) {
2788  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
2789  xx.assign_data( &x(loc*blocksize, col) );
2790  teamGemv(member, blocksize, A_block, xx, yy);
2791  } else {
2792  const auto loc = A_colind_at_j - num_local_rows;
2793  xx_remote.assign_data( &x_remote(loc*blocksize, col) );
2794  teamGemv(member, blocksize, A_block, xx_remote, yy);
2795  }
2796  }
2797  }
2798  }
2799 
2800  template <int P, int B> struct OverlapTag {};
2801 
2802  template<int P, int B>
2803  inline
2804  void
2805  operator() (const OverlapTag<P,B> &, const local_ordinal_type& rowidx) const {
2806  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
2807  const local_ordinal_type blocksize_square = blocksize*blocksize;
2808 
2809  // constants
2810  const local_ordinal_type partidx = rowidx2part(rowidx);
2811  const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
2812  const local_ordinal_type v = partidx % vector_length;
2813 
2814  const local_ordinal_type num_vectors = y_packed.extent(2);
2815  const local_ordinal_type num_local_rows = lclrow.extent(0);
2816 
2817  // temporary buffer for y flat
2818  impl_scalar_type yy[max_blocksize] = {};
2819 
2820  auto colindsub_used = (P == 0 ? colindsub : colindsub_remote);
2821  auto rowptr_used = (P == 0 ? rowptr : rowptr_remote);
2822 
2823  const local_ordinal_type lr = lclrow(rowidx);
2824  const local_ordinal_type row = lr*blocksize;
2825  for (local_ordinal_type col=0;col<num_vectors;++col) {
2826  if (P == 0) {
2827  // y := b
2828  memcpy(yy, &b(row, col), sizeof(impl_scalar_type)*blocksize);
2829  } else {
2830  // y (temporary) := 0
2831  memset(yy, 0, sizeof(impl_scalar_type)*blocksize);
2832  }
2833 
2834  // y -= Rx
2835  const size_type A_k0 = A_rowptr[lr];
2836  for (size_type k=rowptr_used[lr];k<rowptr_used[lr+1];++k) {
2837  const size_type j = A_k0 + colindsub_used[k];
2838  const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
2839  const local_ordinal_type A_colind_at_j = A_colind[j];
2840  if (P == 0) {
2841  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
2842  const impl_scalar_type * const xx = &x(loc*blocksize, col);
2843  serialGemv(blocksize,AA,xx,yy);
2844  } else {
2845  const auto loc = A_colind_at_j - num_local_rows;
2846  const impl_scalar_type * const xx_remote = &x_remote(loc*blocksize, col);
2847  serialGemv(blocksize,AA,xx_remote,yy);
2848  }
2849  }
2850  // move yy to y_packed
2851  if (P == 0) {
2852  for (local_ordinal_type k=0;k<blocksize;++k)
2853  y_packed(pri, k, col)[v] = yy[k];
2854  } else {
2855  for (local_ordinal_type k=0;k<blocksize;++k)
2856  y_packed(pri, k, col)[v] += yy[k];
2857  }
2858  }
2859  }
2860 
2861  template<int P, int B>
2862  KOKKOS_INLINE_FUNCTION
2863  void
2864  operator() (const OverlapTag<P,B> &, const member_type &member) const {
2865  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
2866  const local_ordinal_type blocksize_square = blocksize*blocksize;
2867 
2868  // constants
2869  const local_ordinal_type rowidx = member.league_rank();
2870  const local_ordinal_type partidx = rowidx2part(rowidx);
2871  const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
2872  const local_ordinal_type v = partidx % vector_length;
2873 
2874  const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
2875  const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
2876  const local_ordinal_type num_local_rows = lclrow.extent(0);
2877 
2878  // subview pattern
2879  auto bb = Kokkos::subview(b, block_range, 0);
2880  auto xx = Kokkos::subview(x, block_range, 0);
2881  auto xx_remote = Kokkos::subview(x_remote, block_range, 0);
2882  auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
2883  auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
2884  auto colindsub_used = (P == 0 ? colindsub : colindsub_remote);
2885  auto rowptr_used = (P == 0 ? rowptr : rowptr_remote);
2886 
2887  const local_ordinal_type lr = lclrow(rowidx);
2888  const local_ordinal_type row = lr*blocksize;
2889  for (local_ordinal_type col=0;col<num_vectors;++col) {
2890  yy.assign_data(&y_packed_scalar(pri, 0, col, v));
2891  if (P == 0) {
2892  // y := b
2893  bb.assign_data(&b(row, col));
2894  if (member.team_rank() == 0)
2895  vectorCopy(member, blocksize, bb, yy);
2896  member.team_barrier();
2897  }
2898 
2899  // y -= Rx
2900  const size_type A_k0 = A_rowptr[lr];
2901  for (size_type k=rowptr_used[lr];k<rowptr_used[lr+1];++k) {
2902  const size_type j = A_k0 + colindsub_used[k];
2903  A_block.assign_data( &tpetra_values(j*blocksize_square) );
2904 
2905  const local_ordinal_type A_colind_at_j = A_colind[j];
2906  if (P == 0) {
2907  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
2908  xx.assign_data( &x(loc*blocksize, col) );
2909  teamGemv(member, blocksize, A_block, xx, yy);
2910  } else {
2911  const auto loc = A_colind_at_j - num_local_rows;
2912  xx_remote.assign_data( &x_remote(loc*blocksize, col) );
2913  teamGemv(member, blocksize, A_block, xx_remote, yy);
2914  }
2915  }
2916  }
2917  }
2918 
2919  // y = b - Rx; seq method
2920  template<typename MultiVectorLocalViewTypeY,
2921  typename MultiVectorLocalViewTypeB,
2922  typename MultiVectorLocalViewTypeX>
2923  void run(const MultiVectorLocalViewTypeY &y_,
2924  const MultiVectorLocalViewTypeB &b_,
2925  const MultiVectorLocalViewTypeX &x_) {
2926 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
2927  cudaProfilerStart();
2928 #endif
2929 
2930 #ifdef HAVE_IFPACK2_BLOCKTRIDICONTAINER_TIMERS
2931  TEUCHOS_FUNC_TIME_MONITOR("BlockTriDi::ComputeResidual::Run<SeqTag>");
2932 #endif
2933  y = y_; b = b_; x = x_;
2934  if (is_cuda<execution_space>::value) {
2935 #if defined(KOKKOS_ENABLE_CUDA)
2936  const Kokkos::TeamPolicy<execution_space,SeqTag> policy(rowptr.extent(0) - 1, Kokkos::AUTO());
2937  Kokkos::parallel_for
2938  ("ComputeResidual::TeamPolicy::run<SeqTag>", policy, *this);
2939 #endif
2940  } else {
2941 #if defined(__CUDA_ARCH__)
2942  TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Error: CUDA should not see this code");
2943 #else
2944  const Kokkos::RangePolicy<execution_space,SeqTag> policy(0, rowptr.extent(0) - 1);
2945  Kokkos::parallel_for
2946  ("ComputeResidual::RangePolicy::run<SeqTag>", policy, *this);
2947 #endif
2948  }
2949 
2950 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
2951  cudaProfilerStop();
2952 #endif
2953  }
2954 
2955  // y = b - R (x , x_remote)
2956  template<typename MultiVectorLocalViewTypeB,
2957  typename MultiVectorLocalViewTypeX,
2958  typename MultiVectorLocalViewTypeX_Remote>
2959  void run(const vector_type_3d_view &y_packed_,
2960  const MultiVectorLocalViewTypeB &b_,
2961  const MultiVectorLocalViewTypeX &x_,
2962  const MultiVectorLocalViewTypeX_Remote &x_remote_) {
2963 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
2964  cudaProfilerStart();
2965 #endif
2966 
2967 #ifdef HAVE_IFPACK2_BLOCKTRIDICONTAINER_TIMERS
2968  TEUCHOS_FUNC_TIME_MONITOR("BlockTriDi::ComputeResidual::Run<AsyncTag>");
2969 #endif
2970  b = b_; x = x_; x_remote = x_remote_;
2971  if (is_cuda<execution_space>::value) {
2972 #if defined(KOKKOS_ENABLE_CUDA)
2973  y_packed_scalar = impl_scalar_type_4d_view((impl_scalar_type*)y_packed_.data(),
2974  y_packed_.extent(0),
2975  y_packed_.extent(1),
2976  y_packed_.extent(2),
2977  vector_length);
2978 #endif
2979  } else {
2980  y_packed = y_packed_;
2981  }
2982 
2983  if (is_cuda<execution_space>::value) {
2984 #if defined(KOKKOS_ENABLE_CUDA)
2985  local_ordinal_type vl_power_of_two = 1;
2986  for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
2987  vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
2988  const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
2989 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \
2990  const Kokkos::TeamPolicy<execution_space,AsyncTag<B> > policy(rowidx2part.extent(0), Kokkos::AUTO(), vl); \
2991  Kokkos::parallel_for \
2992  ("ComputeResidual::TeamPolicy::run<AsyncTag>", \
2993  policy, *this); } break
2994  switch (blocksize_requested) {
2995  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
2996  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
2997  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
2998  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
2999  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3000  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3001  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3002  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3003  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3004  default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3005  }
3006 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3007 #endif
3008  } else {
3009 #if defined(__CUDA_ARCH__)
3010  TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Error: CUDA should not see this code");
3011 #else
3012 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \
3013  const Kokkos::RangePolicy<execution_space,AsyncTag<B> > policy(0, rowidx2part.extent(0)); \
3014  Kokkos::parallel_for \
3015  ("ComputeResidual::RangePolicy::run<AsyncTag>", \
3016  policy, *this); } break
3017  switch (blocksize_requested) {
3018  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3019  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3020  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3021  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3022  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3023  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3024  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3025  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3026  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3027  default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3028  }
3029 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3030 #endif
3031  }
3032 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
3033  cudaProfilerStop();
3034 #endif
3035  }
3036 
3037  // y = b - R (y , y_remote)
3038  template<typename MultiVectorLocalViewTypeB,
3039  typename MultiVectorLocalViewTypeX,
3040  typename MultiVectorLocalViewTypeX_Remote>
3041  void run(const vector_type_3d_view &y_packed_,
3042  const MultiVectorLocalViewTypeB &b_,
3043  const MultiVectorLocalViewTypeX &x_,
3044  const MultiVectorLocalViewTypeX_Remote &x_remote_,
3045  const bool compute_owned) {
3046 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
3047  cudaProfilerStart();
3048 #endif
3049 
3050 #ifdef HAVE_IFPACK2_BLOCKTRIDICONTAINER_TIMERS
3051  TEUCHOS_FUNC_TIME_MONITOR("BlockTriDi::ComputeResidual::Run<OverlapTag>");
3052 #endif
3053  b = b_; x = x_; x_remote = x_remote_;
3054  if (is_cuda<execution_space>::value) {
3055 #if defined(KOKKOS_ENABLE_CUDA)
3056  y_packed_scalar = impl_scalar_type_4d_view((impl_scalar_type*)y_packed_.data(),
3057  y_packed_.extent(0),
3058  y_packed_.extent(1),
3059  y_packed_.extent(2),
3060  vector_length);
3061 #endif
3062  } else {
3063  y_packed = y_packed_;
3064  }
3065 
3066  if (is_cuda<execution_space>::value) {
3067 #if defined(KOKKOS_ENABLE_CUDA)
3068  local_ordinal_type vl_power_of_two = 1;
3069  for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
3070  vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
3071  const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
3072 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
3073  if (compute_owned) { \
3074  const Kokkos::TeamPolicy<execution_space,OverlapTag<0,B> > policy(rowidx2part.extent(0), Kokkos::AUTO(), vl); \
3075  Kokkos::parallel_for \
3076  ("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, *this); \
3077  } else { \
3078  const Kokkos::TeamPolicy<execution_space,OverlapTag<1,B> > policy(rowidx2part.extent(0), Kokkos::AUTO(), vl); \
3079  Kokkos::parallel_for \
3080  ("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, *this); \
3081  } break
3082  switch (blocksize_requested) {
3083  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3084  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3085  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3086  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3087  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3088  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3089  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3090  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3091  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3092  default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3093  }
3094 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3095 #endif
3096  } else {
3097 #if defined(__CUDA_ARCH__)
3098  TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Error: CUDA should not see this code");
3099 #else
3100 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
3101  if (compute_owned) { \
3102  const Kokkos::RangePolicy<execution_space,OverlapTag<0,B> > policy(0, rowidx2part.extent(0)); \
3103  Kokkos::parallel_for \
3104  ("ComputeResidual::RangePolicy::run<OverlapTag<0> >", policy, *this); \
3105  } else { \
3106  const Kokkos::RangePolicy<execution_space,OverlapTag<1,B> > policy(0, rowidx2part.extent(0)); \
3107  Kokkos::parallel_for \
3108  ("ComputeResidual::RangePolicy::run<OverlapTag<1> >", policy, *this); \
3109  } break
3110 
3111  switch (blocksize_requested) {
3112  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3113  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3114  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3115  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3116  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3117  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3118  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3119  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3120  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3121  default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3122  }
3123 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3124 #endif
3125  }
3126 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
3127  cudaProfilerStop();
3128 #endif
3129  }
3130  };
3131 
3135  template<typename MatrixType>
3136  struct NormManager {
3137  public:
3139  using magnitude_type = typename impl_type::magnitude_type;
3140 
3141  private:
3142  bool collective_;
3143  int sweep_step_, blocksize_, num_vectors_;
3144 #ifdef HAVE_IFPACK2_MPI
3145  MPI_Request mpi_request_;
3146  MPI_Comm comm_;
3147 #endif
3148  std::vector<magnitude_type> work_;
3149 
3150  public:
3151  NormManager() = default;
3152  NormManager(const NormManager &b) = default;
3153  NormManager(const Teuchos::RCP<const Teuchos::Comm<int> >& comm) {
3154  sweep_step_ = 1;
3155  blocksize_ = 0;
3156  num_vectors_ = 0;
3157 
3158  collective_ = comm->getSize() > 1;
3159  if (collective_) {
3160 #ifdef HAVE_IFPACK2_MPI
3161  const auto mpi_comm = Teuchos::rcp_dynamic_cast<const Teuchos::MpiComm<int> >(comm);
3162  TEUCHOS_ASSERT( ! mpi_comm.is_null());
3163  comm_ = *mpi_comm->getRawMpiComm();
3164 #endif
3165  }
3166  }
3167 
3168  int getBlocksize() const { return blocksize_; }
3169  int getNumVectors() const { return num_vectors_; }
3170 
3171  // Resize the buffer to accommodate nvec vectors in the multivector, for a
3172  // matrix having block size block_size.
3173  void resize(const int& blocksize, const int& num_vectors) {
3174  blocksize_ = blocksize;
3175  num_vectors_ = num_vectors;
3176  work_.resize((2*blocksize_ + 1)*num_vectors_);
3177  }
3178 
3179  // Check the norm every sweep_step sweeps.
3180  void setCheckFrequency(const int& sweep_step) {
3181  TEUCHOS_TEST_FOR_EXCEPT_MSG(sweep_step < 1, "sweep step must be >= 1");
3182  sweep_step_ = sweep_step;
3183  }
3184 
3185  // Get the buffer into which to store rank-local squared norms.
3186  magnitude_type* getBuffer() { return work_.data(); }
3187 
3188  // Call MPI_Iallreduce to find the global squared norms.
3189  void ireduce(const int& sweep, const bool force = false) {
3190  if ( ! force && sweep % sweep_step_) return;
3191  const int n = blocksize_*num_vectors_;
3192  if (collective_) {
3193  std::copy(work_.begin(), work_.begin() + n, work_.begin() + n);
3194 #ifdef HAVE_IFPACK2_MPI
3195 # if MPI_VERSION >= 3
3196  MPI_Iallreduce(work_.data() + n, work_.data(), n,
3197  Teuchos::Details::MpiTypeTraits<magnitude_type>::getType(),
3198  MPI_SUM, comm_, &mpi_request_);
3199 # else
3200  MPI_Allreduce (work_.data() + n, work_.data(), n,
3201  Teuchos::Details::MpiTypeTraits<magnitude_type>::getType(),
3202  MPI_SUM, comm_);
3203 # endif
3204 #endif
3205  }
3206  }
3207 
3208  // Check if the norm-based termination criterion is met. tol2 is the
3209  // tolerance squared. Sweep is the sweep index. If not every iteration is
3210  // being checked, this function immediately returns false. If a check must
3211  // be done at this iteration, it waits for the reduction triggered by
3212  // ireduce to complete, then checks the global norm against the tolerance.
3213  bool checkDone (const int& sweep, const magnitude_type& tol2, const bool force = false) {
3214  // early return
3215  if (sweep <= 0) return false;
3216 
3217 #ifdef HAVE_IFPACK2_BLOCKTRIDICONTAINER_TIMERS
3218  TEUCHOS_FUNC_TIME_MONITOR("BlockTriDi::NormManager::CheckDone");
3219 #endif
3220  TEUCHOS_ASSERT(sweep >= 1);
3221  if ( ! force && (sweep - 1) % sweep_step_) return false;
3222  if (collective_) {
3223 #ifdef HAVE_IFPACK2_MPI
3224 # if MPI_VERSION >= 3
3225  MPI_Wait(&mpi_request_, MPI_STATUS_IGNORE);
3226 # else
3227  // Do nothing.
3228 # endif
3229 #endif
3230  }
3231  const auto n = blocksize_*num_vectors_;
3232  if (sweep == 1) {
3233  magnitude_type* const n0 = work_.data() + 2*n;
3234  for (int v = 0; v < num_vectors_; ++v) {
3235  const magnitude_type* const dn0 = work_.data() + v*blocksize_;
3236  magnitude_type mdn0 = 0;
3237  for (int i = 0; i < blocksize_; ++i)
3238  mdn0 = std::max(mdn0, dn0[i]);
3239  n0[v] = mdn0;
3240  }
3241  return false;
3242  } else {
3243  const auto n0 = work_.data() + 2*n;
3244  bool done = true;
3245  for (int v = 0; v < num_vectors_; ++v) {
3246  const magnitude_type* const dnf = work_.data() + v*blocksize_;
3247  magnitude_type mdnf = 0;
3248  for (int i = 0; i < blocksize_; ++i)
3249  mdnf = std::max(mdnf, dnf[i]);
3250  if (mdnf > tol2*n0[v]) {
3251  done = false;
3252  break;
3253  }
3254  }
3255  return done;
3256  }
3257  }
3258 
3259  // After termination has occurred, finalize the norms for use in
3260  // get_norms{0,final}.
3261  void finalize () {
3262  for (int v = 0; v < num_vectors_; ++v) {
3263  const magnitude_type* const dnf = work_.data() + v*blocksize_;
3264  magnitude_type mdnf = 0;
3265  for (int i = 0; i < blocksize_; ++i)
3266  mdnf = std::max(mdnf, dnf[i]);
3267  // This overwrites the receive buffer, but that's OK; at the time of
3268  // this write, we no longer need the data in this slot.
3269  work_[v] = mdnf;
3270  }
3271  for (int i = 0; i < num_vectors_; ++i)
3272  work_[i] = std::sqrt(work_[i]);
3273  magnitude_type* const nf = work_.data() + 2*blocksize_*num_vectors_;
3274  for (int v = 0; v < num_vectors_; ++v)
3275  nf[v] = std::sqrt(nf[v]);
3276  }
3277 
3278  // Report norms to the caller.
3279  const magnitude_type* getNorms0 () const { return work_.data() + 2*blocksize_*num_vectors_; }
3280  const magnitude_type* getNormsFinal () const { return work_.data(); }
3281  };
3282 
3286  template<typename MatrixType>
3287  int
3288  applyInverseJacobi(// importer
3289  const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
3290  const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
3291  const Teuchos::RCP<AsyncableImport<MatrixType> > &async_importer,
3292  const bool overlap_communication_and_computation,
3293  // tpetra interface
3294  const typename ImplType<MatrixType>::tpetra_multivector_type &X, // tpetra interface
3295  /* */ typename ImplType<MatrixType>::tpetra_multivector_type &Y, // tpetra interface
3296  /* */ typename ImplType<MatrixType>::tpetra_multivector_type &Z, // temporary tpetra interface
3297  // local object interface
3298  const PartInterface<MatrixType> &interf, // mesh interface
3299  const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices
3300  const AmD<MatrixType> &amd, // R = A - D
3301  /* */ typename ImplType<MatrixType>::vector_type_1d_view &work, // workspace for packed multivector of right hand side
3302  /* */ NormManager<MatrixType> &norm_manager,
3303  // preconditioner parameters
3304  const typename ImplType<MatrixType>::impl_scalar_type &damping_factor,
3305  /* */ bool is_y_zero,
3306  const int max_num_sweeps,
3307  const typename ImplType<MatrixType>::magnitude_type tol,
3308  const int check_tol_every) {
3309 
3310 #ifdef HAVE_IFPACK2_BLOCKTRIDICONTAINER_TIMERS
3311  TEUCHOS_FUNC_TIME_MONITOR("BlockTriDi::ApplyInverseJacobi");
3312 #endif
3313  using impl_type = ImplType<MatrixType>;
3314  using memory_space = typename impl_type::memory_space;
3315 
3316  using local_ordinal_type = typename impl_type::local_ordinal_type;
3317  using size_type = typename impl_type::size_type;
3318  using magnitude_type = typename impl_type::magnitude_type;
3319  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
3320  using vector_type_1d_view = typename impl_type::vector_type_1d_view;
3321  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
3322  using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
3323 
3324  // either tpetra importer or async importer must be active
3325  TEUCHOS_TEST_FOR_EXCEPT_MSG(!tpetra_importer.is_null() && !async_importer.is_null(),
3326  "Neither Tpetra importer nor Async importer is null.");
3327  // max number of sweeps should be positive number
3328  TEUCHOS_TEST_FOR_EXCEPT_MSG(max_num_sweeps <= 0,
3329  "Maximum number of sweeps must be >= 1.");
3330 
3331  // const parameters
3332  const bool is_norm_manager_active = tol > Kokkos::ArithTraits<magnitude_type>::zero();
3333  const bool is_seq_method_requested = !tpetra_importer.is_null();
3334  const bool is_async_importer_active = !async_importer.is_null();
3335  const magnitude_type tolerance = tol*tol;
3336  const local_ordinal_type blocksize = btdm.values.extent(1);
3337  const local_ordinal_type num_vectors = Y.getNumVectors();
3338  const local_ordinal_type num_blockrows = interf.part2packrowidx0_back;
3339 
3340  TEUCHOS_TEST_FOR_EXCEPT_MSG(is_norm_manager_active && is_seq_method_requested,
3341  "The seq method for applyInverseJacobi, " <<
3342  "which in any case is for developer use only, " <<
3343  "does not support norm-based termination.");
3344 
3345  // if workspace is needed more, resize it
3346  const size_type work_span_required = num_blockrows*num_vectors*blocksize;
3347  if (work.span() < work_span_required)
3348  work = vector_type_1d_view("vector workspace 1d view", work_span_required);
3349 
3350  typename AsyncableImport<MatrixType>::impl_scalar_type_2d_view remote_multivector;
3351  if (is_seq_method_requested) {
3352  // construct copy of Y again if num vectors are different
3353  if (static_cast<local_ordinal_type>(Z.getNumVectors()) != num_vectors)
3354  Z = tpetra_multivector_type(tpetra_importer->getTargetMap(), num_vectors, false);
3355  } else {
3356  if (is_async_importer_active) {
3357  // create comm data buffer and keep it here
3358  async_importer->createDataBuffer(num_vectors);
3359  remote_multivector = async_importer->getRemoteMultiVectorLocalView();
3360  }
3361  }
3362 
3363  // wrap the workspace with 3d view
3364  vector_type_3d_view pmv(work.data(), num_blockrows, blocksize, num_vectors);
3365  const auto XX = X.template getLocalView<memory_space>();
3366  const auto YY = Y.template getLocalView<memory_space>();
3367  const auto ZZ = Z.template getLocalView<memory_space>();
3368 
3369  MultiVectorConverter<MatrixType> multivector_converter(interf, pmv);
3370  SolveTridiags<MatrixType> solve_tridiags(interf, btdm, pmv);
3371 
3372  const local_ordinal_type_1d_view dummy_local_ordinal_type_1d_view;
3373  ComputeResidualVector<MatrixType>
3374  compute_residual_vector(amd, A->getCrsGraph().getLocalGraph(), blocksize, interf,
3375  is_async_importer_active ? async_importer->dm2cm : dummy_local_ordinal_type_1d_view);
3376 
3377  // norm manager workspace resize
3378  if (is_norm_manager_active) {
3379  norm_manager.resize(blocksize, num_vectors);
3380  norm_manager.setCheckFrequency(check_tol_every);
3381  }
3382 
3383  // // iterate
3384  int sweep = 0;
3385  for (;sweep<max_num_sweeps;++sweep) {
3386  if (is_y_zero) {
3387  // pmv := x(lclrow)
3388  multivector_converter.to_packed_multivector(XX);
3389  } else {
3390  if (is_seq_method_requested) {
3391  // y := x - R y
3392  Z.doImport(Y, *tpetra_importer, Tpetra::REPLACE);
3393  compute_residual_vector.run(YY, XX, ZZ);
3394 
3395  // pmv := y(lclrow).
3396  multivector_converter.to_packed_multivector(YY);
3397  } else {
3398  // fused y := x - R y and pmv := y(lclrow);
3399  if (overlap_communication_and_computation || !is_async_importer_active) {
3400  if (is_async_importer_active) async_importer->asyncSendRecv(YY);
3401  compute_residual_vector.run(pmv, XX, YY, remote_multivector, true);
3402  if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
3403  if (is_async_importer_active) async_importer->cancel();
3404  break;
3405  }
3406  if (is_async_importer_active) {
3407  async_importer->syncRecv();
3408  compute_residual_vector.run(pmv, XX, YY, remote_multivector, false);
3409  }
3410  } else {
3411  if (is_async_importer_active)
3412  async_importer->syncExchange(YY);
3413  if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) break;
3414  compute_residual_vector.run(pmv, XX, YY, remote_multivector);
3415  }
3416  }
3417  }
3418 
3419  // pmv := inv(D) pmv.
3420  solve_tridiags.run();
3421 
3422  // y(lclrow) = (b - a) y(lclrow) + a pmv, with b = 1 always.
3423  multivector_converter.to_scalar_multivector(YY, damping_factor, is_y_zero,
3424  is_norm_manager_active ? norm_manager.getBuffer() : NULL);
3425 
3426  if (is_norm_manager_active) {
3427  if (sweep + 1 == max_num_sweeps) {
3428  norm_manager.ireduce(sweep, true);
3429  norm_manager.checkDone(sweep + 1, tolerance, true);
3430  } else {
3431  norm_manager.ireduce(sweep);
3432  }
3433  }
3434 
3435  is_y_zero = false;
3436  }
3437 
3438  //sqrt the norms for the caller's use.
3439  if (is_norm_manager_active) norm_manager.finalize();
3440 
3441  return sweep;
3442  }
3443 
3444 
3445  template<typename MatrixType>
3446  struct ImplObject {
3447  using impl_type = ImplType<MatrixType>;
3448  using part_interface_type = PartInterface<MatrixType>;
3449  using block_tridiags_type = BlockTridiags<MatrixType>;
3450  using amd_type = AmD<MatrixType>;
3451  using norm_manager_type = NormManager<MatrixType>;
3452  using async_import_type = AsyncableImport<MatrixType>;
3453 
3454  // distructed objects
3455  Teuchos::RCP<const typename impl_type::tpetra_block_crs_matrix_type> A;
3456  Teuchos::RCP<const typename impl_type::tpetra_import_type> tpetra_importer;
3457  Teuchos::RCP<async_import_type> async_importer;
3458  bool overlap_communication_and_computation;
3459 
3460  // copy of Y (mutable to penentrate const)
3461  mutable typename impl_type::tpetra_multivector_type Z;
3462 
3463  // local objects
3464  part_interface_type part_interface;
3465  block_tridiags_type block_tridiags; // D
3466  amd_type a_minus_d; // R = A - D
3467  mutable typename impl_type::vector_type_1d_view work; // right hand side workspace
3468  mutable norm_manager_type norm_manager;
3469  };
3470 
3471  } // namespace BlockTriDiContainerDetails
3472 
3473 } // namespace Ifpack2
3474 
3475 #endif
void operator()(const local_ordinal_type &packidx) const
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1649
BlockTridiags< MatrixType > createBlockTridiags(const PartInterface< MatrixType > &interf)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:955
#define TEUCHOS_FUNC_TIME_MONITOR(FUNCNAME)
typename impl_type::local_ordinal_type_1d_view local_ordinal_type_1d_view
views
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2003
typename impl_type::local_ordinal_type_1d_view local_ordinal_type_1d_view
views
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2504
PartInterface< MatrixType > createPartInterface(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const Teuchos::Array< Teuchos::Array< typename ImplType< MatrixType >::local_ordinal_type > > &partitions)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:779
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1428
typename impl_type::vector_type_3d_view vector_type_3d_view
vectorization
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2006
typename impl_type::local_ordinal_type_1d_view local_ordinal_type_1d_view
views
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1442
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:134
size_type size() const
void performNumericPhase(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, const typename ImplType< MatrixType >::magnitude_type tiny)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1712
typename Kokkos::TeamPolicy< execution_space >::member_type member_type
team policy member type (used in cuda)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2514
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:259
node_type::device_type device_type
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:232
Kokkos::Details::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:221
#define TEUCHOS_TEST_FOR_EXCEPT_MSG(throw_exception_test, msg)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:113
size_t size_type
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:212
std::string get_msg_prefix(const CommPtrType &comm)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:122
typename Kokkos::TeamPolicy< execution_space >::member_type member_type
team policy member type (used in cuda)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1451
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:170
typename Kokkos::TeamPolicy< execution_space >::member_type member_type
team policy member type (used in cuda)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2011
Kokkos::ViewAllocateWithoutInitializing do_not_initialize_tag
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:90
typename impl_type::vector_type_3d_view vector_type_3d_view
vectorization
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1446
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1993
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2493
void send(const Packet sendBuffer[], const Ordinal count, const int destRank, const int tag, const Comm< Ordinal > &comm)
KokkosBatched::Experimental::Vector< T, l > Vector
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:247
Teuchos::RCP< AsyncableImport< MatrixType > > createBlockCrsAsyncImporter(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:667
void update(double alpha, const BlockedMultiVector &x, double beta, BlockedMultiVector &y)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3136
Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_import_type > createBlockCrsTpetraImporter(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:276
Kokkos::DefaultHostExecutionSpace host_execution_space
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:227
void performSymbolicPhase(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, AmD< MatrixType > &amd, const bool overlap_communication_and_computation)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1130
T * getRawPtr() const
KOKKOS_INLINE_FUNCTION void teamSolveSingleVector(const member_type &member, const local_ordinal_type &blocksize, const local_ordinal_type &i0, const local_ordinal_type &r0, const local_ordinal_type &nrows, const local_ordinal_type &v) const
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2209
typename impl_type::tpetra_block_crs_matrix_type block_crs_matrix_type
tpetra interface
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1440
RCP< CommRequest< Ordinal > > isend(const ArrayRCP< const Packet > &sendBuffer, const int destRank, const int tag, const Comm< Ordinal > &comm)
#define TEUCHOS_ASSERT(assertion_test)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1099
Preconditioners and smoothers for Tpetra sparse matrices.
Definition: Ifpack2_AdditiveSchwarz_decl.hpp:72
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:911
void serialSolveSingleVector(const local_ordinal_type &blocksize, const local_ordinal_type &packidx) const
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2063
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:208
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1727
int applyInverseJacobi(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_import_type > &tpetra_importer, const Teuchos::RCP< AsyncableImport< MatrixType > > &async_importer, const bool overlap_communication_and_computation, const typename ImplType< MatrixType >::tpetra_multivector_type &X, typename ImplType< MatrixType >::tpetra_multivector_type &Y, typename ImplType< MatrixType >::tpetra_multivector_type &Z, const PartInterface< MatrixType > &interf, const BlockTridiags< MatrixType > &btdm, const AmD< MatrixType > &amd, typename ImplType< MatrixType >::vector_type_1d_view &work, NormManager< MatrixType > &norm_manager, const typename ImplType< MatrixType >::impl_scalar_type &damping_factor, bool is_y_zero, const int max_num_sweeps, const typename ImplType< MatrixType >::magnitude_type tol, const int check_tol_every)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3288