matrix_m_parallel_utils.h Source File

LibRPA: matrix_m_parallel_utils.h Source File
LibRPA
matrix_m_parallel_utils.h
1 #pragma once
2 #include "matrix_m.h"
3 #include "constants.h"
4 #include "parallel_mpi.h"
5 #include <omp.h>
6 #include "scalapack_connector.h"
7 #include "utils_io.h"
8 #ifdef LIBRPA_USE_LIBRI
9 #include <RI/global/Tensor.h>
10 using RI::Tensor;
11 #else
12 template <typename T>
13 class Tensor;
14 #endif
15 
16 template <typename T>
17 matrix_m<T> init_local_mat(const LIBRPA::Array_Desc &ad, MAJOR major)
18 {
19  matrix_m<T> mat_lo(ad.m_loc(), ad.n_loc(), major);
20  if (mat_lo.size() == 0)
21  mat_lo.resize(1, 1);
22  return mat_lo;
23 }
24 
25 template <typename T>
26 matrix_m<T> get_local_mat(const matrix_m<T> &mat_go, const LIBRPA::Array_Desc &ad, MAJOR major)
27 {
28  // assert the shape of global matrix conforms with the array descriptor
29  assert(mat_go.nr() == ad.m() && mat_go.nc() == ad.n());
30 
31  matrix_m<T> mat_lo(ad.m_loc(), ad.n_loc(), major);
32  for (int i = 0; i != mat_go.nr(); i++)
33  {
34  auto i_lo = ad.indx_g2l_r(i);
35  if (i_lo < 0) continue;
36  for (int j = 0; j != mat_go.nc(); j++)
37  {
38  auto j_lo = ad.indx_g2l_c(j);
39  if (j_lo < 0) continue;
40  mat_lo(i_lo, j_lo) = mat_go(i, j);
41  }
42  }
43  if (mat_lo.size() == 0)
44  mat_lo.resize(1, 1);
45  return mat_lo;
46 }
47 
48 template <typename T>
49 matrix_m<T> get_local_mat(const T *pv, MAJOR major_pv, const LIBRPA::Array_Desc &ad, MAJOR major)
50 {
51  const int nr = ad.m(), nc = ad.n();
52  matrix_m<T> mat_lo(ad.m_loc(), ad.n_loc(), major);
53  const auto picker = Indx_pickers_2d[major_pv];
54  for (int i = 0; i != nr; i++)
55  {
56  auto i_lo = ad.indx_g2l_r(i);
57  if (i_lo < 0) continue;
58  for (int j = 0; j != nc; j++)
59  {
60  auto j_lo = ad.indx_g2l_c(j);
61  if (j_lo < 0) continue;
62  mat_lo(i_lo, j_lo) = pv[picker(nr, i, nc, j)];
63  }
64  }
65  if (mat_lo.size() == 0)
66  mat_lo.resize(1, 1);
67  return mat_lo;
68 }
69 
70 template <typename Tdst, typename Tsrc>
71 void collect_block_from_IJ_storage(
72  matrix_m<Tdst> &mat_lo,
73  const LIBRPA::Array_Desc &ad,
74  const LIBRPA::AtomicBasis &atbasis_row,
75  const LIBRPA::AtomicBasis &atbasis_col,
76  const int &I, const int &J,
77  Tdst alpha, const Tsrc *pvIJ, MAJOR major_pv)
78 {
79  // assert(mat_lo.nr() == ad.m_loc() && mat_lo.nc() == ad.n_loc());
80  assert(ad.m() == atbasis_row.nb_total && ad.n() == atbasis_col.nb_total );
81  const int row_start_id = atbasis_row.get_part_range()[I];
82  const int col_start_id = atbasis_col.get_part_range()[J];
83  const int row_nb = atbasis_row.get_atom_nb(I);
84  const int col_nb = atbasis_col.get_atom_nb(J);
85  const auto picker = Indx_pickers_2d[major_pv];
86  // cout << str(mat_lo);
87  for (int iI = 0; iI != row_nb; iI++)
88  {
89  int ilo = ad.indx_g2l_r(row_start_id + iI);
90  // LIBRPA::utils::lib_printf("row igo %d ilo %d\n", row_start_id + iI, ilo);
91  if (ilo < 0) continue;
92  for (int jJ = 0; jJ != col_nb; jJ++)
93  {
94  int jlo = ad.indx_g2l_c(col_start_id + jJ);
95  // LIBRPA::utils::lib_printf("col jgo %d jlo %d\n", col_start_id + jJ, jlo);
96  if (jlo < 0) continue;
97  // cout << "pickerID " << picker(row_nb, iI, col_nb, jJ) << " " << pvIJ[picker(row_nb, iI, col_nb, jJ)] << " matloc befor " << mat_lo(ilo, jlo);
98  mat_lo(ilo, jlo) += alpha * pvIJ[picker(row_nb, iI, col_nb, jJ)];
99  // cout << " after " << mat_lo(ilo, jlo) << endl;
100  }
101  }
102 }
103 
104 template <typename Tdst, typename Tsrc, typename TA, typename TC, typename TAC = std::pair<TC, TA>>
105 void collect_block_from_IJ_storage_tensor(
106  matrix_m<Tdst> &mat_lo,
107  const LIBRPA::Array_Desc &ad,
108  const LIBRPA::AtomicBasis &atbasis_row,
109  const LIBRPA::AtomicBasis &atbasis_col,
110  const TC &cell,
111  Tdst alpha, const std::map<TA,std::map<TAC,Tensor<Tsrc>>> &TMAP)
112 {
113  // assert(mat_lo.nr() == ad.m_loc() && mat_lo.nc() == ad.n_loc());
114  assert(ad.m() == atbasis_row.nb_total && ad.n() == atbasis_col.nb_total );
115 
116  matrix_m<Tdst> tmp_loc(mat_lo.nr(),mat_lo.nc(), MAJOR::ROW);
117  size_t cp_size= ad.n_loc()*sizeof(Tdst);
118 
119  omp_lock_t mat_lock;
120  omp_init_lock(&mat_lock);
121 
122  #pragma omp parallel for
123  for (int ilo = 0; ilo != ad.m_loc(); ilo++)
124  {
125  int I_loc, J_loc, i_ab, j_ab;
126  int i_gl = ad.indx_l2g_r(ilo);
127  atbasis_row.get_local_index(i_gl, I_loc, i_ab);
128  vector<Tdst> tmp_loc_row(ad.n_loc());
129  for (int jlo = 0; jlo != ad.n_loc(); jlo++)
130  {
131  int j_gl = ad.indx_l2g_c(jlo);
132  atbasis_col.get_local_index(j_gl, J_loc, j_ab);
133  //Tdst temp;
134  // LIBRPA::utils::lib_printf("i_gl I_loc i_ab %d %d %d j_gl J_loc j_ab %d %d %d\n", i_gl, I_loc, i_ab, j_gl, J_loc, j_ab);
135  tmp_loc_row[jlo]= TMAP.at(I_loc).at({J_loc, cell})(i_ab,j_ab);
136  }
137  Tdst *row_ptr=tmp_loc.ptr() + ilo* ad.n_loc();
138  omp_set_lock(&mat_lock);
139  memcpy(row_ptr,&tmp_loc_row[0],cp_size);
140  omp_unset_lock(&mat_lock);
141  }
142  #pragma omp barrier
143  omp_destroy_lock(&mat_lock);
144  if(mat_lo.is_col_major())
145  {
146  tmp_loc.swap_to_col_major();
147  }
148  mat_lo=tmp_loc;
149 }
150 
152 template <typename Tdst, typename Tsrc>
153 void collect_block_from_IJ_storage_syhe(
154  matrix_m<Tdst> &mat_lo,
155  const LIBRPA::Array_Desc &ad,
156  const LIBRPA::AtomicBasis &atbasis,
157  const int &I, const int &J, bool conjugate,
158  Tdst alpha, const Tsrc *pvIJ, MAJOR major_pv)
159 {
160  // assert(mat_lo.nr() == ad.m_loc() && mat_lo.nc() == ad.n_loc());
161  assert(ad.m() == atbasis.nb_total && ad.n() == atbasis.nb_total);
162  // blocks on diagonal is trivial
163  if (I == J)
164  {
165  collect_block_from_IJ_storage(mat_lo, ad, atbasis, atbasis, I, J, alpha, pvIJ, major_pv);
166  return;
167  }
168  const std::function<Tdst(Tdst)> filter = conjugate ? [](Tdst a) { return get_conj(a); }
169  : [](Tdst a) { return a; };
170  const int I_nb = atbasis.get_atom_nb(I);
171  const int J_nb = atbasis.get_atom_nb(J);
172  const auto picker = Indx_pickers_2d[major_pv];
173  int I_loc, J_loc, i_ab, j_ab;
174  for (int ilo = 0; ilo != ad.m_loc(); ilo++)
175  for (int jlo = 0; jlo != ad.n_loc(); jlo++)
176  {
177  int i_gl = ad.indx_l2g_r(ilo);
178  int j_gl = ad.indx_l2g_c(jlo);
179  atbasis.get_local_index(i_gl, I_loc, i_ab);
180  atbasis.get_local_index(j_gl, J_loc, j_ab);
181  Tdst temp;
182  // LIBRPA::utils::lib_printf("i_gl I_loc i_ab %d %d %d j_gl J_loc j_ab %d %d %d\n", i_gl, I_loc, i_ab, j_gl, J_loc, j_ab);
183  if ((I_loc == I) && (J_loc == J))
184  {
185  // in the same block
186  temp = pvIJ[picker(I_nb, i_ab, J_nb, j_ab)];
187  // cout << "same block pickerID " << picker(I_nb, i_ab, J_nb, j_ab) << " " << temp << " matloc befor " << mat_lo(ilo, jlo);
188  }
189  else if ((I_loc == J) && (J_loc == I))
190  {
191  // in the opposite block
192  temp = filter(pvIJ[picker(I_nb, j_ab, J_nb, i_ab)]);
193  // cout << "opposite block pickerID " << picker(I_nb, j_ab, J_nb, i_ab) << " " << temp << " matloc befor " << mat_lo(ilo, jlo);
194  }
195  else
196  continue;
197  mat_lo(ilo, jlo) += alpha * temp;
198  // cout << " after " << mat_lo(ilo, jlo) << endl;
199  }
200 }
201 
202 template <typename Tdst, typename TA, typename TC, typename TAC = std::pair<TA, TC>>
203 void collect_block_from_ALL_IJ_Tensor(
204  matrix_m<Tdst> &mat_lo,
205  const LIBRPA::Array_Desc &ad,
206  const LIBRPA::AtomicBasis &atbasis,
207  const TC &cell,
208  bool conjugate,
209  Tdst alpha, const std::map<TA,std::map<TAC,Tensor<Tdst>>> TMAP, MAJOR major_pv)
210 {
211  // assert(mat_lo.nr() == ad.m_loc() && mat_lo.nc() == ad.n_loc());
212  assert(ad.m() == atbasis.nb_total && ad.n() == atbasis.nb_total);
213  matrix_m<Tdst> tmp_loc(mat_lo.nr(),mat_lo.nc(), MAJOR::ROW);
214  size_t cp_size= ad.n_loc()*sizeof(Tdst);
215 
216  omp_lock_t mat_lock;
217  omp_init_lock(&mat_lock);
218 
219  #pragma omp parallel for
220  for (int ilo = 0; ilo != ad.m_loc(); ilo++)
221  {
222  int I_loc, J_loc, i_ab, j_ab;
223  int i_gl = ad.indx_l2g_r(ilo);
224  atbasis.get_local_index(i_gl, I_loc, i_ab);
225  vector<Tdst> tmp_loc_row(ad.n_loc());
226  for (int jlo = 0; jlo != ad.n_loc(); jlo++)
227  {
228 
229  int j_gl = ad.indx_l2g_c(jlo);
230 
231  atbasis.get_local_index(j_gl, J_loc, j_ab);
232  //Tdst temp;
233  // LIBRPA::utils::lib_printf("i_gl I_loc i_ab %d %d %d j_gl J_loc j_ab %d %d %d\n", i_gl, I_loc, i_ab, j_gl, J_loc, j_ab);
234  if(I_loc<=J_loc)
235  {
236  tmp_loc_row[jlo]= TMAP.at(I_loc).at({J_loc, cell})(i_ab,j_ab);
237  }
238  else
239  {
240  Tdst tmp_ele;
241  tmp_ele= TMAP.at(J_loc).at({I_loc, cell})(j_ab,i_ab);
242  if(conjugate)
243  tmp_ele=get_conj(tmp_ele);
244  tmp_loc_row[jlo]=tmp_ele;
245  }
246  }
247  Tdst *row_ptr=tmp_loc.ptr()+ilo* ad.n_loc();
248  omp_set_lock(&mat_lock);
249  memcpy(row_ptr,&tmp_loc_row[0],cp_size);
250  omp_unset_lock(&mat_lock);
251  }
252  #pragma omp barrier
253  omp_destroy_lock(&mat_lock);
254  if(mat_lo.is_col_major())
255  {
256  tmp_loc.swap_to_col_major();
257  }
258  mat_lo=tmp_loc;
259 }
260 
261 
263 template <typename T>
264 void map_block_to_IJ_storage(map<int, map<int, matrix_m<T>>> &IJmap,
265  const LIBRPA::AtomicBasis &atbasis_row,
266  const LIBRPA::AtomicBasis &atbasis_col,
267  const matrix_m<T> &mat_lo,
268  const LIBRPA::Array_Desc &desc, MAJOR major_map)
269 {
270  assert(desc.m() == atbasis_row.nb_total && desc.n() == atbasis_col.nb_total);
271  int I, J, iI, jJ;
272  for (int i_lo = 0; i_lo != desc.m_loc(); i_lo++)
273  for (int j_lo = 0; j_lo != desc.n_loc(); j_lo++)
274  {
275  int i_glo = desc.indx_l2g_r(i_lo);
276  int j_glo = desc.indx_l2g_c(j_lo);
277  atbasis_row.get_local_index(i_glo, I, iI);
278  atbasis_col.get_local_index(j_glo, J, jJ);
279  if (IJmap.count(I) == 0 || IJmap.at(I).count(J) == 0 || IJmap.at(I).at(J).size() == 0)
280  IJmap[I][J] = matrix_m<T>{static_cast<int>(atbasis_row.get_atom_nb(I)), static_cast<int>(atbasis_col.get_atom_nb(J)), major_map};
281  IJmap[I][J](iI, jJ) = mat_lo(i_lo, j_lo);
282  }
283 }
284 
285 template <typename T>
286 matrix_m<std::complex<T>> power_hemat_blacs(matrix_m<std::complex<T>> &A_local,
287  const LIBRPA::Array_Desc &ad_A,
288  matrix_m<std::complex<T>> &Z_local,
289  const LIBRPA::Array_Desc &ad_Z,
290  size_t &n_filtered, T *W, T power,
291  const T &threshold = -1.e5)
292 {
293  assert (A_local.is_col_major() && Z_local.is_col_major());
294  const bool is_int_power = fabs(power - int(power)) < 1e-4;
295  const int n = ad_A.m();
296  const char jobz = 'V';
297  const char uplo = 'U';
298 
299  int lwork = -1, lrwork = -1, info = 0;
300  std::complex<T> *work;
301  T *rwork;
302  {
303  work = new std::complex<T>[1];
304  rwork = new T[1];
305  // query the optimal lwork and lrwork
306  T *Wquery = new T[1];
307  // LIBRPA::utils::lib_printf("power_hemat_blacs descA %s\n", ad_A.info_desc().c_str());
308  ScalapackConnector::pheev_f(jobz, uplo,
309  n, A_local.ptr(), 1, 1, ad_A.desc,
310  Wquery, Z_local.ptr(), 1, 1, ad_Z.desc, work, lwork, rwork, lrwork, info);
311  lwork = int(work[0].real());
312  lrwork = int(rwork[0]);
313  delete [] work, Wquery, rwork;
314  }
315  // LIBRPA::utils::lib_printf("lwork %d lrwork %d\n", lwork, lrwork); // debug
316 
317  work = new std::complex<T> [lwork];
318  rwork = new T [lrwork];
319  ScalapackConnector::pheev_f(jobz, uplo,
320  n, A_local.ptr(), 1, 1, ad_A.desc,
321  W, Z_local.ptr(), 1, 1, ad_Z.desc, work, lwork, rwork, lrwork, info);
322  delete [] work, rwork;
323 
324  // check the number of non-singular eigenvalues,
325  // using the fact that W is in ascending order
326  n_filtered = n;
327  for (int i = 0; i != n; i++)
328  if (W[i] >= threshold)
329  {
330  n_filtered = i;
331  break;
332  }
333 
334  // filter and scale the eigenvalues, store in a temp array
335  T W_temp[n];
336  for (int i = 0; i != n_filtered; i++)
337  W_temp[i] = 0.0;
338  for (int i = n_filtered; i != n; i++)
339  {
340  if (W[i] < 0 && !is_int_power)
341  LIBRPA::utils::lib_printf("Warning! unfiltered negative eigenvalue with non-integer power: # %d ev = %f , pow = %f\n", i, W[i], power);
342  if (fabs(W[i]) < 1e-10 && power < 0)
343  LIBRPA::utils::lib_printf("Warning! unfiltered nearly-singular eigenvalue with negative power: # %d ev = %f , pow = %f\n", i, W[i], power);
344  W_temp[i] = std::pow(W[i], power);
345  }
346  // debug print
347  // for (int i = 0; i != n; i++)
348  // {
349  // LIBRPA::utils::lib_printf("%d %f %f\n", i, W[i], W_temp[i]);
350  // }
351 
352  // create scaled eigenvectors
353  auto scaled_Z_local = Z_local.copy();
354  for (int i = 0; i != n; i++)
355  ScalapackConnector::pscal_f(n, W_temp[i], scaled_Z_local.ptr(), 1, 1+i, ad_Z.desc, 1);
356  ScalapackConnector::pgemm_f('N', 'C', n, n, n, 1.0, Z_local.ptr(), 1, 1, ad_Z.desc, scaled_Z_local.ptr(), 1, 1, ad_Z.desc, 0.0, A_local.ptr(), 1, 1, ad_A.desc);
357  return scaled_Z_local;
358 }
359 
360 template <typename T, typename Treal = typename to_real<T>::type>
361 void print_matrix_mm_parallel(ostream &os, const matrix_m<T> &mat_loc, const LIBRPA::Array_Desc &ad, Treal threshold = 1e-15, bool row_first = true)
362 {
363  LIBRPA::Array_Desc ad_fb(ad.ictxt());
364  const int nr = ad.m(), nc = ad.n();
365  const int irsrc = ad.irsrc(), icsrc = ad.icsrc();
366  ad_fb.init(nr, nc, nr, nc, irsrc, icsrc);
367  matrix_m<T> mat_glo = init_local_mat<T>(ad_fb, mat_loc.major());
368 
369  ScalapackConnector::pgemr2d_f(nr, nc, mat_loc.ptr(), 1, 1, ad.desc, mat_glo.ptr(), 1, 1, ad_fb.desc, ad.ictxt());
370  if (ad_fb.is_src() && os.good())
371  {
372  print_matrix_mm(mat_glo, os, threshold, row_first);
373  }
374 }
375 
376 template <typename T, typename Treal = typename to_real<T>::type>
377 void print_matrix_mm_file_parallel(const char *fn, const matrix_m<T> &mat_loc, const LIBRPA::Array_Desc &ad, Treal threshold = 1e-15, bool row_first = true)
378 {
379  ofstream fs;
380  if (ad.is_src())
381  fs.open(fn);
382  print_matrix_mm_parallel(fs, mat_loc, ad, threshold, row_first);
383 }
384 
385 template <typename T>
386 matrix_m<T> multiply_scalapack(const matrix_m<T> &m1_loc, const LIBRPA::Array_Desc &desc_m1,
387  const matrix_m<T> &m2_loc, const LIBRPA::Array_Desc &desc_m2,
388  const LIBRPA::Array_Desc &desc_prod)
389 {
390  if (m1_loc.major() != m2_loc.major())
391  throw std::invalid_argument("m1 and m2 in different major");
392  if (desc_m1.n() != desc_m2.m())
393  throw std::invalid_argument("m1.col != m2.row");
394  const int m = desc_m1.m();
395  const int n = desc_m2.n();
396  const int k = desc_m1.n();
397  matrix_m<T> prod_loc = init_local_mat<T>(desc_prod, m1_loc.major());
398  if (m1_loc.is_col_major())
399  {
400  ScalapackConnector::pgemm_f('N', 'N', m, n, k, 1.0,
401  m1_loc.ptr(), 1, 1, desc_m1.desc,
402  m2_loc.ptr(), 1, 1, desc_m2.desc,
403  0.0,
404  prod_loc.ptr(), 1, 1, desc_prod.desc);
405  }
406  else
407  {
408  throw std::logic_error("row-major parallel multiply is not implemented");
409  }
410  return prod_loc;
411 }
412 
413 template <typename T>
414 void invert_scalapack(matrix_m<T> &m_loc, const LIBRPA::Array_Desc &desc_m)
415 {
416  assert(m_loc.is_col_major());
417  int info = 0;
418  if (m_loc.is_col_major())
419  {
420  // NOTE: use local leading dimension desc_m.lld() will lead to invalid pointer error
421  int *ipiv = new int [desc_m.m()];
422  // LIBRPA::utils::lib_printf("invert_scalpack desc %s\n", desc_m.info_desc().c_str());
423  ScalapackConnector::pgetrf_f(desc_m.m(), desc_m.n(), m_loc.ptr(), 1, 1, desc_m.desc, ipiv, info);
424  // get optimized work size
425  int lwork = -1, liwork = -1;
426  {
427  T *work = new T [1];
428  int *iwork = new int [1];
429  ScalapackConnector::pgetri_f(desc_m.m(), m_loc.ptr(), 1, 1, desc_m.desc, ipiv, work, lwork, iwork, liwork, info);
430  lwork = int(get_real(work[0]));
431  liwork = iwork[0];
432  // LIBRPA::utils::lib_printf("lwork %d liwork %d\n", lwork, liwork);
433  delete [] work;
434  delete [] iwork;
435  }
436 
437  T *work = new T[lwork];
438  int *iwork = new int[liwork];
439  ScalapackConnector::pgetri_f(desc_m.m(), m_loc.ptr(), 1, 1, desc_m.desc, ipiv, work, lwork, iwork, liwork, info);
440  delete [] iwork;
441  delete [] work;
442  delete [] ipiv;
443  // LIBRPA::utils::lib_printf("done free\n");
444  }
445  else
446  {
447  throw std::logic_error("row-major invert is not implemented");
448  }
449 }
Definition: parallel_mpi.h:111
Definition: atomic_basis.h:12
Definition: libri_stub.h:19
Definition: matrix_m_parallel_utils.h:13
Definition: matrix_m.h:213
void lib_printf(const char *format, Args &&... args)
printf that handles the stdout redirect
Definition: utils_io.h:13