4 #include "parallel_mpi.h"
6 #include "scalapack_connector.h"
8 #ifdef LIBRPA_USE_LIBRI
9 #include <RI/global/Tensor.h>
20 if (mat_lo.size() == 0)
29 assert(mat_go.nr() == ad.m() && mat_go.nc() == ad.n());
32 for (
int i = 0; i != mat_go.nr(); i++)
34 auto i_lo = ad.indx_g2l_r(i);
35 if (i_lo < 0)
continue;
36 for (
int j = 0; j != mat_go.nc(); j++)
38 auto j_lo = ad.indx_g2l_c(j);
39 if (j_lo < 0)
continue;
40 mat_lo(i_lo, j_lo) = mat_go(i, j);
43 if (mat_lo.size() == 0)
51 const int nr = ad.m(), nc = ad.n();
53 const auto picker = Indx_pickers_2d[major_pv];
54 for (
int i = 0; i != nr; i++)
56 auto i_lo = ad.indx_g2l_r(i);
57 if (i_lo < 0)
continue;
58 for (
int j = 0; j != nc; j++)
60 auto j_lo = ad.indx_g2l_c(j);
61 if (j_lo < 0)
continue;
62 mat_lo(i_lo, j_lo) = pv[picker(nr, i, nc, j)];
65 if (mat_lo.size() == 0)
70 template <
typename Tdst,
typename Tsrc>
71 void collect_block_from_IJ_storage(
76 const int &I,
const int &J,
77 Tdst alpha,
const Tsrc *pvIJ, MAJOR major_pv)
80 assert(ad.m() == atbasis_row.nb_total && ad.n() == atbasis_col.nb_total );
81 const int row_start_id = atbasis_row.get_part_range()[I];
82 const int col_start_id = atbasis_col.get_part_range()[J];
83 const int row_nb = atbasis_row.get_atom_nb(I);
84 const int col_nb = atbasis_col.get_atom_nb(J);
85 const auto picker = Indx_pickers_2d[major_pv];
87 for (
int iI = 0; iI != row_nb; iI++)
89 int ilo = ad.indx_g2l_r(row_start_id + iI);
91 if (ilo < 0)
continue;
92 for (
int jJ = 0; jJ != col_nb; jJ++)
94 int jlo = ad.indx_g2l_c(col_start_id + jJ);
96 if (jlo < 0)
continue;
98 mat_lo(ilo, jlo) += alpha * pvIJ[picker(row_nb, iI, col_nb, jJ)];
104 template <
typename Tdst,
typename Tsrc,
typename TA,
typename TC,
typename TAC = std::pair<TC, TA>>
105 void collect_block_from_IJ_storage_tensor(
111 Tdst alpha,
const std::map<TA,std::map<TAC,
Tensor<Tsrc>>> &TMAP)
114 assert(ad.m() == atbasis_row.nb_total && ad.n() == atbasis_col.nb_total );
117 size_t cp_size= ad.n_loc()*
sizeof(Tdst);
120 omp_init_lock(&mat_lock);
122 #pragma omp parallel for
123 for (
int ilo = 0; ilo != ad.m_loc(); ilo++)
125 int I_loc, J_loc, i_ab, j_ab;
126 int i_gl = ad.indx_l2g_r(ilo);
127 atbasis_row.get_local_index(i_gl, I_loc, i_ab);
128 vector<Tdst> tmp_loc_row(ad.n_loc());
129 for (
int jlo = 0; jlo != ad.n_loc(); jlo++)
131 int j_gl = ad.indx_l2g_c(jlo);
132 atbasis_col.get_local_index(j_gl, J_loc, j_ab);
135 tmp_loc_row[jlo]= TMAP.at(I_loc).at({J_loc, cell})(i_ab,j_ab);
137 Tdst *row_ptr=tmp_loc.ptr() + ilo* ad.n_loc();
138 omp_set_lock(&mat_lock);
139 memcpy(row_ptr,&tmp_loc_row[0],cp_size);
140 omp_unset_lock(&mat_lock);
143 omp_destroy_lock(&mat_lock);
144 if(mat_lo.is_col_major())
146 tmp_loc.swap_to_col_major();
152 template <
typename Tdst,
typename Tsrc>
153 void collect_block_from_IJ_storage_syhe(
157 const int &I,
const int &J,
bool conjugate,
158 Tdst alpha,
const Tsrc *pvIJ, MAJOR major_pv)
161 assert(ad.m() == atbasis.nb_total && ad.n() == atbasis.nb_total);
165 collect_block_from_IJ_storage(mat_lo, ad, atbasis, atbasis, I, J, alpha, pvIJ, major_pv);
168 const std::function<Tdst(Tdst)> filter = conjugate ? [](Tdst a) {
return get_conj(a); }
169 : [](Tdst a) {
return a; };
170 const int I_nb = atbasis.get_atom_nb(I);
171 const int J_nb = atbasis.get_atom_nb(J);
172 const auto picker = Indx_pickers_2d[major_pv];
173 int I_loc, J_loc, i_ab, j_ab;
174 for (
int ilo = 0; ilo != ad.m_loc(); ilo++)
175 for (
int jlo = 0; jlo != ad.n_loc(); jlo++)
177 int i_gl = ad.indx_l2g_r(ilo);
178 int j_gl = ad.indx_l2g_c(jlo);
179 atbasis.get_local_index(i_gl, I_loc, i_ab);
180 atbasis.get_local_index(j_gl, J_loc, j_ab);
183 if ((I_loc == I) && (J_loc == J))
186 temp = pvIJ[picker(I_nb, i_ab, J_nb, j_ab)];
189 else if ((I_loc == J) && (J_loc == I))
192 temp = filter(pvIJ[picker(I_nb, j_ab, J_nb, i_ab)]);
197 mat_lo(ilo, jlo) += alpha * temp;
202 template <
typename Tdst,
typename TA,
typename TC,
typename TAC = std::pair<TA, TC>>
203 void collect_block_from_ALL_IJ_Tensor(
209 Tdst alpha,
const std::map<TA,std::map<TAC,
Tensor<Tdst>>> TMAP, MAJOR major_pv)
212 assert(ad.m() == atbasis.nb_total && ad.n() == atbasis.nb_total);
214 size_t cp_size= ad.n_loc()*
sizeof(Tdst);
217 omp_init_lock(&mat_lock);
219 #pragma omp parallel for
220 for (
int ilo = 0; ilo != ad.m_loc(); ilo++)
222 int I_loc, J_loc, i_ab, j_ab;
223 int i_gl = ad.indx_l2g_r(ilo);
224 atbasis.get_local_index(i_gl, I_loc, i_ab);
225 vector<Tdst> tmp_loc_row(ad.n_loc());
226 for (
int jlo = 0; jlo != ad.n_loc(); jlo++)
229 int j_gl = ad.indx_l2g_c(jlo);
231 atbasis.get_local_index(j_gl, J_loc, j_ab);
236 tmp_loc_row[jlo]= TMAP.at(I_loc).at({J_loc, cell})(i_ab,j_ab);
241 tmp_ele= TMAP.at(J_loc).at({I_loc, cell})(j_ab,i_ab);
243 tmp_ele=get_conj(tmp_ele);
244 tmp_loc_row[jlo]=tmp_ele;
247 Tdst *row_ptr=tmp_loc.ptr()+ilo* ad.n_loc();
248 omp_set_lock(&mat_lock);
249 memcpy(row_ptr,&tmp_loc_row[0],cp_size);
250 omp_unset_lock(&mat_lock);
253 omp_destroy_lock(&mat_lock);
254 if(mat_lo.is_col_major())
256 tmp_loc.swap_to_col_major();
263 template <
typename T>
264 void map_block_to_IJ_storage(map<
int, map<
int,
matrix_m<T>>> &IJmap,
270 assert(desc.m() == atbasis_row.nb_total && desc.n() == atbasis_col.nb_total);
272 for (
int i_lo = 0; i_lo != desc.m_loc(); i_lo++)
273 for (
int j_lo = 0; j_lo != desc.n_loc(); j_lo++)
275 int i_glo = desc.indx_l2g_r(i_lo);
276 int j_glo = desc.indx_l2g_c(j_lo);
277 atbasis_row.get_local_index(i_glo, I, iI);
278 atbasis_col.get_local_index(j_glo, J, jJ);
279 if (IJmap.count(I) == 0 || IJmap.at(I).count(J) == 0 || IJmap.at(I).at(J).size() == 0)
280 IJmap[I][J] =
matrix_m<T>{
static_cast<int>(atbasis_row.get_atom_nb(I)),
static_cast<int>(atbasis_col.get_atom_nb(J)), major_map};
281 IJmap[I][J](iI, jJ) = mat_lo(i_lo, j_lo);
285 template <
typename T>
290 size_t &n_filtered, T *W, T power,
291 const T &threshold = -1.e5)
293 assert (A_local.is_col_major() && Z_local.is_col_major());
294 const bool is_int_power = fabs(power -
int(power)) < 1e-4;
295 const int n = ad_A.m();
296 const char jobz =
'V';
297 const char uplo =
'U';
299 int lwork = -1, lrwork = -1, info = 0;
300 std::complex<T> *work;
303 work =
new std::complex<T>[1];
306 T *Wquery =
new T[1];
308 ScalapackConnector::pheev_f(jobz, uplo,
309 n, A_local.ptr(), 1, 1, ad_A.desc,
310 Wquery, Z_local.ptr(), 1, 1, ad_Z.desc, work, lwork, rwork, lrwork, info);
311 lwork = int(work[0].real());
312 lrwork = int(rwork[0]);
313 delete [] work, Wquery, rwork;
317 work =
new std::complex<T> [lwork];
318 rwork =
new T [lrwork];
319 ScalapackConnector::pheev_f(jobz, uplo,
320 n, A_local.ptr(), 1, 1, ad_A.desc,
321 W, Z_local.ptr(), 1, 1, ad_Z.desc, work, lwork, rwork, lrwork, info);
322 delete [] work, rwork;
327 for (
int i = 0; i != n; i++)
328 if (W[i] >= threshold)
336 for (
int i = 0; i != n_filtered; i++)
338 for (
int i = n_filtered; i != n; i++)
340 if (W[i] < 0 && !is_int_power)
341 LIBRPA::utils::lib_printf(
"Warning! unfiltered negative eigenvalue with non-integer power: # %d ev = %f , pow = %f\n", i, W[i], power);
342 if (fabs(W[i]) < 1e-10 && power < 0)
343 LIBRPA::utils::lib_printf(
"Warning! unfiltered nearly-singular eigenvalue with negative power: # %d ev = %f , pow = %f\n", i, W[i], power);
344 W_temp[i] = std::pow(W[i], power);
353 auto scaled_Z_local = Z_local.copy();
354 for (
int i = 0; i != n; i++)
355 ScalapackConnector::pscal_f(n, W_temp[i], scaled_Z_local.ptr(), 1, 1+i, ad_Z.desc, 1);
356 ScalapackConnector::pgemm_f(
'N',
'C', n, n, n, 1.0, Z_local.ptr(), 1, 1, ad_Z.desc, scaled_Z_local.ptr(), 1, 1, ad_Z.desc, 0.0, A_local.ptr(), 1, 1, ad_A.desc);
357 return scaled_Z_local;
360 template <typename T, typename Treal = typename to_real<T>::type>
361 void print_matrix_mm_parallel(ostream &os,
const matrix_m<T> &mat_loc,
const LIBRPA::Array_Desc &ad, Treal threshold = 1e-15,
bool row_first =
true)
364 const int nr = ad.m(), nc = ad.n();
365 const int irsrc = ad.irsrc(), icsrc = ad.icsrc();
366 ad_fb.init(nr, nc, nr, nc, irsrc, icsrc);
367 matrix_m<T> mat_glo = init_local_mat<T>(ad_fb, mat_loc.major());
369 ScalapackConnector::pgemr2d_f(nr, nc, mat_loc.ptr(), 1, 1, ad.desc, mat_glo.ptr(), 1, 1, ad_fb.desc, ad.ictxt());
370 if (ad_fb.is_src() && os.good())
372 print_matrix_mm(mat_glo, os, threshold, row_first);
376 template <typename T, typename Treal = typename to_real<T>::type>
377 void print_matrix_mm_file_parallel(
const char *fn,
const matrix_m<T> &mat_loc,
const LIBRPA::Array_Desc &ad, Treal threshold = 1e-15,
bool row_first =
true)
382 print_matrix_mm_parallel(fs, mat_loc, ad, threshold, row_first);
385 template <
typename T>
390 if (m1_loc.major() != m2_loc.major())
391 throw std::invalid_argument(
"m1 and m2 in different major");
392 if (desc_m1.n() != desc_m2.m())
393 throw std::invalid_argument(
"m1.col != m2.row");
394 const int m = desc_m1.m();
395 const int n = desc_m2.n();
396 const int k = desc_m1.n();
397 matrix_m<T> prod_loc = init_local_mat<T>(desc_prod, m1_loc.major());
398 if (m1_loc.is_col_major())
400 ScalapackConnector::pgemm_f(
'N',
'N', m, n, k, 1.0,
401 m1_loc.ptr(), 1, 1, desc_m1.desc,
402 m2_loc.ptr(), 1, 1, desc_m2.desc,
404 prod_loc.ptr(), 1, 1, desc_prod.desc);
408 throw std::logic_error(
"row-major parallel multiply is not implemented");
413 template <
typename T>
416 assert(m_loc.is_col_major());
418 if (m_loc.is_col_major())
421 int *ipiv =
new int [desc_m.m()];
423 ScalapackConnector::pgetrf_f(desc_m.m(), desc_m.n(), m_loc.ptr(), 1, 1, desc_m.desc, ipiv, info);
425 int lwork = -1, liwork = -1;
428 int *iwork =
new int [1];
429 ScalapackConnector::pgetri_f(desc_m.m(), m_loc.ptr(), 1, 1, desc_m.desc, ipiv, work, lwork, iwork, liwork, info);
430 lwork = int(get_real(work[0]));
437 T *work =
new T[lwork];
438 int *iwork =
new int[liwork];
439 ScalapackConnector::pgetri_f(desc_m.m(), m_loc.ptr(), 1, 1, desc_m.desc, ipiv, work, lwork, iwork, liwork, info);
447 throw std::logic_error(
"row-major invert is not implemented");
Definition: parallel_mpi.h:111
Definition: atomic_basis.h:12
Definition: libri_stub.h:19
Definition: matrix_m_parallel_utils.h:13
Definition: matrix_m.h:213
void lib_printf(const char *format, Args &&... args)
printf that handles the stdout redirect
Definition: utils_io.h:13