diff --git a/examples/scf/pw_Si2/INPUT b/examples/scf/pw_Si2/INPUT index 9dd1930909..141c104100 100644 --- a/examples/scf/pw_Si2/INPUT +++ b/examples/scf/pw_Si2/INPUT @@ -8,5 +8,5 @@ ecutwfc 60 scf_thr 1e-7 scf_nmax 100 device cpu -ks_solver cg +ks_solver dav_subspace precision double diff --git a/source/module_hsolver/diago_dav_subspace.cpp b/source/module_hsolver/diago_dav_subspace.cpp index 2e14855e19..8c1f8818e6 100644 --- a/source/module_hsolver/diago_dav_subspace.cpp +++ b/source/module_hsolver/diago_dav_subspace.cpp @@ -1,32 +1,68 @@ #include "diago_dav_subspace.h" #include "diago_iter_assist.h" - #include "module_base/memory.h" -#include "module_base/timer.h" -#include "module_base/parallel_global.h" #include "module_base/module_device/device.h" - +#include "module_base/timer.h" #include "module_hsolver/kernels/dngvd_op.h" #include "module_hsolver/kernels/math_kernel_op.h" +#include + using namespace hsolver; template -Diago_DavSubspace::Diago_DavSubspace(const Real* precondition_in) +Diago_DavSubspace::Diago_DavSubspace(const std::vector& precondition_in, + const int& nband_in, + const int& nbasis_in, + const int& david_ndim_in, + const double& diag_thr_in, + const int& diag_nmax_in, + const bool& need_subspace_in, + const diag_comm_info& diag_comm_in) + : precondition(precondition_in), n_band(nband_in), dim(nbasis_in), nbase_x(nband_in * david_ndim_in), + diag_thr(diag_thr_in), iter_nmax(diag_nmax_in), is_subspace(need_subspace_in), diag_comm(diag_comm_in) { this->device = base_device::get_device_type(this->ctx); - this->precondition = precondition_in; - - test_david = 2; - // 1: check which function is called and which step is executed - // 2: check the eigenvalues of the result of each iteration - // 3: check the eigenvalues and errors of the last result - // default: no check this->one = &this->cs.one; this->zero = &this->cs.zero; this->neg_one = &this->cs.neg_one; + + assert(david_ndim_in > 1); + assert(david_ndim_in * nband_in < nbasis_in * this->diag_comm.nproc); + + // TODO: Added memory usage statistics + + //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + // the product of H and psi in the reduced basis set + resmem_complex_op()(this->ctx, this->hphi, this->nbase_x * this->dim, "DAV::hphi"); + setmem_complex_op()(this->ctx, this->hphi, 0, this->nbase_x * this->dim); + + // Hamiltonian on the reduced basis set + resmem_complex_op()(this->ctx, this->hcc, this->nbase_x * this->nbase_x, "DAV::hcc"); + setmem_complex_op()(this->ctx, this->hcc, 0, this->nbase_x * this->nbase_x); + + // Overlap on the reduced basis set + resmem_complex_op()(this->ctx, this->scc, this->nbase_x * this->nbase_x, "DAV::scc"); + setmem_complex_op()(this->ctx, this->scc, 0, this->nbase_x * this->nbase_x); + + // Eigenvectors + resmem_complex_op()(this->ctx, this->vcc, this->nbase_x * this->nbase_x, "DAV::vcc"); + setmem_complex_op()(this->ctx, this->vcc, 0, this->nbase_x * this->nbase_x); + //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +#if defined(__CUDA) || defined(__ROCM) + if (this->device == base_device::GpuDevice) + { + resmem_real_op()(this->ctx, this->d_precondition, nbasis_in); + syncmem_var_h2d_op()(this->ctx, + this->cpu_ctx, + this->d_precondition, + this->precondition.data(), + nbasis_in); + } +#endif } template @@ -37,63 +73,28 @@ Diago_DavSubspace::~Diago_DavSubspace() delmem_complex_op()(this->ctx, this->scc); delmem_complex_op()(this->ctx, this->vcc); - delmem_real_h_op()(this->cpu_ctx, this->eigenvalue_in_dav); - +#if defined(__CUDA) || defined(__ROCM) if (this->device == base_device::GpuDevice) { delmem_real_op()(this->ctx, this->d_precondition); } +#endif } template -void Diago_DavSubspace::diag_once(hamilt::Hamilt* phm_in, - psi::Psi& psi, - Real* eigenvalue_in_hsolver, - std::vector& is_occupied) +int Diago_DavSubspace::diag_once(hamilt::Hamilt* phm_in, + psi::Psi& psi, + Real* eigenvalue_in_hsolver, + const std::vector& is_occupied) { - if (test_david == 1) - { - ModuleBase::TITLE("Diago_DavSubspace", "diag_once"); - } ModuleBase::timer::tick("Diago_DavSubspace", "diag_once"); - assert(Diago_DavSubspace::PW_DIAG_NDIM > 1); - assert(Diago_DavSubspace::PW_DIAG_NDIM * psi.get_nbands() < psi.get_current_nbas() * GlobalV::NPROC_IN_POOL); - - // qianrui change it 2021-7-25. - // In strictly speaking, it shoule be PW_DIAG_NDIM*nband < npw sum of all pools. We roughly estimate it here. - // However, in most cases, total number of plane waves should be much larger than nband * PW_DIAG_NDIM - - this->dim = psi.get_k_first() ? psi.get_current_nbas() : psi.get_nk() * psi.get_nbasis(); - this->n_band = psi.get_nbands(); - - // maximum dimension of the reduced basis set - this->nbase_x = Diago_DavSubspace::PW_DIAG_NDIM * this->n_band; - + // TODO: Allocate memory in the constructor psi::Psi basis(1, this->nbase_x, this->dim, &(psi.get_ngk(0))); ModuleBase::Memory::record("DAV::basis", this->nbase_x * this->dim * sizeof(T)); // the eigenvalues in dav iter - resmem_real_h_op()(this->cpu_ctx, this->eigenvalue_in_dav, this->nbase_x, "DAV::eig"); - setmem_real_h_op()(this->cpu_ctx, this->eigenvalue_in_dav, 0, this->nbase_x); - - //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - // the product of H and psi in the reduced basis set - resmem_complex_op()(this->ctx, this->hphi, this->nbase_x * this->dim, "DAV::hphi"); - setmem_complex_op()(this->ctx, this->hphi, 0, this->nbase_x * this->dim); - - // Hamiltonian on the reduced basis set - resmem_complex_op()(this->ctx, this->hcc, this->nbase_x * this->nbase_x, "DAV::hcc"); - setmem_complex_op()(this->ctx, this->hcc, 0, this->nbase_x * this->nbase_x); - - // Overlap on the reduced basis set - resmem_complex_op()(this->ctx, this->scc, this->nbase_x * this->nbase_x, "DAV::scc"); - setmem_complex_op()(this->ctx, this->scc, 0, this->nbase_x * this->nbase_x); - - // Eigenvectors - resmem_complex_op()(this->ctx, this->vcc, this->nbase_x * this->nbase_x, "DAV::vcc"); - setmem_complex_op()(this->ctx, this->vcc, 0, this->nbase_x * this->nbase_x); - //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + std::vector eigenvalue_iter(this->nbase_x, 0.0); // convflag[m] = true if the m th band is convergent std::vector convflag(this->n_band, false); @@ -124,32 +125,27 @@ void Diago_DavSubspace::diag_once(hamilt::Hamilt* phm_in, hpsi_info dav_hpsi_in(&basis, psi::Range(1, 0, 0, this->n_band - 1), this->hphi); phm_in->ops->hPsi(dav_hpsi_in); - this->cal_elem(this->dim, - nbase, - this->notconv, - basis, - this->hphi, - this->hcc, - this->scc); + this->cal_elem(this->dim, nbase, this->notconv, basis, this->hphi, this->hcc, this->scc); this->diag_zhegvx(nbase, this->n_band, this->hcc, this->scc, this->nbase_x, - this->eigenvalue_in_dav, + &eigenvalue_iter, this->vcc, true, this->is_subspace); for (size_t m = 0; m < this->n_band; m++) { - eigenvalue_in_hsolver[m] = this->eigenvalue_in_dav[m]; + eigenvalue_in_hsolver[m] = eigenvalue_iter[m]; } ModuleBase::timer::tick("Diago_DavSubspace", "first"); int dav_iter = 0; + do { dav_iter++; @@ -162,22 +158,16 @@ void Diago_DavSubspace::diag_once(hamilt::Hamilt* phm_in, this->hphi, this->vcc, unconv.data(), - this->eigenvalue_in_dav); + &eigenvalue_iter); - this->cal_elem(this->dim, - nbase, - this->notconv, - basis, - this->hphi, - this->hcc, - this->scc); + this->cal_elem(this->dim, nbase, this->notconv, basis, this->hphi, this->hcc, this->scc); this->diag_zhegvx(nbase, this->n_band, this->hcc, this->scc, this->nbase_x, - this->eigenvalue_in_dav, + &eigenvalue_iter, this->vcc, false, false); @@ -190,13 +180,12 @@ void Diago_DavSubspace::diag_once(hamilt::Hamilt* phm_in, { if (is_occupied[m]) { - convflag[m] = (std::abs(this->eigenvalue_in_dav[m] - eigenvalue_in_hsolver[m]) - < DiagoIterAssist::PW_DIAG_THR); + convflag[m] = (std::abs(eigenvalue_iter[m] - eigenvalue_in_hsolver[m]) < this->diag_thr); } else { - const double empty_ethr = std::max(DiagoIterAssist::PW_DIAG_THR * 5.0, Diago_DavSubspace::dav_large_thr); - convflag[m] = (std::abs(this->eigenvalue_in_dav[m] - eigenvalue_in_hsolver[m]) < empty_ethr); + const double empty_ethr = std::max(this->diag_thr * 5.0, 1e-5); + convflag[m] = (std::abs(eigenvalue_iter[m] - eigenvalue_in_hsolver[m]) < empty_ethr); } if (!convflag[m]) @@ -205,14 +194,12 @@ void Diago_DavSubspace::diag_once(hamilt::Hamilt* phm_in, this->notconv++; } - eigenvalue_in_hsolver[m] = this->eigenvalue_in_dav[m]; + eigenvalue_in_hsolver[m] = eigenvalue_iter[m]; } ModuleBase::timer::tick("Diago_DavSubspace", "check_update"); - if ((this->notconv == 0) || - (nbase + this->notconv + 1 > this->nbase_x) || - (dav_iter == DiagoIterAssist::PW_DIAG_NMAX)) + if ((this->notconv == 0) || (nbase + this->notconv + 1 > this->nbase_x) || (dav_iter == this->iter_nmax)) { ModuleBase::timer::tick("Diago_DavSubspace", "last"); @@ -235,7 +222,7 @@ void Diago_DavSubspace::diag_once(hamilt::Hamilt* phm_in, psi.get_pointer(), // C dim * n_band psi.get_nbasis()); - if (!this->notconv || (dav_iter == DiagoIterAssist::PW_DIAG_NMAX)) + if (!this->notconv || (dav_iter == this->iter_nmax)) { // overall convergence or last iteration: exit the iteration @@ -264,25 +251,21 @@ void Diago_DavSubspace::diag_once(hamilt::Hamilt* phm_in, } while (1); - // std::cout << "dav_iter == " << dav_iter << std::endl; - - DiagoIterAssist::avg_iter += static_cast(dav_iter); - ModuleBase::timer::tick("Diago_DavSubspace", "diag_once"); - return; + return dav_iter; } template void Diago_DavSubspace::cal_grad(hamilt::Hamilt* phm_in, - const int& dim, - const int& nbase, - const int& notconv, - psi::Psi& basis, - T* hphi, - T* vcc, - const int* unconv, - Real* eigenvalue_in_dav) + const int& dim, + const int& nbase, + const int& notconv, + psi::Psi& basis, + T* hphi, + T* vcc, + const int* unconv, + std::vector* eigenvalue_iter) { ModuleBase::timer::tick("Diago_DavSubspace", "cal_grad"); @@ -290,14 +273,8 @@ void Diago_DavSubspace::cal_grad(hamilt::Hamilt* phm_in, { if (unconv[i] != i) { - syncmem_complex_op()( - this->ctx, - this->ctx, - vcc + i * this->nbase_x, - vcc + unconv[i] * this->nbase_x, - nbase - ); - this->eigenvalue_in_dav[i] = this->eigenvalue_in_dav[unconv[i]]; + syncmem_complex_op()(this->ctx, this->ctx, vcc + i * this->nbase_x, vcc + unconv[i] * this->nbase_x, nbase); + (*eigenvalue_iter)[i] = (*eigenvalue_iter)[unconv[i]]; } } @@ -320,7 +297,7 @@ void Diago_DavSubspace::cal_grad(hamilt::Hamilt* phm_in, for (int m = 0; m < notconv; m++) { - std::vector e_temp_cpu(this->dim, (-this->eigenvalue_in_dav[m])); + std::vector e_temp_cpu(this->dim, ((-1) * (*eigenvalue_iter)[m])); vector_mul_vector_op()(this->ctx, this->dim, @@ -351,16 +328,10 @@ void Diago_DavSubspace::cal_grad(hamilt::Hamilt* phm_in, { for (size_t i = 0; i < this->dim; i++) { - double x = this->precondition[i] - this->eigenvalue_in_dav[m]; + double x = this->precondition[i] - (*eigenvalue_iter)[m]; pre[i] = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0))); } - vector_div_vector_op()( - this->ctx, - this->dim, - &basis(nbase + m, 0), - &basis(nbase + m, 0), - pre.data() - ); + vector_div_vector_op()(this->ctx, this->dim, &basis(nbase + m, 0), &basis(nbase + m, 0), pre.data()); } // "normalize!!!" in order to improve numerical stability of subspace diagonalization @@ -388,47 +359,47 @@ void Diago_DavSubspace::cal_grad(hamilt::Hamilt* phm_in, template void Diago_DavSubspace::cal_elem(const int& dim, - int& nbase, - const int& notconv, - const psi::Psi& basis, - const T* hphi, - T* hcc, - T* scc) + int& nbase, + const int& notconv, + const psi::Psi& basis, + const T* hphi, + T* hcc, + T* scc) { ModuleBase::timer::tick("Diago_DavSubspace", "cal_elem"); gemm_op()(this->ctx, - 'C', - 'N', - nbase + notconv, - notconv, - this->dim, - this->one, - &basis(0, 0), - this->dim, - &hphi[nbase * this->dim], - this->dim, - this->zero, - &hcc[nbase * this->nbase_x], - this->nbase_x); + 'C', + 'N', + nbase + notconv, + notconv, + this->dim, + this->one, + &basis(0, 0), + this->dim, + &hphi[nbase * this->dim], + this->dim, + this->zero, + &hcc[nbase * this->nbase_x], + this->nbase_x); gemm_op()(this->ctx, - 'C', - 'N', - nbase + notconv, - notconv, - this->dim, - this->one, - &basis(0, 0), - this->dim, - &basis(nbase, 0), - this->dim, - this->zero, - &scc[nbase * this->nbase_x], - this->nbase_x); + 'C', + 'N', + nbase + notconv, + notconv, + this->dim, + this->one, + &basis(0, 0), + this->dim, + &basis(nbase, 0), + this->dim, + this->zero, + &scc[nbase * this->nbase_x], + this->nbase_x); #ifdef __MPI - if (GlobalV::NPROC_IN_POOL > 1) + if (this->diag_comm.nproc > 1) { auto* swap = new T[notconv * this->nbase_x]; syncmem_complex_op()(this->ctx, this->ctx, swap, hcc + nbase * this->nbase_x, notconv * this->nbase_x); @@ -448,7 +419,7 @@ void Diago_DavSubspace::cal_elem(const int& dim, MPI_COMPLEX, MPI_SUM, 0, - POOL_WORLD); + this->diag_comm.comm); } else { @@ -458,7 +429,7 @@ void Diago_DavSubspace::cal_elem(const int& dim, MPI_DOUBLE_COMPLEX, MPI_SUM, 0, - POOL_WORLD); + this->diag_comm.comm); } syncmem_complex_op()(this->ctx, this->ctx, swap, scc + nbase * this->nbase_x, notconv * this->nbase_x); @@ -471,7 +442,7 @@ void Diago_DavSubspace::cal_elem(const int& dim, MPI_COMPLEX, MPI_SUM, 0, - POOL_WORLD); + this->diag_comm.comm); } else { @@ -481,7 +452,7 @@ void Diago_DavSubspace::cal_elem(const int& dim, MPI_DOUBLE_COMPLEX, MPI_SUM, 0, - POOL_WORLD); + this->diag_comm.comm); } } delete[] swap; @@ -522,20 +493,21 @@ void Diago_DavSubspace::cal_elem(const int& dim, template void Diago_DavSubspace::diag_zhegvx(const int& nbase, - const int& nband, - T* hcc, - T* scc, - const int& nbase_x, - Real* eigenvalue_in_dav, - T* vcc, - bool init, - bool is_subspace) + const int& nband, + T* hcc, + T* scc, + const int& nbase_x, + std::vector* eigenvalue_iter, + // Real* eigenvalue_iter, + T* vcc, + bool init, + bool is_subspace) { ModuleBase::timer::tick("Diago_DavSubspace", "diag_zhegvx"); if (is_subspace == false) { - if (GlobalV::RANK_IN_POOL == 0) + if (this->diag_comm.rank == 0) { assert(nbase_x >= std::max(1, nbase)); @@ -556,25 +528,21 @@ void Diago_DavSubspace::diag_zhegvx(const int& nbase, #if defined(__CUDA) || defined(__ROCM) Real* eigenvalue_gpu = nullptr; resmem_real_op()(this->ctx, eigenvalue_gpu, this->nbase_x); - - syncmem_var_h2d_op()( - this->ctx, - this->cpu_ctx, - eigenvalue_gpu, - this->eigenvalue_in_dav, - this->nbase_x - ); + + syncmem_var_h2d_op()(this->ctx, + this->cpu_ctx, + eigenvalue_gpu, + (*eigenvalue_iter).data(), + this->nbase_x); dnevx_op()(this->ctx, nbase, this->nbase_x, this->hcc, nband, eigenvalue_gpu, this->vcc); - syncmem_var_d2h_op()( - this->cpu_ctx, - this->ctx, - this->eigenvalue_in_dav, - eigenvalue_gpu, - this->nbase_x - ); - + syncmem_var_d2h_op()(this->cpu_ctx, + this->ctx, + (*eigenvalue_iter).data(), + eigenvalue_gpu, + this->nbase_x); + delmem_real_op()(this->ctx, eigenvalue_gpu); #endif } @@ -587,7 +555,7 @@ void Diago_DavSubspace::diag_zhegvx(const int& nbase, this->nbase_x, this->hcc, nband, - this->eigenvalue_in_dav, + (*eigenvalue_iter).data(), this->vcc); } else @@ -599,7 +567,7 @@ void Diago_DavSubspace::diag_zhegvx(const int& nbase, this->hcc, this->scc, nband, - this->eigenvalue_in_dav, + (*eigenvalue_iter).data(), this->vcc); } } @@ -624,34 +592,32 @@ void Diago_DavSubspace::diag_zhegvx(const int& nbase, } #ifdef __MPI - if (GlobalV::NPROC_IN_POOL > 1) + if (this->diag_comm.nproc > 1) { // vcc: nbase * nband for (int i = 0; i < nband; i++) { - MPI_Bcast(&vcc[i * this->nbase_x], nbase, MPI_DOUBLE_COMPLEX, 0, POOL_WORLD); + MPI_Bcast(&vcc[i * this->nbase_x], nbase, MPI_DOUBLE_COMPLEX, 0, this->diag_comm.comm); } - MPI_Bcast(this->eigenvalue_in_dav, nband, MPI_DOUBLE, 0, POOL_WORLD); + MPI_Bcast((*eigenvalue_iter).data(), nband, MPI_DOUBLE, 0, this->diag_comm.comm); } #endif - } else if (is_subspace == true) { for (size_t m = 0; m < nband; m++) { - this->eigenvalue_in_dav[m] = get_real(hcc[m * this->nbase_x + m]); + (*eigenvalue_iter)[m] = get_real(hcc[m * this->nbase_x + m]); vcc[m * this->nbase_x + m] = set_real_tocomplex(1.0); } - + #ifdef __MPI - if (GlobalV::NPROC_IN_POOL > 1) + if (this->diag_comm.nproc > 1) { - MPI_Bcast(this->eigenvalue_in_dav, this->n_band, MPI_DOUBLE, 0, POOL_WORLD); + MPI_Bcast((*eigenvalue_iter).data(), this->n_band, MPI_DOUBLE, 0, this->diag_comm.comm); } #endif - } ModuleBase::timer::tick("Diago_DavSubspace", "diag_zhegvx"); @@ -660,15 +626,15 @@ void Diago_DavSubspace::diag_zhegvx(const int& nbase, template void Diago_DavSubspace::refresh(const int& dim, - const int& nband, - int& nbase, - const Real* eigenvalue_in_hsolver, - const psi::Psi& psi, - psi::Psi& basis, - T* hp, - T* sp, - T* hc, - T* vc) + const int& nband, + int& nbase, + const Real* eigenvalue_in_hsolver, + const psi::Psi& psi, + psi::Psi& basis, + T* hp, + T* sp, + T* hc, + T* vc) { ModuleBase::timer::tick("Diago_DavSubspace", "refresh"); @@ -759,44 +725,20 @@ void Diago_DavSubspace::refresh(const int& dim, } template -void Diago_DavSubspace::diag(hamilt::Hamilt* phm_in, - psi::Psi& psi, - Real* eigenvalue_in_hsolver, - std::vector& is_occupied) +int Diago_DavSubspace::diag(hamilt::Hamilt* phm_in, + psi::Psi& psi, + Real* eigenvalue_in_hsolver, + const std::vector& is_occupied, + const bool& scf_type) { /// record the times of trying iterative diagonalization - int ntry = 0; this->notconv = 0; -#if defined(__CUDA) || defined(__ROCM) - if (this->device == base_device::GpuDevice) - { - resmem_real_op()(this->ctx, this->d_precondition, psi.get_nbasis()); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, this->d_precondition, this->precondition, psi.get_nbasis()); - } -#endif - - if (DiagoIterAssist::SCF_ITER == 1) - { - // std::cout << "diagH_subspace" << std::endl; - DiagoIterAssist::diagH_subspace(phm_in, psi, psi, eigenvalue_in_hsolver, psi.get_nbands()); - - this->is_subspace = true; - } - else - { - this->is_subspace = false; - } - bool outputscc = false; bool outputeigenvalue = false; if (outputscc) { - this->nbase_x = 2 * psi.get_nbands(); - resmem_complex_op()(this->ctx, this->scc, this->nbase_x * this->nbase_x, "DAV::scc"); - setmem_complex_op()(this->ctx, this->scc, 0, this->nbase_x * this->nbase_x); - std::cout << "before dav 111" << std::endl; gemm_op()(this->ctx, 'C', @@ -835,14 +777,20 @@ void Diago_DavSubspace::diag(hamilt::Hamilt* phm_in, std::cout << std::endl; } + int sum_iter = 0; + int ntry = 0; do { + if (this->is_subspace || ntry > 0) + { + DiagoIterAssist::diagH_subspace(phm_in, psi, psi, eigenvalue_in_hsolver, psi.get_nbands()); + } - this->diag_once(phm_in, psi, eigenvalue_in_hsolver, is_occupied); + sum_iter += this->diag_once(phm_in, psi, eigenvalue_in_hsolver, is_occupied); ++ntry; - } while (DiagoIterAssist::test_exit_cond(ntry, this->notconv)); + } while (this->test_exit_cond(ntry, this->notconv, scf_type)); if (notconv > std::max(5, psi.get_nbands() / 4)) { @@ -890,7 +838,26 @@ void Diago_DavSubspace::diag(hamilt::Hamilt* phm_in, std::cout << std::endl; } - return; + return sum_iter; +} + +template +bool Diago_DavSubspace::test_exit_cond(const int& ntry, const int& notconv, const bool& scf) +{ + // scf = true; // scf + // scf = false; // nscf + + // If ntry <=5, try to do it better, if ntry > 5, exit. + const bool f1 = (ntry <= 5); + + // In non-self consistent calculation, do until totally converged. + const bool f2 = ((!scf && (notconv > 0))); + + // if self consistent calculation, if not converged > 5, + // using diagH_subspace and cg method again. ntry++ + const bool f3 = ((scf && (notconv > 5))); + + return (f1 && (f2 || f3)); } namespace hsolver diff --git a/source/module_hsolver/diago_dav_subspace.h b/source/module_hsolver/diago_dav_subspace.h index 58bfdafcf3..d2c5a3ab41 100644 --- a/source/module_hsolver/diago_dav_subspace.h +++ b/source/module_hsolver/diago_dav_subspace.h @@ -16,47 +16,63 @@ class Diago_DavSubspace : public DiagH using Real = typename GetTypeReal::type; public: - Diago_DavSubspace(const Real* precondition_in); - ~Diago_DavSubspace(); + Diago_DavSubspace(const std::vector& precondition_in, + const int& nband_in, + const int& nbasis_in, + const int& david_ndim_in, + const double& diag_thr_in, + const int& diag_nmax_in, + const bool& need_subspace_in, + const diag_comm_info& diag_comm_in); + + virtual ~Diago_DavSubspace() override; + + int diag(hamilt::Hamilt* phm_in, + psi::Psi& phi, + Real* eigenvalue_in, + const std::vector& is_occupied, + const bool& scf_type); - // this is the override function diag() for CG method - void diag(hamilt::Hamilt* phm_in, - psi::Psi& phi, - Real* eigenvalue_in, - std::vector& is_occupied); + private: + /// for MPI communication + const diag_comm_info diag_comm; - static int PW_DIAG_NDIM; + /// the threshold for this electronic iteration + const double diag_thr; - static double dav_large_thr; + /// maximal iteration number + const int iter_nmax; - private: - bool is_subspace = false; + /// is diagH_subspace needed? + const bool is_subspace; - int test_david = 0; + /// the first dimension of the matrix to be diagonalized + const int n_band = 0; - /// record for how many bands not have convergence eigenvalues - int notconv = 0; + /// the second dimension of the matrix to be diagonalized + const int dim = 0; + + /// the maximum dimension of the reduced basis set + const int nbase_x = 0; - /// row size for input psi matrix - int n_band = 0; - /// non-zero col size for inputted psi matrix - int dim = 0; - // maximum dimension of the reduced basis set - int nbase_x = 0; - /// precondition for cg diag - const Real* precondition = nullptr; + /// precondition for diag + const std::vector& precondition; Real* d_precondition = nullptr; - /// eigenvalue results - Real* eigenvalue_in_dav = nullptr; + /// record for how many bands not have convergence eigenvalues + int notconv = 0; - T* hphi = nullptr; // the product of H and psi in the reduced basis set + /// the product of H and psi in the reduced basis set + T* hphi = nullptr; - T* hcc = nullptr; // Hamiltonian on the reduced basis + /// Hamiltonian on the reduced basis + T* hcc = nullptr; - T* scc = nullptr; // Overlap on the reduced basis + /// Overlap on the reduced basis + T* scc = nullptr; - T* vcc = nullptr; // Eigenvectors on the reduced basis + /// Eigenvectors on the reduced basis + T* vcc = nullptr; /// device type of psi Device* ctx = {}; @@ -71,7 +87,7 @@ class Diago_DavSubspace : public DiagH T* hphi, T* vcc, const int* unconv, - Real* eigenvalue); + std::vector* eigenvalue_iter); void cal_elem(const int& dim, int& nbase, @@ -97,15 +113,17 @@ class Diago_DavSubspace : public DiagH T* hcc, T* scc, const int& nbase_x, - Real* eigenvalue, + std::vector* eigenvalue_iter, T* vcc, bool init, bool is_subspace); - void diag_once(hamilt::Hamilt* phm_in, - psi::Psi& psi, - Real* eigenvalue_in, - std::vector& is_occupied); + int diag_once(hamilt::Hamilt* phm_in, + psi::Psi& psi, + Real* eigenvalue_in, + const std::vector& is_occupied); + + bool test_exit_cond(const int& ntry, const int& notconv, const bool& scf); using resmem_complex_op = base_device::memory::resize_memory_op; using delmem_complex_op = base_device::memory::delete_memory_op; @@ -132,12 +150,6 @@ class Diago_DavSubspace : public DiagH const T *one = nullptr, *zero = nullptr, *neg_one = nullptr; }; -template -int Diago_DavSubspace::PW_DIAG_NDIM = 4; - -template -double Diago_DavSubspace::dav_large_thr = 1e-5; - } // namespace hsolver #endif \ No newline at end of file diff --git a/source/module_hsolver/hsolver_pw.cpp b/source/module_hsolver/hsolver_pw.cpp index 3567bbe5f4..29d4b762c4 100644 --- a/source/module_hsolver/hsolver_pw.cpp +++ b/source/module_hsolver/hsolver_pw.cpp @@ -1,11 +1,9 @@ #include "hsolver_pw.h" -#include - #include "diago_bpcg.h" #include "diago_cg.h" -#include "diago_david.h" #include "diago_dav_subspace.h" +#include "diago_david.h" #include "module_base/global_variable.h" #include "module_base/parallel_global.h" // for MPI #include "module_base/timer.h" @@ -16,6 +14,8 @@ #include "module_hamilt_pw/hamilt_pwdft/wavefunc.h" #include "module_hsolver/diagh.h" #include "module_hsolver/diago_iter_assist.h" + +#include #ifdef USE_PAW #include "module_cell/module_paw/paw_cell.h" #endif @@ -84,7 +84,7 @@ void HSolverPW::initDiagh(const psi::Psi& psi) } else if (this->method == "dav") { -#ifdef __MPI +#ifdef __MPI const diag_comm_info comm_info = {POOL_WORLD, GlobalV::RANK_IN_POOL, GlobalV::NPROC_IN_POOL}; #else const diag_comm_info comm_info = {GlobalV::RANK_IN_POOL, GlobalV::NPROC_IN_POOL}; @@ -96,45 +96,55 @@ void HSolverPW::initDiagh(const psi::Psi& psi) { delete (DiagoDavid*)this->pdiagh; - this->pdiagh = new DiagoDavid( - precondition.data(), - GlobalV::PW_DIAG_NDIM, - GlobalV::use_paw, - comm_info - ); + this->pdiagh = new DiagoDavid(precondition.data(), + GlobalV::PW_DIAG_NDIM, + GlobalV::use_paw, + comm_info); this->pdiagh->method = this->method; } } else { - this->pdiagh = new DiagoDavid( - precondition.data(), - GlobalV::PW_DIAG_NDIM, - GlobalV::use_paw, - comm_info - ); + this->pdiagh + = new DiagoDavid(precondition.data(), GlobalV::PW_DIAG_NDIM, GlobalV::use_paw, comm_info); this->pdiagh->method = this->method; } } else if (this->method == "dav_subspace") { - Diago_DavSubspace::PW_DIAG_NDIM = GlobalV::PW_DIAG_NDIM; - if (this->pdiagh != nullptr) - { - if (this->pdiagh->method != this->method) - { - delete (Diago_DavSubspace*)this->pdiagh; - this->pdiagh = new Diago_DavSubspace(precondition.data()); - this->pdiagh->method = this->method; - } - } - else - { - this->pdiagh = new Diago_DavSubspace(precondition.data()); - this->pdiagh->method = this->method; - } +// #ifdef __MPI +// const diag_comm_info comm_info = {POOL_WORLD, GlobalV::RANK_IN_POOL, GlobalV::NPROC_IN_POOL}; +// #else +// const diag_comm_info comm_info = {GlobalV::RANK_IN_POOL, GlobalV::NPROC_IN_POOL}; +// #endif +// if (this->pdiagh != nullptr) +// { +// if (this->pdiagh->method != this->method) +// { +// delete (Diago_DavSubspace*)this->pdiagh; + +// this->pdiagh = new Diago_DavSubspace(precondition.data(), +// GlobalV::PW_DIAG_NDIM, +// DiagoIterAssist::PW_DIAG_THR, +// DiagoIterAssist::PW_DIAG_NMAX, +// DiagoIterAssist::need_subspace, +// comm_info); + +// this->pdiagh->method = this->method; +// } +// } +// else +// { +// this->pdiagh = new Diago_DavSubspace(precondition.data(), +// GlobalV::PW_DIAG_NDIM, +// DiagoIterAssist::PW_DIAG_THR, +// DiagoIterAssist::PW_DIAG_NMAX, +// DiagoIterAssist::need_subspace, +// comm_info); +// this->pdiagh->method = this->method; +// } } else if (this->method == "bpcg") { @@ -202,7 +212,7 @@ void HSolverPW::solve(hamilt::Hamilt* pHamilt, is_occupied[i * psi.get_nbands() + j] = false; } } - } + } } } } @@ -223,7 +233,7 @@ void HSolverPW::solve(hamilt::Hamilt* pHamilt, _gk[ig] = this->wfc_basis->getgpluskcar(ik, ig); } - std::vector kpt(3,0); + std::vector kpt(3, 0); kpt[0] = this->wfc_basis->kvec_c[ik].x; kpt[1] = this->wfc_basis->kvec_c[ik].y; kpt[2] = this->wfc_basis->kvec_c[ik].z; @@ -323,7 +333,7 @@ void HSolverPW::solve(hamilt::Hamilt* pHamilt, _gk[ig] = this->wfc_basis->getgpluskcar(ik, ig); } - std::vector kpt(3,0); + std::vector kpt(3, 0); kpt[0] = this->wfc_basis->kvec_c[ik].x; kpt[1] = this->wfc_basis->kvec_c[ik].y; kpt[2] = this->wfc_basis->kvec_c[ik].z; @@ -442,7 +452,7 @@ void HSolverPW::solve(hamilt::Hamilt* pHamilt, // ESolver_ _gk[ig] = this->wfc_basis->getgpluskcar(ik, ig); } - std::vector kpt(3,0); + std::vector kpt(3, 0); kpt[0] = this->wfc_basis->kvec_c[ik].x; kpt[1] = this->wfc_basis->kvec_c[ik].y; kpt[2] = this->wfc_basis->kvec_c[ik].z; @@ -546,7 +556,7 @@ void HSolverPW::solve(hamilt::Hamilt* pHamilt, // ESolver_ _gk[ig] = this->wfc_basis->getgpluskcar(ik, ig); } - std::vector kpt(3,0); + std::vector kpt(3, 0); kpt[0] = this->wfc_basis->kvec_c[ik].x; kpt[1] = this->wfc_basis->kvec_c[ik].y; kpt[2] = this->wfc_basis->kvec_c[ik].z; @@ -653,11 +663,11 @@ void HSolverPW::endDiagh() delete reinterpret_cast*>(this->pdiagh); this->pdiagh = nullptr; } - if (this->method == "dav_subspace") - { - delete reinterpret_cast*>(this->pdiagh); - this->pdiagh = nullptr; - } + // if (this->method == "dav_subspace") + // { + // delete reinterpret_cast*>(this->pdiagh); + // this->pdiagh = nullptr; + // } if (this->method == "bpcg") { delete reinterpret_cast*>(this->pdiagh); @@ -710,8 +720,46 @@ void HSolverPW::hamiltSolvePsiK(hamilt::Hamilt* hm, psi::P { if (this->method == "dav_subspace") { +#ifdef __MPI + const diag_comm_info comm_info = {POOL_WORLD, GlobalV::RANK_IN_POOL, GlobalV::NPROC_IN_POOL}; +#else + const diag_comm_info comm_info = {GlobalV::RANK_IN_POOL, GlobalV::NPROC_IN_POOL}; +#endif + + this->pdiagh = new Diago_DavSubspace(this->precondition, + + + psi.get_nbands(), + psi.get_k_first() ? psi.get_current_nbas() : psi.get_nk() * psi.get_nbasis(), - ((Diago_DavSubspace*)this->pdiagh)->diag(hm, psi, eigenvalue, is_occupied); + GlobalV::PW_DIAG_NDIM, + DiagoIterAssist::PW_DIAG_THR, + DiagoIterAssist::PW_DIAG_NMAX, + DiagoIterAssist::need_subspace, + comm_info); + + this->pdiagh->method = this->method; + + bool scf; + if (GlobalV::CALCULATION == "nscf") + { + scf = false; + } + else + { + scf = true; + } + DiagoIterAssist::avg_iter + += static_cast((reinterpret_cast*>(this->pdiagh)) + ->diag( + hm, + psi, + eigenvalue, + is_occupied, + scf)); + + delete reinterpret_cast*>(this->pdiagh); + this->pdiagh = nullptr; } else { diff --git a/source/module_hsolver/kernels/cuda/math_kernel_op.cu b/source/module_hsolver/kernels/cuda/math_kernel_op.cu index 7c927c932a..1dc1c81cdc 100644 --- a/source/module_hsolver/kernels/cuda/math_kernel_op.cu +++ b/source/module_hsolver/kernels/cuda/math_kernel_op.cu @@ -10,9 +10,12 @@ #include #include -#define WARP_SIZE 32 -#define FULL_MASK 0xffffffff -#define THREAD_PER_BLOCK 256 +namespace hsolver +{ +const int warp_size = 32; +const unsigned int full_mask = 0xffffffff; +const int thread_per_block = 256; +} template <> struct GetTypeReal> { @@ -66,7 +69,7 @@ void destoryBLAShandle(){ template __forceinline__ __device__ void warp_reduce(FPTYPE& val) { for (int offset = 16; offset > 0; offset >>= 1) - val += __shfl_down_sync(FULL_MASK, val, offset); + val += __shfl_down_sync(full_mask, val, offset); } template @@ -83,56 +86,91 @@ __global__ void line_minimize_with_block( int item = 0; Real epsilo_0 = 0.0, epsilo_1 = 0.0, epsilo_2 = 0.0; Real theta = 0.0, cos_theta = 0.0, sin_theta = 0.0; - __shared__ Real data[THREAD_PER_BLOCK * 3]; + __shared__ Real data[thread_per_block * 3]; data[tid] = 0; - for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) { + for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) { item = band_idx * n_basis_max + basis_idx; data[tid] += (grad[item] * thrust::conj(grad[item])).real(); } __syncthreads(); // just do some parallel reduction in shared memory - for (int ii = THREAD_PER_BLOCK >> 1; ii > 0; ii >>= 1) { + for (int ii = thread_per_block >> 1; ii > warp_size; ii >>= 1) { if (tid < ii) { data[tid] += data[tid + ii]; } __syncthreads(); } + // For threads in the same warp, it is better that they process the same work + // Also, __syncwarp() should be used instead of __syncthreads() + // Therefore we unroll the loop and ensure that the threads does the same work + if (tid < warp_size) { + data[tid] += data[tid + 32]; __syncwarp(); + data[tid] += data[tid + 16]; __syncwarp(); + data[tid] += data[tid + 8]; __syncwarp(); + data[tid] += data[tid + 4]; __syncwarp(); + data[tid] += data[tid + 2]; __syncwarp(); + data[tid] += data[tid + 1]; __syncwarp(); + } + + __syncthreads(); + Real norm = 1.0 / sqrt(data[0]); __syncthreads(); data[tid] = 0; - data[THREAD_PER_BLOCK + tid] = 0; - data[2 * THREAD_PER_BLOCK + tid] = 0; - for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) { + data[thread_per_block + tid] = 0; + data[2 * thread_per_block + tid] = 0; + for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) { item = band_idx * n_basis_max + basis_idx; grad[item] *= norm; hgrad[item] *= norm; data[tid] += (hpsi[item] * thrust::conj(psi[item])).real(); - data[THREAD_PER_BLOCK + tid] += (grad[item] * thrust::conj(hpsi[item])).real(); - data[2 * THREAD_PER_BLOCK + tid] += (grad[item] * thrust::conj(hgrad[item])).real(); + data[thread_per_block + tid] += (grad[item] * thrust::conj(hpsi[item])).real(); + data[2 * thread_per_block + tid] += (grad[item] * thrust::conj(hgrad[item])).real(); } __syncthreads(); // just do some parallel reduction in shared memory - for (int ii = THREAD_PER_BLOCK >> 1; ii > 0; ii >>= 1) { + for (int ii = thread_per_block >> 1; ii > warp_size; ii >>= 1) { if (tid < ii) { data[tid] += data[tid + ii]; - data[THREAD_PER_BLOCK + tid] += data[THREAD_PER_BLOCK + tid + ii]; - data[2 * THREAD_PER_BLOCK + tid] += data[2 * THREAD_PER_BLOCK + tid + ii]; + data[thread_per_block + tid] += data[thread_per_block + tid + ii]; + data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + ii]; } __syncthreads(); } + if (tid < warp_size) { + data[tid] += data[tid + 32]; __syncwarp(); + data[tid] += data[tid + 16]; __syncwarp(); + data[tid] += data[tid + 8]; __syncwarp(); + data[tid] += data[tid + 4]; __syncwarp(); + data[tid] += data[tid + 2]; __syncwarp(); + data[tid] += data[tid + 1]; __syncwarp(); + data[thread_per_block + tid] += data[thread_per_block + tid + 32]; __syncwarp(); + data[thread_per_block + tid] += data[thread_per_block + tid + 16]; __syncwarp(); + data[thread_per_block + tid] += data[thread_per_block + tid + 8]; __syncwarp(); + data[thread_per_block + tid] += data[thread_per_block + tid + 4]; __syncwarp(); + data[thread_per_block + tid] += data[thread_per_block + tid + 2]; __syncwarp(); + data[thread_per_block + tid] += data[thread_per_block + tid + 1]; __syncwarp(); + data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + 32]; __syncwarp(); + data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + 16]; __syncwarp(); + data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + 8]; __syncwarp(); + data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + 4]; __syncwarp(); + data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + 2]; __syncwarp(); + data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + 1]; __syncwarp(); + } + __syncthreads(); epsilo_0 = data[0]; - epsilo_1 = data[THREAD_PER_BLOCK]; - epsilo_2 = data[2 * THREAD_PER_BLOCK]; + epsilo_1 = data[thread_per_block]; + epsilo_2 = data[2 * thread_per_block]; theta = 0.5 * abs(atan(2 * epsilo_1/(epsilo_0 - epsilo_2))); cos_theta = cos(theta); sin_theta = sin(theta); - for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) { + for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) { item = band_idx * n_basis_max + basis_idx; psi [item] = psi [item] * cos_theta + grad [item] * sin_theta; hpsi[item] = hpsi[item] * cos_theta + hgrad[item] * sin_theta; @@ -159,29 +197,40 @@ __global__ void calc_grad_with_block( Real epsilo = 0.0; Real grad_2 = 0.0; thrust::complex grad_1 = {0, 0}; - __shared__ Real data[THREAD_PER_BLOCK * 2]; + __shared__ Real data[thread_per_block * 2]; // Init shared memory data[tid] = 0; - for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) { + for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) { item = band_idx * n_basis_max + basis_idx; data[tid] += (psi[item] * thrust::conj(psi[item])).real(); } __syncthreads(); // just do some parallel reduction in shared memory - for (int ii = THREAD_PER_BLOCK >> 1; ii > 0; ii >>= 1) { + for (int ii = thread_per_block >> 1; ii > warp_size; ii >>= 1) { if (tid < ii) { data[tid] += data[tid + ii]; } __syncthreads(); } + if (tid < warp_size) { + data[tid] += data[tid + 32]; __syncwarp(); + data[tid] += data[tid + 16]; __syncwarp(); + data[tid] += data[tid + 8]; __syncwarp(); + data[tid] += data[tid + 4]; __syncwarp(); + data[tid] += data[tid + 2]; __syncwarp(); + data[tid] += data[tid + 1]; __syncwarp(); + } + + __syncthreads(); + Real norm = 1.0 / sqrt(data[0]); __syncthreads(); data[tid] = 0; - for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) { + for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) { item = band_idx * n_basis_max + basis_idx; psi[item] *= norm; hpsi[item] *= norm; @@ -190,37 +239,65 @@ __global__ void calc_grad_with_block( __syncthreads(); // just do some parallel reduction in shared memory - for (int ii = THREAD_PER_BLOCK >> 1; ii > 0; ii >>= 1) { + for (int ii = thread_per_block >> 1; ii > warp_size; ii >>= 1) { if (tid < ii) { data[tid] += data[tid + ii]; } __syncthreads(); } + + if (tid < warp_size) { + data[tid] += data[tid + 32]; __syncwarp(); + data[tid] += data[tid + 16]; __syncwarp(); + data[tid] += data[tid + 8]; __syncwarp(); + data[tid] += data[tid + 4]; __syncwarp(); + data[tid] += data[tid + 2]; __syncwarp(); + data[tid] += data[tid + 1]; __syncwarp(); + } + + __syncthreads(); epsilo = data[0]; __syncthreads(); data[tid] = 0; - data[THREAD_PER_BLOCK + tid] = 0; - for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) { + data[thread_per_block + tid] = 0; + for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) { item = band_idx * n_basis_max + basis_idx; grad_1 = hpsi[item] - epsilo * psi[item]; grad_2 = thrust::norm(grad_1); data[tid] += grad_2; - data[THREAD_PER_BLOCK + tid] += grad_2 / prec[basis_idx]; + data[thread_per_block + tid] += grad_2 / prec[basis_idx]; } __syncthreads(); // just do some parallel reduction in shared memory - for (int ii = THREAD_PER_BLOCK >> 1; ii > 0; ii >>= 1) { + for (int ii = thread_per_block >> 1; ii > warp_size; ii >>= 1) { if (tid < ii) { data[tid] += data[tid + ii]; - data[THREAD_PER_BLOCK + tid] += data[THREAD_PER_BLOCK + tid + ii]; + data[thread_per_block + tid] += data[thread_per_block + tid + ii]; } __syncthreads(); } + + if (tid < warp_size) { + data[tid] += data[tid + 32]; __syncwarp(); + data[tid] += data[tid + 16]; __syncwarp(); + data[tid] += data[tid + 8]; __syncwarp(); + data[tid] += data[tid + 4]; __syncwarp(); + data[tid] += data[tid + 2]; __syncwarp(); + data[tid] += data[tid + 1]; __syncwarp(); + data[thread_per_block + tid] += data[thread_per_block + tid + 32]; __syncwarp(); + data[thread_per_block + tid] += data[thread_per_block + tid + 16]; __syncwarp(); + data[thread_per_block + tid] += data[thread_per_block + tid + 8]; __syncwarp(); + data[thread_per_block + tid] += data[thread_per_block + tid + 4]; __syncwarp(); + data[thread_per_block + tid] += data[thread_per_block + tid + 2]; __syncwarp(); + data[thread_per_block + tid] += data[thread_per_block + tid + 1]; __syncwarp(); + } + + __syncthreads(); err_st = data[0]; - beta_st = data[THREAD_PER_BLOCK]; - for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) { + beta_st = data[thread_per_block]; + for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) { item = band_idx * n_basis_max + basis_idx; grad_1 = hpsi[item] - epsilo * psi[item]; grad[item] = -grad_1 / prec[basis_idx] + beta_st / beta[band_idx] * grad_old[item]; @@ -342,7 +419,7 @@ void line_minimize_with_block_op::operator()(T* grad auto C = reinterpret_cast*>(psi_out); auto D = reinterpret_cast*>(hpsi_out); - line_minimize_with_block<<>>( + line_minimize_with_block<<>>( A, B, C, D, n_basis, n_basis_max); @@ -367,7 +444,7 @@ void calc_grad_with_block_op::operator()(const Real* auto C = reinterpret_cast*>(grad_out); auto D = reinterpret_cast*>(grad_old_out); - calc_grad_with_block<<>>( + calc_grad_with_block<<>>( prec_in, err_out, beta_out, A, B, C, D, n_basis, n_basis_max); @@ -440,7 +517,8 @@ void vector_div_constant_op::operator()(const b const double* vector, const double constant) { - int thread = 1024; + // In small cases, 1024 threads per block will only utilize 17 blocks, much less than 40 + int thread = thread_per_block; int block = (dim + thread - 1) / thread; vector_div_constant_kernel << > > (dim, result, vector, constant); @@ -459,7 +537,7 @@ inline void vector_div_constant_complex_wrapper(const base_device::DEVICE_GPU* d thrust::complex* result_tmp = reinterpret_cast*>(result); const thrust::complex* vector_tmp = reinterpret_cast*>(vector); - int thread = 1024; + int thread = thread_per_block; int block = (dim + thread - 1) / thread; vector_div_constant_kernel> << > > (dim, result_tmp, vector_tmp, constant); @@ -493,7 +571,7 @@ void vector_mul_vector_op::operator()(const bas const double* vector1, const double* vector2) { - int thread = 1024; + int thread = thread_per_block; int block = (dim + thread - 1) / thread; vector_mul_vector_kernel << > > (dim, result, vector1, vector2); @@ -510,7 +588,7 @@ inline void vector_mul_vector_complex_wrapper(const base_device::DEVICE_GPU* d, { thrust::complex* result_tmp = reinterpret_cast*>(result); const thrust::complex* vector1_tmp = reinterpret_cast*>(vector1); - int thread = 1024; + int thread = thread_per_block; int block = (dim + thread - 1) / thread; vector_mul_vector_kernel> << > > (dim, result_tmp, vector1_tmp, vector2); @@ -545,7 +623,7 @@ void vector_div_vector_op::operator()(const bas const double* vector1, const double* vector2) { - int thread = 1024; + int thread = thread_per_block; int block = (dim + thread - 1) / thread; vector_div_vector_kernel << > > (dim, result, vector1, vector2); @@ -562,7 +640,7 @@ inline void vector_div_vector_complex_wrapper(const base_device::DEVICE_GPU* d, { thrust::complex* result_tmp = reinterpret_cast*>(result); const thrust::complex* vector1_tmp = reinterpret_cast*>(vector1); - int thread = 1024; + int thread = thread_per_block; int block = (dim + thread - 1) / thread; vector_div_vector_kernel> << > > (dim, result_tmp, vector1_tmp, vector2); @@ -605,7 +683,7 @@ void constantvector_addORsub_constantVector_op::oper auto vector1_tmp = reinterpret_cast(vector1); auto vector2_tmp = reinterpret_cast(vector2); - int thread = 1024; + int thread = thread_per_block; int block = (dim + thread - 1) / thread; constantvector_addORsub_constantVector_kernel <<>>(dim, result_tmp, vector1_tmp, constant1, vector2_tmp, constant2);