diff --git a/examples/scf/pw_Si2/INPUT b/examples/scf/pw_Si2/INPUT
index 9dd1930909..141c104100 100644
--- a/examples/scf/pw_Si2/INPUT
+++ b/examples/scf/pw_Si2/INPUT
@@ -8,5 +8,5 @@ ecutwfc         60
 scf_thr         1e-7
 scf_nmax        100
 device          cpu
-ks_solver       cg
+ks_solver       dav_subspace
 precision       double
diff --git a/source/module_hsolver/diago_dav_subspace.cpp b/source/module_hsolver/diago_dav_subspace.cpp
index 2e14855e19..8c1f8818e6 100644
--- a/source/module_hsolver/diago_dav_subspace.cpp
+++ b/source/module_hsolver/diago_dav_subspace.cpp
@@ -1,32 +1,68 @@
 #include "diago_dav_subspace.h"
 
 #include "diago_iter_assist.h"
-
 #include "module_base/memory.h"
-#include "module_base/timer.h"
-#include "module_base/parallel_global.h" 
 #include "module_base/module_device/device.h"
-
+#include "module_base/timer.h"
 #include "module_hsolver/kernels/dngvd_op.h"
 #include "module_hsolver/kernels/math_kernel_op.h"
 
+#include <vector>
+
 using namespace hsolver;
 
 template <typename T, typename Device>
-Diago_DavSubspace<T, Device>::Diago_DavSubspace(const Real* precondition_in)
+Diago_DavSubspace<T, Device>::Diago_DavSubspace(const std::vector<Real>& precondition_in,
+                                                const int& nband_in,
+                                                const int& nbasis_in,
+                                                const int& david_ndim_in,
+                                                const double& diag_thr_in,
+                                                const int& diag_nmax_in,
+                                                const bool& need_subspace_in,
+                                                const diag_comm_info& diag_comm_in)
+    : precondition(precondition_in), n_band(nband_in), dim(nbasis_in), nbase_x(nband_in * david_ndim_in),
+      diag_thr(diag_thr_in), iter_nmax(diag_nmax_in), is_subspace(need_subspace_in), diag_comm(diag_comm_in)
 {
     this->device = base_device::get_device_type<Device>(this->ctx);
-    this->precondition = precondition_in;
-
-    test_david = 2;
-    // 1: check which function is called and which step is executed
-    // 2: check the eigenvalues of the result of each iteration
-    // 3: check the eigenvalues and errors of the last result
-    // default: no check
 
     this->one = &this->cs.one;
     this->zero = &this->cs.zero;
     this->neg_one = &this->cs.neg_one;
+
+    assert(david_ndim_in > 1);
+    assert(david_ndim_in * nband_in < nbasis_in * this->diag_comm.nproc);
+
+    // TODO: Added memory usage statistics
+
+    //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+    // the product of H and psi in the reduced basis set
+    resmem_complex_op()(this->ctx, this->hphi, this->nbase_x * this->dim, "DAV::hphi");
+    setmem_complex_op()(this->ctx, this->hphi, 0, this->nbase_x * this->dim);
+
+    // Hamiltonian on the reduced basis set
+    resmem_complex_op()(this->ctx, this->hcc, this->nbase_x * this->nbase_x, "DAV::hcc");
+    setmem_complex_op()(this->ctx, this->hcc, 0, this->nbase_x * this->nbase_x);
+
+    // Overlap on the reduced basis set
+    resmem_complex_op()(this->ctx, this->scc, this->nbase_x * this->nbase_x, "DAV::scc");
+    setmem_complex_op()(this->ctx, this->scc, 0, this->nbase_x * this->nbase_x);
+
+    // Eigenvectors
+    resmem_complex_op()(this->ctx, this->vcc, this->nbase_x * this->nbase_x, "DAV::vcc");
+    setmem_complex_op()(this->ctx, this->vcc, 0, this->nbase_x * this->nbase_x);
+    //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+#if defined(__CUDA) || defined(__ROCM)
+    if (this->device == base_device::GpuDevice)
+    {
+        resmem_real_op()(this->ctx, this->d_precondition, nbasis_in);
+        syncmem_var_h2d_op()(this->ctx,
+                             this->cpu_ctx,
+                             this->d_precondition,
+                             this->precondition.data(),
+                             nbasis_in);
+    }
+#endif
 }
 
 template <typename T, typename Device>
@@ -37,63 +73,28 @@ Diago_DavSubspace<T, Device>::~Diago_DavSubspace()
     delmem_complex_op()(this->ctx, this->scc);
     delmem_complex_op()(this->ctx, this->vcc);
 
-    delmem_real_h_op()(this->cpu_ctx, this->eigenvalue_in_dav);
-
+#if defined(__CUDA) || defined(__ROCM)
     if (this->device == base_device::GpuDevice)
     {
         delmem_real_op()(this->ctx, this->d_precondition);
     }
+#endif
 }
 
 template <typename T, typename Device>
-void Diago_DavSubspace<T, Device>::diag_once(hamilt::Hamilt<T, Device>* phm_in,
-                                        psi::Psi<T, Device>& psi,
-                                        Real* eigenvalue_in_hsolver,
-                                        std::vector<bool>& is_occupied)
+int Diago_DavSubspace<T, Device>::diag_once(hamilt::Hamilt<T, Device>* phm_in,
+                                            psi::Psi<T, Device>& psi,
+                                            Real* eigenvalue_in_hsolver,
+                                            const std::vector<bool>& is_occupied)
 {
-    if (test_david == 1)
-    {
-        ModuleBase::TITLE("Diago_DavSubspace", "diag_once");
-    }
     ModuleBase::timer::tick("Diago_DavSubspace", "diag_once");
 
-    assert(Diago_DavSubspace::PW_DIAG_NDIM > 1);
-    assert(Diago_DavSubspace::PW_DIAG_NDIM * psi.get_nbands() < psi.get_current_nbas() * GlobalV::NPROC_IN_POOL);
-
-    // qianrui change it 2021-7-25.
-    // In strictly speaking, it shoule be PW_DIAG_NDIM*nband < npw sum of all pools. We roughly estimate it here.
-    // However, in most cases, total number of plane waves should be much larger than nband * PW_DIAG_NDIM
-
-    this->dim = psi.get_k_first() ? psi.get_current_nbas() : psi.get_nk() * psi.get_nbasis();
-    this->n_band = psi.get_nbands();
-
-    // maximum dimension of the reduced basis set
-    this->nbase_x = Diago_DavSubspace::PW_DIAG_NDIM * this->n_band;
-
+    // TODO: Allocate memory in the constructor 
     psi::Psi<T, Device> basis(1, this->nbase_x, this->dim, &(psi.get_ngk(0)));
     ModuleBase::Memory::record("DAV::basis", this->nbase_x * this->dim * sizeof(T));
 
     // the eigenvalues in dav iter
-    resmem_real_h_op()(this->cpu_ctx, this->eigenvalue_in_dav, this->nbase_x, "DAV::eig");
-    setmem_real_h_op()(this->cpu_ctx, this->eigenvalue_in_dav, 0, this->nbase_x);
-
-    //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-    // the product of H and psi in the reduced basis set
-    resmem_complex_op()(this->ctx, this->hphi, this->nbase_x * this->dim, "DAV::hphi");
-    setmem_complex_op()(this->ctx, this->hphi, 0, this->nbase_x * this->dim);
-
-    // Hamiltonian on the reduced basis set
-    resmem_complex_op()(this->ctx, this->hcc, this->nbase_x * this->nbase_x, "DAV::hcc");
-    setmem_complex_op()(this->ctx, this->hcc, 0, this->nbase_x * this->nbase_x);
-
-    // Overlap on the reduced basis set
-    resmem_complex_op()(this->ctx, this->scc, this->nbase_x * this->nbase_x, "DAV::scc");
-    setmem_complex_op()(this->ctx, this->scc, 0, this->nbase_x * this->nbase_x);
-
-    // Eigenvectors
-    resmem_complex_op()(this->ctx, this->vcc, this->nbase_x * this->nbase_x, "DAV::vcc");
-    setmem_complex_op()(this->ctx, this->vcc, 0, this->nbase_x * this->nbase_x);
-    //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+    std::vector<Real> eigenvalue_iter(this->nbase_x, 0.0);
 
     // convflag[m] = true if the m th band is convergent
     std::vector<bool> convflag(this->n_band, false);
@@ -124,32 +125,27 @@ void Diago_DavSubspace<T, Device>::diag_once(hamilt::Hamilt<T, Device>* phm_in,
     hpsi_info dav_hpsi_in(&basis, psi::Range(1, 0, 0, this->n_band - 1), this->hphi);
     phm_in->ops->hPsi(dav_hpsi_in);
 
-    this->cal_elem(this->dim, 
-                   nbase, 
-                   this->notconv, 
-                   basis, 
-                   this->hphi, 
-                   this->hcc, 
-                   this->scc);
+    this->cal_elem(this->dim, nbase, this->notconv, basis, this->hphi, this->hcc, this->scc);
 
     this->diag_zhegvx(nbase,
                       this->n_band,
                       this->hcc,
                       this->scc,
                       this->nbase_x,
-                      this->eigenvalue_in_dav,
+                      &eigenvalue_iter,
                       this->vcc,
                       true,
                       this->is_subspace);
 
     for (size_t m = 0; m < this->n_band; m++)
     {
-        eigenvalue_in_hsolver[m] = this->eigenvalue_in_dav[m];
+        eigenvalue_in_hsolver[m] = eigenvalue_iter[m];
     }
 
     ModuleBase::timer::tick("Diago_DavSubspace", "first");
 
     int dav_iter = 0;
+
     do
     {
         dav_iter++;
@@ -162,22 +158,16 @@ void Diago_DavSubspace<T, Device>::diag_once(hamilt::Hamilt<T, Device>* phm_in,
                        this->hphi,
                        this->vcc,
                        unconv.data(),
-                       this->eigenvalue_in_dav);
+                       &eigenvalue_iter);
 
-        this->cal_elem(this->dim, 
-                       nbase, 
-                       this->notconv, 
-                       basis, 
-                       this->hphi, 
-                       this->hcc, 
-                       this->scc);
+        this->cal_elem(this->dim, nbase, this->notconv, basis, this->hphi, this->hcc, this->scc);
 
         this->diag_zhegvx(nbase,
                           this->n_band,
                           this->hcc,
                           this->scc,
                           this->nbase_x,
-                          this->eigenvalue_in_dav,
+                          &eigenvalue_iter,
                           this->vcc,
                           false,
                           false);
@@ -190,13 +180,12 @@ void Diago_DavSubspace<T, Device>::diag_once(hamilt::Hamilt<T, Device>* phm_in,
         {
             if (is_occupied[m])
             {
-                convflag[m] = (std::abs(this->eigenvalue_in_dav[m] - eigenvalue_in_hsolver[m])
-                               < DiagoIterAssist<T, Device>::PW_DIAG_THR);
+                convflag[m] = (std::abs(eigenvalue_iter[m] - eigenvalue_in_hsolver[m]) < this->diag_thr);
             }
             else
             {
-                const double empty_ethr = std::max(DiagoIterAssist<T, Device>::PW_DIAG_THR * 5.0, Diago_DavSubspace::dav_large_thr);
-                convflag[m] = (std::abs(this->eigenvalue_in_dav[m] - eigenvalue_in_hsolver[m]) < empty_ethr);
+                const double empty_ethr = std::max(this->diag_thr * 5.0, 1e-5);
+                convflag[m] = (std::abs(eigenvalue_iter[m] - eigenvalue_in_hsolver[m]) < empty_ethr);
             }
 
             if (!convflag[m])
@@ -205,14 +194,12 @@ void Diago_DavSubspace<T, Device>::diag_once(hamilt::Hamilt<T, Device>* phm_in,
                 this->notconv++;
             }
 
-            eigenvalue_in_hsolver[m] = this->eigenvalue_in_dav[m];
+            eigenvalue_in_hsolver[m] = eigenvalue_iter[m];
         }
 
         ModuleBase::timer::tick("Diago_DavSubspace", "check_update");
 
-        if ((this->notconv == 0) || 
-            (nbase + this->notconv + 1 > this->nbase_x) || 
-            (dav_iter == DiagoIterAssist<T, Device>::PW_DIAG_NMAX))
+        if ((this->notconv == 0) || (nbase + this->notconv + 1 > this->nbase_x) || (dav_iter == this->iter_nmax))
         {
             ModuleBase::timer::tick("Diago_DavSubspace", "last");
 
@@ -235,7 +222,7 @@ void Diago_DavSubspace<T, Device>::diag_once(hamilt::Hamilt<T, Device>* phm_in,
                                  psi.get_pointer(), // C dim * n_band
                                  psi.get_nbasis());
 
-            if (!this->notconv || (dav_iter == DiagoIterAssist<T, Device>::PW_DIAG_NMAX))
+            if (!this->notconv || (dav_iter == this->iter_nmax))
             {
                 // overall convergence or last iteration: exit the iteration
 
@@ -264,25 +251,21 @@ void Diago_DavSubspace<T, Device>::diag_once(hamilt::Hamilt<T, Device>* phm_in,
 
     } while (1);
 
-    // std::cout << "dav_iter == " << dav_iter << std::endl;
-
-    DiagoIterAssist<T, Device>::avg_iter += static_cast<double>(dav_iter);
-
     ModuleBase::timer::tick("Diago_DavSubspace", "diag_once");
 
-    return;
+    return dav_iter;
 }
 
 template <typename T, typename Device>
 void Diago_DavSubspace<T, Device>::cal_grad(hamilt::Hamilt<T, Device>* phm_in,
-                                       const int& dim,
-                                       const int& nbase,
-                                       const int& notconv,
-                                       psi::Psi<T, Device>& basis,
-                                       T* hphi,
-                                       T* vcc,
-                                       const int* unconv,
-                                       Real* eigenvalue_in_dav)
+                                            const int& dim,
+                                            const int& nbase,
+                                            const int& notconv,
+                                            psi::Psi<T, Device>& basis,
+                                            T* hphi,
+                                            T* vcc,
+                                            const int* unconv,
+                                            std::vector<Real>* eigenvalue_iter)
 {
     ModuleBase::timer::tick("Diago_DavSubspace", "cal_grad");
 
@@ -290,14 +273,8 @@ void Diago_DavSubspace<T, Device>::cal_grad(hamilt::Hamilt<T, Device>* phm_in,
     {
         if (unconv[i] != i)
         {
-            syncmem_complex_op()(
-                this->ctx, 
-                this->ctx, 
-                vcc + i * this->nbase_x, 
-                vcc + unconv[i] * this->nbase_x, 
-                nbase
-            );
-            this->eigenvalue_in_dav[i] = this->eigenvalue_in_dav[unconv[i]];
+            syncmem_complex_op()(this->ctx, this->ctx, vcc + i * this->nbase_x, vcc + unconv[i] * this->nbase_x, nbase);
+            (*eigenvalue_iter)[i] = (*eigenvalue_iter)[unconv[i]];
         }
     }
 
@@ -320,7 +297,7 @@ void Diago_DavSubspace<T, Device>::cal_grad(hamilt::Hamilt<T, Device>* phm_in,
     for (int m = 0; m < notconv; m++)
     {
 
-        std::vector<Real> e_temp_cpu(this->dim, (-this->eigenvalue_in_dav[m]));
+        std::vector<Real> e_temp_cpu(this->dim, ((-1) * (*eigenvalue_iter)[m]));
 
         vector_mul_vector_op<T, Device>()(this->ctx,
                                           this->dim,
@@ -351,16 +328,10 @@ void Diago_DavSubspace<T, Device>::cal_grad(hamilt::Hamilt<T, Device>* phm_in,
     {
         for (size_t i = 0; i < this->dim; i++)
         {
-            double x = this->precondition[i] - this->eigenvalue_in_dav[m];
+            double x = this->precondition[i] - (*eigenvalue_iter)[m];
             pre[i] = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0)));
         }
-        vector_div_vector_op<T, Device>()(
-            this->ctx, 
-            this->dim, 
-            &basis(nbase + m, 0), 
-            &basis(nbase + m, 0), 
-            pre.data()
-        );
+        vector_div_vector_op<T, Device>()(this->ctx, this->dim, &basis(nbase + m, 0), &basis(nbase + m, 0), pre.data());
     }
 
     // "normalize!!!" in order to improve numerical stability of subspace diagonalization
@@ -388,47 +359,47 @@ void Diago_DavSubspace<T, Device>::cal_grad(hamilt::Hamilt<T, Device>* phm_in,
 
 template <typename T, typename Device>
 void Diago_DavSubspace<T, Device>::cal_elem(const int& dim,
-                                       int& nbase,
-                                       const int& notconv,
-                                       const psi::Psi<T, Device>& basis,
-                                       const T* hphi,
-                                       T* hcc,
-                                       T* scc)
+                                            int& nbase,
+                                            const int& notconv,
+                                            const psi::Psi<T, Device>& basis,
+                                            const T* hphi,
+                                            T* hcc,
+                                            T* scc)
 {
     ModuleBase::timer::tick("Diago_DavSubspace", "cal_elem");
 
     gemm_op<T, Device>()(this->ctx,
-                        'C',
-                        'N',
-                        nbase + notconv,
-                        notconv,
-                        this->dim,
-                        this->one,
-                        &basis(0, 0),
-                        this->dim,
-                        &hphi[nbase * this->dim],
-                        this->dim,
-                        this->zero,
-                        &hcc[nbase * this->nbase_x],
-                        this->nbase_x);
+                         'C',
+                         'N',
+                         nbase + notconv,
+                         notconv,
+                         this->dim,
+                         this->one,
+                         &basis(0, 0),
+                         this->dim,
+                         &hphi[nbase * this->dim],
+                         this->dim,
+                         this->zero,
+                         &hcc[nbase * this->nbase_x],
+                         this->nbase_x);
 
     gemm_op<T, Device>()(this->ctx,
-                        'C',
-                        'N',
-                        nbase + notconv,
-                        notconv,
-                        this->dim,
-                        this->one,
-                        &basis(0, 0),
-                        this->dim,
-                        &basis(nbase, 0),
-                        this->dim,
-                        this->zero,
-                        &scc[nbase * this->nbase_x],
-                        this->nbase_x);
+                         'C',
+                         'N',
+                         nbase + notconv,
+                         notconv,
+                         this->dim,
+                         this->one,
+                         &basis(0, 0),
+                         this->dim,
+                         &basis(nbase, 0),
+                         this->dim,
+                         this->zero,
+                         &scc[nbase * this->nbase_x],
+                         this->nbase_x);
 
 #ifdef __MPI
-    if (GlobalV::NPROC_IN_POOL > 1)
+    if (this->diag_comm.nproc > 1)
     {
         auto* swap = new T[notconv * this->nbase_x];
         syncmem_complex_op()(this->ctx, this->ctx, swap, hcc + nbase * this->nbase_x, notconv * this->nbase_x);
@@ -448,7 +419,7 @@ void Diago_DavSubspace<T, Device>::cal_elem(const int& dim,
                            MPI_COMPLEX,
                            MPI_SUM,
                            0,
-                           POOL_WORLD);
+                           this->diag_comm.comm);
             }
             else
             {
@@ -458,7 +429,7 @@ void Diago_DavSubspace<T, Device>::cal_elem(const int& dim,
                            MPI_DOUBLE_COMPLEX,
                            MPI_SUM,
                            0,
-                           POOL_WORLD);
+                           this->diag_comm.comm);
             }
 
             syncmem_complex_op()(this->ctx, this->ctx, swap, scc + nbase * this->nbase_x, notconv * this->nbase_x);
@@ -471,7 +442,7 @@ void Diago_DavSubspace<T, Device>::cal_elem(const int& dim,
                            MPI_COMPLEX,
                            MPI_SUM,
                            0,
-                           POOL_WORLD);
+                           this->diag_comm.comm);
             }
             else
             {
@@ -481,7 +452,7 @@ void Diago_DavSubspace<T, Device>::cal_elem(const int& dim,
                            MPI_DOUBLE_COMPLEX,
                            MPI_SUM,
                            0,
-                           POOL_WORLD);
+                           this->diag_comm.comm);
             }
         }
         delete[] swap;
@@ -522,20 +493,21 @@ void Diago_DavSubspace<T, Device>::cal_elem(const int& dim,
 
 template <typename T, typename Device>
 void Diago_DavSubspace<T, Device>::diag_zhegvx(const int& nbase,
-                                          const int& nband,
-                                          T* hcc,
-                                          T* scc,
-                                          const int& nbase_x,
-                                          Real* eigenvalue_in_dav,
-                                          T* vcc,
-                                          bool init,
-                                          bool is_subspace)
+                                               const int& nband,
+                                               T* hcc,
+                                               T* scc,
+                                               const int& nbase_x,
+                                               std::vector<Real>* eigenvalue_iter,
+                                               //    Real* eigenvalue_iter,
+                                               T* vcc,
+                                               bool init,
+                                               bool is_subspace)
 {
     ModuleBase::timer::tick("Diago_DavSubspace", "diag_zhegvx");
 
     if (is_subspace == false)
     {
-        if (GlobalV::RANK_IN_POOL == 0)
+        if (this->diag_comm.rank == 0)
         {
             assert(nbase_x >= std::max(1, nbase));
 
@@ -556,25 +528,21 @@ void Diago_DavSubspace<T, Device>::diag_zhegvx(const int& nbase,
 #if defined(__CUDA) || defined(__ROCM)
                 Real* eigenvalue_gpu = nullptr;
                 resmem_real_op()(this->ctx, eigenvalue_gpu, this->nbase_x);
-                
-                syncmem_var_h2d_op()(
-                    this->ctx, 
-                    this->cpu_ctx, 
-                    eigenvalue_gpu, 
-                    this->eigenvalue_in_dav, 
-                    this->nbase_x
-                );
+
+                syncmem_var_h2d_op()(this->ctx,
+                                     this->cpu_ctx,
+                                     eigenvalue_gpu,
+                                     (*eigenvalue_iter).data(),
+                                     this->nbase_x);
 
                 dnevx_op<T, Device>()(this->ctx, nbase, this->nbase_x, this->hcc, nband, eigenvalue_gpu, this->vcc);
 
-                syncmem_var_d2h_op()(
-                    this->cpu_ctx, 
-                    this->ctx, 
-                    this->eigenvalue_in_dav, 
-                    eigenvalue_gpu, 
-                    this->nbase_x
-                );
-                
+                syncmem_var_d2h_op()(this->cpu_ctx,
+                                     this->ctx,
+                                     (*eigenvalue_iter).data(),
+                                     eigenvalue_gpu,
+                                     this->nbase_x);
+
                 delmem_real_op()(this->ctx, eigenvalue_gpu);
 #endif
             }
@@ -587,7 +555,7 @@ void Diago_DavSubspace<T, Device>::diag_zhegvx(const int& nbase,
                                           this->nbase_x,
                                           this->hcc,
                                           nband,
-                                          this->eigenvalue_in_dav,
+                                          (*eigenvalue_iter).data(),
                                           this->vcc);
                 }
                 else
@@ -599,7 +567,7 @@ void Diago_DavSubspace<T, Device>::diag_zhegvx(const int& nbase,
                                           this->hcc,
                                           this->scc,
                                           nband,
-                                          this->eigenvalue_in_dav,
+                                          (*eigenvalue_iter).data(),
                                           this->vcc);
                 }
             }
@@ -624,34 +592,32 @@ void Diago_DavSubspace<T, Device>::diag_zhegvx(const int& nbase,
         }
 
 #ifdef __MPI
-        if (GlobalV::NPROC_IN_POOL > 1)
+        if (this->diag_comm.nproc > 1)
         {
             // vcc: nbase * nband
             for (int i = 0; i < nband; i++)
             {
-                MPI_Bcast(&vcc[i * this->nbase_x], nbase, MPI_DOUBLE_COMPLEX, 0, POOL_WORLD);
+                MPI_Bcast(&vcc[i * this->nbase_x], nbase, MPI_DOUBLE_COMPLEX, 0, this->diag_comm.comm);
             }
-            MPI_Bcast(this->eigenvalue_in_dav, nband, MPI_DOUBLE, 0, POOL_WORLD);
+            MPI_Bcast((*eigenvalue_iter).data(), nband, MPI_DOUBLE, 0, this->diag_comm.comm);
         }
 #endif
-
     }
     else if (is_subspace == true)
     {
         for (size_t m = 0; m < nband; m++)
         {
-            this->eigenvalue_in_dav[m] = get_real(hcc[m * this->nbase_x + m]);
+            (*eigenvalue_iter)[m] = get_real(hcc[m * this->nbase_x + m]);
 
             vcc[m * this->nbase_x + m] = set_real_tocomplex(1.0);
         }
-        
+
 #ifdef __MPI
-        if (GlobalV::NPROC_IN_POOL > 1)
+        if (this->diag_comm.nproc > 1)
         {
-            MPI_Bcast(this->eigenvalue_in_dav, this->n_band, MPI_DOUBLE, 0, POOL_WORLD);
+            MPI_Bcast((*eigenvalue_iter).data(), this->n_band, MPI_DOUBLE, 0, this->diag_comm.comm);
         }
 #endif
-
     }
 
     ModuleBase::timer::tick("Diago_DavSubspace", "diag_zhegvx");
@@ -660,15 +626,15 @@ void Diago_DavSubspace<T, Device>::diag_zhegvx(const int& nbase,
 
 template <typename T, typename Device>
 void Diago_DavSubspace<T, Device>::refresh(const int& dim,
-                                      const int& nband,
-                                      int& nbase,
-                                      const Real* eigenvalue_in_hsolver,
-                                      const psi::Psi<T, Device>& psi,
-                                      psi::Psi<T, Device>& basis,
-                                      T* hp,
-                                      T* sp,
-                                      T* hc,
-                                      T* vc)
+                                           const int& nband,
+                                           int& nbase,
+                                           const Real* eigenvalue_in_hsolver,
+                                           const psi::Psi<T, Device>& psi,
+                                           psi::Psi<T, Device>& basis,
+                                           T* hp,
+                                           T* sp,
+                                           T* hc,
+                                           T* vc)
 {
     ModuleBase::timer::tick("Diago_DavSubspace", "refresh");
 
@@ -759,44 +725,20 @@ void Diago_DavSubspace<T, Device>::refresh(const int& dim,
 }
 
 template <typename T, typename Device>
-void Diago_DavSubspace<T, Device>::diag(hamilt::Hamilt<T, Device>* phm_in,
-                                   psi::Psi<T, Device>& psi,
-                                   Real* eigenvalue_in_hsolver,
-                                   std::vector<bool>& is_occupied)
+int Diago_DavSubspace<T, Device>::diag(hamilt::Hamilt<T, Device>* phm_in,
+                                       psi::Psi<T, Device>& psi,
+                                       Real* eigenvalue_in_hsolver,
+                                       const std::vector<bool>& is_occupied,
+                                       const bool& scf_type)
 {
     /// record the times of trying iterative diagonalization
-    int ntry = 0;
     this->notconv = 0;
 
-#if defined(__CUDA) || defined(__ROCM)
-    if (this->device == base_device::GpuDevice)
-    {
-        resmem_real_op()(this->ctx, this->d_precondition, psi.get_nbasis());
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, this->d_precondition, this->precondition, psi.get_nbasis());
-    }
-#endif
-
-    if (DiagoIterAssist<T, Device>::SCF_ITER == 1)
-    {
-        // std::cout << "diagH_subspace" << std::endl;
-        DiagoIterAssist<T, Device>::diagH_subspace(phm_in, psi, psi, eigenvalue_in_hsolver, psi.get_nbands());
-
-        this->is_subspace = true;
-    }
-    else
-    {
-        this->is_subspace = false;
-    }
-
     bool outputscc = false;
     bool outputeigenvalue = false;
 
     if (outputscc)
     {
-        this->nbase_x = 2 * psi.get_nbands();
-        resmem_complex_op()(this->ctx, this->scc, this->nbase_x * this->nbase_x, "DAV::scc");
-        setmem_complex_op()(this->ctx, this->scc, 0, this->nbase_x * this->nbase_x);
-
         std::cout << "before dav 111" << std::endl;
         gemm_op<T, Device>()(this->ctx,
                              'C',
@@ -835,14 +777,20 @@ void Diago_DavSubspace<T, Device>::diag(hamilt::Hamilt<T, Device>* phm_in,
         std::cout << std::endl;
     }
 
+    int sum_iter = 0;
+    int ntry = 0;
     do
     {
+        if (this->is_subspace || ntry > 0)
+        {
+            DiagoIterAssist<T, Device>::diagH_subspace(phm_in, psi, psi, eigenvalue_in_hsolver, psi.get_nbands());
+        }
 
-        this->diag_once(phm_in, psi, eigenvalue_in_hsolver, is_occupied);
+        sum_iter += this->diag_once(phm_in, psi, eigenvalue_in_hsolver, is_occupied);
 
         ++ntry;
 
-    } while (DiagoIterAssist<T, Device>::test_exit_cond(ntry, this->notconv));
+    } while (this->test_exit_cond(ntry, this->notconv, scf_type));
 
     if (notconv > std::max(5, psi.get_nbands() / 4))
     {
@@ -890,7 +838,26 @@ void Diago_DavSubspace<T, Device>::diag(hamilt::Hamilt<T, Device>* phm_in,
         std::cout << std::endl;
     }
 
-    return;
+    return sum_iter;
+}
+
+template <typename T, typename Device>
+bool Diago_DavSubspace<T, Device>::test_exit_cond(const int& ntry, const int& notconv, const bool& scf)
+{
+    // scf = true; // scf
+    // scf = false; // nscf
+
+    // If ntry <=5, try to do it better, if ntry > 5, exit.
+    const bool f1 = (ntry <= 5);
+
+    // In non-self consistent calculation, do until totally converged.
+    const bool f2 = ((!scf && (notconv > 0)));
+
+    // if self consistent calculation, if not converged > 5,
+    // using diagH_subspace and cg method again. ntry++
+    const bool f3 = ((scf && (notconv > 5)));
+
+    return (f1 && (f2 || f3));
 }
 
 namespace hsolver
diff --git a/source/module_hsolver/diago_dav_subspace.h b/source/module_hsolver/diago_dav_subspace.h
index 58bfdafcf3..d2c5a3ab41 100644
--- a/source/module_hsolver/diago_dav_subspace.h
+++ b/source/module_hsolver/diago_dav_subspace.h
@@ -16,47 +16,63 @@ class Diago_DavSubspace : public DiagH<T, Device>
     using Real = typename GetTypeReal<T>::type;
 
   public:
-    Diago_DavSubspace(const Real* precondition_in);
-    ~Diago_DavSubspace();
+    Diago_DavSubspace(const std::vector<Real>& precondition_in,
+                      const int& nband_in,
+                      const int& nbasis_in,
+                      const int& david_ndim_in,
+                      const double& diag_thr_in,
+                      const int& diag_nmax_in,
+                      const bool& need_subspace_in,
+                      const diag_comm_info& diag_comm_in);
+
+    virtual ~Diago_DavSubspace() override;
+
+    int diag(hamilt::Hamilt<T, Device>* phm_in,
+             psi::Psi<T, Device>& phi,
+             Real* eigenvalue_in,
+             const std::vector<bool>& is_occupied,
+             const bool& scf_type);
 
-    // this is the override function diag() for CG method
-    void diag(hamilt::Hamilt<T, Device>* phm_in,
-              psi::Psi<T, Device>& phi,
-              Real* eigenvalue_in,
-              std::vector<bool>& is_occupied);
+  private:
+    /// for MPI communication
+    const diag_comm_info diag_comm;
 
-    static int PW_DIAG_NDIM;
+    /// the threshold for this electronic iteration
+    const double diag_thr;
 
-    static double dav_large_thr;
+    /// maximal iteration number
+    const int iter_nmax;
 
-  private:
-    bool is_subspace = false;
+    /// is diagH_subspace needed?
+    const bool is_subspace;
 
-    int test_david = 0;
+    /// the first dimension of the matrix to be diagonalized
+    const int n_band = 0;
 
-    /// record for how many bands not have convergence eigenvalues
-    int notconv = 0;
+    /// the second dimension of the matrix to be diagonalized
+    const int dim = 0;
+
+    /// the maximum dimension of the reduced basis set
+    const int nbase_x = 0;
 
-    /// row size for input psi matrix
-    int n_band = 0;
-    /// non-zero col size for inputted psi matrix
-    int dim = 0;
-    // maximum dimension of the reduced basis set
-    int nbase_x = 0;
-    /// precondition for cg diag
-    const Real* precondition = nullptr;
+    /// precondition for diag
+    const std::vector<Real>& precondition;
     Real* d_precondition = nullptr;
 
-    /// eigenvalue results
-    Real* eigenvalue_in_dav = nullptr;
+    /// record for how many bands not have convergence eigenvalues
+    int notconv = 0;
 
-    T* hphi = nullptr; // the product of H and psi in the reduced basis set
+    /// the product of H and psi in the reduced basis set
+    T* hphi = nullptr;
 
-    T* hcc = nullptr; // Hamiltonian on the reduced basis
+    /// Hamiltonian on the reduced basis
+    T* hcc = nullptr;
 
-    T* scc = nullptr; // Overlap on the reduced basis
+    /// Overlap on the reduced basis
+    T* scc = nullptr;
 
-    T* vcc = nullptr; // Eigenvectors on the reduced basis
+    /// Eigenvectors on the reduced basis
+    T* vcc = nullptr;
 
     /// device type of psi
     Device* ctx = {};
@@ -71,7 +87,7 @@ class Diago_DavSubspace : public DiagH<T, Device>
                   T* hphi,
                   T* vcc,
                   const int* unconv,
-                  Real* eigenvalue);
+                  std::vector<Real>* eigenvalue_iter);
 
     void cal_elem(const int& dim,
                   int& nbase,
@@ -97,15 +113,17 @@ class Diago_DavSubspace : public DiagH<T, Device>
                      T* hcc,
                      T* scc,
                      const int& nbase_x,
-                     Real* eigenvalue,
+                     std::vector<Real>* eigenvalue_iter,
                      T* vcc,
                      bool init,
                      bool is_subspace);
 
-    void diag_once(hamilt::Hamilt<T, Device>* phm_in,
-                   psi::Psi<T, Device>& psi,
-                   Real* eigenvalue_in,
-                   std::vector<bool>& is_occupied);
+    int diag_once(hamilt::Hamilt<T, Device>* phm_in,
+                  psi::Psi<T, Device>& psi,
+                  Real* eigenvalue_in,
+                  const std::vector<bool>& is_occupied);
+
+    bool test_exit_cond(const int& ntry, const int& notconv, const bool& scf);
 
     using resmem_complex_op = base_device::memory::resize_memory_op<T, Device>;
     using delmem_complex_op = base_device::memory::delete_memory_op<T, Device>;
@@ -132,12 +150,6 @@ class Diago_DavSubspace : public DiagH<T, Device>
     const T *one = nullptr, *zero = nullptr, *neg_one = nullptr;
 };
 
-template <typename Real, typename Device>
-int Diago_DavSubspace<Real, Device>::PW_DIAG_NDIM = 4;
-
-template <typename Real, typename Device>
-double Diago_DavSubspace<Real, Device>::dav_large_thr = 1e-5;
-
 } // namespace hsolver
 
 #endif
\ No newline at end of file
diff --git a/source/module_hsolver/hsolver_pw.cpp b/source/module_hsolver/hsolver_pw.cpp
index 3567bbe5f4..29d4b762c4 100644
--- a/source/module_hsolver/hsolver_pw.cpp
+++ b/source/module_hsolver/hsolver_pw.cpp
@@ -1,11 +1,9 @@
 #include "hsolver_pw.h"
 
-#include <algorithm>
-
 #include "diago_bpcg.h"
 #include "diago_cg.h"
-#include "diago_david.h"
 #include "diago_dav_subspace.h"
+#include "diago_david.h"
 #include "module_base/global_variable.h"
 #include "module_base/parallel_global.h" // for MPI
 #include "module_base/timer.h"
@@ -16,6 +14,8 @@
 #include "module_hamilt_pw/hamilt_pwdft/wavefunc.h"
 #include "module_hsolver/diagh.h"
 #include "module_hsolver/diago_iter_assist.h"
+
+#include <algorithm>
 #ifdef USE_PAW
 #include "module_cell/module_paw/paw_cell.h"
 #endif
@@ -84,7 +84,7 @@ void HSolverPW<T, Device>::initDiagh(const psi::Psi<T, Device>& psi)
     }
     else if (this->method == "dav")
     {
-#ifdef __MPI 
+#ifdef __MPI
         const diag_comm_info comm_info = {POOL_WORLD, GlobalV::RANK_IN_POOL, GlobalV::NPROC_IN_POOL};
 #else
         const diag_comm_info comm_info = {GlobalV::RANK_IN_POOL, GlobalV::NPROC_IN_POOL};
@@ -96,45 +96,55 @@ void HSolverPW<T, Device>::initDiagh(const psi::Psi<T, Device>& psi)
             {
                 delete (DiagoDavid<T, Device>*)this->pdiagh;
 
-                this->pdiagh = new DiagoDavid<T, Device>(
-                                precondition.data(),
-                                GlobalV::PW_DIAG_NDIM,
-                                GlobalV::use_paw,
-                                comm_info
-                                );
+                this->pdiagh = new DiagoDavid<T, Device>(precondition.data(),
+                                                         GlobalV::PW_DIAG_NDIM,
+                                                         GlobalV::use_paw,
+                                                         comm_info);
 
                 this->pdiagh->method = this->method;
             }
         }
         else
         {
-            this->pdiagh = new DiagoDavid<T, Device>(
-                                precondition.data(),
-                                GlobalV::PW_DIAG_NDIM,
-                                GlobalV::use_paw,
-                                comm_info
-                                );
+            this->pdiagh
+                = new DiagoDavid<T, Device>(precondition.data(), GlobalV::PW_DIAG_NDIM, GlobalV::use_paw, comm_info);
 
             this->pdiagh->method = this->method;
         }
     }
     else if (this->method == "dav_subspace")
     {
-        Diago_DavSubspace<T>::PW_DIAG_NDIM = GlobalV::PW_DIAG_NDIM;
-        if (this->pdiagh != nullptr)
-        {
-            if (this->pdiagh->method != this->method)
-            {
-                delete (Diago_DavSubspace<T, Device>*)this->pdiagh;
-                this->pdiagh = new Diago_DavSubspace<T, Device>(precondition.data());
-                this->pdiagh->method = this->method;
-            }
-        }
-        else
-        {
-            this->pdiagh = new Diago_DavSubspace<T, Device>(precondition.data());
-            this->pdiagh->method = this->method;
-        }
+// #ifdef __MPI
+//         const diag_comm_info comm_info = {POOL_WORLD, GlobalV::RANK_IN_POOL, GlobalV::NPROC_IN_POOL};
+// #else
+//         const diag_comm_info comm_info = {GlobalV::RANK_IN_POOL, GlobalV::NPROC_IN_POOL};
+// #endif
+//         if (this->pdiagh != nullptr)
+//         {
+//             if (this->pdiagh->method != this->method)
+//             {
+//                 delete (Diago_DavSubspace<T, Device>*)this->pdiagh;
+
+//                 this->pdiagh = new Diago_DavSubspace<T, Device>(precondition.data(),
+//                                                                 GlobalV::PW_DIAG_NDIM,
+//                                                                 DiagoIterAssist<T, Device>::PW_DIAG_THR,
+//                                                                 DiagoIterAssist<T, Device>::PW_DIAG_NMAX,
+//                                                                 DiagoIterAssist<T, Device>::need_subspace,
+//                                                                 comm_info);
+
+//                 this->pdiagh->method = this->method;
+//             }
+//         }
+//         else
+//         {
+//             this->pdiagh = new Diago_DavSubspace<T, Device>(precondition.data(),
+//                                                             GlobalV::PW_DIAG_NDIM,
+//                                                             DiagoIterAssist<T, Device>::PW_DIAG_THR,
+//                                                             DiagoIterAssist<T, Device>::PW_DIAG_NMAX,
+//                                                             DiagoIterAssist<T, Device>::need_subspace,
+//                                                             comm_info);
+//             this->pdiagh->method = this->method;
+//         }
     }
     else if (this->method == "bpcg")
     {
@@ -202,7 +212,7 @@ void HSolverPW<T, Device>::solve(hamilt::Hamilt<T, Device>* pHamilt,
                             is_occupied[i * psi.get_nbands() + j] = false;
                         }
                     }
-                }    
+                }
             }
         }
     }
@@ -223,7 +233,7 @@ void HSolverPW<T, Device>::solve(hamilt::Hamilt<T, Device>* pHamilt,
                 _gk[ig] = this->wfc_basis->getgpluskcar(ik, ig);
             }
 
-            std::vector<double> kpt(3,0);
+            std::vector<double> kpt(3, 0);
             kpt[0] = this->wfc_basis->kvec_c[ik].x;
             kpt[1] = this->wfc_basis->kvec_c[ik].y;
             kpt[2] = this->wfc_basis->kvec_c[ik].z;
@@ -323,7 +333,7 @@ void HSolverPW<T, Device>::solve(hamilt::Hamilt<T, Device>* pHamilt,
                 _gk[ig] = this->wfc_basis->getgpluskcar(ik, ig);
             }
 
-            std::vector<double> kpt(3,0);
+            std::vector<double> kpt(3, 0);
             kpt[0] = this->wfc_basis->kvec_c[ik].x;
             kpt[1] = this->wfc_basis->kvec_c[ik].y;
             kpt[2] = this->wfc_basis->kvec_c[ik].z;
@@ -442,7 +452,7 @@ void HSolverPW<T, Device>::solve(hamilt::Hamilt<T, Device>* pHamilt, // ESolver_
                 _gk[ig] = this->wfc_basis->getgpluskcar(ik, ig);
             }
 
-            std::vector<double> kpt(3,0);
+            std::vector<double> kpt(3, 0);
             kpt[0] = this->wfc_basis->kvec_c[ik].x;
             kpt[1] = this->wfc_basis->kvec_c[ik].y;
             kpt[2] = this->wfc_basis->kvec_c[ik].z;
@@ -546,7 +556,7 @@ void HSolverPW<T, Device>::solve(hamilt::Hamilt<T, Device>* pHamilt, // ESolver_
                 _gk[ig] = this->wfc_basis->getgpluskcar(ik, ig);
             }
 
-            std::vector<double> kpt(3,0);
+            std::vector<double> kpt(3, 0);
             kpt[0] = this->wfc_basis->kvec_c[ik].x;
             kpt[1] = this->wfc_basis->kvec_c[ik].y;
             kpt[2] = this->wfc_basis->kvec_c[ik].z;
@@ -653,11 +663,11 @@ void HSolverPW<T, Device>::endDiagh()
         delete reinterpret_cast<DiagoDavid<T, Device>*>(this->pdiagh);
         this->pdiagh = nullptr;
     }
-    if (this->method == "dav_subspace")
-    {
-        delete reinterpret_cast<Diago_DavSubspace<T, Device>*>(this->pdiagh);
-        this->pdiagh = nullptr;
-    }
+    // if (this->method == "dav_subspace")
+    // {
+    //     delete reinterpret_cast<Diago_DavSubspace<T, Device>*>(this->pdiagh);
+    //     this->pdiagh = nullptr;
+    // }
     if (this->method == "bpcg")
     {
         delete reinterpret_cast<DiagoBPCG<T, Device>*>(this->pdiagh);
@@ -710,8 +720,46 @@ void HSolverPW<T, Device>::hamiltSolvePsiK(hamilt::Hamilt<T, Device>* hm, psi::P
     {
         if (this->method == "dav_subspace")
         {
+#ifdef __MPI
+            const diag_comm_info comm_info = {POOL_WORLD, GlobalV::RANK_IN_POOL, GlobalV::NPROC_IN_POOL};
+#else
+            const diag_comm_info comm_info = {GlobalV::RANK_IN_POOL, GlobalV::NPROC_IN_POOL};
+#endif
+
+            this->pdiagh = new Diago_DavSubspace<T, Device>(this->precondition,
+
+                                            
+                                    psi.get_nbands(),
+                                    psi.get_k_first() ? psi.get_current_nbas() : psi.get_nk() * psi.get_nbasis(),
 
-            ((Diago_DavSubspace<T, Device>*)this->pdiagh)->diag(hm, psi, eigenvalue, is_occupied);
+                                                            GlobalV::PW_DIAG_NDIM,
+                                                            DiagoIterAssist<T, Device>::PW_DIAG_THR,
+                                                            DiagoIterAssist<T, Device>::PW_DIAG_NMAX,
+                                                            DiagoIterAssist<T, Device>::need_subspace,
+                                                            comm_info);
+
+            this->pdiagh->method = this->method;
+
+            bool scf;
+            if (GlobalV::CALCULATION == "nscf")
+            {
+                scf = false;
+            }
+            else
+            {
+                scf = true;
+            }
+            DiagoIterAssist<T, Device>::avg_iter
+                += static_cast<double>((reinterpret_cast<Diago_DavSubspace<T, Device>*>(this->pdiagh))
+                                           ->diag(
+                                                hm, 
+                                                psi, 
+                                                eigenvalue,
+                                                is_occupied, 
+                                                scf));
+
+            delete reinterpret_cast<Diago_DavSubspace<T, Device>*>(this->pdiagh);
+            this->pdiagh = nullptr;
         }
         else
         {
diff --git a/source/module_hsolver/kernels/cuda/math_kernel_op.cu b/source/module_hsolver/kernels/cuda/math_kernel_op.cu
index 7c927c932a..1dc1c81cdc 100644
--- a/source/module_hsolver/kernels/cuda/math_kernel_op.cu
+++ b/source/module_hsolver/kernels/cuda/math_kernel_op.cu
@@ -10,9 +10,12 @@
 #include <thrust/execution_policy.h>
 #include <thrust/inner_product.h>
 
-#define WARP_SIZE 32
-#define FULL_MASK 0xffffffff
-#define THREAD_PER_BLOCK 256
+namespace hsolver
+{
+const int warp_size = 32;
+const unsigned int full_mask = 0xffffffff;
+const int thread_per_block = 256;
+}
 
 template <>
 struct GetTypeReal<thrust::complex<float>> {
@@ -66,7 +69,7 @@ void destoryBLAShandle(){
 template <typename FPTYPE>
 __forceinline__ __device__ void warp_reduce(FPTYPE& val) {
     for (int offset = 16; offset > 0; offset >>= 1)
-        val += __shfl_down_sync(FULL_MASK, val, offset);
+        val += __shfl_down_sync(full_mask, val, offset);
 }
 
 template <typename Real>
@@ -83,56 +86,91 @@ __global__ void line_minimize_with_block(
     int item = 0;
     Real epsilo_0 = 0.0, epsilo_1 = 0.0, epsilo_2 = 0.0;
     Real theta = 0.0, cos_theta = 0.0, sin_theta = 0.0;
-    __shared__ Real data[THREAD_PER_BLOCK * 3];
+    __shared__ Real data[thread_per_block * 3];
 
     data[tid] = 0;
 
-    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) {
+    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) {
         item = band_idx * n_basis_max + basis_idx;
         data[tid] += (grad[item] * thrust::conj(grad[item])).real();
     }
     __syncthreads();
     // just do some parallel reduction in shared memory
-    for (int ii = THREAD_PER_BLOCK >> 1; ii > 0; ii >>= 1) {
+    for (int ii = thread_per_block >> 1; ii > warp_size; ii >>= 1) {
         if (tid < ii) {
             data[tid] += data[tid + ii];
         }
         __syncthreads();
     }
 
+    // For threads in the same warp, it is better that they process the same work
+    // Also, __syncwarp() should be used instead of __syncthreads()
+    // Therefore we unroll the loop and ensure that the threads does the same work
+    if (tid < warp_size) {
+        data[tid] += data[tid + 32]; __syncwarp();
+        data[tid] += data[tid + 16]; __syncwarp();
+        data[tid] += data[tid + 8]; __syncwarp();
+        data[tid] += data[tid + 4]; __syncwarp();
+        data[tid] += data[tid + 2]; __syncwarp();
+        data[tid] += data[tid + 1]; __syncwarp();
+    }
+
+    __syncthreads();
+
     Real norm = 1.0 / sqrt(data[0]);
     __syncthreads();
 
     data[tid] = 0;
-    data[THREAD_PER_BLOCK + tid] = 0;
-    data[2 * THREAD_PER_BLOCK + tid] = 0;
-    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) {
+    data[thread_per_block + tid] = 0;
+    data[2 * thread_per_block + tid] = 0;
+    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) {
         item = band_idx * n_basis_max + basis_idx;
         grad[item] *= norm;
         hgrad[item] *= norm;
         data[tid] += (hpsi[item] * thrust::conj(psi[item])).real();
-        data[THREAD_PER_BLOCK + tid] += (grad[item] * thrust::conj(hpsi[item])).real();
-        data[2 * THREAD_PER_BLOCK + tid] += (grad[item] * thrust::conj(hgrad[item])).real();
+        data[thread_per_block + tid] += (grad[item] * thrust::conj(hpsi[item])).real();
+        data[2 * thread_per_block + tid] += (grad[item] * thrust::conj(hgrad[item])).real();
     }
     __syncthreads();
 
     // just do some parallel reduction in shared memory
-    for (int ii = THREAD_PER_BLOCK >> 1; ii > 0; ii >>= 1) {
+    for (int ii = thread_per_block >> 1; ii > warp_size; ii >>= 1) {
         if (tid < ii) {
             data[tid] += data[tid + ii];
-            data[THREAD_PER_BLOCK + tid] += data[THREAD_PER_BLOCK + tid + ii];
-            data[2 * THREAD_PER_BLOCK + tid] += data[2 * THREAD_PER_BLOCK + tid + ii];
+            data[thread_per_block + tid] += data[thread_per_block + tid + ii];
+            data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + ii];
         }
         __syncthreads();
     }
+    if (tid < warp_size) {
+        data[tid] += data[tid + 32]; __syncwarp();
+        data[tid] += data[tid + 16]; __syncwarp();
+        data[tid] += data[tid + 8]; __syncwarp();
+        data[tid] += data[tid + 4]; __syncwarp();
+        data[tid] += data[tid + 2]; __syncwarp();
+        data[tid] += data[tid + 1]; __syncwarp();
+        data[thread_per_block + tid] += data[thread_per_block + tid + 32]; __syncwarp();
+        data[thread_per_block + tid] += data[thread_per_block + tid + 16]; __syncwarp();
+        data[thread_per_block + tid] += data[thread_per_block + tid + 8]; __syncwarp();
+        data[thread_per_block + tid] += data[thread_per_block + tid + 4]; __syncwarp();
+        data[thread_per_block + tid] += data[thread_per_block + tid + 2]; __syncwarp();
+        data[thread_per_block + tid] += data[thread_per_block + tid + 1]; __syncwarp();
+        data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + 32]; __syncwarp();
+        data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + 16]; __syncwarp();
+        data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + 8]; __syncwarp();
+        data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + 4]; __syncwarp();
+        data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + 2]; __syncwarp();
+        data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + 1]; __syncwarp();
+    }
+    __syncthreads();
     epsilo_0 = data[0];
-    epsilo_1 = data[THREAD_PER_BLOCK];
-    epsilo_2 = data[2 * THREAD_PER_BLOCK];
+    epsilo_1 = data[thread_per_block];
+    epsilo_2 = data[2 * thread_per_block];
 
     theta = 0.5 * abs(atan(2 * epsilo_1/(epsilo_0 - epsilo_2)));
     cos_theta = cos(theta);
     sin_theta = sin(theta);
-    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) {
+    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) {
         item = band_idx * n_basis_max + basis_idx;
         psi [item] = psi [item] * cos_theta + grad [item] * sin_theta;
         hpsi[item] = hpsi[item] * cos_theta + hgrad[item] * sin_theta;
@@ -159,29 +197,40 @@ __global__ void calc_grad_with_block(
     Real epsilo = 0.0;
     Real grad_2 = 0.0;
     thrust::complex<Real> grad_1 = {0, 0};
-    __shared__ Real data[THREAD_PER_BLOCK * 2];
+    __shared__ Real data[thread_per_block * 2];
 
     // Init shared memory
     data[tid] = 0;
 
-    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) {
+    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) {
         item = band_idx * n_basis_max + basis_idx;
         data[tid] += (psi[item] * thrust::conj(psi[item])).real();
     }
     __syncthreads();
     // just do some parallel reduction in shared memory
-    for (int ii = THREAD_PER_BLOCK >> 1; ii > 0; ii >>= 1) {
+    for (int ii = thread_per_block >> 1; ii > warp_size; ii >>= 1) {
         if (tid < ii) {
             data[tid] += data[tid + ii];
         }
         __syncthreads();
     }
 
+    if (tid < warp_size) {
+        data[tid] += data[tid + 32]; __syncwarp();
+        data[tid] += data[tid + 16]; __syncwarp();
+        data[tid] += data[tid + 8]; __syncwarp();
+        data[tid] += data[tid + 4]; __syncwarp();
+        data[tid] += data[tid + 2]; __syncwarp();
+        data[tid] += data[tid + 1]; __syncwarp();
+    }
+
+    __syncthreads();
+
     Real norm = 1.0 / sqrt(data[0]);
     __syncthreads();
 
     data[tid] = 0;
-    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) {
+    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) {
         item = band_idx * n_basis_max + basis_idx;
         psi[item] *= norm;
         hpsi[item] *= norm;
@@ -190,37 +239,65 @@ __global__ void calc_grad_with_block(
     __syncthreads();
 
     // just do some parallel reduction in shared memory
-    for (int ii = THREAD_PER_BLOCK >> 1; ii > 0; ii >>= 1) {
+    for (int ii = thread_per_block >> 1; ii > warp_size; ii >>= 1) {
         if (tid < ii) {
             data[tid] += data[tid + ii];
         }
         __syncthreads();
     }
+
+    if (tid < warp_size) {
+        data[tid] += data[tid + 32]; __syncwarp();
+        data[tid] += data[tid + 16]; __syncwarp();
+        data[tid] += data[tid + 8]; __syncwarp();
+        data[tid] += data[tid + 4]; __syncwarp();
+        data[tid] += data[tid + 2]; __syncwarp();
+        data[tid] += data[tid + 1]; __syncwarp();
+    }
+
+    __syncthreads();
     epsilo = data[0];
     __syncthreads();
 
     data[tid] = 0;
-    data[THREAD_PER_BLOCK + tid] = 0;
-    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) {
+    data[thread_per_block + tid] = 0;
+    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) {
         item = band_idx * n_basis_max + basis_idx;
         grad_1 = hpsi[item] - epsilo * psi[item];
         grad_2 = thrust::norm(grad_1);
         data[tid] += grad_2;
-        data[THREAD_PER_BLOCK + tid] += grad_2 / prec[basis_idx];
+        data[thread_per_block + tid] += grad_2 / prec[basis_idx];
     }
     __syncthreads();
 
     // just do some parallel reduction in shared memory
-    for (int ii = THREAD_PER_BLOCK >> 1; ii > 0; ii >>= 1) {
+    for (int ii = thread_per_block >> 1; ii > warp_size; ii >>= 1) {
         if (tid < ii) {
             data[tid] += data[tid + ii];
-            data[THREAD_PER_BLOCK + tid] += data[THREAD_PER_BLOCK + tid + ii];
+            data[thread_per_block + tid] += data[thread_per_block + tid + ii];
         }
         __syncthreads();
     }
+
+    if (tid < warp_size) {
+        data[tid] += data[tid + 32]; __syncwarp();
+        data[tid] += data[tid + 16]; __syncwarp();
+        data[tid] += data[tid + 8]; __syncwarp();
+        data[tid] += data[tid + 4]; __syncwarp();
+        data[tid] += data[tid + 2]; __syncwarp();
+        data[tid] += data[tid + 1]; __syncwarp();
+        data[thread_per_block + tid] += data[thread_per_block + tid + 32]; __syncwarp();
+        data[thread_per_block + tid] += data[thread_per_block + tid + 16]; __syncwarp();
+        data[thread_per_block + tid] += data[thread_per_block + tid + 8]; __syncwarp();
+        data[thread_per_block + tid] += data[thread_per_block + tid + 4]; __syncwarp();
+        data[thread_per_block + tid] += data[thread_per_block + tid + 2]; __syncwarp();
+        data[thread_per_block + tid] += data[thread_per_block + tid + 1]; __syncwarp();
+    }
+
+    __syncthreads();
     err_st = data[0];
-    beta_st = data[THREAD_PER_BLOCK];
-    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) {
+    beta_st = data[thread_per_block];
+    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) {
         item = band_idx * n_basis_max + basis_idx;
         grad_1 = hpsi[item] - epsilo * psi[item];
         grad[item] = -grad_1 / prec[basis_idx] + beta_st / beta[band_idx] * grad_old[item];
@@ -342,7 +419,7 @@ void line_minimize_with_block_op<T, base_device::DEVICE_GPU>::operator()(T* grad
     auto C = reinterpret_cast<thrust::complex<Real>*>(psi_out);
     auto D = reinterpret_cast<thrust::complex<Real>*>(hpsi_out);
 
-    line_minimize_with_block<Real><<<n_band, THREAD_PER_BLOCK>>>(
+    line_minimize_with_block<Real><<<n_band, thread_per_block>>>(
             A, B, C, D,
             n_basis, n_basis_max);
     
@@ -367,7 +444,7 @@ void calc_grad_with_block_op<T, base_device::DEVICE_GPU>::operator()(const Real*
     auto C = reinterpret_cast<thrust::complex<Real>*>(grad_out);
     auto D = reinterpret_cast<thrust::complex<Real>*>(grad_old_out);
 
-    calc_grad_with_block<Real><<<n_band, THREAD_PER_BLOCK>>>(
+    calc_grad_with_block<Real><<<n_band, thread_per_block>>>(
             prec_in, err_out, beta_out,
             A, B, C, D,
             n_basis, n_basis_max);
@@ -440,7 +517,8 @@ void vector_div_constant_op<double, base_device::DEVICE_GPU>::operator()(const b
                                                                          const double* vector,
                                                                          const double constant)
 {
-    int thread = 1024;
+    // In small cases, 1024 threads per block will only utilize 17 blocks, much less than 40
+    int thread = thread_per_block;
     int block = (dim + thread - 1) / thread;
     vector_div_constant_kernel<double> << <block, thread >> > (dim, result, vector, constant);
 
@@ -459,7 +537,7 @@ inline void vector_div_constant_complex_wrapper(const base_device::DEVICE_GPU* d
     thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
     const thrust::complex<FPTYPE>* vector_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector);
 
-    int thread = 1024;
+    int thread = thread_per_block;
     int block = (dim + thread - 1) / thread;
     vector_div_constant_kernel<thrust::complex<FPTYPE>> << <block, thread >> > (dim, result_tmp, vector_tmp, constant);
 
@@ -493,7 +571,7 @@ void vector_mul_vector_op<double, base_device::DEVICE_GPU>::operator()(const bas
                                                                        const double* vector1,
                                                                        const double* vector2)
 {
-    int thread = 1024;
+    int thread = thread_per_block;
     int block = (dim + thread - 1) / thread;
     vector_mul_vector_kernel<double> << <block, thread >> > (dim, result, vector1, vector2);
 
@@ -510,7 +588,7 @@ inline void vector_mul_vector_complex_wrapper(const base_device::DEVICE_GPU* d,
 {
     thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
     const thrust::complex<FPTYPE>* vector1_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector1);
-    int thread = 1024;
+    int thread = thread_per_block;
     int block = (dim + thread - 1) / thread;
     vector_mul_vector_kernel<thrust::complex<FPTYPE>> << <block, thread >> > (dim, result_tmp, vector1_tmp, vector2);
 
@@ -545,7 +623,7 @@ void vector_div_vector_op<double, base_device::DEVICE_GPU>::operator()(const bas
                                                                        const double* vector1,
                                                                        const double* vector2)
 {
-    int thread = 1024;
+    int thread = thread_per_block;
     int block = (dim + thread - 1) / thread;
     vector_div_vector_kernel<double> << <block, thread >> > (dim, result, vector1, vector2);
 
@@ -562,7 +640,7 @@ inline void vector_div_vector_complex_wrapper(const base_device::DEVICE_GPU* d,
 {
     thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
     const thrust::complex<FPTYPE>* vector1_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector1);
-    int thread = 1024;
+    int thread = thread_per_block;
     int block = (dim + thread - 1) / thread;
     vector_div_vector_kernel<thrust::complex<FPTYPE>> << <block, thread >> > (dim, result_tmp, vector1_tmp, vector2);
 
@@ -605,7 +683,7 @@ void constantvector_addORsub_constantVector_op<T, base_device::DEVICE_GPU>::oper
     auto vector1_tmp = reinterpret_cast<const Type*>(vector1);
     auto vector2_tmp = reinterpret_cast<const Type*>(vector2);
 
-    int thread = 1024;
+    int thread = thread_per_block;
     int block = (dim + thread - 1) / thread;
     constantvector_addORsub_constantVector_kernel<Type, Real> <<<block, thread >>>(dim, result_tmp, vector1_tmp, constant1, vector2_tmp, constant2);