diff --git a/src/hamiltonian/projector_all.hpp b/src/hamiltonian/projector_all.hpp index 607a1431..2aaa0069 100644 --- a/src/hamiltonian/projector_all.hpp +++ b/src/hamiltonian/projector_all.hpp @@ -96,8 +96,8 @@ class projector_all { //////////////////////////////////////////////////////////////////////////////////////////// template - gpu::array project(states::orbital_set const & phi, KpointType const & kpoint) const { - + gpu::array calculate_projections(states::orbital_set const & phi, KpointType const & kpoint) const { + gpu::array sphere_phi_all({nprojs_, max_sphere_size_, phi.local_set_size()}); gpu::array projections_all({nprojs_, max_nlm_, phi.local_set_size()}, 0.0); @@ -155,6 +155,21 @@ class projector_all { } #endif + if(phi.basis().comm().size() > 1) { + CALI_CXX_MARK_SCOPE("projector_all::project::reduce"); + phi.basis().comm().all_reduce_in_place_n(raw_pointer_cast(projections_all.data_elements()), projections_all.num_elements(), std::plus<>{}); + } + + return projections_all; + } + + //////////////////////////////////////////////////////////////////////////////////////////// + + template + gpu::array project(states::orbital_set const & phi, KpointType const & kpoint) const { + + auto projections_all = calculate_projections(phi, kpoint); + { CALI_CXX_MARK_SCOPE("projector_scal"); gpu::run(phi.local_set_size(), max_nlm_, nprojs_, @@ -164,11 +179,8 @@ class projector_all { }); } - if(phi.basis().comm().size() > 1) { - CALI_CXX_MARK_SCOPE("projector_all::project::reduce"); - phi.basis().comm().all_reduce_in_place_n(raw_pointer_cast(projections_all.data_elements()), projections_all.num_elements(), std::plus<>{}); - } - + gpu::array sphere_phi_all({nprojs_, max_sphere_size_, phi.local_set_size()}); + #ifndef ENABLE_CUDA for(auto iproj = 0; iproj < nprojs_; iproj++) { CALI_CXX_MARK_SCOPE("projector_gemm_2"); @@ -235,68 +247,8 @@ class projector_all { template double energy(states::orbital_set const & phi, KpointType const & kpoint, Occupations const & occupations, bool const reduce_states = true) const { - - gpu::array sphere_phi_all({nprojs_, max_sphere_size_, phi.local_set_size()}); - gpu::array projections_all({nprojs_, max_nlm_, phi.local_set_size()}, 0.0); - - { CALI_CXX_MARK_SCOPE("projector::gather"); - - gpu::run(phi.local_set_size(), max_sphere_size_, nprojs_, - [sgr = begin(sphere_phi_all), gr = begin(phi.hypercubic()), poi = begin(points_), pos = begin(positions_), kpoint] GPU_LAMBDA (auto ist, auto ipoint, auto iproj){ - if(poi[iproj][ipoint][0] >= 0){ - auto phase = polar(1.0, dot(kpoint, pos[iproj][ipoint])); - sgr[iproj][ipoint][ist] = phase*gr[poi[iproj][ipoint][0]][poi[iproj][ipoint][1]][poi[iproj][ipoint][2]][ist]; - } else { - sgr[iproj][ipoint][ist] = complex(0.0, 0.0); - } - }); - } - -#ifndef ENABLE_CUDA - for(auto iproj = 0; iproj < nprojs_; iproj++){ - CALI_CXX_MARK_SCOPE("projector_gemm_1"); - - if(locally_empty_[iproj]) continue; - - namespace blas = boost::multi::blas; - blas::real_doubled(projections_all[iproj]) = blas::gemm(phi.basis().volume_element(), matrices_[iproj], blas::real_doubled(sphere_phi_all[iproj])); - } -#else - if(max_sphere_size_ > 0) { - CALI_CXX_MARK_SCOPE("projector_gemm_1"); - const double zero = 0.0; - const double vol = phi.basis().volume_element(); - - auto status = cublasDgemmStridedBatched(/*cublasHandle_t handle = */ boost::multi::cuda::cublas::context::get_instance().get(), - /*cublasOperation_t transa = */ CUBLAS_OP_N, - /*cublasOperation_t transb = */ CUBLAS_OP_N, - /*int m = */ 2*phi.local_set_size(), - /*int n = */ max_nlm_, - /*int k = */ max_sphere_size_, - /*const double *alpha = */ &vol, - /*const double *A = */ reinterpret_cast(raw_pointer_cast(sphere_phi_all.data_elements())), - /*int lda = */ 2*phi.local_set_size(), - /*long long int strideA = */ 2*max_sphere_size_*phi.local_set_size(), - /*const double *B = */ raw_pointer_cast(matrices_.data_elements()), - /*int ldb = */ max_sphere_size_, - /*long long int strideB =*/ max_nlm_*max_sphere_size_, - /*const double *beta = */ &zero, - /*double *C = */ reinterpret_cast(raw_pointer_cast(projections_all.data_elements())), - /*int ldc = */ 2*phi.local_set_size(), - /*long long int strideC = */ 2*max_nlm_*phi.local_set_size(), - /*int batchCount = */ nprojs_); - gpu::sync(); - - assert(status == CUBLAS_STATUS_SUCCESS); - - } -#endif - - if(phi.basis().comm().size() > 1) { - CALI_CXX_MARK_SCOPE("projector_all::energy::reduce_basis"); - phi.basis().comm().all_reduce_in_place_n(raw_pointer_cast(projections_all.data_elements()), projections_all.num_elements(), std::plus<>{}); - } + auto projections_all = calculate_projections(phi, kpoint); auto en = gpu::run(gpu::reduce(phi.local_set_size()), gpu::reduce(max_nlm_), gpu::reduce(nprojs_), 0.0, [proj = begin(projections_all), coe = begin(coeff_), occ = begin(occupations), spinor_size = phi.local_spinor_set_size()] @@ -312,62 +264,36 @@ class projector_all { } return en; - } //////////////////////////////////////////////////////////////////////////////////////////// - template + template void force(PhiType & phi, GPhiType const & gphi, MetricType const & metric, - OccsType const & occs, vector3 const & vector_potential, gpu::array, 1> & forces_non_local) const { + OccsType const & occs, vector3 const & vector_potential, KPoint const & kpoint, gpu::array, 1> & forces_non_local) const { CALI_CXX_MARK_FUNCTION; namespace blas = boost::multi::blas; - gpu::array sphere_phi_all({nprojs_, max_sphere_size_, phi.local_set_size()}); gpu::array sphere_gphi_all({nprojs_, max_sphere_size_, phi.local_set_size()}); - gpu::array projections_all({nprojs_, max_nlm_, phi.local_set_size()}, 0.0); { CALI_CXX_MARK_SCOPE("projector_all::force::gather"); gpu::run(phi.local_set_size(), max_sphere_size_, nprojs_, - [sgr = begin(sphere_phi_all), gsgr = begin(sphere_gphi_all), gr = begin(phi.hypercubic()), ggr = begin(gphi.hypercubic()), poi = begin(points_), pos = begin(positions_), kpoint = phi.kpoint() + vector_potential] + [gsgr = begin(sphere_gphi_all), gr = begin(phi.hypercubic()), ggr = begin(gphi.hypercubic()), poi = begin(points_), pos = begin(positions_), kpoint] GPU_LAMBDA (auto ist, auto ipoint, auto iproj){ if(poi[iproj][ipoint][0] >= 0){ auto phase = polar(1.0, dot(kpoint, pos[iproj][ipoint])); - sgr[iproj][ipoint][ist] = phase*gr[poi[iproj][ipoint][0]][poi[iproj][ipoint][1]][poi[iproj][ipoint][2]][ist]; gsgr[iproj][ipoint][ist] = phase*ggr[poi[iproj][ipoint][0]][poi[iproj][ipoint][1]][poi[iproj][ipoint][2]][ist]; } else { - sgr[iproj][ipoint][ist] = 0.0; gsgr[iproj][ipoint][ist] = {complex(0.0), complex(0.0), complex(0.0)}; } }); } - for(auto iproj = 0; iproj < nprojs_; iproj++){ - if(locally_empty_[iproj]) continue; - - blas::real_doubled(projections_all[iproj]) = blas::gemm(phi.basis().volume_element(), matrices_[iproj], blas::real_doubled(sphere_phi_all[iproj])); - } - - { CALI_CXX_MARK_SCOPE("projector_force_scal"); - - gpu::run(phi.local_set_size(), max_nlm_, nprojs_, - [proj = begin(projections_all), coeff = begin(coeff_)] GPU_LAMBDA (auto ist, auto ipj, auto iproj){ - proj[iproj][ipj][ist] *= coeff[iproj][ipj]; - }); - } - - if(phi.basis().comm().size() > 1) { - phi.basis().comm().all_reduce_in_place_n(raw_pointer_cast(projections_all.data_elements()), projections_all.num_elements(), std::plus<>{}); - } - - for(auto iproj = 0; iproj < nprojs_; iproj++){ - blas::real_doubled(sphere_phi_all[iproj]) = blas::gemm(1.0, blas::T(matrices_[iproj]), blas::real_doubled(projections_all[iproj])); - } - + auto sphere_phi_all = project(phi, kpoint); gpu::array, 1> force(nprojs_, {0.0, 0.0, 0.0}); for(auto iproj = 0; iproj < nprojs_; iproj++) { @@ -393,24 +319,19 @@ class projector_all { //////////////////////////////////////////////////////////////////////////////////////////// template void position_commutator(states::orbital_set const & phi, states::orbital_set> & cphi, KpointType const & kpoint) const { - - gpu::array sphere_phi_all({nprojs_, max_sphere_size_, phi.local_set_size()}); - gpu::array, 3> sphere_rphi_all({nprojs_, max_sphere_size_, phi.local_set_size()}); - gpu::array projections_all({nprojs_, max_nlm_, phi.local_set_size()}, 0.0); + gpu::array, 3> sphere_rphi_all({nprojs_, max_sphere_size_, phi.local_set_size()}); gpu::array, 3> rprojections_all({nprojs_, max_nlm_, phi.local_set_size()}, zero>()); { CALI_CXX_MARK_SCOPE("position_commutator::gather"); gpu::run(phi.local_set_size(), max_sphere_size_, nprojs_, - [sphi = begin(sphere_phi_all), srphi = begin(sphere_rphi_all), gr = begin(phi.hypercubic()), poi = begin(points_), pos = begin(positions_), kpoint] GPU_LAMBDA (auto ist, auto ipoint, auto iproj){ + [srphi = begin(sphere_rphi_all), gr = begin(phi.hypercubic()), poi = begin(points_), pos = begin(positions_), kpoint] GPU_LAMBDA (auto ist, auto ipoint, auto iproj){ if(poi[iproj][ipoint][0] >= 0){ auto rr = static_cast>(pos[iproj][ipoint]); auto phase = polar(1.0, dot(kpoint, rr)); - sphi[iproj][ipoint][ist] = phase*gr[poi[iproj][ipoint][0]][poi[iproj][ipoint][1]][poi[iproj][ipoint][2]][ist]; - srphi[iproj][ipoint][ist] = rr*sphi[iproj][ipoint][ist]; + srphi[iproj][ipoint][ist] = rr*phase*gr[poi[iproj][ipoint][0]][poi[iproj][ipoint][1]][poi[iproj][ipoint][2]][ist]; } else { - sphi[iproj][ipoint][ist] = complex(0.0, 0.0); srphi[iproj][ipoint][ist][0] = complex(0.0, 0.0); srphi[iproj][ipoint][ist][1] = complex(0.0, 0.0); srphi[iproj][ipoint][ist][2] = complex(0.0, 0.0); @@ -424,8 +345,6 @@ class projector_all { if(locally_empty_[iproj]) continue; namespace blas = boost::multi::blas; - blas::real_doubled(projections_all[iproj]) = blas::gemm(phi.basis().volume_element(), matrices_[iproj], blas::real_doubled(sphere_phi_all[iproj])); - auto rpa = rprojections_all[iproj].template reinterpret_array_cast(3).rotated().flatted().unrotated(); auto sra = sphere_rphi_all[iproj].template reinterpret_array_cast(3).rotated().flatted().unrotated(); @@ -435,15 +354,13 @@ class projector_all { { CALI_CXX_MARK_SCOPE("position_commutator_scal"); gpu::run(phi.local_set_size(), max_nlm_, nprojs_, - [proj = begin(projections_all), rproj = begin(rprojections_all), coe = begin(coeff_)] + [rproj = begin(rprojections_all), coe = begin(coeff_)] GPU_LAMBDA (auto ist, auto ilm, auto iproj){ - proj[iproj][ilm][ist] *= coe[iproj][ilm]; rproj[iproj][ilm][ist] *= coe[iproj][ilm]; }); } if(phi.basis().comm().size() > 1) { - phi.basis().comm().all_reduce_in_place_n(raw_pointer_cast(projections_all.data_elements()), projections_all.num_elements(), std::plus<>{}); phi.basis().comm().all_reduce_in_place_n(raw_pointer_cast(rprojections_all.data_elements()), rprojections_all.num_elements(), std::plus<>{}); } @@ -453,13 +370,13 @@ class projector_all { if(locally_empty_[iproj]) continue; namespace blas = boost::multi::blas; - blas::real_doubled(sphere_phi_all[iproj]) = blas::gemm(1., blas::T(matrices_[iproj]), blas::real_doubled(projections_all[iproj])); - auto rpa = rprojections_all[iproj].template reinterpret_array_cast(3).rotated().flatted().unrotated(); auto sra = sphere_rphi_all[iproj].template reinterpret_array_cast(3).rotated().flatted().unrotated(); blas::real_doubled(sra) = blas::gemm(1., blas::T(matrices_[iproj]), blas::real_doubled(rpa)); } + auto sphere_phi_all = project(phi, kpoint); + for(auto iproj = 0; iproj < nprojs_; iproj++){ if(locally_empty_[iproj]) continue; diff --git a/src/observables/forces_stress.hpp b/src/observables/forces_stress.hpp index c4bf8eea..33fb5545 100644 --- a/src/observables/forces_stress.hpp +++ b/src/observables/forces_stress.hpp @@ -126,10 +126,10 @@ struct forces_stress { auto iphi = 0; for(auto & phi : electrons.kpin()){ - auto gphi = operations::gradient(phi, /* factor = */ 1.0, /*shift = */ phi.kpoint()); + auto gphi = operations::gradient(phi, /* factor = */ 1.0, /*shift = */ phi.kpoint() + ham.uniform_vector_potential()); observables::density::calculate_gradient_add(electrons.occupations()[iphi], phi, gphi, gdensity); - ham.projectors_all().force(phi, gphi, ions.cell().metric(), electrons.occupations()[iphi], ham.uniform_vector_potential(), forces_non_local); + ham.projectors_all().force(phi, gphi, ions.cell().metric(), electrons.occupations()[iphi], ham.uniform_vector_potential(), phi.kpoint() + ham.uniform_vector_potential(), forces_non_local); stress += stress_kinetic(gphi, electrons.occupations()[iphi]);