diff --git a/exachem/cc/cc2/cd_cc2_cs.cpp b/exachem/cc/cc2/cd_cc2_cs.cpp index 3b64158..b67b440 100644 --- a/exachem/cc/cc2/cd_cc2_cs.cpp +++ b/exachem/cc/cc2/cd_cc2_cs.cpp @@ -14,7 +14,6 @@ namespace cc2_cs { using CCEType = double; CCSE_Tensors _a021; TiledIndexSpace o_alpha, v_alpha, o_beta, v_beta; -bool has_gpu_tmp; Tensor _a01V, _a02V, _a007V; CCSE_Tensors _a01, _a02, _a03, _a04, _a05, _a06, _a001, _a004, _a006, _a008, _a009, _a017, @@ -303,7 +302,6 @@ std::tuple cc2_cs::cd_cc2_cs_driver( const TiledIndexSpace& O = MO("occ"); const TiledIndexSpace& V = MO("virt"); auto [cind] = CI.labels<1>("all"); - has_gpu_tmp = ec.has_gpu(); const int otiles = O.num_tiles(); const int vtiles = V.num_tiles(); diff --git a/exachem/cc/cc2/cd_cc2_os.cpp b/exachem/cc/cc2/cd_cc2_os.cpp index 9f34326..1c7b179 100644 --- a/exachem/cc/cc2/cd_cc2_os.cpp +++ b/exachem/cc/cc2/cd_cc2_os.cpp @@ -13,7 +13,6 @@ namespace cc2_os { using CCEType = double; CCSE_Tensors _a021_os; TiledIndexSpace o_alpha_os, v_alpha_os, o_beta_os, v_beta_os; -bool has_gpu_tmp_os; Tensor _a01V_os, _a02V_os, _a007V_os; CCSE_Tensors _a01_os, _a02_os, _a03_os, _a04_os, _a05_os, _a06_os, _a001_os, _a004_os, diff --git a/exachem/cc/ccsd/cd_ccsd_cs_ann.cpp b/exachem/cc/ccsd/cd_ccsd_cs_ann.cpp index 4ccd883..cd85e3f 100644 --- a/exachem/cc/ccsd/cd_ccsd_cs_ann.cpp +++ b/exachem/cc/ccsd/cd_ccsd_cs_ann.cpp @@ -13,7 +13,6 @@ using CCEType = double; CCSE_Tensors _a021; Tensor a22_abab, a22_aaaa, a22_bbbb; TiledIndexSpace o_alpha, v_alpha, o_beta, v_beta; -bool has_gpu_tmp; Tensor _a01V, _a02V, _a007V; CCSE_Tensors _a01, _a02, _a03, _a04, _a05, _a06, _a001, _a004, _a006, _a008, _a009, _a017, @@ -41,24 +40,24 @@ void ccsd_e_cs(Scheduler& sch, const TiledIndexSpace& MO, const TiledIndexSpace& sch (t2_aaaa_temp()=0) .exact_copy(t2_aaaa(p1_va, p2_va, h1_oa, h2_oa), t2_abab(p1_va, p2_va, h1_oa, h2_oa)) - (t2_aaaa_temp() = t2_aaaa(), + (t2_aaaa_temp() = t2_aaaa(), "t2_aaaa_temp() = t2_aaaa()") - (t2_aaaa(p1_va,p2_va,h1_oa,h2_oa) += -1.0 * t2_aaaa_temp(p2_va,p1_va,h1_oa,h2_oa), + (t2_aaaa(p1_va,p2_va,h1_oa,h2_oa) += -1.0 * t2_aaaa_temp(p2_va,p1_va,h1_oa,h2_oa), "t2_aaaa(p1_va,p2_va,h1_oa,h2_oa) += -1.0 * t2_aaaa_temp(p2_va,p1_va,h1_oa,h2_oa)") - (t2_aaaa_temp(p1_va,p2_va,h1_oa,h2_oa) += 1.0 * t2_aaaa(p2_va,p1_va,h2_oa,h1_oa), + (t2_aaaa_temp(p1_va,p2_va,h1_oa,h2_oa) += 1.0 * t2_aaaa(p2_va,p1_va,h2_oa,h1_oa), "t2_aaaa_temp(p1_va,p2_va,h1_oa,h2_oa) += 1.0 * t2_aaaa(p2_va,p1_va,h2_oa,h1_oa)") - (_a01V(cind) = t1_aa(p1_va, h1_oa) * chol3d_ov("aa")(h1_oa, p1_va, cind), + (_a01V(cind) = t1_aa(p1_va, h1_oa) * chol3d_ov("aa")(h1_oa, p1_va, cind), "_a01V(cind) = t1_aa(p1_va, h1_oa) * chol3d_ov( aa )(h1_oa, p1_va, cind)") - (_a02("aa")(h1_oa, h2_oa, cind) = t1_aa(p1_va, h1_oa) * chol3d_ov("aa")(h2_oa, p1_va, cind), + (_a02("aa")(h1_oa, h2_oa, cind) = t1_aa(p1_va, h1_oa) * chol3d_ov("aa")(h2_oa, p1_va, cind), "_a02( aa )(h1_oa, h2_oa, cind) = t1_aa(p1_va, h1_oa) * chol3d_ov( aa )(h2_oa, p1_va, cind)") - (_a03("aa")(h2_oa, p2_va, cind) = t2_aaaa_temp(p2_va, p1_va, h2_oa, h1_oa) * chol3d_ov("aa")(h1_oa, p1_va, cind), + (_a03("aa")(h2_oa, p2_va, cind) = t2_aaaa_temp(p2_va, p1_va, h2_oa, h1_oa) * chol3d_ov("aa")(h1_oa, p1_va, cind), "_a03( aa )(h2_oa, p2_va, cind) = t2_aaaa_temp(p2_va, p1_va, h2_oa, h1_oa) * chol3d_ov( aa )(h1_oa, p1_va, cind)") - (de() = 2.0 * _a01V() * _a01V(), + (de() = 2.0 * _a01V() * _a01V(), "de() = 2.0 * _a01V() * _a01V()") - (de() += -1.0 * _a02("aa")(h1_oa, h2_oa, cind) * _a02("aa")(h2_oa, h1_oa, cind), + (de() += -1.0 * _a02("aa")(h1_oa, h2_oa, cind) * _a02("aa")(h2_oa, h1_oa, cind), "de() += -1.0 * _a02( aa )(h1_oa, h2_oa, cind) * _a02( aa )(h2_oa, h1_oa, cind)") - (de() += 1.0 * _a03("aa")(h1_oa, p1_va, cind) * chol3d_ov("aa")(h1_oa, p1_va, cind), + (de() += 1.0 * _a03("aa")(h1_oa, p1_va, cind) * chol3d_ov("aa")(h1_oa, p1_va, cind), "de() += 1.0 * _a03( aa )(h1_oa, p1_va, cind) * chol3d_ov( aa )(h1_oa, p1_va, cind)") (de() += 2.0 * t1_aa(p1_va, h1_oa) * f1_ov("aa")(h1_oa, p1_va), "de() += 2.0 * t1_aa(p1_va, h1_oa) * f1_ov( aa )(h1_oa, p1_va)") // NEW TERM @@ -90,50 +89,50 @@ void ccsd_t1_cs(Scheduler& sch, const TiledIndexSpace& MO, const TiledIndexSpace // clang-format off sch - (i0_aa(p2_va, h1_oa) = 1.0 * f1_ov("aa")(h1_oa, p2_va), + (i0_aa(p2_va, h1_oa) = 1.0 * f1_ov("aa")(h1_oa, p2_va), "i0_aa(p2_va, h1_oa) = 1.0 * f1_ov( aa )(h1_oa, p2_va)") - (_a01("aa")(h2_oa, h1_oa, cind) = 1.0 * t1_aa(p1_va, h1_oa) * chol3d_ov("aa")(h2_oa, p1_va, cind), + (_a01("aa")(h2_oa, h1_oa, cind) = 1.0 * t1_aa(p1_va, h1_oa) * chol3d_ov("aa")(h2_oa, p1_va, cind), "_a01( aa )(h2_oa, h1_oa, cind) = 1.0 * t1_aa(p1_va, h1_oa) * chol3d_ov( aa )(h2_oa, p1_va, cind)") // ovm - (_a02V(cind) = 2.0 * t1_aa(p1_va, h1_oa) * chol3d_ov("aa")(h1_oa, p1_va, cind), + (_a02V(cind) = 2.0 * t1_aa(p1_va, h1_oa) * chol3d_ov("aa")(h1_oa, p1_va, cind), "_a02V(cind) = 2.0 * t1_aa(p1_va, h1_oa) * chol3d_ov( aa )(h1_oa, p1_va, cind)") // ovm // (_a02V(cind) = 2.0 * _a01("aa")(h1_oa, h1_oa, cind)) - (_a05("aa")(h2_oa, p1_va) = -1.0 * chol3d_ov("aa")(h1_oa, p1_va, cind) * _a01("aa")(h2_oa, h1_oa, cind), + (_a05("aa")(h2_oa, p1_va) = -1.0 * chol3d_ov("aa")(h1_oa, p1_va, cind) * _a01("aa")(h2_oa, h1_oa, cind), "_a05( aa )(h2_oa, p1_va) = -1.0 * chol3d_ov( aa )(h1_oa, p1_va, cind) * _a01( aa )(h2_oa, h1_oa, cind)") // o2vm (_a05("aa")(h2_oa, p1_va) += 1.0 * f1_ov("aa")(h2_oa, p1_va), "_a05( aa )(h2_oa, p1_va) += 1.0 * f1_ov( aa )(h2_oa, p1_va)") // NEW TERM // .exact_copy(_a05_bb(h1_ob,p1_vb),_a05_aa(h1_ob,p1_vb)) - (_a06("aa")(p1_va, h1_oa, cind) = -1.0 * t2_aaaa_temp(p1_va, p2_va, h1_oa, h2_oa) * chol3d_ov("aa")(h2_oa, p2_va, cind), + (_a06("aa")(p1_va, h1_oa, cind) = -1.0 * t2_aaaa_temp(p1_va, p2_va, h1_oa, h2_oa) * chol3d_ov("aa")(h2_oa, p2_va, cind), "_a06( aa )(p1_va, h1_oa, cind) = -1.0 * t2_aaaa_temp(p1_va, p2_va, h1_oa, h2_oa) * chol3d_ov( aa )(h2_oa, p2_va, cind)") // o2v2m - (_a04("aa")(h2_oa, h1_oa) = -1.0 * f1_oo("aa")(h2_oa, h1_oa), + (_a04("aa")(h2_oa, h1_oa) = -1.0 * f1_oo("aa")(h2_oa, h1_oa), "_a04( aa )(h2_oa, h1_oa) = -1.0 * f1_oo( aa )(h2_oa, h1_oa)") // MOVED TERM - (_a04("aa")(h2_oa, h1_oa) += 1.0 * chol3d_ov("aa")(h2_oa, p1_va, cind) * _a06("aa")(p1_va, h1_oa, cind), + (_a04("aa")(h2_oa, h1_oa) += 1.0 * chol3d_ov("aa")(h2_oa, p1_va, cind) * _a06("aa")(p1_va, h1_oa, cind), "_a04( aa )(h2_oa, h1_oa) += 1.0 * chol3d_ov( aa )(h2_oa, p1_va, cind) * _a06( aa )(p1_va, h1_oa, cind)") // o2vm (_a04("aa")(h2_oa, h1_oa) += -1.0 * t1_aa(p1_va, h1_oa) * f1_ov("aa")(h2_oa, p1_va), "_a04( aa )(h2_oa, h1_oa) += -1.0 * t1_aa(p1_va, h1_oa) * f1_ov( aa )(h2_oa, p1_va)") // NEW TERM - (i0_aa(p2_va, h1_oa) += 1.0 * t1_aa(p2_va, h2_oa) * _a04("aa")(h2_oa, h1_oa), + (i0_aa(p2_va, h1_oa) += 1.0 * t1_aa(p2_va, h2_oa) * _a04("aa")(h2_oa, h1_oa), "i0_aa(p2_va, h1_oa) += 1.0 * t1_aa(p2_va, h2_oa) * _a04( aa )(h2_oa, h1_oa)") // o2v - (i0_aa(p1_va, h2_oa) += 1.0 * chol3d_ov("aa")(h2_oa, p1_va, cind) * _a02V(cind), + (i0_aa(p1_va, h2_oa) += 1.0 * chol3d_ov("aa")(h2_oa, p1_va, cind) * _a02V(cind), "i0_aa(p1_va, h2_oa) += 1.0 * chol3d_ov( aa )(h2_oa, p1_va, cind) * _a02V(cind)") // ovm - (i0_aa(p1_va, h2_oa) += 1.0 * t2_aaaa_temp(p1_va, p2_va, h2_oa, h1_oa) * _a05("aa")(h1_oa, p2_va), + (i0_aa(p1_va, h2_oa) += 1.0 * t2_aaaa_temp(p1_va, p2_va, h2_oa, h1_oa) * _a05("aa")(h1_oa, p2_va), "i0_aa(p1_va, h2_oa) += 1.0 * t2_aaaa_temp(p1_va, p2_va, h2_oa, h1_oa) * _a05( aa )(h1_oa, p2_va)") - (i0_aa(p2_va, h1_oa) += -1.0 * chol3d_vv("aa")(p2_va, p1_va, cind) * _a06("aa")(p1_va, h1_oa, cind), + (i0_aa(p2_va, h1_oa) += -1.0 * chol3d_vv("aa")(p2_va, p1_va, cind) * _a06("aa")(p1_va, h1_oa, cind), "i0_aa(p2_va, h1_oa) += -1.0 * chol3d_vv( aa )(p2_va, p1_va, cind) * _a06( aa )(p1_va, h1_oa, cind)") // ov2m - (_a06("aa")(p2_va, h2_oa, cind) += -1.0 * t1_aa(p1_va, h2_oa) * chol3d_vv("aa")(p2_va, p1_va, cind), + (_a06("aa")(p2_va, h2_oa, cind) += -1.0 * t1_aa(p1_va, h2_oa) * chol3d_vv("aa")(p2_va, p1_va, cind), "_a06( aa )(p2_va, h2_oa, cind) += -1.0 * t1_aa(p1_va, h2_oa) * chol3d_vv( aa )(p2_va, p1_va, cind)") // ov2m - (i0_aa(p1_va, h2_oa) += -1.0 * _a06("aa")(p1_va, h2_oa, cind) * _a02V(cind), + (i0_aa(p1_va, h2_oa) += -1.0 * _a06("aa")(p1_va, h2_oa, cind) * _a02V(cind), "i0_aa(p1_va, h2_oa) += -1.0 * _a06( aa )(p1_va, h2_oa, cind) * _a02V(cind)") // ovm - (_a06("aa")(p2_va, h1_oa, cind) += -1.0 * t1_aa(p2_va, h1_oa) * _a02V(cind), + (_a06("aa")(p2_va, h1_oa, cind) += -1.0 * t1_aa(p2_va, h1_oa) * _a02V(cind), "_a06( aa )(p2_va, h1_oa, cind) += -1.0 * t1_aa(p2_va, h1_oa) * _a02V(cind)") // ovm - (_a06("aa")(p2_va, h1_oa, cind) += 1.0 * t1_aa(p2_va, h2_oa) * _a01("aa")(h2_oa, h1_oa, cind), + (_a06("aa")(p2_va, h1_oa, cind) += 1.0 * t1_aa(p2_va, h2_oa) * _a01("aa")(h2_oa, h1_oa, cind), "_a06( aa )(p2_va, h1_oa, cind) += 1.0 * t1_aa(p2_va, h2_oa) * _a01( aa )(h2_oa, h1_oa, cind)") // o2vm - (_a01("aa")(h2_oa, h1_oa, cind) += 1.0 * chol3d_oo("aa")(h2_oa, h1_oa, cind), + (_a01("aa")(h2_oa, h1_oa, cind) += 1.0 * chol3d_oo("aa")(h2_oa, h1_oa, cind), "_a01( aa )(h2_oa, h1_oa, cind) += 1.0 * chol3d_oo( aa )(h2_oa, h1_oa, cind)") // o2m - (i0_aa(p2_va, h1_oa) += 1.0 * _a01("aa")(h2_oa, h1_oa, cind) * _a06("aa")(p2_va, h2_oa, cind), + (i0_aa(p2_va, h1_oa) += 1.0 * _a01("aa")(h2_oa, h1_oa, cind) * _a06("aa")(p2_va, h2_oa, cind), "i0_aa(p2_va, h1_oa) += 1.0 * _a01( aa )(h2_oa, h1_oa, cind) * _a06( aa )(p2_va, h2_oa, cind)") // o2vm // (i0_aa(p2_va, h1_oa) += -1.0 * t1_aa(p2_va, h2_oa) * f1_oo("aa")(h2_oa, h1_oa), // MOVED ABOVE // "i0_aa(p2_va, h1_oa) += -1.0 * t1_aa(p2_va, h2_oa) * f1_oo( aa )(h2_oa, h1_oa)") // o2v - (i0_aa(p2_va, h1_oa) += 1.0 * t1_aa(p1_va, h1_oa) * f1_vv("aa")(p2_va, p1_va), + (i0_aa(p2_va, h1_oa) += 1.0 * t1_aa(p1_va, h1_oa) * f1_vv("aa")(p2_va, p1_va), "i0_aa(p2_va, h1_oa) += 1.0 * t1_aa(p1_va, h1_oa) * f1_vv( aa )(p2_va, p1_va)") // ov2 ; // clang-format on @@ -271,7 +270,7 @@ void ccsd_t2_cs(Scheduler& sch, const TiledIndexSpace& MO, const TiledIndexSpace // for(auto itval=loop_nest.begin(); itval!=loop_nest.end(); ++itval) {} auto compute_v4_term = [=](const IndexVector& cblkid, span cbuf) { - std::vector*> add_bufs; + auto& memHostPool = tamm::RMMMemoryManager::getInstance().getHostMemoryPool(); // compute blockids from the loop indices. itval is the loop index // execute_bufacc(ec, hw); @@ -297,37 +296,23 @@ void ccsd_t2_cs(Scheduler& sch, const TiledIndexSpace& MO, const TiledIndexSpace // } const size_t csize = ctensor.block_size(translated_cblockid); - // std::vector cbuf(csize, 0); - memset(cbuf.data(), 0x00, csize * sizeof(TensorElType1)); + std::memset(cbuf.data(), 0, csize * sizeof(TensorElType1)); const auto& cdims = ctensor.block_dims(translated_cblockid); SizeVec cdims_sz; for(const auto v: cdims) { cdims_sz.push_back(v); } - bool isgpu = false; - + AddBuf* ab{nullptr}; #if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP) TensorElType2* th_a{nullptr}; TensorElType3* th_b{nullptr}; - TensorElType1* th_c{nullptr}; - auto& thandle = tamm::GPUStreamPool::getInstance().getStream(); + auto& thandle = GPUStreamPool::getInstance().getStream(); - AddBuf* ab{nullptr}; - if(hw == ExecutionHW::GPU) { - ab = new AddBuf{ - isgpu, &thandle, th_a, th_b, th_c, {}, translated_cblockid}; - } - else { - ab = new AddBuf{ - isgpu, &thandle, th_a, th_b, th_c, {}, translated_cblockid}; - } - add_bufs.push_back(ab); + ab = + new AddBuf{th_a, th_b, {}, translated_cblockid}; #else - gpuStream_t thandle{}; - AddBuf* ab = - new AddBuf{ - isgpu, ctensor, {}, translated_cblockid}; - add_bufs.push_back(ab); + gpuStream_t thandle{}; + ab = new AddBuf{ctensor, {}, translated_cblockid}; #endif // LabelLoopNest inner_loop{reduction_lbls}; @@ -338,19 +323,21 @@ void ccsd_t2_cs(Scheduler& sch, const TiledIndexSpace& MO, const TiledIndexSpace TensorElType1* cbuf_dev_ptr{nullptr}; TensorElType1* cbuf_tmp_dev_ptr{nullptr}; #if(defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP)) - auto& memPool = tamm::GPUPooledStorageManager::getInstance(); + auto& memDevicePool = tamm::RMMMemoryManager::getInstance().getDeviceMemoryPool(); if(hw == ExecutionHW::GPU) { - cbuf_dev_ptr = static_cast(memPool.allocate(csize * sizeof(TensorElType1))); + cbuf_dev_ptr = + static_cast(memDevicePool.allocate(csize * sizeof(TensorElType1))); cbuf_tmp_dev_ptr = - static_cast(memPool.allocate(csize * sizeof(TensorElType1))); + static_cast(memDevicePool.allocate(csize * sizeof(TensorElType1))); - memPool.gpuMemset(reinterpret_cast(cbuf_dev_ptr), csize * sizeof(TensorElType1)); - memPool.gpuMemset(reinterpret_cast(cbuf_tmp_dev_ptr), csize * sizeof(TensorElType1)); + gpuMemsetAsync(reinterpret_cast(cbuf_dev_ptr), csize * sizeof(TensorElType1), + thandle); + gpuMemsetAsync(reinterpret_cast(cbuf_tmp_dev_ptr), csize * sizeof(TensorElType1), + thandle); } #endif - int slc = 0; for(const auto& inner_it_val: inner_loop) { // k IndexVector a_block_id(rhs1_.labels().size()); @@ -383,11 +370,13 @@ void ccsd_t2_cs(Scheduler& sch, const TiledIndexSpace& MO, const TiledIndexSpace const size_t asize = atensor.block_size(translated_ablockid); const size_t bsize = btensor.block_size(translated_bblockid); - std::vector abuf(asize); - std::vector bbuf(bsize); + TensorElType2* abuf{nullptr}; + TensorElType3* bbuf{nullptr}; + abuf = static_cast(memHostPool.allocate(asize * sizeof(TensorElType2))); + bbuf = static_cast(memHostPool.allocate(bsize * sizeof(TensorElType3))); - atensor.get(translated_ablockid, abuf); - btensor.get(translated_bblockid, bbuf); + atensor.get(translated_ablockid, {abuf, asize}); + btensor.get(translated_bblockid, {bbuf, bsize}); const auto& adims = atensor.block_dims(translated_ablockid); const auto& bdims = btensor.block_dims(translated_bblockid); @@ -400,38 +389,39 @@ void ccsd_t2_cs(Scheduler& sch, const TiledIndexSpace& MO, const TiledIndexSpace for(const auto v: bdims) { bdims_sz.push_back(v); } // A*B - { - AddBuf* abptr{nullptr}; + TensorElType2* abuf_dev{nullptr}; + TensorElType3* bbuf_dev{nullptr}; + #if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP) if(hw == ExecutionHW::GPU) { - abptr = ab; - ab->ta_ = static_cast(memPool.allocate(asize * sizeof(TensorElType2))); - ab->tb_ = static_cast(memPool.allocate(bsize * sizeof(TensorElType3))); + abuf_dev = + static_cast(memDevicePool.allocate(asize * sizeof(TensorElType2))); + bbuf_dev = + static_cast(memDevicePool.allocate(bsize * sizeof(TensorElType3))); + + gpuMemcpyAsync(abuf_dev, abuf, asize, gpuMemcpyHostToDevice, thandle); + gpuMemcpyAsync(bbuf_dev, bbuf, bsize, gpuMemcpyHostToDevice, thandle); } - else abptr = add_bufs[0]; -#else - abptr = add_bufs[0]; #endif - abptr->abuf_ = std::move(abuf); - abptr->bbuf_ = std::move(bbuf); kernels::block_multiply( - abptr->isgpu_, #if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP) - abptr->ta_, abptr->tb_, + abuf_dev, bbuf_dev, #endif - thandle, 1.0, (abptr->abuf_).data(), adims_sz, rhs1_int_labels_, (abptr->bbuf_).data(), - bdims_sz, rhs2_int_labels_, cscale, cbuf.data(), cdims_sz, lhs_int_labels_, hw, - has_gpu_tmp, false, cbuf_dev_ptr, cbuf_tmp_dev_ptr); - } + thandle, 1.0, abuf, adims_sz, rhs1_int_labels_, bbuf, bdims_sz, rhs2_int_labels_, cscale, + cbuf.data(), cdims_sz, lhs_int_labels_, hw, false, cbuf_dev_ptr, cbuf_tmp_dev_ptr); + #if(defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP)) - if(hw == ExecutionHW::GPU) { - memPool.deallocate(static_cast(ab->ta_), (ab->abuf_).size() * sizeof(TensorElType2)); - memPool.deallocate(static_cast(ab->tb_), (ab->bbuf_).size() * sizeof(TensorElType3)); - } + if(hw == ExecutionHW::GPU) { + memDevicePool.deallocate(abuf_dev, asize * sizeof(TensorElType2)); + memDevicePool.deallocate(bbuf_dev, bsize * sizeof(TensorElType3)); + } #endif - slc++; + } // A * B + + memHostPool.deallocate(abuf, asize * sizeof(TensorElType2)); + memHostPool.deallocate(bbuf, bsize * sizeof(TensorElType3)); } // end of reduction loop // add the computed update to the tensor @@ -439,117 +429,121 @@ void ccsd_t2_cs(Scheduler& sch, const TiledIndexSpace& MO, const TiledIndexSpace #if(defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP)) // copy to host if(hw == ExecutionHW::GPU) { - std::vector cbuf_tmp(csize, 0); - kernels::copy_result_to_host(hw, thandle, cbuf_tmp, cbuf_dev_ptr); + TensorElType1* cbuf_tmp{nullptr}; + cbuf_tmp = static_cast(memHostPool.allocate(csize * sizeof(TensorElType1))); + std::memset(cbuf_tmp, 0, csize * sizeof(TensorElType1)); + gpuMemcpyAsync(cbuf_tmp, cbuf_dev_ptr, csize, gpuMemcpyDeviceToHost, + thandle); // cbuf+=cbuf_tmp - kernels::stream_synchronize(thandle); - blas::axpy(csize, TensorElType1{1}, cbuf_tmp.data(), 1, cbuf.data(), 1); + gpuStreamSynchronize(thandle); + blas::axpy(csize, TensorElType1{1}, cbuf_tmp, 1, cbuf.data(), 1); // free cbuf_dev_ptr - auto& memPool = tamm::GPUPooledStorageManager::getInstance(); - memPool.deallocate(static_cast(cbuf_dev_ptr), csize * sizeof(TensorElType1)); - memPool.deallocate(static_cast(cbuf_tmp_dev_ptr), csize * sizeof(TensorElType1)); + memDevicePool.deallocate(static_cast(cbuf_dev_ptr), csize * sizeof(TensorElType1)); + memDevicePool.deallocate(static_cast(cbuf_tmp_dev_ptr), + csize * sizeof(TensorElType1)); + + memHostPool.deallocate(cbuf_tmp, csize * sizeof(TensorElType1)); } #endif // ctensor.add(translated_cblockid, cbuf); // for (size_t i=0;i{{v_alpha, v_beta, v_alpha, v_beta}, compute_v4_term}; // clang-format off sch - (_a017("aa")(p1_va, h2_oa, cind) = -1.0 * t2_aaaa_temp(p1_va, p2_va, h2_oa, h1_oa) * chol3d_ov("aa")(h1_oa, p2_va, cind), + (_a017("aa")(p1_va, h2_oa, cind) = -1.0 * t2_aaaa_temp(p1_va, p2_va, h2_oa, h1_oa) * chol3d_ov("aa")(h1_oa, p2_va, cind), "_a017( aa )(p1_va, h2_oa, cind) = -1.0 * t2_aaaa_temp(p1_va, p2_va, h2_oa, h1_oa) * chol3d_ov( aa )(h1_oa, p2_va, cind)") - (_a006("aa")(h2_oa, h1_oa) = -1.0 * chol3d_ov("aa")(h2_oa, p2_va, cind) * _a017("aa")(p2_va, h1_oa, cind), + (_a006("aa")(h2_oa, h1_oa) = -1.0 * chol3d_ov("aa")(h2_oa, p2_va, cind) * _a017("aa")(p2_va, h1_oa, cind), "_a006( aa )(h2_oa, h1_oa) = -1.0 * chol3d_ov( aa )(h2_oa, p2_va, cind) * _a017( aa )(p2_va, h1_oa, cind)") - (_a007V(cind) = 2.0 * chol3d_ov("aa")(h1_oa, p1_va, cind) * t1_aa(p1_va, h1_oa), + (_a007V(cind) = 2.0 * chol3d_ov("aa")(h1_oa, p1_va, cind) * t1_aa(p1_va, h1_oa), "_a007V(cind) = 2.0 * chol3d_ov( aa )(h1_oa, p1_va, cind) * t1_aa(p1_va, h1_oa)") - (_a009("aa")(h1_oa, h2_oa, cind) = 1.0 * chol3d_ov("aa")(h1_oa, p1_va, cind) * t1_aa(p1_va, h2_oa), + (_a009("aa")(h1_oa, h2_oa, cind) = 1.0 * chol3d_ov("aa")(h1_oa, p1_va, cind) * t1_aa(p1_va, h2_oa), "_a009( aa )(h1_oa, h2_oa, cind) = 1.0 * chol3d_ov( aa )(h1_oa, p1_va, cind) * t1_aa(p1_va, h2_oa)") - (_a021("aa")(p2_va, p1_va, cind) = -0.5 * chol3d_ov("aa")(h1_oa, p1_va, cind) * t1_aa(p2_va, h1_oa), + (_a021("aa")(p2_va, p1_va, cind) = -0.5 * chol3d_ov("aa")(h1_oa, p1_va, cind) * t1_aa(p2_va, h1_oa), "_a021( aa )(p2_va, p1_va, cind) = -0.5 * chol3d_ov( aa )(h1_oa, p1_va, cind) * t1_aa(p2_va, h1_oa)") - (_a021("aa")(p2_va, p1_va, cind) += 0.5 * chol3d_vv("aa")(p2_va, p1_va, cind), + (_a021("aa")(p2_va, p1_va, cind) += 0.5 * chol3d_vv("aa")(p2_va, p1_va, cind), "_a021( aa )(p2_va, p1_va, cind) += 0.5 * chol3d_vv( aa )(p2_va, p1_va, cind)") - (_a017("aa")(p1_va, h2_oa, cind) += -2.0 * t1_aa(p2_va, h2_oa) * _a021("aa")(p1_va, p2_va, cind), + (_a017("aa")(p1_va, h2_oa, cind) += -2.0 * t1_aa(p2_va, h2_oa) * _a021("aa")(p1_va, p2_va, cind), "_a017( aa )(p1_va, h2_oa, cind) += -2.0 * t1_aa(p2_va, h2_oa) * _a021( aa )(p1_va, p2_va, cind)") - (_a008("aa")(h2_oa, h1_oa, cind) = 1.0 * _a009("aa")(h2_oa, h1_oa, cind), + (_a008("aa")(h2_oa, h1_oa, cind) = 1.0 * _a009("aa")(h2_oa, h1_oa, cind), "_a008( aa )(h2_oa, h1_oa, cind) = 1.0 * _a009( aa )(h2_oa, h1_oa, cind)") - (_a009("aa")(h2_oa, h1_oa, cind) += 1.0 * chol3d_oo("aa")(h2_oa, h1_oa, cind), + (_a009("aa")(h2_oa, h1_oa, cind) += 1.0 * chol3d_oo("aa")(h2_oa, h1_oa, cind), "_a009( aa )(h2_oa, h1_oa, cind) += 1.0 * chol3d_oo( aa )(h2_oa, h1_oa, cind)") .exact_copy(_a009("bb")(h2_ob,h1_ob,cind),_a009("aa")(h2_ob,h1_ob,cind)) .exact_copy(_a021("bb")(p2_vb,p1_vb,cind),_a021("aa")(p2_vb,p1_vb,cind)) - (_a001("aa")(p1_va, p2_va) = -2.0 * _a021("aa")(p1_va, p2_va, cind) * _a007V(cind), + (_a001("aa")(p1_va, p2_va) = -2.0 * _a021("aa")(p1_va, p2_va, cind) * _a007V(cind), "_a001( aa )(p1_va, p2_va) = -2.0 * _a021( aa )(p1_va, p2_va, cind) * _a007V(cind)") - (_a001("aa")(p1_va, p2_va) += -1.0 * _a017("aa")(p1_va, h2_oa, cind) * chol3d_ov("aa")(h2_oa, p2_va, cind), + (_a001("aa")(p1_va, p2_va) += -1.0 * _a017("aa")(p1_va, h2_oa, cind) * chol3d_ov("aa")(h2_oa, p2_va, cind), "_a001( aa )(p1_va, p2_va) += -1.0 * _a017( aa )(p1_va, h2_oa, cind) * chol3d_ov( aa )(h2_oa, p2_va, cind)") - (_a006("aa")(h2_oa, h1_oa) += 1.0 * _a009("aa")(h2_oa, h1_oa, cind) * _a007V(cind), + (_a006("aa")(h2_oa, h1_oa) += 1.0 * _a009("aa")(h2_oa, h1_oa, cind) * _a007V(cind), "_a006( aa )(h2_oa, h1_oa) += 1.0 * _a009( aa )(h2_oa, h1_oa, cind) * _a007V(cind)") - (_a006("aa")(h3_oa, h1_oa) += -1.0 * _a009("aa")(h2_oa, h1_oa, cind) * _a008("aa")(h3_oa, h2_oa, cind), + (_a006("aa")(h3_oa, h1_oa) += -1.0 * _a009("aa")(h2_oa, h1_oa, cind) * _a008("aa")(h3_oa, h2_oa, cind), "_a006( aa )(h3_oa, h1_oa) += -1.0 * _a009( aa )(h2_oa, h1_oa, cind) * _a008( aa )(h3_oa, h2_oa, cind)") - (_a019("abab")(h2_oa, h1_ob, h1_oa, h2_ob) = 0.25 * _a009("aa")(h2_oa, h1_oa, cind) * _a009("bb")(h1_ob, h2_ob, cind), + (_a019("abab")(h2_oa, h1_ob, h1_oa, h2_ob) = 0.25 * _a009("aa")(h2_oa, h1_oa, cind) * _a009("bb")(h1_ob, h2_ob, cind), "_a019( abab )(h2_oa, h1_ob, h1_oa, h2_ob) = 0.25 * _a009( aa )(h2_oa, h1_oa, cind) * _a009( bb )(h1_ob, h2_ob, cind)") - (_a020("aaaa")(p2_va, h2_oa, p1_va, h1_oa) = -2.0 * _a009("aa")(h2_oa, h1_oa, cind) * _a021("aa")(p2_va, p1_va, cind), + (_a020("aaaa")(p2_va, h2_oa, p1_va, h1_oa) = -2.0 * _a009("aa")(h2_oa, h1_oa, cind) * _a021("aa")(p2_va, p1_va, cind), "_a020( aaaa )(p2_va, h2_oa, p1_va, h1_oa) = -2.0 * _a009( aa )(h2_oa, h1_oa, cind) * _a021( aa )(p2_va, p1_va, cind)") .exact_copy(_a020("baba")(p2_vb, h2_oa, p1_vb, h1_oa),_a020("aaaa")(p2_vb, h2_oa, p1_vb, h1_oa)) - (_a020("aaaa")(p1_va, h3_oa, p3_va, h2_oa) += 0.5 * _a004("aaaa")(p2_va, p3_va, h3_oa, h1_oa) * t2_aaaa(p1_va,p2_va,h1_oa,h2_oa), - "_a020( aaaa )(p1_va, h3_oa, p3_va, h2_oa) += 0.5 * _a004( aaaa )(p2_va, p3_va, h3_oa, h1_oa) * t2_aaaa(p1_va,p2_va,h1_oa,h2_oa)") - (_a020("baab")(p1_vb, h2_oa, p1_va, h2_ob) = -0.5 * _a004("aaaa")(p2_va, p1_va, h2_oa, h1_oa) * t2_abab(p2_va,p1_vb,h1_oa,h2_ob), - "_a020( baab )(p1_vb, h2_oa, p1_va, h2_ob) = -0.5 * _a004( aaaa )(p2_va, p1_va, h2_oa, h1_oa) * t2_abab(p2_va,p1_vb,h1_oa,h2_ob)") - (_a020("baba")(p1_vb, h1_oa, p2_vb, h2_oa) += 0.5 * _a004("abab")(p1_va, p2_vb, h1_oa, h1_ob) * t2_abab(p1_va,p1_vb,h2_oa,h1_ob), + (_a020("aaaa")(p1_va, h3_oa, p3_va, h2_oa) += 0.5 * _a004("aaaa")(p2_va, p3_va, h3_oa, h1_oa) * t2_aaaa(p1_va,p2_va,h1_oa,h2_oa), + "_a020( aaaa )(p1_va, h3_oa, p3_va, h2_oa) += 0.5 * _a004( aaaa )(p2_va, p3_va, h3_oa, h1_oa) * t2_aaaa(p1_va,p2_va,h1_oa,h2_oa)") + (_a020("baab")(p1_vb, h2_oa, p1_va, h2_ob) = -0.5 * _a004("aaaa")(p2_va, p1_va, h2_oa, h1_oa) * t2_abab(p2_va,p1_vb,h1_oa,h2_ob), + "_a020( baab )(p1_vb, h2_oa, p1_va, h2_ob) = -0.5 * _a004( aaaa )(p2_va, p1_va, h2_oa, h1_oa) * t2_abab(p2_va,p1_vb,h1_oa,h2_ob)") + (_a020("baba")(p1_vb, h1_oa, p2_vb, h2_oa) += 0.5 * _a004("abab")(p1_va, p2_vb, h1_oa, h1_ob) * t2_abab(p1_va,p1_vb,h2_oa,h1_ob), "_a020( baba )(p1_vb, h1_oa, p2_vb, h2_oa) += 0.5 * _a004( abab )(p1_va, p2_vb, h1_oa, h1_ob) * t2_abab(p1_va,p1_vb,h2_oa,h1_ob)") - (_a017("aa")(p1_va, h2_oa, cind) += 1.0 * t1_aa(p1_va, h1_oa) * chol3d_oo("aa")(h1_oa, h2_oa, cind), + (_a017("aa")(p1_va, h2_oa, cind) += 1.0 * t1_aa(p1_va, h1_oa) * chol3d_oo("aa")(h1_oa, h2_oa, cind), "_a017( aa )(p1_va, h2_oa, cind) += 1.0 * t1_aa(p1_va, h1_oa) * chol3d_oo( aa )(h1_oa, h2_oa, cind)") - (_a017("aa")(p1_va, h2_oa, cind) += -1.0 * chol3d_ov("aa")(h2_oa, p1_va, cind), + (_a017("aa")(p1_va, h2_oa, cind) += -1.0 * chol3d_ov("aa")(h2_oa, p1_va, cind), "_a017( aa )(p1_va, h2_oa, cind) += -1.0 * chol3d_ov( aa )(h2_oa, p1_va, cind)") - (_a001("aa")(p2_va, p1_va) += -1.0 * f1_vv("aa")(p2_va, p1_va), + (_a001("aa")(p2_va, p1_va) += -1.0 * f1_vv("aa")(p2_va, p1_va), "_a001( aa )(p2_va, p1_va) += -1.0 * f1_vv( aa )(p2_va, p1_va)") (_a001("aa")(p2_va, p1_va) += 1.0 * t1_aa(p2_va, h1_oa) * f1_ov("aa")(h1_oa, p1_va), "_a001( aa )(p2_va, p1_va) += 1.0 * t1_aa(p2_va, h1_oa) * f1_ov( aa )(h1_oa, p1_va)") // NEW TERM - (_a006("aa")(h2_oa, h1_oa) += 1.0 * f1_oo("aa")(h2_oa, h1_oa), + (_a006("aa")(h2_oa, h1_oa) += 1.0 * f1_oo("aa")(h2_oa, h1_oa), "_a006( aa )(h2_oa, h1_oa) += 1.0 * f1_oo( aa )(h2_oa, h1_oa)") - (_a006("aa")(h2_oa, h1_oa) += 1.0 * t1_aa(p1_va, h1_oa) * f1_ov("aa")(h2_oa, p1_va), + (_a006("aa")(h2_oa, h1_oa) += 1.0 * t1_aa(p1_va, h1_oa) * f1_ov("aa")(h2_oa, p1_va), "_a006( aa )(h2_oa, h1_oa) += 1.0 * t1_aa(p1_va, h1_oa) * f1_ov( aa )(h2_oa, p1_va)") .exact_copy(_a017("bb")(p1_vb, h1_ob, cind), _a017("aa")(p1_vb, h1_ob, cind)) .exact_copy(_a006("bb")(h1_ob, h2_ob), _a006("aa")(h1_ob, h2_ob)) .exact_copy(_a001("bb")(p1_vb, p2_vb), _a001("aa")(p1_vb, p2_vb)) .exact_copy(_a021("bb")(p1_vb, p2_vb, cind), _a021("aa")(p1_vb, p2_vb, cind)) .exact_copy(_a020("bbbb")(p1_vb, h1_ob, p2_vb, h2_ob), _a020("aaaa")(p1_vb, h1_ob, p2_vb, h2_ob)) - - (i0_abab(p1_va, p2_vb, h2_oa, h1_ob) = 1.0 * _a020("bbbb")(p2_vb, h2_ob, p1_vb, h1_ob) * t2_abab(p1_va, p1_vb, h2_oa, h2_ob), + + (i0_abab(p1_va, p2_vb, h2_oa, h1_ob) = 1.0 * _a020("bbbb")(p2_vb, h2_ob, p1_vb, h1_ob) * t2_abab(p1_va, p1_vb, h2_oa, h2_ob), "i0_abab(p1_va, p2_vb, h2_oa, h1_ob) = 1.0 * _a020(bbbb)(p2_vb, h2_ob, p1_vb, h1_ob) * t2_abab(p1_va, p1_vb, h2_oa, h2_ob)") - (i0_abab(p2_va, p1_vb, h2_oa, h1_ob) += 1.0 * _a020("baab")(p1_vb, h1_oa, p1_va, h1_ob) * t2_aaaa(p2_va, p1_va, h2_oa, h1_oa), + (i0_abab(p2_va, p1_vb, h2_oa, h1_ob) += 1.0 * _a020("baab")(p1_vb, h1_oa, p1_va, h1_ob) * t2_aaaa(p2_va, p1_va, h2_oa, h1_oa), "i0_abab(p2_va, p1_vb, h2_oa, h1_ob) += 1.0 * _a020(baab)(p1_vb, h1_oa, p1_va, h1_ob) * t2_aaaa(p2_va, p1_va, h2_oa, h1_oa)") - (i0_abab(p1_va, p1_vb, h2_oa, h1_ob) += 1.0 * _a020("baba")(p1_vb, h1_oa, p2_vb, h2_oa) * t2_abab(p1_va, p2_vb, h1_oa, h1_ob), + (i0_abab(p1_va, p1_vb, h2_oa, h1_ob) += 1.0 * _a020("baba")(p1_vb, h1_oa, p2_vb, h2_oa) * t2_abab(p1_va, p2_vb, h1_oa, h1_ob), "i0_abab(p1_va, p1_vb, h2_oa, h1_ob) += 1.0 * _a020(baba)(p1_vb, h1_oa, p2_vb, h2_oa) * t2_abab(p1_va, p2_vb, h1_oa, h1_ob)") .exact_copy(i0_temp(p1_vb,p1_va,h2_ob,h1_oa),i0_abab(p1_vb,p1_va,h2_ob,h1_oa)) - (i0_abab(p1_va, p1_vb, h2_oa, h1_ob) += 1.0 * i0_temp(p1_vb, p1_va, h1_ob, h2_oa), + (i0_abab(p1_va, p1_vb, h2_oa, h1_ob) += 1.0 * i0_temp(p1_vb, p1_va, h1_ob, h2_oa), "i0_abab(p1_va, p1_vb, h2_oa, h1_ob) += 1.0 * i0_temp(p1_vb, p1_va, h1_ob, h2_oa)") - (i0_abab(p1_va, p1_vb, h1_oa, h2_ob) += 1.0 * _a017("aa")(p1_va, h1_oa, cind) * _a017("bb")(p1_vb, h2_ob, cind), + (i0_abab(p1_va, p1_vb, h1_oa, h2_ob) += 1.0 * _a017("aa")(p1_va, h1_oa, cind) * _a017("bb")(p1_vb, h2_ob, cind), "i0_abab(p1_va, p1_vb, h1_oa, h2_ob) += 1.0 * _a017( aa )(p1_va, h1_oa, cind) * _a017( bb )(p1_vb, h2_ob, cind)"); sch - // (_a022("abab")(p1_va,p2_vb,p2_va,p1_vb) = 1.0 * _a021("aa")(p1_va,p2_va,cind) * _a021("bb")(p2_vb,p1_vb,cind), + // (_a022("abab")(p1_va,p2_vb,p2_va,p1_vb) = 1.0 * _a021("aa")(p1_va,p2_va,cind) * _a021("bb")(p2_vb,p1_vb,cind), // "_a022( abab )(p1_va,p2_vb,p2_va,p1_vb) = 1.0 * _a021( aa )(p1_va,p2_va,cind) * _a021( bb )(p2_vb,p1_vb,cind)") - (i0_abab(p1_va, p2_vb, h1_oa, h2_ob) += 4.0 * a22_abab(p1_va, p2_vb, p2_va, p1_vb) * t2_abab(p2_va,p1_vb,h1_oa,h2_ob), + (i0_abab(p1_va, p2_vb, h1_oa, h2_ob) += 4.0 * a22_abab(p1_va, p2_vb, p2_va, p1_vb) * t2_abab(p2_va,p1_vb,h1_oa,h2_ob), "i0_abab(p1_va, p2_vb, h1_oa, h2_ob) += 4.0 * a22_abab(p1_va, p2_vb, p2_va, p1_vb) * t2_abab(p2_va,p1_vb,h1_oa,h2_ob)"); - - - sch(_a019("abab")(h2_oa, h1_ob, h1_oa, h2_ob) += 0.25 * _a004("abab")(p1_va, p2_vb, h2_oa, h1_ob) * t2_abab(p1_va,p2_vb,h1_oa,h2_ob), - "_a019( abab )(h2_oa, h1_ob, h1_oa, h2_ob) += 0.25 * _a004( abab )(p1_va, p2_vb, h2_oa, h1_ob) * t2_abab(p1_va,p2_vb,h1_oa,h2_ob)") - (i0_abab(p1_va, p1_vb, h1_oa, h2_ob) += 4.0 * _a019("abab")(h2_oa, h1_ob, h1_oa, h2_ob) * t2_abab(p1_va, p1_vb, h2_oa, h1_ob), + + + sch(_a019("abab")(h2_oa, h1_ob, h1_oa, h2_ob) += 0.25 * _a004("abab")(p1_va, p2_vb, h2_oa, h1_ob) * t2_abab(p1_va,p2_vb,h1_oa,h2_ob), + "_a019( abab )(h2_oa, h1_ob, h1_oa, h2_ob) += 0.25 * _a004( abab )(p1_va, p2_vb, h2_oa, h1_ob) * t2_abab(p1_va,p2_vb,h1_oa,h2_ob)") + (i0_abab(p1_va, p1_vb, h1_oa, h2_ob) += 4.0 * _a019("abab")(h2_oa, h1_ob, h1_oa, h2_ob) * t2_abab(p1_va, p1_vb, h2_oa, h1_ob), "i0_abab(p1_va, p1_vb, h1_oa, h2_ob) += 4.0 * _a019( abab )(h2_oa, h1_ob, h1_oa, h2_ob) * t2_abab(p1_va, p1_vb, h2_oa, h1_ob)") - (i0_abab(p1_va, p1_vb, h1_oa, h2_ob) += -1.0 * t2_abab(p1_va, p2_vb, h1_oa, h2_ob) * _a001("bb")(p1_vb, p2_vb), + (i0_abab(p1_va, p1_vb, h1_oa, h2_ob) += -1.0 * t2_abab(p1_va, p2_vb, h1_oa, h2_ob) * _a001("bb")(p1_vb, p2_vb), "i0_abab(p1_va, p1_vb, h1_oa, h2_ob) += -1.0 * t2_abab(p1_va, p2_vb, h1_oa, h2_ob) * _a001( bb )(p1_vb, p2_vb)") - (i0_abab(p1_va, p1_vb, h1_oa, h2_ob) += -1.0 * t2_abab(p2_va, p1_vb, h1_oa, h2_ob) * _a001("aa")(p1_va, p2_va), + (i0_abab(p1_va, p1_vb, h1_oa, h2_ob) += -1.0 * t2_abab(p2_va, p1_vb, h1_oa, h2_ob) * _a001("aa")(p1_va, p2_va), "i0_abab(p1_va, p1_vb, h1_oa, h2_ob) += -1.0 * t2_abab(p2_va, p1_vb, h1_oa, h2_ob) * _a001( aa )(p1_va, p2_va)") - (i0_abab(p1_va, p1_vb, h2_oa, h1_ob) += -1.0 * t2_abab(p1_va, p1_vb, h1_oa, h1_ob) * _a006("aa")(h1_oa, h2_oa), + (i0_abab(p1_va, p1_vb, h2_oa, h1_ob) += -1.0 * t2_abab(p1_va, p1_vb, h1_oa, h1_ob) * _a006("aa")(h1_oa, h2_oa), "i0_abab(p1_va, p1_vb, h2_oa, h1_ob) += -1.0 * t2_abab(p1_va, p1_vb, h1_oa, h1_ob) * _a006( aa )(h1_oa, h2_oa)") - (i0_abab(p1_va, p1_vb, h1_oa, h2_ob) += -1.0 * t2_abab(p1_va, p1_vb, h1_oa, h1_ob) * _a006("bb")(h1_ob, h2_ob), + (i0_abab(p1_va, p1_vb, h1_oa, h2_ob) += -1.0 * t2_abab(p1_va, p1_vb, h1_oa, h1_ob) * _a006("bb")(h1_ob, h2_ob), "i0_abab(p1_va, p1_vb, h1_oa, h2_ob) += -1.0 * t2_abab(p1_va, p1_vb, h1_oa, h1_ob) * _a006( bb )(h1_ob, h2_ob)") ; // clang-format on @@ -584,7 +578,6 @@ std::tuple cd_ccsd_cs_driver( const TiledIndexSpace& O = MO("occ"); const TiledIndexSpace& V = MO("virt"); auto [cind] = CI.labels<1>("all"); - has_gpu_tmp = ec.has_gpu(); const int otiles = O.num_tiles(); const int vtiles = V.num_tiles(); @@ -844,7 +837,7 @@ std::tuple cd_ccsd_cs_driver( .exact_copy(t1_bb(p1_vb,h3_ob), t1_aa(p1_vb,h3_ob)) .exact_copy(t2_bbbb(p1_vb,p2_vb,h3_ob,h4_ob), t2_aaaa(p1_vb,p2_vb,h3_ob,h4_ob)).execute(); - // .exact_copy(t2_baba(p1_vb,p2_va,h3_ob,h4_oa), t2_abab(p1_vb,p2_va,h3_ob,h4_oa),true,1.0,perm) + // .exact_copy(t2_baba(p1_vb,p2_va,h3_ob,h4_oa), t2_abab(p1_vb,p2_va,h3_ob,h4_oa),true,1.0,perm) // .exact_copy(t2_abba(p1_va,p2_vb,h3_ob,h4_oa), t2_abab(p1_va,p2_vb,h3_ob,h4_oa),true,-1.0) // .exact_copy(t2_baab(p1_vb,p2_va,h3_oa,h4_ob), t2_abab(p1_vb,p2_va,h3_oa,h4_ob),true,-1.0) @@ -858,7 +851,7 @@ std::tuple cd_ccsd_cs_driver( (t2_baab(p2_vb,p1_va,h3_oa,h4_ob) = -1.0 * t2_abab(p1_va,p2_vb,h3_oa,h4_ob)) (d_t1(p1_va,h3_oa) = t1_aa(p1_va,h3_oa)) - (d_t1(p1_vb,h3_ob) = t1_bb(p1_vb,h3_ob)) + (d_t1(p1_vb,h3_ob) = t1_bb(p1_vb,h3_ob)) (d_t2(p1_va,p2_va,h3_oa,h4_oa) = t2_aaaa(p1_va,p2_va,h3_oa,h4_oa)) (d_t2(p1_va,p2_vb,h3_oa,h4_ob) = t2_abab(p1_va,p2_vb,h3_oa,h4_ob)) (d_t2(p1_vb,p2_vb,h3_ob,h4_ob) = t2_bbbb(p1_vb,p2_vb,h3_ob,h4_ob)) diff --git a/exachem/cc/ccsd/cd_ccsd_os_ann.cpp b/exachem/cc/ccsd/cd_ccsd_os_ann.cpp index 258f941..81ec90b 100644 --- a/exachem/cc/ccsd/cd_ccsd_os_ann.cpp +++ b/exachem/cc/ccsd/cd_ccsd_os_ann.cpp @@ -12,7 +12,6 @@ using CCEType = double; CCSE_Tensors _a021_os; Tensor a22_abab_os, a22_aaaa_os, a22_bbbb_os; TiledIndexSpace o_alpha_os, v_alpha_os, o_beta_os, v_beta_os; -bool has_gpu_tmp_os; Tensor _a01V_os, _a02V_os, _a007V_os; CCSE_Tensors _a01_os, _a02_os, _a03_os, _a04_os, _a05_os, _a06_os, _a001_os, _a004_os, @@ -41,31 +40,31 @@ void ccsd_e_os(Scheduler& sch, const TiledIndexSpace& MO, const TiledIndexSpace& // clang-format off sch - (_a01V_os(cind) = t1_aa(p3_va, h4_oa) * chol3d_ov("aa")(h4_oa, p3_va, cind), + (_a01V_os(cind) = t1_aa(p3_va, h4_oa) * chol3d_ov("aa")(h4_oa, p3_va, cind), "_a01V_os(cind) = t1_aa(p3_va, h4_oa) * chol3d_ov( aa )(h4_oa, p3_va, cind)") - (_a02_os("aa")(h4_oa, h6_oa, cind) = t1_aa(p3_va, h4_oa) * chol3d_ov("aa")(h6_oa, p3_va, cind), + (_a02_os("aa")(h4_oa, h6_oa, cind) = t1_aa(p3_va, h4_oa) * chol3d_ov("aa")(h6_oa, p3_va, cind), "_a02_os( aa )(h4_oa, h6_oa, cind) = t1_aa(p3_va, h4_oa) * chol3d_ov( aa )(h6_oa, p3_va, cind)") - (_a03_os("aa")(h4_oa, p2_va, cind) = t2_aaaa(p1_va, p2_va, h3_oa, h4_oa) * chol3d_ov("aa")(h3_oa, p1_va, cind), + (_a03_os("aa")(h4_oa, p2_va, cind) = t2_aaaa(p1_va, p2_va, h3_oa, h4_oa) * chol3d_ov("aa")(h3_oa, p1_va, cind), "_a03_os( aa )(h4_oa, p2_va, cind) = t2_aaaa(p1_va, p2_va, h3_oa, h4_oa) * chol3d_ov( aa )(h3_oa, p1_va, cind)") - (_a03_os("aa")(h4_oa, p2_va, cind) += t2_abab(p2_va, p1_vb, h4_oa, h3_ob) * chol3d_ov("bb")(h3_ob, p1_vb, cind), + (_a03_os("aa")(h4_oa, p2_va, cind) += t2_abab(p2_va, p1_vb, h4_oa, h3_ob) * chol3d_ov("bb")(h3_ob, p1_vb, cind), "_a03_os( aa )(h4_oa, p2_va, cind) += t2_abab(p2_va, p1_vb, h4_oa, h3_ob) * chol3d_ov( bb )(h3_ob, p1_vb, cind)") - (_a01V_os(cind) += t1_bb(p3_vb, h4_ob) * chol3d_ov("bb")(h4_ob, p3_vb, cind), + (_a01V_os(cind) += t1_bb(p3_vb, h4_ob) * chol3d_ov("bb")(h4_ob, p3_vb, cind), "_a01V_os(cind) += t1_bb(p3_vb, h4_ob) * chol3d_ov( bb )(h4_ob, p3_vb, cind)") - (_a02_os("bb")(h4_ob, h6_ob, cind) = t1_bb(p3_vb, h4_ob) * chol3d_ov("bb")(h6_ob, p3_vb, cind), + (_a02_os("bb")(h4_ob, h6_ob, cind) = t1_bb(p3_vb, h4_ob) * chol3d_ov("bb")(h6_ob, p3_vb, cind), "_a02_os( bb )(h4_ob, h6_ob, cind) = t1_bb(p3_vb, h4_ob) * chol3d_ov( bb )(h6_ob, p3_vb, cind)") - (_a03_os("bb")(h4_ob, p2_vb, cind) = t2_bbbb(p1_vb, p2_vb, h3_ob, h4_ob) * chol3d_ov("bb")(h3_ob, p1_vb, cind), + (_a03_os("bb")(h4_ob, p2_vb, cind) = t2_bbbb(p1_vb, p2_vb, h3_ob, h4_ob) * chol3d_ov("bb")(h3_ob, p1_vb, cind), "_a03_os( bb )(h4_ob, p2_vb, cind) = t2_bbbb(p1_vb, p2_vb, h3_ob, h4_ob) * chol3d_ov( bb )(h3_ob, p1_vb, cind)") - (_a03_os("bb")(h4_ob, p2_vb, cind) += t2_abab(p1_va, p2_vb, h3_oa, h4_ob) * chol3d_ov("aa")(h3_oa, p1_va, cind), + (_a03_os("bb")(h4_ob, p2_vb, cind) += t2_abab(p1_va, p2_vb, h3_oa, h4_ob) * chol3d_ov("aa")(h3_oa, p1_va, cind), "_a03_os( bb )(h4_ob, p2_vb, cind) += t2_abab(p1_va, p2_vb, h3_oa, h4_ob) * chol3d_ov( aa )(h3_oa, p1_va, cind)") - (de() = 0.5 * _a01V_os() * _a01V_os(), + (de() = 0.5 * _a01V_os() * _a01V_os(), "de() = 0.5 * _a01V_os() * _a01V_os()") - (de() += -0.5 * _a02_os("aa")(h4_oa, h6_oa, cind) * _a02_os("aa")(h6_oa, h4_oa, cind), + (de() += -0.5 * _a02_os("aa")(h4_oa, h6_oa, cind) * _a02_os("aa")(h6_oa, h4_oa, cind), "de() += -0.5 * _a02_os( aa )(h4_oa, h6_oa, cind) * _a02_os( aa )(h6_oa, h4_oa, cind)") - (de() += -0.5 * _a02_os("bb")(h4_ob, h6_ob, cind) * _a02_os("bb")(h6_ob, h4_ob, cind), + (de() += -0.5 * _a02_os("bb")(h4_ob, h6_ob, cind) * _a02_os("bb")(h6_ob, h4_ob, cind), "de() += -0.5 * _a02_os( bb )(h4_ob, h6_ob, cind) * _a02_os( bb )(h6_ob, h4_ob, cind)") - (de() += 0.5 * _a03_os("aa")(h4_oa, p1_va, cind) * chol3d_ov("aa")(h4_oa, p1_va, cind), + (de() += 0.5 * _a03_os("aa")(h4_oa, p1_va, cind) * chol3d_ov("aa")(h4_oa, p1_va, cind), "de() += 0.5 * _a03_os( aa )(h4_oa, p1_va, cind) * chol3d_ov( aa )(h4_oa, p1_va, cind)") - (de() += 0.5 * _a03_os("bb")(h4_ob, p1_vb, cind) * chol3d_ov("bb")(h4_ob, p1_vb, cind), + (de() += 0.5 * _a03_os("bb")(h4_ob, p1_vb, cind) * chol3d_ov("bb")(h4_ob, p1_vb, cind), "de() += 0.5 * _a03_os( bb )(h4_ob, p1_vb, cind) * chol3d_ov( bb )(h4_ob, p1_vb, cind)") (de() += 1.0 * t1_aa(p1_va, h3_oa) * f1_ov("aa")(h3_oa, p1_va), "de() += 1.0 * t1_aa(p1_va, h3_oa) * f1_ov( aa )(h3_oa, p1_va)") // NEW TERM @@ -109,97 +108,97 @@ void ccsd_t1_os(Scheduler& sch, const TiledIndexSpace& MO, const TiledIndexSpace // clang-format off sch - (i0_aa(p2_va, h1_oa) = 1.0 * f1_vo("aa")(p2_va, h1_oa), + (i0_aa(p2_va, h1_oa) = 1.0 * f1_vo("aa")(p2_va, h1_oa), "i0_aa(p2_va, h1_oa) = 1.0 * f1_vo( aa )(p2_va, h1_oa)") - (i0_bb(p2_vb, h1_ob) = 1.0 * f1_vo("bb")(p2_vb, h1_ob), + (i0_bb(p2_vb, h1_ob) = 1.0 * f1_vo("bb")(p2_vb, h1_ob), "i0_bb(p2_vb, h1_ob) = 1.0 * f1_vo( bb )(p2_vb, h1_ob)") - (_a01_os("aa")(h2_oa, h1_oa, cind) = 1.0 * t1_aa(p1_va, h1_oa) * chol3d_ov("aa")(h2_oa, p1_va, cind), + (_a01_os("aa")(h2_oa, h1_oa, cind) = 1.0 * t1_aa(p1_va, h1_oa) * chol3d_ov("aa")(h2_oa, p1_va, cind), "_a01_os( aa )(h2_oa, h1_oa, cind) = 1.0 * t1_aa(p1_va, h1_oa) * chol3d_ov( aa )(h2_oa, p1_va, cind)") // ovm - (_a01_os("bb")(h2_ob, h1_ob, cind) = 1.0 * t1_bb(p1_vb, h1_ob) * chol3d_ov("bb")(h2_ob, p1_vb, cind), + (_a01_os("bb")(h2_ob, h1_ob, cind) = 1.0 * t1_bb(p1_vb, h1_ob) * chol3d_ov("bb")(h2_ob, p1_vb, cind), "_a01_os( bb )(h2_ob, h1_ob, cind) = 1.0 * t1_bb(p1_vb, h1_ob) * chol3d_ov( bb )(h2_ob, p1_vb, cind)") // ovm - (_a02V_os(cind) = 1.0 * t1_aa(p3_va, h3_oa) * chol3d_ov("aa")(h3_oa, p3_va, cind), + (_a02V_os(cind) = 1.0 * t1_aa(p3_va, h3_oa) * chol3d_ov("aa")(h3_oa, p3_va, cind), "_a02V_os(cind) = 1.0 * t1_aa(p3_va, h3_oa) * chol3d_ov( aa )(h3_oa, p3_va, cind)") // ovm - (_a02V_os(cind) += 1.0 * t1_bb(p3_vb, h3_ob) * chol3d_ov("bb")(h3_ob, p3_vb, cind), + (_a02V_os(cind) += 1.0 * t1_bb(p3_vb, h3_ob) * chol3d_ov("bb")(h3_ob, p3_vb, cind), "_a02V_os(cind) += 1.0 * t1_bb(p3_vb, h3_ob) * chol3d_ov( bb )(h3_ob, p3_vb, cind)") // ovm - (_a06_os("aa")(p1_va, h1_oa, cind) = 1.0 * t2_aaaa(p1_va, p3_va, h2_oa, h1_oa) * chol3d_ov("aa")(h2_oa, p3_va, cind), + (_a06_os("aa")(p1_va, h1_oa, cind) = 1.0 * t2_aaaa(p1_va, p3_va, h2_oa, h1_oa) * chol3d_ov("aa")(h2_oa, p3_va, cind), "_a06_os( aa )(p1_va, h1_oa, cind) = 1.0 * t2_aaaa(p1_va, p3_va, h2_oa, h1_oa) * chol3d_ov( aa )(h2_oa, p3_va, cind)") // o2v2m - (_a06_os("aa")(p1_va, h1_oa, cind) += -1.0 * t2_abab(p1_va, p3_vb, h1_oa, h2_ob) * chol3d_ov("bb")(h2_ob, p3_vb, cind), + (_a06_os("aa")(p1_va, h1_oa, cind) += -1.0 * t2_abab(p1_va, p3_vb, h1_oa, h2_ob) * chol3d_ov("bb")(h2_ob, p3_vb, cind), "_a06_os( aa )(p1_va, h1_oa, cind) += -1.0 * t2_abab(p1_va, p3_vb, h1_oa, h2_ob) * chol3d_ov( bb )(h2_ob, p3_vb, cind)") // o2v2m - (_a06_os("bb")(p1_vb, h1_ob, cind) = -1.0 * t2_abab(p3_va, p1_vb, h2_oa, h1_ob) * chol3d_ov("aa")(h2_oa, p3_va, cind), + (_a06_os("bb")(p1_vb, h1_ob, cind) = -1.0 * t2_abab(p3_va, p1_vb, h2_oa, h1_ob) * chol3d_ov("aa")(h2_oa, p3_va, cind), "_a06_os( bb )(p1_vb, h1_ob, cind) = -1.0 * t2_abab(p3_va, p1_vb, h2_oa, h1_ob) * chol3d_ov( aa )(h2_oa, p3_va, cind)") // o2v2m - (_a06_os("bb")(p1_vb, h1_ob, cind) += 1.0 * t2_bbbb(p1_vb, p3_vb, h2_ob, h1_ob) * chol3d_ov("bb")(h2_ob, p3_vb, cind), + (_a06_os("bb")(p1_vb, h1_ob, cind) += 1.0 * t2_bbbb(p1_vb, p3_vb, h2_ob, h1_ob) * chol3d_ov("bb")(h2_ob, p3_vb, cind), "_a06_os( bb )(p1_vb, h1_ob, cind) += 1.0 * t2_bbbb(p1_vb, p3_vb, h2_ob, h1_ob) * chol3d_ov( bb )(h2_ob, p3_vb, cind)") // o2v2m - (_a04_os("aa")(h2_oa, h1_oa) = -1.0 * f1_oo("aa")(h2_oa, h1_oa), + (_a04_os("aa")(h2_oa, h1_oa) = -1.0 * f1_oo("aa")(h2_oa, h1_oa), "_a04_os( aa )(h2_oa, h1_oa) = -1.0 * f1_oo( aa )(h2_oa, h1_oa)") // MOVED TERM - (_a04_os("bb")(h2_ob, h1_ob) = -1.0 * f1_oo("bb")(h2_ob, h1_ob), + (_a04_os("bb")(h2_ob, h1_ob) = -1.0 * f1_oo("bb")(h2_ob, h1_ob), "_a04_os( bb )(h2_ob, h1_ob) = -1.0 * f1_oo( bb )(h2_ob, h1_ob)") // MOVED TERM - (_a04_os("aa")(h2_oa, h1_oa) += 1.0 * chol3d_ov("aa")(h2_oa, p1_va, cind) * _a06_os("aa")(p1_va, h1_oa, cind), + (_a04_os("aa")(h2_oa, h1_oa) += 1.0 * chol3d_ov("aa")(h2_oa, p1_va, cind) * _a06_os("aa")(p1_va, h1_oa, cind), "_a04_os( aa )(h2_oa, h1_oa) += 1.0 * chol3d_ov( aa )(h2_oa, p1_va, cind) * _a06_os( aa )(p1_va, h1_oa, cind)") // o2vm - (_a04_os("bb")(h2_ob, h1_ob) += 1.0 * chol3d_ov("bb")(h2_ob, p1_vb, cind) * _a06_os("bb")(p1_vb, h1_ob, cind), + (_a04_os("bb")(h2_ob, h1_ob) += 1.0 * chol3d_ov("bb")(h2_ob, p1_vb, cind) * _a06_os("bb")(p1_vb, h1_ob, cind), "_a04_os( bb )(h2_ob, h1_ob) += 1.0 * chol3d_ov( bb )(h2_ob, p1_vb, cind) * _a06_os( bb )(p1_vb, h1_ob, cind)") // o2vm - (_a04_os("aa")(h2_oa, h1_oa) += -1.0 * t1_aa(p1_va, h1_oa) * f1_ov("aa")(h2_oa, p1_va), + (_a04_os("aa")(h2_oa, h1_oa) += -1.0 * t1_aa(p1_va, h1_oa) * f1_ov("aa")(h2_oa, p1_va), "_a04_os( aa )(h2_oa, h1_oa) += -1.0 * t1_aa(p1_va, h1_oa) * f1_ov( aa )(h2_oa, p1_va)") // NEW TERM - (_a04_os("bb")(h2_ob, h1_ob) += -1.0 * t1_bb(p1_vb, h1_ob) * f1_ov("bb")(h2_ob, p1_vb), + (_a04_os("bb")(h2_ob, h1_ob) += -1.0 * t1_bb(p1_vb, h1_ob) * f1_ov("bb")(h2_ob, p1_vb), "_a04_os( bb )(h2_ob, h1_ob) += -1.0 * t1_bb(p1_vb, h1_ob) * f1_ov( bb )(h2_ob, p1_vb)") // NEW TERM - (i0_aa(p2_va, h1_oa) += 1.0 * t1_aa(p2_va, h2_oa) * _a04_os("aa")(h2_oa, h1_oa), + (i0_aa(p2_va, h1_oa) += 1.0 * t1_aa(p2_va, h2_oa) * _a04_os("aa")(h2_oa, h1_oa), "i0_aa(p2_va, h1_oa) += 1.0 * t1_aa(p2_va, h2_oa) * _a04_os( aa )(h2_oa, h1_oa)") // o2v - (i0_bb(p2_vb, h1_ob) += 1.0 * t1_bb(p2_vb, h2_ob) * _a04_os("bb")(h2_ob, h1_ob), + (i0_bb(p2_vb, h1_ob) += 1.0 * t1_bb(p2_vb, h2_ob) * _a04_os("bb")(h2_ob, h1_ob), "i0_bb(p2_vb, h1_ob) += 1.0 * t1_bb(p2_vb, h2_ob) * _a04_os( bb )(h2_ob, h1_ob)") // o2v - (i0_aa(p1_va, h2_oa) += 1.0 * chol3d_vo("aa")(p1_va, h2_oa, cind) * _a02V_os(cind), + (i0_aa(p1_va, h2_oa) += 1.0 * chol3d_vo("aa")(p1_va, h2_oa, cind) * _a02V_os(cind), "i0_aa(p1_va, h2_oa) += 1.0 * chol3d_vo( aa )(p1_va, h2_oa, cind) * _a02V_os(cind)") // ovm - (i0_bb(p1_vb, h2_ob) += 1.0 * chol3d_vo("bb")(p1_vb, h2_ob, cind) * _a02V_os(cind), + (i0_bb(p1_vb, h2_ob) += 1.0 * chol3d_vo("bb")(p1_vb, h2_ob, cind) * _a02V_os(cind), "i0_bb(p1_vb, h2_ob) += 1.0 * chol3d_vo( bb )(p1_vb, h2_ob, cind) * _a02V_os(cind)") // ovm - (_a05_os("aa")(h2_oa, p1_va) = -1.0 * chol3d_ov("aa")(h3_oa, p1_va, cind) * _a01_os("aa")(h2_oa, h3_oa, cind), + (_a05_os("aa")(h2_oa, p1_va) = -1.0 * chol3d_ov("aa")(h3_oa, p1_va, cind) * _a01_os("aa")(h2_oa, h3_oa, cind), "_a05_os( aa )(h2_oa, p1_va) = -1.0 * chol3d_ov( aa )(h3_oa, p1_va, cind) * _a01_os( aa )(h2_oa, h3_oa, cind)") // o2vm - (_a05_os("bb")(h2_ob, p1_vb) = -1.0 * chol3d_ov("bb")(h3_ob, p1_vb, cind) * _a01_os("bb")(h2_ob, h3_ob, cind), + (_a05_os("bb")(h2_ob, p1_vb) = -1.0 * chol3d_ov("bb")(h3_ob, p1_vb, cind) * _a01_os("bb")(h2_ob, h3_ob, cind), "_a05_os( bb )(h2_ob, p1_vb) = -1.0 * chol3d_ov( bb )(h3_ob, p1_vb, cind) * _a01_os( bb )(h2_ob, h3_ob, cind)") // o2vm - (_a05_os("aa")(h2_oa, p1_va) += 1.0 * f1_ov("aa")(h2_oa, p1_va), + (_a05_os("aa")(h2_oa, p1_va) += 1.0 * f1_ov("aa")(h2_oa, p1_va), "_a05_os( aa )(h2_oa, p1_va) += 1.0 * f1_ov( aa )(h2_oa, p1_va)") // NEW TERM - (_a05_os("bb")(h2_ob, p1_vb) += 1.0 * f1_ov("bb")(h2_ob, p1_vb), + (_a05_os("bb")(h2_ob, p1_vb) += 1.0 * f1_ov("bb")(h2_ob, p1_vb), "_a05_os( bb )(h2_ob, p1_vb) += 1.0 * f1_ov( bb )(h2_ob, p1_vb)") // NEW TERM - (i0_aa(p2_va, h1_oa) += 1.0 * t2_aaaa(p1_va, p2_va, h2_oa, h1_oa) * _a05_os("aa")(h2_oa, p1_va), + (i0_aa(p2_va, h1_oa) += 1.0 * t2_aaaa(p1_va, p2_va, h2_oa, h1_oa) * _a05_os("aa")(h2_oa, p1_va), "i0_aa(p2_va, h1_oa) += 1.0 * t2_aaaa(p1_va, p2_va, h2_oa, h1_oa) * _a05_os( aa )(h2_oa, p1_va)") // o2v - (i0_bb(p2_vb, h1_ob) += 1.0 * t2_abab(p1_va, p2_vb, h2_oa, h1_ob) * _a05_os("aa")(h2_oa, p1_va), + (i0_bb(p2_vb, h1_ob) += 1.0 * t2_abab(p1_va, p2_vb, h2_oa, h1_ob) * _a05_os("aa")(h2_oa, p1_va), "i0_bb(p2_vb, h1_ob) += 1.0 * t2_abab(p1_va, p2_vb, h2_oa, h1_ob) * _a05_os( aa )(h2_oa, p1_va)") // o2v - (i0_aa(p2_va, h1_oa) += 1.0 * t2_abab(p2_va, p1_vb, h1_oa, h2_ob) * _a05_os("bb")(h2_ob, p1_vb), + (i0_aa(p2_va, h1_oa) += 1.0 * t2_abab(p2_va, p1_vb, h1_oa, h2_ob) * _a05_os("bb")(h2_ob, p1_vb), "i0_aa(p2_va, h1_oa) += 1.0 * t2_abab(p2_va, p1_vb, h1_oa, h2_ob) * _a05_os( bb )(h2_ob, p1_vb)") // o2v - (i0_bb(p2_vb, h1_ob) += 1.0 * t2_bbbb(p1_vb, p2_vb, h2_ob, h1_ob) * _a05_os("bb")(h2_ob, p1_vb), + (i0_bb(p2_vb, h1_ob) += 1.0 * t2_bbbb(p1_vb, p2_vb, h2_ob, h1_ob) * _a05_os("bb")(h2_ob, p1_vb), "i0_bb(p2_vb, h1_ob) += 1.0 * t2_bbbb(p1_vb, p2_vb, h2_ob, h1_ob) * _a05_os( bb )(h2_ob, p1_vb)") // o2v - (i0_aa(p2_va, h1_oa) += -1.0 * chol3d_vv("aa")(p2_va, p1_va, cind) * _a06_os("aa")(p1_va, h1_oa, cind), + (i0_aa(p2_va, h1_oa) += -1.0 * chol3d_vv("aa")(p2_va, p1_va, cind) * _a06_os("aa")(p1_va, h1_oa, cind), "i0_aa(p2_va, h1_oa) += -1.0 * chol3d_vv( aa )(p2_va, p1_va, cind) * _a06_os( aa )(p1_va, h1_oa, cind)") // ov2m - (i0_bb(p2_vb, h1_ob) += -1.0 * chol3d_vv("bb")(p2_vb, p1_vb, cind) * _a06_os("bb")(p1_vb, h1_ob, cind), + (i0_bb(p2_vb, h1_ob) += -1.0 * chol3d_vv("bb")(p2_vb, p1_vb, cind) * _a06_os("bb")(p1_vb, h1_ob, cind), "i0_bb(p2_vb, h1_ob) += -1.0 * chol3d_vv( bb )(p2_vb, p1_vb, cind) * _a06_os( bb )(p1_vb, h1_ob, cind)") // ov2m - (_a06_os("aa")(p2_va, h2_oa, cind) += -1.0 * t1_aa(p1_va, h2_oa) * chol3d_vv("aa")(p2_va, p1_va, cind), + (_a06_os("aa")(p2_va, h2_oa, cind) += -1.0 * t1_aa(p1_va, h2_oa) * chol3d_vv("aa")(p2_va, p1_va, cind), "_a06_os( aa )(p2_va, h2_oa, cind) += -1.0 * t1_aa(p1_va, h2_oa) * chol3d_vv( aa )(p2_va, p1_va, cind)") // ov2m - (_a06_os("bb")(p2_vb, h2_ob, cind) += -1.0 * t1_bb(p1_vb, h2_ob) * chol3d_vv("bb")(p2_vb, p1_vb, cind), + (_a06_os("bb")(p2_vb, h2_ob, cind) += -1.0 * t1_bb(p1_vb, h2_ob) * chol3d_vv("bb")(p2_vb, p1_vb, cind), "_a06_os( bb )(p2_vb, h2_ob, cind) += -1.0 * t1_bb(p1_vb, h2_ob) * chol3d_vv( bb )(p2_vb, p1_vb, cind)") // ov2m - (i0_aa(p1_va, h2_oa) += -1.0 * _a06_os("aa")(p1_va, h2_oa, cind) * _a02V_os(cind), + (i0_aa(p1_va, h2_oa) += -1.0 * _a06_os("aa")(p1_va, h2_oa, cind) * _a02V_os(cind), "i0_aa(p1_va, h2_oa) += -1.0 * _a06_os( aa )(p1_va, h2_oa, cind) * _a02V_os(cind)") // ovm - (i0_bb(p1_vb, h2_ob) += -1.0 * _a06_os("bb")(p1_vb, h2_ob, cind) * _a02V_os(cind), + (i0_bb(p1_vb, h2_ob) += -1.0 * _a06_os("bb")(p1_vb, h2_ob, cind) * _a02V_os(cind), "i0_bb(p1_vb, h2_ob) += -1.0 * _a06_os( bb )(p1_vb, h2_ob, cind) * _a02V_os(cind)") // ovm - (_a06_os("aa")(p2_va, h3_oa, cind) += -1.0 * t1_aa(p2_va, h3_oa) * _a02V_os(cind), + (_a06_os("aa")(p2_va, h3_oa, cind) += -1.0 * t1_aa(p2_va, h3_oa) * _a02V_os(cind), "_a06_os( aa )(p2_va, h3_oa, cind) += -1.0 * t1_aa(p2_va, h3_oa) * _a02V_os(cind)") // ovm - (_a06_os("bb")(p2_vb, h3_ob, cind) += -1.0 * t1_bb(p2_vb, h3_ob) * _a02V_os(cind), + (_a06_os("bb")(p2_vb, h3_ob, cind) += -1.0 * t1_bb(p2_vb, h3_ob) * _a02V_os(cind), "_a06_os( bb )(p2_vb, h3_ob, cind) += -1.0 * t1_bb(p2_vb, h3_ob) * _a02V_os(cind)") // ovm - (_a06_os("aa")(p2_va, h3_oa, cind) += 1.0 * t1_aa(p2_va, h2_oa) * _a01_os("aa")(h2_oa, h3_oa, cind), + (_a06_os("aa")(p2_va, h3_oa, cind) += 1.0 * t1_aa(p2_va, h2_oa) * _a01_os("aa")(h2_oa, h3_oa, cind), "_a06_os( aa )(p2_va, h3_oa, cind) += 1.0 * t1_aa(p2_va, h2_oa) * _a01_os( aa )(h2_oa, h3_oa, cind)") // o2vm - (_a06_os("bb")(p2_vb, h3_ob, cind) += 1.0 * t1_bb(p2_vb, h2_ob) * _a01_os("bb")(h2_ob, h3_ob, cind), + (_a06_os("bb")(p2_vb, h3_ob, cind) += 1.0 * t1_bb(p2_vb, h2_ob) * _a01_os("bb")(h2_ob, h3_ob, cind), "_a06_os( bb )(p2_vb, h3_ob, cind) += 1.0 * t1_bb(p2_vb, h2_ob) * _a01_os( bb )(h2_ob, h3_ob, cind)") // o2vm - (_a01_os("aa")(h3_oa, h1_oa, cind) += 1.0 * chol3d_oo("aa")(h3_oa, h1_oa, cind), + (_a01_os("aa")(h3_oa, h1_oa, cind) += 1.0 * chol3d_oo("aa")(h3_oa, h1_oa, cind), "_a01_os( aa )(h3_oa, h1_oa, cind) += 1.0 * chol3d_oo( aa )(h3_oa, h1_oa, cind)") // o2m - (_a01_os("bb")(h3_ob, h1_ob, cind) += 1.0 * chol3d_oo("bb")(h3_ob, h1_ob, cind), - "_a01_os( bb )(h3_ob, h1_ob, cind) += 1.0 * chol3d_oo( bb )(h3_ob, h1_ob, cind)") // o2m - (i0_aa(p2_va, h1_oa) += 1.0 * _a01_os("aa")(h3_oa, h1_oa, cind) * _a06_os("aa")(p2_va, h3_oa, cind), + (_a01_os("bb")(h3_ob, h1_ob, cind) += 1.0 * chol3d_oo("bb")(h3_ob, h1_ob, cind), + "_a01_os( bb )(h3_ob, h1_ob, cind) += 1.0 * chol3d_oo( bb )(h3_ob, h1_ob, cind)") // o2m + (i0_aa(p2_va, h1_oa) += 1.0 * _a01_os("aa")(h3_oa, h1_oa, cind) * _a06_os("aa")(p2_va, h3_oa, cind), "i0_aa(p2_va, h1_oa) += 1.0 * _a01_os( aa )(h3_oa, h1_oa, cind) * _a06_os( aa )(p2_va, h3_oa, cind)") // o2vm - // (i0_aa(p2_va, h1_oa) += -1.0 * t1_aa(p2_va, h7_oa) * f1_oo("aa")(h7_oa, h1_oa), + // (i0_aa(p2_va, h1_oa) += -1.0 * t1_aa(p2_va, h7_oa) * f1_oo("aa")(h7_oa, h1_oa), // "i0_aa(p2_va, h1_oa) += -1.0 * t1_aa(p2_va, h7_oa) * f1_oo( aa )(h7_oa, h1_oa)") // MOVED ABOVE // o2v - (i0_aa(p2_va, h1_oa) += 1.0 * t1_aa(p3_va, h1_oa) * f1_vv("aa")(p2_va, p3_va), + (i0_aa(p2_va, h1_oa) += 1.0 * t1_aa(p3_va, h1_oa) * f1_vv("aa")(p2_va, p3_va), "i0_aa(p2_va, h1_oa) += 1.0 * t1_aa(p3_va, h1_oa) * f1_vv( aa )(p2_va, p3_va)") // ov2 - (i0_bb(p2_vb, h1_ob) += 1.0 * _a01_os("bb")(h3_ob, h1_ob, cind) * _a06_os("bb")(p2_vb, h3_ob, cind), + (i0_bb(p2_vb, h1_ob) += 1.0 * _a01_os("bb")(h3_ob, h1_ob, cind) * _a06_os("bb")(p2_vb, h3_ob, cind), "i0_bb(p2_vb, h1_ob) += 1.0 * _a01_os( bb )(h3_ob, h1_ob, cind) * _a06_os( bb )(p2_vb, h3_ob, cind)") // o2vm - // (i0_bb(p2_vb, h1_ob) += -1.0 * t1_bb(p2_vb, h7_ob) * f1_oo("bb")(h7_ob, h1_ob), + // (i0_bb(p2_vb, h1_ob) += -1.0 * t1_bb(p2_vb, h7_ob) * f1_oo("bb")(h7_ob, h1_ob), // "i0_bb(p2_vb, h1_ob) += -1.0 * t1_bb(p2_vb, h7_ob) * f1_oo( bb )(h7_ob, h1_ob)") // MOVED ABOVE // o2v - (i0_bb(p2_vb, h1_ob) += 1.0 * t1_bb(p3_vb, h1_ob) * f1_vv("bb")(p2_vb, p3_vb), + (i0_bb(p2_vb, h1_ob) += 1.0 * t1_bb(p3_vb, h1_ob) * f1_vv("bb")(p2_vb, p3_vb), "i0_bb(p2_vb, h1_ob) += 1.0 * t1_bb(p3_vb, h1_ob) * f1_vv( bb )(p2_vb, p3_vb)") // ov2 ; // clang-format on @@ -414,31 +413,19 @@ void ccsd_t2_os(Scheduler& sch, const TiledIndexSpace& MO, const TiledIndexSpace SizeVec cdims_sz; for(const auto v: cdims) { cdims_sz.push_back(v); } - bool isgpu = false; - + AddBuf* ab{nullptr}; #if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP) TensorElType2* th_a{nullptr}; TensorElType3* th_b{nullptr}; - TensorElType1* th_c{nullptr}; - auto& thandle = tamm::GPUStreamPool::getInstance().getStream(); + auto& thandle = GPUStreamPool::getInstance().getStream(); - AddBuf* ab{nullptr}; - if(hw == ExecutionHW::GPU) { - ab = new AddBuf{ - isgpu, &thandle, th_a, th_b, th_c, {}, translated_cblockid}; - } - else { - ab = new AddBuf{ - isgpu, &thandle, th_a, th_b, th_c, {}, translated_cblockid}; - } - add_bufs.push_back(ab); + ab = + new AddBuf{th_a, th_b, {}, translated_cblockid}; #else - gpuStream_t thandle{}; - AddBuf* ab = - new AddBuf{ - isgpu, ctensor, {}, translated_cblockid}; - add_bufs.push_back(ab); + gpuStream_t thandle{}; + ab = new AddBuf{ctensor, {}, translated_cblockid}; #endif + add_bufs.push_back(ab); // LabelLoopNest inner_loop{reduction_lbls}; LabelLoopNest inner_loop{reduction_labels}; @@ -447,20 +434,24 @@ void ccsd_t2_os(Scheduler& sch, const TiledIndexSpace& MO, const TiledIndexSpace TensorElType1* cbuf_dev_ptr{nullptr}; TensorElType1* cbuf_tmp_dev_ptr{nullptr}; + auto& memHostPool = tamm::RMMMemoryManager::getInstance().getHostMemoryPool(); + #if(defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP)) - auto& memPool = tamm::GPUPooledStorageManager::getInstance(); + auto& memDevicePool = tamm::RMMMemoryManager::getInstance().getDeviceMemoryPool(); if(hw == ExecutionHW::GPU) { - cbuf_dev_ptr = static_cast(memPool.allocate(csize * sizeof(TensorElType1))); + cbuf_dev_ptr = + static_cast(memDevicePool.allocate(csize * sizeof(TensorElType1))); cbuf_tmp_dev_ptr = - static_cast(memPool.allocate(csize * sizeof(TensorElType1))); + static_cast(memDevicePool.allocate(csize * sizeof(TensorElType1))); - memPool.gpuMemset(reinterpret_cast(cbuf_dev_ptr), csize * sizeof(TensorElType1)); - memPool.gpuMemset(reinterpret_cast(cbuf_tmp_dev_ptr), csize * sizeof(TensorElType1)); + gpuMemsetAsync(reinterpret_cast(cbuf_dev_ptr), csize * sizeof(TensorElType1), + thandle); + gpuMemsetAsync(reinterpret_cast(cbuf_tmp_dev_ptr), csize * sizeof(TensorElType1), + thandle); } #endif - int slc = 0; for(const auto& inner_it_val: inner_loop) { // k IndexVector a_block_id(rhs1_.labels().size()); @@ -493,11 +484,13 @@ void ccsd_t2_os(Scheduler& sch, const TiledIndexSpace& MO, const TiledIndexSpace const size_t asize = atensor.block_size(translated_ablockid); const size_t bsize = btensor.block_size(translated_bblockid); - std::vector abuf(asize); - std::vector bbuf(bsize); + TensorElType2* abuf{nullptr}; + TensorElType3* bbuf{nullptr}; + abuf = static_cast(memHostPool.allocate(asize * sizeof(TensorElType2))); + bbuf = static_cast(memHostPool.allocate(bsize * sizeof(TensorElType3))); - atensor.get(translated_ablockid, abuf); - btensor.get(translated_bblockid, bbuf); + atensor.get(translated_ablockid, {abuf, asize}); + btensor.get(translated_bblockid, {bbuf, bsize}); const auto& adims = atensor.block_dims(translated_ablockid); const auto& bdims = btensor.block_dims(translated_bblockid); @@ -510,38 +503,41 @@ void ccsd_t2_os(Scheduler& sch, const TiledIndexSpace& MO, const TiledIndexSpace for(const auto v: bdims) { bdims_sz.push_back(v); } // A*B - { AddBuf* abptr{nullptr}; #if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP) if(hw == ExecutionHW::GPU) { - abptr = ab; - ab->ta_ = static_cast(memPool.allocate(asize * sizeof(TensorElType2))); - ab->tb_ = static_cast(memPool.allocate(bsize * sizeof(TensorElType3))); + abptr = ab; + ab->ta_ = + static_cast(memDevicePool.allocate(asize * sizeof(TensorElType2))); + ab->tb_ = + static_cast(memDevicePool.allocate(bsize * sizeof(TensorElType3))); } else abptr = add_bufs[0]; #else abptr = add_bufs[0]; #endif - abptr->abuf_ = std::move(abuf); - abptr->bbuf_ = std::move(bbuf); + abptr->abuf_ = abuf; + abptr->bbuf_ = bbuf; kernels::block_multiply( - abptr->isgpu_, #if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP) abptr->ta_, abptr->tb_, #endif - thandle, 1.0, (abptr->abuf_).data(), adims_sz, rhs1_int_labels_, (abptr->bbuf_).data(), - bdims_sz, rhs2_int_labels_, cscale, cbuf.data(), cdims_sz, lhs_int_labels_, hw, - has_gpu_tmp_os, false, cbuf_dev_ptr, cbuf_tmp_dev_ptr); - } + thandle, 1.0, abptr->abuf_, adims_sz, rhs1_int_labels_, abptr->bbuf_, bdims_sz, + rhs2_int_labels_, cscale, cbuf.data(), cdims_sz, lhs_int_labels_, hw, false, cbuf_dev_ptr, + cbuf_tmp_dev_ptr); + #if(defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP)) - if(hw == ExecutionHW::GPU) { - memPool.deallocate(static_cast(ab->ta_), (ab->abuf_).size() * sizeof(TensorElType2)); - memPool.deallocate(static_cast(ab->tb_), (ab->bbuf_).size() * sizeof(TensorElType3)); - } + if(hw == ExecutionHW::GPU) { + memDevicePool.deallocate(ab->ta_, asize * sizeof(TensorElType2)); + memDevicePool.deallocate(ab->tb_, bsize * sizeof(TensorElType3)); + } #endif - slc++; + } // A * B + + memHostPool.deallocate(abuf, asize * sizeof(TensorElType2)); + memHostPool.deallocate(bbuf, bsize * sizeof(TensorElType3)); } // end of reduction loop // add the computed update to the tensor @@ -549,20 +545,22 @@ void ccsd_t2_os(Scheduler& sch, const TiledIndexSpace& MO, const TiledIndexSpace #if(defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP)) // copy to host if(hw == ExecutionHW::GPU) { - std::vector cbuf_tmp(csize, 0); - kernels::copy_result_to_host(hw, thandle, cbuf_tmp, cbuf_dev_ptr); + TensorElType1* cbuf_tmp{nullptr}; + cbuf_tmp = static_cast(memHostPool.allocate(csize * sizeof(TensorElType1))); + std::memset(cbuf_tmp, 0, csize * sizeof(TensorElType1)); + gpuMemcpyAsync(cbuf_tmp, cbuf_dev_ptr, csize, gpuMemcpyDeviceToHost, + thandle); // cbuf+=cbuf_tmp - kernels::stream_synchronize(thandle); - blas::axpy(csize, TensorElType1{1}, cbuf_tmp.data(), 1, cbuf.data(), 1); + gpuStreamSynchronize(thandle); + blas::axpy(csize, TensorElType1{1}, cbuf_tmp, 1, cbuf.data(), 1); + + memHostPool.deallocate(cbuf_tmp, csize * sizeof(TensorElType1)); - // free cbuf_dev_ptr - auto& memPool = tamm::GPUPooledStorageManager::getInstance(); - memPool.deallocate(static_cast(cbuf_dev_ptr), csize * sizeof(TensorElType1)); - memPool.deallocate(static_cast(cbuf_tmp_dev_ptr), csize * sizeof(TensorElType1)); + memDevicePool.deallocate(static_cast(cbuf_dev_ptr), csize * sizeof(TensorElType1)); + memDevicePool.deallocate(static_cast(cbuf_tmp_dev_ptr), + csize * sizeof(TensorElType1)); } #endif - // ctensor.add(translated_cblockid, cbuf); - // for (size_t i=0;i{{v_beta_os, v_beta_os, v_beta_os, v_beta_os}, compute_v4_term}; // clang-format off - sch - (_a017_os("aa")(p3_va, h2_oa, cind) = -1.0 * t2_aaaa(p1_va, p3_va, h3_oa, h2_oa) * chol3d_ov("aa")(h3_oa, p1_va, cind), + sch + (_a017_os("aa")(p3_va, h2_oa, cind) = -1.0 * t2_aaaa(p1_va, p3_va, h3_oa, h2_oa) * chol3d_ov("aa")(h3_oa, p1_va, cind), "_a017_os( aa )(p3_va, h2_oa, cind) = -1.0 * t2_aaaa(p1_va, p3_va, h3_oa, h2_oa) * chol3d_ov( aa )(h3_oa, p1_va, cind)") - (_a017_os("bb")(p3_vb, h2_ob, cind) = -1.0 * t2_bbbb(p1_vb, p3_vb, h3_ob, h2_ob) * chol3d_ov("bb")(h3_ob, p1_vb, cind), + (_a017_os("bb")(p3_vb, h2_ob, cind) = -1.0 * t2_bbbb(p1_vb, p3_vb, h3_ob, h2_ob) * chol3d_ov("bb")(h3_ob, p1_vb, cind), "_a017_os( bb )(p3_vb, h2_ob, cind) = -1.0 * t2_bbbb(p1_vb, p3_vb, h3_ob, h2_ob) * chol3d_ov( bb )(h3_ob, p1_vb, cind)") - (_a017_os("bb")(p3_vb, h2_ob, cind) += -1.0 * t2_abab(p1_va, p3_vb, h3_oa, h2_ob) * chol3d_ov("aa")(h3_oa, p1_va, cind), + (_a017_os("bb")(p3_vb, h2_ob, cind) += -1.0 * t2_abab(p1_va, p3_vb, h3_oa, h2_ob) * chol3d_ov("aa")(h3_oa, p1_va, cind), "_a017_os( bb )(p3_vb, h2_ob, cind) += -1.0 * t2_abab(p1_va, p3_vb, h3_oa, h2_ob) * chol3d_ov( aa )(h3_oa, p1_va, cind)") - (_a017_os("aa")(p3_va, h2_oa, cind) += -1.0 * t2_abab(p3_va, p1_vb, h2_oa, h3_ob) * chol3d_ov("bb")(h3_ob, p1_vb, cind), + (_a017_os("aa")(p3_va, h2_oa, cind) += -1.0 * t2_abab(p3_va, p1_vb, h2_oa, h3_ob) * chol3d_ov("bb")(h3_ob, p1_vb, cind), "_a017_os( aa )(p3_va, h2_oa, cind) += -1.0 * t2_abab(p3_va, p1_vb, h2_oa, h3_ob) * chol3d_ov( bb )(h3_ob, p1_vb, cind)") - (_a006_os("aa")(h4_oa, h1_oa) = -1.0 * chol3d_ov("aa")(h4_oa, p2_va, cind) * _a017_os("aa")(p2_va, h1_oa, cind), + (_a006_os("aa")(h4_oa, h1_oa) = -1.0 * chol3d_ov("aa")(h4_oa, p2_va, cind) * _a017_os("aa")(p2_va, h1_oa, cind), "_a006_os( aa )(h4_oa, h1_oa) = -1.0 * chol3d_ov( aa )(h4_oa, p2_va, cind) * _a017_os( aa )(p2_va, h1_oa, cind)") - (_a006_os("bb")(h4_ob, h1_ob) = -1.0 * chol3d_ov("bb")(h4_ob, p2_vb, cind) * _a017_os("bb")(p2_vb, h1_ob, cind), + (_a006_os("bb")(h4_ob, h1_ob) = -1.0 * chol3d_ov("bb")(h4_ob, p2_vb, cind) * _a017_os("bb")(p2_vb, h1_ob, cind), "_a006_os( bb )(h4_ob, h1_ob) = -1.0 * chol3d_ov( bb )(h4_ob, p2_vb, cind) * _a017_os( bb )(p2_vb, h1_ob, cind)") - (_a007V_os(cind) = 1.0 * chol3d_ov("aa")(h4_oa, p1_va, cind) * t1_aa(p1_va, h4_oa), + (_a007V_os(cind) = 1.0 * chol3d_ov("aa")(h4_oa, p1_va, cind) * t1_aa(p1_va, h4_oa), "_a007V_os(cind) = 1.0 * chol3d_ov( aa )(h4_oa, p1_va, cind) * t1_aa(p1_va, h4_oa)") - (_a007V_os(cind) += 1.0 * chol3d_ov("bb")(h4_ob, p1_vb, cind) * t1_bb(p1_vb, h4_ob), + (_a007V_os(cind) += 1.0 * chol3d_ov("bb")(h4_ob, p1_vb, cind) * t1_bb(p1_vb, h4_ob), "_a007V_os(cind) += 1.0 * chol3d_ov( bb )(h4_ob, p1_vb, cind) * t1_bb(p1_vb, h4_ob)") - (_a009_os("aa")(h3_oa, h2_oa, cind) = 1.0 * chol3d_ov("aa")(h3_oa, p1_va, cind) * t1_aa(p1_va, h2_oa), + (_a009_os("aa")(h3_oa, h2_oa, cind) = 1.0 * chol3d_ov("aa")(h3_oa, p1_va, cind) * t1_aa(p1_va, h2_oa), "_a009_os( aa )(h3_oa, h2_oa, cind) = 1.0 * chol3d_ov( aa )(h3_oa, p1_va, cind) * t1_aa(p1_va, h2_oa)") - (_a009_os("bb")(h3_ob, h2_ob, cind) = 1.0 * chol3d_ov("bb")(h3_ob, p1_vb, cind) * t1_bb(p1_vb, h2_ob), + (_a009_os("bb")(h3_ob, h2_ob, cind) = 1.0 * chol3d_ov("bb")(h3_ob, p1_vb, cind) * t1_bb(p1_vb, h2_ob), "_a009_os( bb )(h3_ob, h2_ob, cind) = 1.0 * chol3d_ov( bb )(h3_ob, p1_vb, cind) * t1_bb(p1_vb, h2_ob)") - (_a021_os("aa")(p3_va, p1_va, cind) = -0.5 * chol3d_ov("aa")(h3_oa, p1_va, cind) * t1_aa(p3_va, h3_oa), + (_a021_os("aa")(p3_va, p1_va, cind) = -0.5 * chol3d_ov("aa")(h3_oa, p1_va, cind) * t1_aa(p3_va, h3_oa), "_a021_os( aa )(p3_va, p1_va, cind) = -0.5 * chol3d_ov( aa )(h3_oa, p1_va, cind) * t1_aa(p3_va, h3_oa)") - (_a021_os("bb")(p3_vb, p1_vb, cind) = -0.5 * chol3d_ov("bb")(h3_ob, p1_vb, cind) * t1_bb(p3_vb, h3_ob), + (_a021_os("bb")(p3_vb, p1_vb, cind) = -0.5 * chol3d_ov("bb")(h3_ob, p1_vb, cind) * t1_bb(p3_vb, h3_ob), "_a021_os( bb )(p3_vb, p1_vb, cind) = -0.5 * chol3d_ov( bb )(h3_ob, p1_vb, cind) * t1_bb(p3_vb, h3_ob)") - (_a021_os("aa")(p3_va, p1_va, cind) += 0.5 * chol3d_vv("aa")(p3_va, p1_va, cind), + (_a021_os("aa")(p3_va, p1_va, cind) += 0.5 * chol3d_vv("aa")(p3_va, p1_va, cind), "_a021_os( aa )(p3_va, p1_va, cind) += 0.5 * chol3d_vv( aa )(p3_va, p1_va, cind)") - (_a021_os("bb")(p3_vb, p1_vb, cind) += 0.5 * chol3d_vv("bb")(p3_vb, p1_vb, cind), + (_a021_os("bb")(p3_vb, p1_vb, cind) += 0.5 * chol3d_vv("bb")(p3_vb, p1_vb, cind), "_a021_os( bb )(p3_vb, p1_vb, cind) += 0.5 * chol3d_vv( bb )(p3_vb, p1_vb, cind)") - (_a017_os("aa")(p3_va, h2_oa, cind) += -2.0 * t1_aa(p2_va, h2_oa) * _a021_os("aa")(p3_va, p2_va, cind), + (_a017_os("aa")(p3_va, h2_oa, cind) += -2.0 * t1_aa(p2_va, h2_oa) * _a021_os("aa")(p3_va, p2_va, cind), "_a017_os( aa )(p3_va, h2_oa, cind) += -2.0 * t1_aa(p2_va, h2_oa) * _a021_os( aa )(p3_va, p2_va, cind)") - (_a017_os("bb")(p3_vb, h2_ob, cind) += -2.0 * t1_bb(p2_vb, h2_ob) * _a021_os("bb")(p3_vb, p2_vb, cind), + (_a017_os("bb")(p3_vb, h2_ob, cind) += -2.0 * t1_bb(p2_vb, h2_ob) * _a021_os("bb")(p3_vb, p2_vb, cind), "_a017_os( bb )(p3_vb, h2_ob, cind) += -2.0 * t1_bb(p2_vb, h2_ob) * _a021_os( bb )(p3_vb, p2_vb, cind)") - (_a008_os("aa")(h3_oa, h1_oa, cind) = 1.0 * _a009_os("aa")(h3_oa, h1_oa, cind), + (_a008_os("aa")(h3_oa, h1_oa, cind) = 1.0 * _a009_os("aa")(h3_oa, h1_oa, cind), "_a008_os( aa )(h3_oa, h1_oa, cind) = 1.0 * _a009_os( aa )(h3_oa, h1_oa, cind)") - (_a008_os("bb")(h3_ob, h1_ob, cind) = 1.0 * _a009_os("bb")(h3_ob, h1_ob, cind), + (_a008_os("bb")(h3_ob, h1_ob, cind) = 1.0 * _a009_os("bb")(h3_ob, h1_ob, cind), "_a008_os( bb )(h3_ob, h1_ob, cind) = 1.0 * _a009_os( bb )(h3_ob, h1_ob, cind)") - (_a009_os("aa")(h3_oa, h1_oa, cind) += 1.0 * chol3d_oo("aa")(h3_oa, h1_oa, cind), + (_a009_os("aa")(h3_oa, h1_oa, cind) += 1.0 * chol3d_oo("aa")(h3_oa, h1_oa, cind), "_a009_os( aa )(h3_oa, h1_oa, cind) += 1.0 * chol3d_oo( aa )(h3_oa, h1_oa, cind)") - (_a009_os("bb")(h3_ob, h1_ob, cind) += 1.0 * chol3d_oo("bb")(h3_ob, h1_ob, cind), + (_a009_os("bb")(h3_ob, h1_ob, cind) += 1.0 * chol3d_oo("bb")(h3_ob, h1_ob, cind), "_a009_os( bb )(h3_ob, h1_ob, cind) += 1.0 * chol3d_oo( bb )(h3_ob, h1_ob, cind)") - (_a001_os("aa")(p4_va, p2_va) = -2.0 * _a021_os("aa")(p4_va, p2_va, cind) * _a007V_os(cind), + (_a001_os("aa")(p4_va, p2_va) = -2.0 * _a021_os("aa")(p4_va, p2_va, cind) * _a007V_os(cind), "_a001_os( aa )(p4_va, p2_va) = -2.0 * _a021_os( aa )(p4_va, p2_va, cind) * _a007V_os(cind)") - (_a001_os("bb")(p4_vb, p2_vb) = -2.0 * _a021_os("bb")(p4_vb, p2_vb, cind) * _a007V_os(cind), + (_a001_os("bb")(p4_vb, p2_vb) = -2.0 * _a021_os("bb")(p4_vb, p2_vb, cind) * _a007V_os(cind), "_a001_os( bb )(p4_vb, p2_vb) = -2.0 * _a021_os( bb )(p4_vb, p2_vb, cind) * _a007V_os(cind)") - (_a001_os("aa")(p4_va, p2_va) += -1.0 * _a017_os("aa")(p4_va, h2_oa, cind) * chol3d_ov("aa")(h2_oa, p2_va, cind), + (_a001_os("aa")(p4_va, p2_va) += -1.0 * _a017_os("aa")(p4_va, h2_oa, cind) * chol3d_ov("aa")(h2_oa, p2_va, cind), "_a001_os( aa )(p4_va, p2_va) += -1.0 * _a017_os( aa )(p4_va, h2_oa, cind) * chol3d_ov( aa )(h2_oa, p2_va, cind)") - (_a001_os("bb")(p4_vb, p2_vb) += -1.0 * _a017_os("bb")(p4_vb, h2_ob, cind) * chol3d_ov("bb")(h2_ob, p2_vb, cind), + (_a001_os("bb")(p4_vb, p2_vb) += -1.0 * _a017_os("bb")(p4_vb, h2_ob, cind) * chol3d_ov("bb")(h2_ob, p2_vb, cind), "_a001_os( bb )(p4_vb, p2_vb) += -1.0 * _a017_os( bb )(p4_vb, h2_ob, cind) * chol3d_ov( bb )(h2_ob, p2_vb, cind)") - (_a006_os("aa")(h4_oa, h1_oa) += 1.0 * _a009_os("aa")(h4_oa, h1_oa, cind) * _a007V_os(cind), + (_a006_os("aa")(h4_oa, h1_oa) += 1.0 * _a009_os("aa")(h4_oa, h1_oa, cind) * _a007V_os(cind), "_a006_os( aa )(h4_oa, h1_oa) += 1.0 * _a009_os( aa )(h4_oa, h1_oa, cind) * _a007V_os(cind)") - (_a006_os("bb")(h4_ob, h1_ob) += 1.0 * _a009_os("bb")(h4_ob, h1_ob, cind) * _a007V_os(cind), + (_a006_os("bb")(h4_ob, h1_ob) += 1.0 * _a009_os("bb")(h4_ob, h1_ob, cind) * _a007V_os(cind), "_a006_os( bb )(h4_ob, h1_ob) += 1.0 * _a009_os( bb )(h4_ob, h1_ob, cind) * _a007V_os(cind)") - (_a006_os("aa")(h4_oa, h1_oa) += -1.0 * _a009_os("aa")(h3_oa, h1_oa, cind) * _a008_os("aa")(h4_oa, h3_oa, cind), + (_a006_os("aa")(h4_oa, h1_oa) += -1.0 * _a009_os("aa")(h3_oa, h1_oa, cind) * _a008_os("aa")(h4_oa, h3_oa, cind), "_a006_os( aa )(h4_oa, h1_oa) += -1.0 * _a009_os( aa )(h3_oa, h1_oa, cind) * _a008_os( aa )(h4_oa, h3_oa, cind)") - (_a006_os("bb")(h4_ob, h1_ob) += -1.0 * _a009_os("bb")(h3_ob, h1_ob, cind) * _a008_os("bb")(h4_ob, h3_ob, cind), + (_a006_os("bb")(h4_ob, h1_ob) += -1.0 * _a009_os("bb")(h3_ob, h1_ob, cind) * _a008_os("bb")(h4_ob, h3_ob, cind), "_a006_os( bb )(h4_ob, h1_ob) += -1.0 * _a009_os( bb )(h3_ob, h1_ob, cind) * _a008_os( bb )(h4_ob, h3_ob, cind)") - (_a019_os("aaaa")(h4_oa, h3_oa, h1_oa, h2_oa) = 0.25 * _a009_os("aa")(h4_oa, h1_oa, cind) * _a009_os("aa")(h3_oa, h2_oa, cind), - "_a019_os( aaaa )(h4_oa, h3_oa, h1_oa, h2_oa) = 0.25 * _a009_os( aa )(h4_oa, h1_oa, cind) * _a009_os( aa )(h3_oa, h2_oa, cind)") - (_a019_os("abab")(h4_oa, h3_ob, h1_oa, h2_ob) = 0.25 * _a009_os("aa")(h4_oa, h1_oa, cind) * _a009_os("bb")(h3_ob, h2_ob, cind), + (_a019_os("aaaa")(h4_oa, h3_oa, h1_oa, h2_oa) = 0.25 * _a009_os("aa")(h4_oa, h1_oa, cind) * _a009_os("aa")(h3_oa, h2_oa, cind), + "_a019_os( aaaa )(h4_oa, h3_oa, h1_oa, h2_oa) = 0.25 * _a009_os( aa )(h4_oa, h1_oa, cind) * _a009_os( aa )(h3_oa, h2_oa, cind)") + (_a019_os("abab")(h4_oa, h3_ob, h1_oa, h2_ob) = 0.25 * _a009_os("aa")(h4_oa, h1_oa, cind) * _a009_os("bb")(h3_ob, h2_ob, cind), "_a019_os( abab )(h4_oa, h3_ob, h1_oa, h2_ob) = 0.25 * _a009_os( aa )(h4_oa, h1_oa, cind) * _a009_os( bb )(h3_ob, h2_ob, cind)") - (_a019_os("bbbb")(h4_ob, h3_ob, h1_ob, h2_ob) = 0.25 * _a009_os("bb")(h4_ob, h1_ob, cind) * _a009_os("bb")(h3_ob, h2_ob, cind), - "_a019_os( bbbb )(h4_ob, h3_ob, h1_ob, h2_ob) = 0.25 * _a009_os( bb )(h4_ob, h1_ob, cind) * _a009_os( bb )(h3_ob, h2_ob, cind)") - (_a020_os("aaaa")(p4_va, h4_oa, p1_va, h1_oa) = -2.0 * _a009_os("aa")(h4_oa, h1_oa, cind) * _a021_os("aa")(p4_va, p1_va, cind), + (_a019_os("bbbb")(h4_ob, h3_ob, h1_ob, h2_ob) = 0.25 * _a009_os("bb")(h4_ob, h1_ob, cind) * _a009_os("bb")(h3_ob, h2_ob, cind), + "_a019_os( bbbb )(h4_ob, h3_ob, h1_ob, h2_ob) = 0.25 * _a009_os( bb )(h4_ob, h1_ob, cind) * _a009_os( bb )(h3_ob, h2_ob, cind)") + (_a020_os("aaaa")(p4_va, h4_oa, p1_va, h1_oa) = -2.0 * _a009_os("aa")(h4_oa, h1_oa, cind) * _a021_os("aa")(p4_va, p1_va, cind), "_a020_os( aaaa )(p4_va, h4_oa, p1_va, h1_oa) = -2.0 * _a009_os( aa )(h4_oa, h1_oa, cind) * _a021_os( aa )(p4_va, p1_va, cind)") - (_a020_os("abab")(p4_va, h4_ob, p1_va, h1_ob) = -2.0 * _a009_os("bb")(h4_ob, h1_ob, cind) * _a021_os("aa")(p4_va, p1_va, cind), + (_a020_os("abab")(p4_va, h4_ob, p1_va, h1_ob) = -2.0 * _a009_os("bb")(h4_ob, h1_ob, cind) * _a021_os("aa")(p4_va, p1_va, cind), "_a020_os( abab )(p4_va, h4_ob, p1_va, h1_ob) = -2.0 * _a009_os( bb )(h4_ob, h1_ob, cind) * _a021_os( aa )(p4_va, p1_va, cind)") - (_a020_os("baba")(p4_vb, h4_oa, p1_vb, h1_oa) = -2.0 * _a009_os("aa")(h4_oa, h1_oa, cind) * _a021_os("bb")(p4_vb, p1_vb, cind), + (_a020_os("baba")(p4_vb, h4_oa, p1_vb, h1_oa) = -2.0 * _a009_os("aa")(h4_oa, h1_oa, cind) * _a021_os("bb")(p4_vb, p1_vb, cind), "_a020_os( baba )(p4_vb, h4_oa, p1_vb, h1_oa) = -2.0 * _a009_os( aa )(h4_oa, h1_oa, cind) * _a021_os( bb )(p4_vb, p1_vb, cind)") - (_a020_os("bbbb")(p4_vb, h4_ob, p1_vb, h1_ob) = -2.0 * _a009_os("bb")(h4_ob, h1_ob, cind) * _a021_os("bb")(p4_vb, p1_vb, cind), + (_a020_os("bbbb")(p4_vb, h4_ob, p1_vb, h1_ob) = -2.0 * _a009_os("bb")(h4_ob, h1_ob, cind) * _a021_os("bb")(p4_vb, p1_vb, cind), "_a020_os( bbbb )(p4_vb, h4_ob, p1_vb, h1_ob) = -2.0 * _a009_os( bb )(h4_ob, h1_ob, cind) * _a021_os( bb )(p4_vb, p1_vb, cind)") - (_a017_os("aa")(p3_va, h2_oa, cind) += 1.0 * t1_aa(p3_va, h3_oa) * chol3d_oo("aa")(h3_oa, h2_oa, cind), + (_a017_os("aa")(p3_va, h2_oa, cind) += 1.0 * t1_aa(p3_va, h3_oa) * chol3d_oo("aa")(h3_oa, h2_oa, cind), "_a017_os( aa )(p3_va, h2_oa, cind) += 1.0 * t1_aa(p3_va, h3_oa) * chol3d_oo( aa )(h3_oa, h2_oa, cind)") - (_a017_os("bb")(p3_vb, h2_ob, cind) += 1.0 * t1_bb(p3_vb, h3_ob) * chol3d_oo("bb")(h3_ob, h2_ob, cind), + (_a017_os("bb")(p3_vb, h2_ob, cind) += 1.0 * t1_bb(p3_vb, h3_ob) * chol3d_oo("bb")(h3_ob, h2_ob, cind), "_a017_os( bb )(p3_vb, h2_ob, cind) += 1.0 * t1_bb(p3_vb, h3_ob) * chol3d_oo( bb )(h3_ob, h2_ob, cind)") - (_a017_os("aa")(p3_va, h2_oa, cind) += -1.0 * chol3d_vo("aa")(p3_va, h2_oa, cind), + (_a017_os("aa")(p3_va, h2_oa, cind) += -1.0 * chol3d_vo("aa")(p3_va, h2_oa, cind), "_a017_os( aa )(p3_va, h2_oa, cind) += -1.0 * chol3d_vo( aa )(p3_va, h2_oa, cind)") - (_a017_os("bb")(p3_vb, h2_ob, cind) += -1.0 * chol3d_vo("bb")(p3_vb, h2_ob, cind), + (_a017_os("bb")(p3_vb, h2_ob, cind) += -1.0 * chol3d_vo("bb")(p3_vb, h2_ob, cind), "_a017_os( bb )(p3_vb, h2_ob, cind) += -1.0 * chol3d_vo( bb )(p3_vb, h2_ob, cind)") - (i0_aaaa(p3_va, p4_va, h1_oa, h2_oa) = 0.5 * _a017_os("aa")(p3_va, h1_oa, cind) * _a017_os("aa")(p4_va, h2_oa, cind), + (i0_aaaa(p3_va, p4_va, h1_oa, h2_oa) = 0.5 * _a017_os("aa")(p3_va, h1_oa, cind) * _a017_os("aa")(p4_va, h2_oa, cind), "i0_aaaa(p3_va, p4_va, h1_oa, h2_oa) = 0.5 * _a017_os( aa )(p3_va, h1_oa, cind) * _a017_os( aa )(p4_va, h2_oa, cind)") - (i0_bbbb(p3_vb, p4_vb, h1_ob, h2_ob) = 0.5 * _a017_os("bb")(p3_vb, h1_ob, cind) * _a017_os("bb")(p4_vb, h2_ob, cind), + (i0_bbbb(p3_vb, p4_vb, h1_ob, h2_ob) = 0.5 * _a017_os("bb")(p3_vb, h1_ob, cind) * _a017_os("bb")(p4_vb, h2_ob, cind), "i0_bbbb(p3_vb, p4_vb, h1_ob, h2_ob) = 0.5 * _a017_os( bb )(p3_vb, h1_ob, cind) * _a017_os( bb )(p4_vb, h2_ob, cind)") - (i0_abab(p3_va, p4_vb, h1_oa, h2_ob) = 1.0 * _a017_os("aa")(p3_va, h1_oa, cind) * _a017_os("bb")(p4_vb, h2_ob, cind), + (i0_abab(p3_va, p4_vb, h1_oa, h2_ob) = 1.0 * _a017_os("aa")(p3_va, h1_oa, cind) * _a017_os("bb")(p4_vb, h2_ob, cind), "i0_abab(p3_va, p4_vb, h1_oa, h2_ob) = 1.0 * _a017_os( aa )(p3_va, h1_oa, cind) * _a017_os( bb )(p4_vb, h2_ob, cind)").execute(hw); - // sch(_a022("aaaa")(p3_va,p4_va,p2_va,p1_va) = 1.0 * _a021_os("aa")(p3_va,p2_va,cind) * _a021_os("aa")(p4_va,p1_va,cind), + // sch(_a022("aaaa")(p3_va,p4_va,p2_va,p1_va) = 1.0 * _a021_os("aa")(p3_va,p2_va,cind) * _a021_os("aa")(p4_va,p1_va,cind), // "_a022( aaaa )(p3_va,p4_va,p2_va,p1_va) = 1.0 * _a021_os( aa )(p3_va,p2_va,cind) * _a021_os( aa )(p4_va,p1_va,cind)") - // (_a022("abab")(p3_va,p4_vb,p2_va,p1_vb) = 1.0 * _a021_os("aa")(p3_va,p2_va,cind) * _a021_os("bb")(p4_vb,p1_vb,cind), + // (_a022("abab")(p3_va,p4_vb,p2_va,p1_vb) = 1.0 * _a021_os("aa")(p3_va,p2_va,cind) * _a021_os("bb")(p4_vb,p1_vb,cind), // "_a022( abab )(p3_va,p4_vb,p2_va,p1_vb) = 1.0 * _a021_os( aa )(p3_va,p2_va,cind) * _a021_os( bb )(p4_vb,p1_vb,cind)") - // (_a022("bbbb")(p3_vb,p4_vb,p2_vb,p1_vb) = 1.0 * _a021_os("bb")(p3_vb,p2_vb,cind) * _a021_os("bb")(p4_vb,p1_vb,cind), + // (_a022("bbbb")(p3_vb,p4_vb,p2_vb,p1_vb) = 1.0 * _a021_os("bb")(p3_vb,p2_vb,cind) * _a021_os("bb")(p4_vb,p1_vb,cind), // "_a022( bbbb )(p3_vb,p4_vb,p2_vb,p1_vb) = 1.0 * _a021_os( bb )(p3_vb,p2_vb,cind) * _a021_os( bb )(p4_vb,p1_vb,cind)") - // (i0_aaaa(p3_va, p4_va, h1_oa, h2_oa) += 1.0 * _a022("aaaa")(p3_va, p4_va, p2_va, p1_va) * t2_aaaa(p2_va,p1_va,h1_oa,h2_oa), + // (i0_aaaa(p3_va, p4_va, h1_oa, h2_oa) += 1.0 * _a022("aaaa")(p3_va, p4_va, p2_va, p1_va) * t2_aaaa(p2_va,p1_va,h1_oa,h2_oa), // "i0_aaaa(p3_va, p4_va, h1_oa, h2_oa) += 1.0 * _a022( aaaa )(p3_va, p4_va, p2_va, p1_va) * t2_aaaa(p2_va,p1_va,h1_oa,h2_oa)") - // (i0_bbbb(p3_vb, p4_vb, h1_ob, h2_ob) += 1.0 * _a022("bbbb")(p3_vb, p4_vb, p2_vb, p1_vb) * t2_bbbb(p2_vb,p1_vb,h1_ob,h2_ob), + // (i0_bbbb(p3_vb, p4_vb, h1_ob, h2_ob) += 1.0 * _a022("bbbb")(p3_vb, p4_vb, p2_vb, p1_vb) * t2_bbbb(p2_vb,p1_vb,h1_ob,h2_ob), // "i0_bbbb(p3_vb, p4_vb, h1_ob, h2_ob) += 1.0 * _a022( bbbb )(p3_vb, p4_vb, p2_vb, p1_vb) * t2_bbbb(p2_vb,p1_vb,h1_ob,h2_ob)") - // (i0_abab(p3_va, p4_vb, h1_oa, h2_ob) += 4.0 * _a022("abab")(p3_va, p4_vb, p2_va, p1_vb) * t2_abab(p2_va,p1_vb,h1_oa,h2_ob), + // (i0_abab(p3_va, p4_vb, h1_oa, h2_ob) += 4.0 * _a022("abab")(p3_va, p4_vb, p2_va, p1_vb) * t2_abab(p2_va,p1_vb,h1_oa,h2_ob), // "i0_abab(p3_va, p4_vb, h1_oa, h2_ob) += 4.0 * _a022( abab )(p3_va, p4_vb, p2_va, p1_vb) * t2_abab(p2_va,p1_vb,h1_oa,h2_ob)"); - + a22_flag = 1; - sch(i0_aaaa(p3_va, p4_va, h1_oa, h2_oa) += 1.0 * a22_aaaa_os(p3_va, p4_va, p2_va, p1_va) * t2_aaaa(p2_va,p1_va,h1_oa,h2_oa), + sch(i0_aaaa(p3_va, p4_va, h1_oa, h2_oa) += 1.0 * a22_aaaa_os(p3_va, p4_va, p2_va, p1_va) * t2_aaaa(p2_va,p1_va,h1_oa,h2_oa), "i0_aaaa(p3_va, p4_va, h1_oa, h2_oa) += 1.0 * a22_aaaa_os(p3_va, p4_va, p2_va, p1_va) * t2_aaaa(p2_va,p1_va,h1_oa,h2_oa)").execute(hw); a22_flag = 2; - sch(i0_bbbb(p3_vb, p4_vb, h1_ob, h2_ob) += 1.0 * a22_bbbb_os(p3_vb, p4_vb, p2_vb, p1_vb) * t2_bbbb(p2_vb,p1_vb,h1_ob,h2_ob), + sch(i0_bbbb(p3_vb, p4_vb, h1_ob, h2_ob) += 1.0 * a22_bbbb_os(p3_vb, p4_vb, p2_vb, p1_vb) * t2_bbbb(p2_vb,p1_vb,h1_ob,h2_ob), "i0_bbbb(p3_vb, p4_vb, h1_ob, h2_ob) += 1.0 * a22_bbbb_os(p3_vb, p4_vb, p2_vb, p1_vb) * t2_bbbb(p2_vb,p1_vb,h1_ob,h2_ob)").execute(hw); a22_flag = 3; - sch(i0_abab(p3_va, p4_vb, h1_oa, h2_ob) += 4.0 * a22_abab_os(p3_va, p4_vb, p2_va, p1_vb) * t2_abab(p2_va,p1_vb,h1_oa,h2_ob), + sch(i0_abab(p3_va, p4_vb, h1_oa, h2_ob) += 4.0 * a22_abab_os(p3_va, p4_vb, p2_va, p1_vb) * t2_abab(p2_va,p1_vb,h1_oa,h2_ob), "i0_abab(p3_va, p4_vb, h1_oa, h2_ob) += 4.0 * a22_abab_os(p3_va, p4_vb, p2_va, p1_vb) * t2_abab(p2_va,p1_vb,h1_oa,h2_ob)").execute(hw); - sch(_a019_os("aaaa")(h4_oa, h3_oa, h1_oa, h2_oa) += -0.125 * _a004_os("aaaa")(p1_va, p2_va, h3_oa, h4_oa) * t2_aaaa(p1_va,p2_va,h1_oa,h2_oa), + sch(_a019_os("aaaa")(h4_oa, h3_oa, h1_oa, h2_oa) += -0.125 * _a004_os("aaaa")(p1_va, p2_va, h3_oa, h4_oa) * t2_aaaa(p1_va,p2_va,h1_oa,h2_oa), "_a019_os( aaaa )(h4_oa, h3_oa, h1_oa, h2_oa) += -0.125 * _a004_os( aaaa )(p1_va, p2_va, h3_oa, h4_oa) * t2_aaaa(p1_va,p2_va,h1_oa,h2_oa)") - (_a019_os("abab")(h4_oa, h3_ob, h1_oa, h2_ob) += 0.25 * _a004_os("abab")(p1_va, p2_vb, h4_oa, h3_ob) * t2_abab(p1_va,p2_vb,h1_oa,h2_ob), - "_a019_os( abab )(h4_oa, h3_ob, h1_oa, h2_ob) += 0.25 * _a004_os( abab )(p1_va, p2_vb, h4_oa, h3_ob) * t2_abab(p1_va,p2_vb,h1_oa,h2_ob)") - (_a019_os("bbbb")(h4_ob, h3_ob, h1_ob, h2_ob) += -0.125 * _a004_os("bbbb")(p1_vb, p2_vb, h3_ob, h4_ob) * t2_bbbb(p1_vb,p2_vb,h1_ob,h2_ob), + (_a019_os("abab")(h4_oa, h3_ob, h1_oa, h2_ob) += 0.25 * _a004_os("abab")(p1_va, p2_vb, h4_oa, h3_ob) * t2_abab(p1_va,p2_vb,h1_oa,h2_ob), + "_a019_os( abab )(h4_oa, h3_ob, h1_oa, h2_ob) += 0.25 * _a004_os( abab )(p1_va, p2_vb, h4_oa, h3_ob) * t2_abab(p1_va,p2_vb,h1_oa,h2_ob)") + (_a019_os("bbbb")(h4_ob, h3_ob, h1_ob, h2_ob) += -0.125 * _a004_os("bbbb")(p1_vb, p2_vb, h3_ob, h4_ob) * t2_bbbb(p1_vb,p2_vb,h1_ob,h2_ob), "_a019_os( bbbb )(h4_ob, h3_ob, h1_ob, h2_ob) += -0.125 * _a004_os( bbbb )(p1_vb, p2_vb, h3_ob, h4_ob) * t2_bbbb(p1_vb,p2_vb,h1_ob,h2_ob)") - (i0_aaaa(p3_va, p4_va, h1_oa, h2_oa) += 1.0 * _a019_os("aaaa")(h4_oa, h3_oa, h1_oa, h2_oa) * t2_aaaa(p3_va, p4_va, h4_oa, h3_oa), + (i0_aaaa(p3_va, p4_va, h1_oa, h2_oa) += 1.0 * _a019_os("aaaa")(h4_oa, h3_oa, h1_oa, h2_oa) * t2_aaaa(p3_va, p4_va, h4_oa, h3_oa), "i0_aaaa(p3_va, p4_va, h1_oa, h2_oa) += 1.0 * _a019_os( aaaa )(h4_oa, h3_oa, h1_oa, h2_oa) * t2_aaaa(p3_va, p4_va, h4_oa, h3_oa)") - (i0_bbbb(p3_vb, p4_vb, h1_ob, h2_ob) += 1.0 * _a019_os("bbbb")(h4_ob, h3_ob, h1_ob, h2_ob) * t2_bbbb(p3_vb, p4_vb, h4_ob, h3_ob), + (i0_bbbb(p3_vb, p4_vb, h1_ob, h2_ob) += 1.0 * _a019_os("bbbb")(h4_ob, h3_ob, h1_ob, h2_ob) * t2_bbbb(p3_vb, p4_vb, h4_ob, h3_ob), "i0_bbbb(p3_vb, p4_vb, h1_ob, h2_ob) += 1.0 * _a019_os( bbbb )(h4_ob, h3_ob, h1_ob, h2_ob) * t2_bbbb(p3_vb, p4_vb, h4_ob, h3_ob)") - (i0_abab(p3_va, p4_vb, h1_oa, h2_ob) += 4.0 * _a019_os("abab")(h4_oa, h3_ob, h1_oa, h2_ob) * t2_abab(p3_va, p4_vb, h4_oa, h3_ob), + (i0_abab(p3_va, p4_vb, h1_oa, h2_ob) += 4.0 * _a019_os("abab")(h4_oa, h3_ob, h1_oa, h2_ob) * t2_abab(p3_va, p4_vb, h4_oa, h3_ob), "i0_abab(p3_va, p4_vb, h1_oa, h2_ob) += 4.0 * _a019_os( abab )(h4_oa, h3_ob, h1_oa, h2_ob) * t2_abab(p3_va, p4_vb, h4_oa, h3_ob)") - (_a020_os("aaaa")(p1_va, h3_oa, p4_va, h2_oa) += 0.5 * _a004_os("aaaa")(p2_va, p4_va, h3_oa, h1_oa) * t2_aaaa(p1_va,p2_va,h1_oa,h2_oa), - "_a020_os( aaaa )(p1_va, h3_oa, p4_va, h2_oa) += 0.5 * _a004_os( aaaa )(p2_va, p4_va, h3_oa, h1_oa) * t2_aaaa(p1_va,p2_va,h1_oa,h2_oa)") - (_a020_os("baab")(p1_vb, h3_oa, p4_va, h2_ob) = -0.5 * _a004_os("aaaa")(p2_va, p4_va, h3_oa, h1_oa) * t2_abab(p2_va,p1_vb,h1_oa,h2_ob), - "_a020_os( baab )(p1_vb, h3_oa, p4_va, h2_ob) = -0.5 * _a004_os( aaaa )(p2_va, p4_va, h3_oa, h1_oa) * t2_abab(p2_va,p1_vb,h1_oa,h2_ob)") - (_a020_os("abba")(p1_va, h3_ob, p4_vb, h2_oa) = -0.5 * _a004_os("bbbb")(p2_vb, p4_vb, h3_ob, h1_ob) * t2_abab(p1_va,p2_vb,h2_oa,h1_ob), + (_a020_os("aaaa")(p1_va, h3_oa, p4_va, h2_oa) += 0.5 * _a004_os("aaaa")(p2_va, p4_va, h3_oa, h1_oa) * t2_aaaa(p1_va,p2_va,h1_oa,h2_oa), + "_a020_os( aaaa )(p1_va, h3_oa, p4_va, h2_oa) += 0.5 * _a004_os( aaaa )(p2_va, p4_va, h3_oa, h1_oa) * t2_aaaa(p1_va,p2_va,h1_oa,h2_oa)") + (_a020_os("baab")(p1_vb, h3_oa, p4_va, h2_ob) = -0.5 * _a004_os("aaaa")(p2_va, p4_va, h3_oa, h1_oa) * t2_abab(p2_va,p1_vb,h1_oa,h2_ob), + "_a020_os( baab )(p1_vb, h3_oa, p4_va, h2_ob) = -0.5 * _a004_os( aaaa )(p2_va, p4_va, h3_oa, h1_oa) * t2_abab(p2_va,p1_vb,h1_oa,h2_ob)") + (_a020_os("abba")(p1_va, h3_ob, p4_vb, h2_oa) = -0.5 * _a004_os("bbbb")(p2_vb, p4_vb, h3_ob, h1_ob) * t2_abab(p1_va,p2_vb,h2_oa,h1_ob), "_a020_os( abba )(p1_va, h3_ob, p4_vb, h2_oa) = -0.5 * _a004_os( bbbb )(p2_vb, p4_vb, h3_ob, h1_ob) * t2_abab(p1_va,p2_vb,h2_oa,h1_ob)") - (_a020_os("bbbb")(p1_vb, h3_ob, p4_vb, h2_ob) += 0.5 * _a004_os("bbbb")(p2_vb, p4_vb, h3_ob, h1_ob) * t2_bbbb(p1_vb,p2_vb,h1_ob,h2_ob), + (_a020_os("bbbb")(p1_vb, h3_ob, p4_vb, h2_ob) += 0.5 * _a004_os("bbbb")(p2_vb, p4_vb, h3_ob, h1_ob) * t2_bbbb(p1_vb,p2_vb,h1_ob,h2_ob), "_a020_os( bbbb )(p1_vb, h3_ob, p4_vb, h2_ob) += 0.5 * _a004_os( bbbb )(p2_vb, p4_vb, h3_ob, h1_ob) * t2_bbbb(p1_vb,p2_vb,h1_ob,h2_ob)") - (_a020_os("baba")(p1_vb, h7_oa, p6_vb, h2_oa) += 1.0 * _a004_os("abab")(p5_va, p6_vb, h7_oa, h8_ob) * t2_abab(p5_va,p1_vb,h2_oa,h8_ob), + (_a020_os("baba")(p1_vb, h7_oa, p6_vb, h2_oa) += 1.0 * _a004_os("abab")(p5_va, p6_vb, h7_oa, h8_ob) * t2_abab(p5_va,p1_vb,h2_oa,h8_ob), "_a020_os( baba )(p1_vb, h7_oa, p6_vb, h2_oa) += 1.0 * _a004_os( abab )(p5_va, p6_vb, h7_oa, h8_ob) * t2_abab(p5_va,p1_vb,h2_oa,h8_ob)") - (i0_aaaa(p3_va, p4_va, h1_oa, h2_oa) += 1.0 * _a020_os("aaaa")(p4_va, h4_oa, p1_va, h1_oa) * t2_aaaa(p3_va, p1_va, h4_oa, h2_oa), + (i0_aaaa(p3_va, p4_va, h1_oa, h2_oa) += 1.0 * _a020_os("aaaa")(p4_va, h4_oa, p1_va, h1_oa) * t2_aaaa(p3_va, p1_va, h4_oa, h2_oa), "i0_aaaa(p3_va, p4_va, h1_oa, h2_oa) += 1.0 * _a020_os( aaaa )(p4_va, h4_oa, p1_va, h1_oa) * t2_aaaa(p3_va, p1_va, h4_oa, h2_oa)") - (i0_aaaa(p3_va, p4_va, h1_oa, h2_oa) += -1.0 * _a020_os("abba")(p4_va, h4_ob, p1_vb, h1_oa) * t2_abab(p3_va, p1_vb, h2_oa, h4_ob), + (i0_aaaa(p3_va, p4_va, h1_oa, h2_oa) += -1.0 * _a020_os("abba")(p4_va, h4_ob, p1_vb, h1_oa) * t2_abab(p3_va, p1_vb, h2_oa, h4_ob), "i0_aaaa(p3_va, p4_va, h1_oa, h2_oa) += -1.0 * _a020_os( abba )(p4_va, h4_ob, p1_vb, h1_oa) * t2_abab(p3_va, p1_vb, h2_oa, h4_ob)") - (i0_bbbb(p3_vb, p4_vb, h1_ob, h2_ob) += 1.0 * _a020_os("bbbb")(p4_vb, h4_ob, p1_vb, h1_ob) * t2_bbbb(p3_vb, p1_vb, h4_ob, h2_ob), + (i0_bbbb(p3_vb, p4_vb, h1_ob, h2_ob) += 1.0 * _a020_os("bbbb")(p4_vb, h4_ob, p1_vb, h1_ob) * t2_bbbb(p3_vb, p1_vb, h4_ob, h2_ob), "i0_bbbb(p3_vb, p4_vb, h1_ob, h2_ob) += 1.0 * _a020_os( bbbb )(p4_vb, h4_ob, p1_vb, h1_ob) * t2_bbbb(p3_vb, p1_vb, h4_ob, h2_ob)") - (i0_bbbb(p3_vb, p4_vb, h1_ob, h2_ob) += -1.0 * _a020_os("baab")(p4_vb, h4_oa, p1_va, h1_ob) * t2_abab(p1_va, p3_vb, h4_oa, h2_ob), + (i0_bbbb(p3_vb, p4_vb, h1_ob, h2_ob) += -1.0 * _a020_os("baab")(p4_vb, h4_oa, p1_va, h1_ob) * t2_abab(p1_va, p3_vb, h4_oa, h2_ob), "i0_bbbb(p3_vb, p4_vb, h1_ob, h2_ob) += -1.0 * _a020_os( baab )(p4_vb, h4_oa, p1_va, h1_ob) * t2_abab(p1_va, p3_vb, h4_oa, h2_ob)") - (i0_abab(p3_va, p1_vb, h2_oa, h4_ob) += 1.0 * _a020_os("baba")(p1_vb, h7_oa, p6_vb, h2_oa) * t2_abab(p3_va, p6_vb, h7_oa, h4_ob), + (i0_abab(p3_va, p1_vb, h2_oa, h4_ob) += 1.0 * _a020_os("baba")(p1_vb, h7_oa, p6_vb, h2_oa) * t2_abab(p3_va, p6_vb, h7_oa, h4_ob), "i0_abab(p3_va, p1_vb, h2_oa, h4_ob) += 1.0 * _a020_os( baba )(p1_vb, h7_oa, p6_vb, h2_oa) * t2_abab(p3_va, p6_vb, h7_oa, h4_ob)") - (i0_abab(p3_va, p1_vb, h2_oa, h4_ob) += 1.0 * _a020_os("abab")(p3_va, h8_ob, p5_va, h4_ob) * t2_abab(p5_va, p1_vb, h2_oa, h8_ob), + (i0_abab(p3_va, p1_vb, h2_oa, h4_ob) += 1.0 * _a020_os("abab")(p3_va, h8_ob, p5_va, h4_ob) * t2_abab(p5_va, p1_vb, h2_oa, h8_ob), "i0_abab(p3_va, p1_vb, h2_oa, h4_ob) += 1.0 * _a020_os( abab )(p3_va, h8_ob, p5_va, h4_ob) * t2_abab(p5_va, p1_vb, h2_oa, h8_ob)") - (i0_abab(p3_va, p4_vb, h2_oa, h1_ob) += 1.0 * _a020_os("bbbb")(p4_vb, h4_ob, p1_vb, h1_ob) * t2_abab(p3_va, p1_vb, h2_oa, h4_ob), + (i0_abab(p3_va, p4_vb, h2_oa, h1_ob) += 1.0 * _a020_os("bbbb")(p4_vb, h4_ob, p1_vb, h1_ob) * t2_abab(p3_va, p1_vb, h2_oa, h4_ob), "i0_abab(p3_va, p4_vb, h2_oa, h1_ob) += 1.0 * _a020_os( bbbb )(p4_vb, h4_ob, p1_vb, h1_ob) * t2_abab(p3_va, p1_vb, h2_oa, h4_ob)") - (i0_abab(p3_va, p4_vb, h2_oa, h1_ob) += -1.0 * _a020_os("baab")(p4_vb, h4_oa, p1_va, h1_ob) * t2_aaaa(p3_va, p1_va, h4_oa, h2_oa), + (i0_abab(p3_va, p4_vb, h2_oa, h1_ob) += -1.0 * _a020_os("baab")(p4_vb, h4_oa, p1_va, h1_ob) * t2_aaaa(p3_va, p1_va, h4_oa, h2_oa), "i0_abab(p3_va, p4_vb, h2_oa, h1_ob) += -1.0 * _a020_os( baab )(p4_vb, h4_oa, p1_va, h1_ob) * t2_aaaa(p3_va, p1_va, h4_oa, h2_oa)") - (i0_abab(p4_va, p3_vb, h1_oa, h2_ob) += 1.0 * _a020_os("aaaa")(p4_va, h4_oa, p1_va, h1_oa) * t2_abab(p1_va, p3_vb, h4_oa, h2_ob), + (i0_abab(p4_va, p3_vb, h1_oa, h2_ob) += 1.0 * _a020_os("aaaa")(p4_va, h4_oa, p1_va, h1_oa) * t2_abab(p1_va, p3_vb, h4_oa, h2_ob), "i0_abab(p4_va, p3_vb, h1_oa, h2_ob) += 1.0 * _a020_os( aaaa )(p4_va, h4_oa, p1_va, h1_oa) * t2_abab(p1_va, p3_vb, h4_oa, h2_ob)") - (i0_abab(p4_va, p3_vb, h1_oa, h2_ob) += -1.0 * _a020_os("abba")(p4_va, h4_ob, p1_vb, h1_oa) * t2_bbbb(p3_vb, p1_vb, h4_ob, h2_ob), + (i0_abab(p4_va, p3_vb, h1_oa, h2_ob) += -1.0 * _a020_os("abba")(p4_va, h4_ob, p1_vb, h1_oa) * t2_bbbb(p3_vb, p1_vb, h4_ob, h2_ob), "i0_abab(p4_va, p3_vb, h1_oa, h2_ob) += -1.0 * _a020_os( abba )(p4_va, h4_ob, p1_vb, h1_oa) * t2_bbbb(p3_vb, p1_vb, h4_ob, h2_ob)") - (_a001_os("aa")(p4_va, p1_va) += -1.0 * f1_vv("aa")(p4_va, p1_va), + (_a001_os("aa")(p4_va, p1_va) += -1.0 * f1_vv("aa")(p4_va, p1_va), "_a001_os( aa )(p4_va, p1_va) += -1.0 * f1_vv( aa )(p4_va, p1_va)") - (_a001_os("bb")(p4_vb, p1_vb) += -1.0 * f1_vv("bb")(p4_vb, p1_vb), + (_a001_os("bb")(p4_vb, p1_vb) += -1.0 * f1_vv("bb")(p4_vb, p1_vb), "_a001_os( bb )(p4_vb, p1_vb) += -1.0 * f1_vv( bb )(p4_vb, p1_vb)") - (_a001_os("aa")(p4_va, p1_va) += 1.0 * t1_aa(p4_va, h1_oa) * f1_ov("aa")(h1_oa, p1_va), + (_a001_os("aa")(p4_va, p1_va) += 1.0 * t1_aa(p4_va, h1_oa) * f1_ov("aa")(h1_oa, p1_va), "_a001_os( aa )(p4_va, p1_va) += 1.0 * t1_aa(p4_va, h1_oa) * f1_ov( aa )(h1_oa, p1_va)") // NEW TERM - (_a001_os("bb")(p4_vb, p1_vb) += 1.0 * t1_bb(p4_vb, h1_ob) * f1_ov("bb")(h1_ob, p1_vb), + (_a001_os("bb")(p4_vb, p1_vb) += 1.0 * t1_bb(p4_vb, h1_ob) * f1_ov("bb")(h1_ob, p1_vb), "_a001_os( bb )(p4_vb, p1_vb) += 1.0 * t1_bb(p4_vb, h1_ob) * f1_ov( bb )(h1_ob, p1_vb)") // NEW TERM - (_a006_os("aa")(h9_oa, h1_oa) += 1.0 * f1_oo("aa")(h9_oa, h1_oa), + (_a006_os("aa")(h9_oa, h1_oa) += 1.0 * f1_oo("aa")(h9_oa, h1_oa), "_a006_os( aa )(h9_oa, h1_oa) += 1.0 * f1_oo( aa )(h9_oa, h1_oa)") - (_a006_os("bb")(h9_ob, h1_ob) += 1.0 * f1_oo("bb")(h9_ob, h1_ob), + (_a006_os("bb")(h9_ob, h1_ob) += 1.0 * f1_oo("bb")(h9_ob, h1_ob), "_a006_os( bb )(h9_ob, h1_ob) += 1.0 * f1_oo( bb )(h9_ob, h1_ob)") - (_a006_os("aa")(h9_oa, h1_oa) += 1.0 * t1_aa(p8_va, h1_oa) * f1_ov("aa")(h9_oa, p8_va), + (_a006_os("aa")(h9_oa, h1_oa) += 1.0 * t1_aa(p8_va, h1_oa) * f1_ov("aa")(h9_oa, p8_va), "_a006_os( aa )(h9_oa, h1_oa) += 1.0 * t1_aa(p8_va, h1_oa) * f1_ov( aa )(h9_oa, p8_va)") - (_a006_os("bb")(h9_ob, h1_ob) += 1.0 * t1_bb(p8_vb, h1_ob) * f1_ov("bb")(h9_ob, p8_vb), + (_a006_os("bb")(h9_ob, h1_ob) += 1.0 * t1_bb(p8_vb, h1_ob) * f1_ov("bb")(h9_ob, p8_vb), "_a006_os( bb )(h9_ob, h1_ob) += 1.0 * t1_bb(p8_vb, h1_ob) * f1_ov( bb )(h9_ob, p8_vb)") - (i0_aaaa(p3_va, p4_va, h1_oa, h2_oa) += -0.5 * t2_aaaa(p3_va, p2_va, h1_oa, h2_oa) * _a001_os("aa")(p4_va, p2_va), + (i0_aaaa(p3_va, p4_va, h1_oa, h2_oa) += -0.5 * t2_aaaa(p3_va, p2_va, h1_oa, h2_oa) * _a001_os("aa")(p4_va, p2_va), "i0_aaaa(p3_va, p4_va, h1_oa, h2_oa) += -0.5 * t2_aaaa(p3_va, p2_va, h1_oa, h2_oa) * _a001_os( aa )(p4_va, p2_va)") - (i0_bbbb(p3_vb, p4_vb, h1_ob, h2_ob) += -0.5 * t2_bbbb(p3_vb, p2_vb, h1_ob, h2_ob) * _a001_os("bb")(p4_vb, p2_vb), + (i0_bbbb(p3_vb, p4_vb, h1_ob, h2_ob) += -0.5 * t2_bbbb(p3_vb, p2_vb, h1_ob, h2_ob) * _a001_os("bb")(p4_vb, p2_vb), "i0_bbbb(p3_vb, p4_vb, h1_ob, h2_ob) += -0.5 * t2_bbbb(p3_vb, p2_vb, h1_ob, h2_ob) * _a001_os( bb )(p4_vb, p2_vb)") - (i0_abab(p3_va, p4_vb, h1_oa, h2_ob) += -1.0 * t2_abab(p3_va, p2_vb, h1_oa, h2_ob) * _a001_os("bb")(p4_vb, p2_vb), + (i0_abab(p3_va, p4_vb, h1_oa, h2_ob) += -1.0 * t2_abab(p3_va, p2_vb, h1_oa, h2_ob) * _a001_os("bb")(p4_vb, p2_vb), "i0_abab(p3_va, p4_vb, h1_oa, h2_ob) += -1.0 * t2_abab(p3_va, p2_vb, h1_oa, h2_ob) * _a001_os( bb )(p4_vb, p2_vb)") - (i0_abab(p4_va, p3_vb, h1_oa, h2_ob) += -1.0 * t2_abab(p2_va, p3_vb, h1_oa, h2_ob) * _a001_os("aa")(p4_va, p2_va), + (i0_abab(p4_va, p3_vb, h1_oa, h2_ob) += -1.0 * t2_abab(p2_va, p3_vb, h1_oa, h2_ob) * _a001_os("aa")(p4_va, p2_va), "i0_abab(p4_va, p3_vb, h1_oa, h2_ob) += -1.0 * t2_abab(p2_va, p3_vb, h1_oa, h2_ob) * _a001_os( aa )(p4_va, p2_va)") - (i0_aaaa(p3_va, p4_va, h2_oa, h1_oa) += -0.5 * t2_aaaa(p3_va, p4_va, h3_oa, h1_oa) * _a006_os("aa")(h3_oa, h2_oa), + (i0_aaaa(p3_va, p4_va, h2_oa, h1_oa) += -0.5 * t2_aaaa(p3_va, p4_va, h3_oa, h1_oa) * _a006_os("aa")(h3_oa, h2_oa), "i0_aaaa(p3_va, p4_va, h2_oa, h1_oa) += -0.5 * t2_aaaa(p3_va, p4_va, h3_oa, h1_oa) * _a006_os( aa )(h3_oa, h2_oa)") - (i0_bbbb(p3_vb, p4_vb, h2_ob, h1_ob) += -0.5 * t2_bbbb(p3_vb, p4_vb, h3_ob, h1_ob) * _a006_os("bb")(h3_ob, h2_ob), + (i0_bbbb(p3_vb, p4_vb, h2_ob, h1_ob) += -0.5 * t2_bbbb(p3_vb, p4_vb, h3_ob, h1_ob) * _a006_os("bb")(h3_ob, h2_ob), "i0_bbbb(p3_vb, p4_vb, h2_ob, h1_ob) += -0.5 * t2_bbbb(p3_vb, p4_vb, h3_ob, h1_ob) * _a006_os( bb )(h3_ob, h2_ob)") - (i0_abab(p3_va, p4_vb, h2_oa, h1_ob) += -1.0 * t2_abab(p3_va, p4_vb, h3_oa, h1_ob) * _a006_os("aa")(h3_oa, h2_oa), + (i0_abab(p3_va, p4_vb, h2_oa, h1_ob) += -1.0 * t2_abab(p3_va, p4_vb, h3_oa, h1_ob) * _a006_os("aa")(h3_oa, h2_oa), "i0_abab(p3_va, p4_vb, h2_oa, h1_ob) += -1.0 * t2_abab(p3_va, p4_vb, h3_oa, h1_ob) * _a006_os( aa )(h3_oa, h2_oa)") - (i0_abab(p3_va, p4_vb, h1_oa, h2_ob) += -1.0 * t2_abab(p3_va, p4_vb, h1_oa, h3_ob) * _a006_os("bb")(h3_ob, h2_ob), + (i0_abab(p3_va, p4_vb, h1_oa, h2_ob) += -1.0 * t2_abab(p3_va, p4_vb, h1_oa, h3_ob) * _a006_os("bb")(h3_ob, h2_ob), "i0_abab(p3_va, p4_vb, h1_oa, h2_ob) += -1.0 * t2_abab(p3_va, p4_vb, h1_oa, h3_ob) * _a006_os( bb )(h3_ob, h2_ob)") - (i0tmp("aaaa")(p3_va, p4_va, h1_oa, h2_oa) = 1.0 * i0_aaaa(p3_va, p4_va, h1_oa, h2_oa), - "i0tmp( aaaa )(p3_va, p4_va, h1_oa, h2_oa) = 1.0 * i0_aaaa(p3_va, p4_va, h1_oa, h2_oa)") - (i0tmp("aaaa")(p3_va, p4_va, h1_oa, h2_oa) += 1.0 * i0_aaaa(p4_va, p3_va, h2_oa, h1_oa), - "i0tmp( aaaa )(p3_va, p4_va, h1_oa, h2_oa) += 1.0 * i0_aaaa(p4_va, p3_va, h2_oa, h1_oa)") - (i0tmp("aaaa")(p3_va, p4_va, h1_oa, h2_oa) += -1.0 * i0_aaaa(p3_va, p4_va, h2_oa, h1_oa), - "i0tmp( aaaa )(p3_va, p4_va, h1_oa, h2_oa) += -1.0 * i0_aaaa(p3_va, p4_va, h2_oa, h1_oa)") - (i0tmp("aaaa")(p3_va, p4_va, h1_oa, h2_oa) += -1.0 * i0_aaaa(p4_va, p3_va, h1_oa, h2_oa), + (i0tmp("aaaa")(p3_va, p4_va, h1_oa, h2_oa) = 1.0 * i0_aaaa(p3_va, p4_va, h1_oa, h2_oa), + "i0tmp( aaaa )(p3_va, p4_va, h1_oa, h2_oa) = 1.0 * i0_aaaa(p3_va, p4_va, h1_oa, h2_oa)") + (i0tmp("aaaa")(p3_va, p4_va, h1_oa, h2_oa) += 1.0 * i0_aaaa(p4_va, p3_va, h2_oa, h1_oa), + "i0tmp( aaaa )(p3_va, p4_va, h1_oa, h2_oa) += 1.0 * i0_aaaa(p4_va, p3_va, h2_oa, h1_oa)") + (i0tmp("aaaa")(p3_va, p4_va, h1_oa, h2_oa) += -1.0 * i0_aaaa(p3_va, p4_va, h2_oa, h1_oa), + "i0tmp( aaaa )(p3_va, p4_va, h1_oa, h2_oa) += -1.0 * i0_aaaa(p3_va, p4_va, h2_oa, h1_oa)") + (i0tmp("aaaa")(p3_va, p4_va, h1_oa, h2_oa) += -1.0 * i0_aaaa(p4_va, p3_va, h1_oa, h2_oa), "i0tmp( aaaa )(p3_va, p4_va, h1_oa, h2_oa) += -1.0 * i0_aaaa(p4_va, p3_va, h1_oa, h2_oa)") - (i0tmp("bbbb")(p3_vb, p4_vb, h1_ob, h2_ob) = 1.0 * i0_bbbb(p3_vb, p4_vb, h1_ob, h2_ob), + (i0tmp("bbbb")(p3_vb, p4_vb, h1_ob, h2_ob) = 1.0 * i0_bbbb(p3_vb, p4_vb, h1_ob, h2_ob), "i0tmp( bbbb )(p3_vb, p4_vb, h1_ob, h2_ob) = 1.0 * i0_bbbb(p3_vb, p4_vb, h1_ob, h2_ob)") - (i0tmp("bbbb")(p3_vb, p4_vb, h1_ob, h2_ob) += 1.0 * i0_bbbb(p4_vb, p3_vb, h2_ob, h1_ob), + (i0tmp("bbbb")(p3_vb, p4_vb, h1_ob, h2_ob) += 1.0 * i0_bbbb(p4_vb, p3_vb, h2_ob, h1_ob), "i0tmp( bbbb )(p3_vb, p4_vb, h1_ob, h2_ob) += 1.0 * i0_bbbb(p4_vb, p3_vb, h2_ob, h1_ob)") - (i0tmp("bbbb")(p3_vb, p4_vb, h1_ob, h2_ob) += -1.0 * i0_bbbb(p3_vb, p4_vb, h2_ob, h1_ob), - "i0tmp( bbbb )(p3_vb, p4_vb, h1_ob, h2_ob) += -1.0 * i0_bbbb(p3_vb, p4_vb, h2_ob, h1_ob)") - (i0tmp("bbbb")(p3_vb, p4_vb, h1_ob, h2_ob) += -1.0 * i0_bbbb(p4_vb, p3_vb, h1_ob, h2_ob), + (i0tmp("bbbb")(p3_vb, p4_vb, h1_ob, h2_ob) += -1.0 * i0_bbbb(p3_vb, p4_vb, h2_ob, h1_ob), + "i0tmp( bbbb )(p3_vb, p4_vb, h1_ob, h2_ob) += -1.0 * i0_bbbb(p3_vb, p4_vb, h2_ob, h1_ob)") + (i0tmp("bbbb")(p3_vb, p4_vb, h1_ob, h2_ob) += -1.0 * i0_bbbb(p4_vb, p3_vb, h1_ob, h2_ob), "i0tmp( bbbb )(p3_vb, p4_vb, h1_ob, h2_ob) += -1.0 * i0_bbbb(p4_vb, p3_vb, h1_ob, h2_ob)") - (i0_aaaa(p3_va, p4_va, h1_oa, h2_oa) = 1.0 * i0tmp("aaaa")(p3_va, p4_va, h1_oa, h2_oa), + (i0_aaaa(p3_va, p4_va, h1_oa, h2_oa) = 1.0 * i0tmp("aaaa")(p3_va, p4_va, h1_oa, h2_oa), "i0_aaaa(p3_va, p4_va, h1_oa, h2_oa) = 1.0 * i0tmp( aaaa )(p3_va, p4_va, h1_oa, h2_oa)") - (i0_bbbb(p3_vb, p4_vb, h1_ob, h2_ob) = 1.0 * i0tmp("bbbb")(p3_vb, p4_vb, h1_ob, h2_ob), + (i0_bbbb(p3_vb, p4_vb, h1_ob, h2_ob) = 1.0 * i0tmp("bbbb")(p3_vb, p4_vb, h1_ob, h2_ob), "i0_bbbb(p3_vb, p4_vb, h1_ob, h2_ob) = 1.0 * i0tmp( bbbb )(p3_vb, p4_vb, h1_ob, h2_ob)") ; // clang-format on @@ -1179,4 +1177,4 @@ cd_ccsd_os_driver(SystemData& sys_data, ExecutionContext& ec, const TiledInde Tensor& d_r1, Tensor& d_r2, std::vector>& d_r1s, std::vector>& d_r2s, std::vector>& d_t1s, std::vector>& d_t2s, std::vector& p_evl_sorted, Tensor& cv3d, - bool ccsd_restart, std::string out_fp, bool computeTData); \ No newline at end of file + bool ccsd_restart, std::string out_fp, bool computeTData); diff --git a/exachem/cc/ccsd_t/ccsd_t.cpp b/exachem/cc/ccsd_t/ccsd_t.cpp index 56ac1b2..c76afa5 100644 --- a/exachem/cc/ccsd_t/ccsd_t.cpp +++ b/exachem/cc/ccsd_t/ccsd_t.cpp @@ -450,6 +450,11 @@ void ccsd_t_driver(std::string filename, OptionsMap options_map) { bool is_restricted = is_rhf; + // Given the singleton pool created by the TAMM is not used by the (T) kernel calculation. + // We artifically destroy the pool + tamm::reset_rmm_pool(); + // tamm::reinitialize_rmm_pool(); + if(rank == 0) { if(is_restricted) cout << endl << "Running Closed Shell CCSD(T) calculation" << endl; else cout << endl << "Running Open Shell CCSD(T) calculation" << endl; diff --git a/exachem/cc/ccsd_t/ccsd_t_all_fused.hpp b/exachem/cc/ccsd_t/ccsd_t_all_fused.hpp index 9a58405..9e96236 100644 --- a/exachem/cc/ccsd_t/ccsd_t_all_fused.hpp +++ b/exachem/cc/ccsd_t/ccsd_t_all_fused.hpp @@ -111,8 +111,6 @@ void ccsd_t_fully_fused_none_df_none_task( #if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP) // get (round-robin) GPU stream from pool gpuStream_t& stream = tamm::GPUStreamPool::getInstance().getStream(); - // get GPU memory handle from pool - auto& memPool = tamm::GPUPooledStorageManager::getInstance(); #endif // Index p4b,p5b,p6b,h1b,h2b,h3b; @@ -140,12 +138,12 @@ void ccsd_t_fully_fused_none_df_none_task( T* host_evl_sorted_p6b = &k_evl_sorted[k_offset[t_p6b]]; #if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP) - T* dev_evl_sorted_h1b = static_cast(memPool.allocate(sizeof(T) * base_size_h1b)); - T* dev_evl_sorted_h2b = static_cast(memPool.allocate(sizeof(T) * base_size_h2b)); - T* dev_evl_sorted_h3b = static_cast(memPool.allocate(sizeof(T) * base_size_h3b)); - T* dev_evl_sorted_p4b = static_cast(memPool.allocate(sizeof(T) * base_size_p4b)); - T* dev_evl_sorted_p5b = static_cast(memPool.allocate(sizeof(T) * base_size_p5b)); - T* dev_evl_sorted_p6b = static_cast(memPool.allocate(sizeof(T) * base_size_p6b)); + T* dev_evl_sorted_h1b = static_cast(getGpuMem(sizeof(T) * base_size_h1b)); + T* dev_evl_sorted_h2b = static_cast(getGpuMem(sizeof(T) * base_size_h2b)); + T* dev_evl_sorted_h3b = static_cast(getGpuMem(sizeof(T) * base_size_h3b)); + T* dev_evl_sorted_p4b = static_cast(getGpuMem(sizeof(T) * base_size_p4b)); + T* dev_evl_sorted_p5b = static_cast(getGpuMem(sizeof(T) * base_size_p5b)); + T* dev_evl_sorted_p6b = static_cast(getGpuMem(sizeof(T) * base_size_p6b)); if(!gpuEventQuery(*done_copy)) { gpuEventSynchronize(*done_copy); } #endif @@ -218,38 +216,6 @@ void ccsd_t_fully_fused_none_df_none_task( size_t num_blocks = CEIL(base_size_h3b, 4) * CEIL(base_size_h2b, 4) * CEIL(base_size_h1b, 4) * CEIL(base_size_p6b, 4) * CEIL(base_size_p5b, 4) * CEIL(base_size_p4b, 4); -#ifdef OPT_KERNEL_TIMING - // - long double task_num_ops_s1 = 0; - long double task_num_ops_d1 = 0; - long double task_num_ops_d2 = 0; - long double task_num_ops_total = 0; - - // - helper_calculate_num_ops(noab, nvab, df_simple_s1_size, df_simple_d1_size, df_simple_d2_size, - df_simple_s1_exec, df_simple_d1_exec, df_simple_d2_exec, task_num_ops_s1, - task_num_ops_d1, task_num_ops_d2, total_num_ops_s1, total_num_ops_d1, - total_num_ops_d2); - - // - task_num_ops_total = task_num_ops_s1 + task_num_ops_d1 + task_num_ops_d2; -#endif - -#ifdef OPT_KERNEL_TIMING - gpuEvent_t start_kernel_only, stop_kernel_only; - -#if defined(USE_CUDA) - CUDA_SAFE(cudaEventCreate(&start_kernel_only)); - CUDA_SAFE(cudaEventCreate(&stop_kernel_only)); - CUDA_SAFE(cudaEventRecord(start_kernel_only)); -#elif defined(USE_HIP) - HIP_SAFE(hipEventCreate(&start_kernel_only)); - HIP_SAFE(hipEventCreate(&stop_kernel_only)); - HIP_SAFE(hipEventRecord(start_kernel_only)); -#endif - -#endif // OPT_KERNEL_TIMING - #if defined(USE_DPCPP) || defined(USE_HIP) || (defined(USE_CUDA) && !defined(USE_NV_TC)) fully_fused_ccsd_t_gpu(stream, num_blocks, k_range[t_h1b], k_range[t_h2b], k_range[t_h3b], k_range[t_p4b], k_range[t_p5b], k_range[t_p6b], @@ -303,25 +269,23 @@ void ccsd_t_fully_fused_none_df_none_task( reduceData->factor = factor; #ifdef USE_CUDA - CUDA_SAFE(cudaLaunchHostFunc(stream, hostEnergyReduce, reduceData)); - CUDA_SAFE(cudaEventRecord(*done_compute, stream)); + CUDA_SAFE(cudaLaunchHostFunc(stream.first, hostEnergyReduce, reduceData)); + CUDA_SAFE(cudaEventRecord(*done_compute, stream.first)); #elif defined(USE_HIP) - HIP_SAFE(hipLaunchHostFunc(stream, hostEnergyReduce, reduceData)); - HIP_SAFE(hipEventRecord(*done_compute, stream)); + HIP_SAFE(hipLaunchHostFunc(stream.first, hostEnergyReduce, reduceData)); + HIP_SAFE(hipEventRecord(*done_compute, stream.first)); #elif defined(USE_DPCPP) // TODO: the sync might not be needed (stream.first.ext_oneapi_submit_barrier) - auto host_task_event = stream.submit( + auto host_task_event = stream.first.submit( [&](sycl::handler& cgh) { cgh.host_task([=]() { hostEnergyReduce(reduceData); }); }); - (*done_compute) = stream.ext_oneapi_submit_barrier({host_task_event}); + (*done_compute) = stream.first.ext_oneapi_submit_barrier({host_task_event}); #endif - // free device mem back to pool - memPool.deallocate(static_cast(dev_evl_sorted_h1b), sizeof(T) * base_size_h1b); - memPool.deallocate(static_cast(dev_evl_sorted_h2b), sizeof(T) * base_size_h2b); - memPool.deallocate(static_cast(dev_evl_sorted_h3b), sizeof(T) * base_size_h3b); - memPool.deallocate(static_cast(dev_evl_sorted_p4b), sizeof(T) * base_size_p4b); - memPool.deallocate(static_cast(dev_evl_sorted_p5b), sizeof(T) * base_size_p5b); - memPool.deallocate(static_cast(dev_evl_sorted_p6b), sizeof(T) * base_size_p6b); - -#endif // if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP) + freeGpuMem(dev_evl_sorted_h1b); + freeGpuMem(dev_evl_sorted_h2b); + freeGpuMem(dev_evl_sorted_h3b); + freeGpuMem(dev_evl_sorted_p4b); + freeGpuMem(dev_evl_sorted_p5b); + freeGpuMem(dev_evl_sorted_p6b); +#endif } diff --git a/exachem/cc/ccsd_t/ccsd_t_all_fused_gpu.cu b/exachem/cc/ccsd_t/ccsd_t_all_fused_gpu.cu index 53753a5..c75cc67 100644 --- a/exachem/cc/ccsd_t/ccsd_t_all_fused_gpu.cu +++ b/exachem/cc/ccsd_t/ccsd_t_all_fused_gpu.cu @@ -2591,18 +2591,18 @@ void ccsd_t_fully_fused_nvidia_tc_fp64(gpuStream_t& stream_id, size_t numBlks, s // constant memories // cudaMemcpyToSymbolAsync(const_d1_h7b, host_size_d1_h7b, sizeof(int) * size_noab, 0, - cudaMemcpyHostToDevice, stream_id); + cudaMemcpyHostToDevice, stream_id.first); cudaMemcpyToSymbolAsync(const_d2_p7b, host_size_d2_p7b, sizeof(int) * size_nvab, 0, - cudaMemcpyHostToDevice, stream_id); + cudaMemcpyHostToDevice, stream_id.first); cudaMemcpyToSymbolAsync(const_s1_exec, host_exec_s1, sizeof(int) * (9), 0, cudaMemcpyHostToDevice, - stream_id); + stream_id.first); cudaMemcpyToSymbolAsync(const_d1_exec, host_exec_d1, sizeof(int) * (9 * size_noab), 0, - cudaMemcpyHostToDevice, stream_id); + cudaMemcpyHostToDevice, stream_id.first); cudaMemcpyToSymbolAsync(const_d2_exec, host_exec_d2, sizeof(int) * (9 * size_nvab), 0, - cudaMemcpyHostToDevice, stream_id); + cudaMemcpyHostToDevice, stream_id.first); - CUDA_SAFE(cudaEventRecord(*done_copy, stream_id)); + CUDA_SAFE(cudaEventRecord(*done_copy, stream_id.first)); // printf ("[new] s1: %d,%d,%d/%d,%d,%d/%d,%d,%d\n", host_exec_s1[0], host_exec_s1[1], // host_exec_s1[2], host_exec_s1[3], host_exec_s1[4], host_exec_s1[5], host_exec_s1[6], @@ -2637,7 +2637,7 @@ void ccsd_t_fully_fused_nvidia_tc_fp64(gpuStream_t& stream_id, size_t numBlks, s // T host_energies_zero[2] = {0.0, 0.0}; // cudaMemcpyAsync(dev_energies, host_energies_zero, sizeof(T) * 2, cudaMemcpyHostToDevice, - // stream_id); + // stream_id.first); // // cudaDeviceSetCacheConfig(cudaFuncCachePreferShared); @@ -2647,7 +2647,7 @@ void ccsd_t_fully_fused_nvidia_tc_fp64(gpuStream_t& stream_id, size_t numBlks, s // CUCHK(cudaFuncSetAttribute(fused_kernel_d2, cudaFuncAttributeMaxDynamicSharedMemorySize, // maxbytes)); fully_fused_kernel_ccsd_t_nvidia_tc_fp64 - <<>>( + <<>>( (int) size_noab, (int) size_nvab, // (int) size_max_dim_s1_t1, (int) size_max_dim_s1_v2, (int) size_max_dim_d1_t2, diff --git a/exachem/cc/ccsd_t/ccsd_t_all_fused_nontcCuda_Hip_Sycl.cpp b/exachem/cc/ccsd_t/ccsd_t_all_fused_nontcCuda_Hip_Sycl.cpp index f9f3d99..637d3c2 100644 --- a/exachem/cc/ccsd_t/ccsd_t_all_fused_nontcCuda_Hip_Sycl.cpp +++ b/exachem/cc/ccsd_t/ccsd_t_all_fused_nontcCuda_Hip_Sycl.cpp @@ -188,7 +188,9 @@ __global__ void revised_jk_ccsd_t_fully_fused_kernel( int base_size_h7b, base_size_p7b; +#pragma unroll 4 for(int i = 0; i < 4; i++) +#pragma unroll 4 for(int j = 0; j < 4; j++) { reg_tile[i][j] = 0.0; reg_singles[i][j] = 0.0; @@ -327,6 +329,7 @@ __global__ void revised_jk_ccsd_t_fully_fused_kernel( temp_bv[2] = sm_b[ll][idx_h3 + (idx_h2) *FUSION_SIZE_SLICE_2_H3 + 32]; temp_bv[3] = sm_b[ll][idx_h3 + (idx_h2) *FUSION_SIZE_SLICE_2_H3 + 48]; +#pragma unroll 4 for(int xx = 0; xx < 4; xx++) { temp_av = sm_a[ll][idx_p6 + (idx_h1) *FUSION_SIZE_SLICE_1_P6 + (xx * 16)]; @@ -395,6 +398,7 @@ __global__ void revised_jk_ccsd_t_fully_fused_kernel( temp_bv[2] = sm_b[ll][idx_h3 + (idx_h1) *FUSION_SIZE_SLICE_2_H3 + 32]; temp_bv[3] = sm_b[ll][idx_h3 + (idx_h1) *FUSION_SIZE_SLICE_2_H3 + 48]; +#pragma unroll 4 for(int xx = 0; xx < 4; xx++) { temp_av = sm_a[ll][idx_p6 + (idx_h2) *FUSION_SIZE_SLICE_2_P4 + (xx * 16)]; @@ -462,6 +466,7 @@ __global__ void revised_jk_ccsd_t_fully_fused_kernel( temp_bv[2] = sm_b[ll][idx_h2 + (idx_h1) *FUSION_SIZE_SLICE_2_H2 + 32]; temp_bv[3] = sm_b[ll][idx_h2 + (idx_h1) *FUSION_SIZE_SLICE_2_H2 + 48]; +#pragma unroll 4 for(int xx = 0; xx < 4; xx++) { temp_av = sm_a[ll][idx_p6 + (idx_h3) *FUSION_SIZE_SLICE_2_P4 + (xx * 16)]; @@ -604,6 +609,7 @@ __global__ void revised_jk_ccsd_t_fully_fused_kernel( temp_bv[2] = sm_a[ll][idx_h1 + (idx_h2) *FUSION_SIZE_SLICE_2_H1 + 32]; temp_bv[3] = sm_a[ll][idx_h1 + (idx_h2) *FUSION_SIZE_SLICE_2_H1 + 48]; +#pragma unroll 4 for(int xx = 0; xx < 4; xx++) { temp_av = sm_b[ll][idx_h3 + (idx_p6) *FUSION_SIZE_SLICE_2_H3 + (xx * 16)]; @@ -678,6 +684,7 @@ __global__ void revised_jk_ccsd_t_fully_fused_kernel( temp_bv[2] = sm_a[ll][idx_h2 + (idx_h3) *FUSION_SIZE_SLICE_2_H2 + 32]; temp_bv[3] = sm_a[ll][idx_h2 + (idx_h3) *FUSION_SIZE_SLICE_2_H2 + 48]; +#pragma unroll 4 for(int xx = 0; xx < 4; xx++) { temp_av = sm_b[ll][idx_h1 + (idx_p6) *FUSION_SIZE_SLICE_2_H1 + (xx * 16)]; @@ -752,6 +759,7 @@ __global__ void revised_jk_ccsd_t_fully_fused_kernel( temp_bv[2] = sm_a[ll][idx_h1 + (idx_h3) *FUSION_SIZE_SLICE_2_H1 + 32]; temp_bv[3] = sm_a[ll][idx_h1 + (idx_h3) *FUSION_SIZE_SLICE_2_H1 + 48]; +#pragma unroll 4 for(int xx = 0; xx < 4; xx++) { temp_av = sm_b[ll][idx_h2 + (idx_p6) *FUSION_SIZE_SLICE_2_H2 + (xx * 16)]; @@ -1106,6 +1114,7 @@ __global__ void revised_jk_ccsd_t_fully_fused_kernel( temp_bv[2] = sm_b[ll][idx_h3 + (idx_h2) *FUSION_SIZE_SLICE_1_H3 + 32]; temp_bv[3] = sm_b[ll][idx_h3 + (idx_h2) *FUSION_SIZE_SLICE_1_H3 + 48]; +#pragma unroll 4 for(int xx = 0; xx < 4; xx++) { temp_av = sm_a[ll][idx_p6 + (idx_h1) *FUSION_SIZE_SLICE_1_P6 + (xx * 16)]; @@ -1175,6 +1184,7 @@ __global__ void revised_jk_ccsd_t_fully_fused_kernel( temp_bv[2] = sm_b[ll][idx_h3 + (idx_h1) *FUSION_SIZE_SLICE_1_H3 + 32]; temp_bv[3] = sm_b[ll][idx_h3 + (idx_h1) *FUSION_SIZE_SLICE_1_H3 + 48]; +#pragma unroll 4 for(int xx = 0; xx < 4; xx++) { temp_av = sm_a[ll][idx_p6 + (idx_h2) *FUSION_SIZE_SLICE_1_P6 + (xx * 16)]; @@ -1244,6 +1254,7 @@ __global__ void revised_jk_ccsd_t_fully_fused_kernel( temp_bv[2] = sm_b[ll][idx_h2 + (idx_h1) *FUSION_SIZE_SLICE_1_H2 + 32]; temp_bv[3] = sm_b[ll][idx_h2 + (idx_h1) *FUSION_SIZE_SLICE_1_H2 + 48]; +#pragma unroll 4 for(int xx = 0; xx < 4; xx++) { temp_av = sm_a[ll][idx_p6 + (idx_h3) *FUSION_SIZE_SLICE_1_P6 + (xx * 16)]; @@ -1312,6 +1323,7 @@ __global__ void revised_jk_ccsd_t_fully_fused_kernel( temp_bv[2] = sm_a[ll][idx_p6 + (idx_h1) *FUSION_SIZE_SLICE_1_P6 + 32]; temp_bv[3] = sm_a[ll][idx_p6 + (idx_h1) *FUSION_SIZE_SLICE_1_P6 + 48]; +#pragma unroll 4 for(int xx = 0; xx < 4; xx++) { temp_av = sm_b[ll][idx_h3 + (idx_h2) *FUSION_SIZE_SLICE_1_H3 + (xx * 16)]; @@ -1380,6 +1392,7 @@ __global__ void revised_jk_ccsd_t_fully_fused_kernel( temp_bv[2] = sm_a[ll][idx_p6 + (idx_h2) *FUSION_SIZE_SLICE_1_P6 + 32]; temp_bv[3] = sm_a[ll][idx_p6 + (idx_h2) *FUSION_SIZE_SLICE_1_P6 + 48]; +#pragma unroll 4 for(int xx = 0; xx < 4; xx++) { temp_av = sm_b[ll][idx_h3 + (idx_h1) *FUSION_SIZE_SLICE_1_H3 + (xx * 16)]; @@ -1448,6 +1461,7 @@ __global__ void revised_jk_ccsd_t_fully_fused_kernel( temp_bv[2] = sm_a[ll][idx_p6 + (idx_h3) *FUSION_SIZE_SLICE_1_P6 + 32]; temp_bv[3] = sm_a[ll][idx_p6 + (idx_h3) *FUSION_SIZE_SLICE_1_P6 + 48]; +#pragma unroll 4 for(int xx = 0; xx < 4; xx++) { temp_av = sm_b[ll][idx_h2 + (idx_h1) *FUSION_SIZE_SLICE_1_H2 + (xx * 16)]; @@ -1589,6 +1603,7 @@ __global__ void revised_jk_ccsd_t_fully_fused_kernel( temp_bv[2] = sm_a[ll][idx_h1 + (idx_h2) *FUSION_SIZE_SLICE_1_H1 + 32]; temp_bv[3] = sm_a[ll][idx_h1 + (idx_h2) *FUSION_SIZE_SLICE_1_H1 + 48]; +#pragma unroll 4 for(int xx = 0; xx < 4; xx++) { temp_av = sm_b[ll][idx_h3 + (idx_p6) *FUSION_SIZE_SLICE_1_H3 + (xx * 16)]; @@ -1658,6 +1673,7 @@ __global__ void revised_jk_ccsd_t_fully_fused_kernel( temp_bv[2] = sm_a[ll][idx_h2 + (idx_h3) *FUSION_SIZE_SLICE_1_H2 + 32]; temp_bv[3] = sm_a[ll][idx_h2 + (idx_h3) *FUSION_SIZE_SLICE_1_H2 + 48]; +#pragma unroll 4 for(int xx = 0; xx < 4; xx++) { temp_av = sm_b[ll][idx_h1 + (idx_p6) *FUSION_SIZE_SLICE_1_H1 + (xx * 16)]; @@ -1727,6 +1743,7 @@ __global__ void revised_jk_ccsd_t_fully_fused_kernel( temp_bv[2] = sm_a[ll][idx_h1 + (idx_h3) *FUSION_SIZE_SLICE_1_H1 + 32]; temp_bv[3] = sm_a[ll][idx_h1 + (idx_h3) *FUSION_SIZE_SLICE_1_H1 + 48]; +#pragma unroll 4 for(int xx = 0; xx < 4; xx++) { temp_av = sm_b[ll][idx_h2 + (idx_p6) *FUSION_SIZE_SLICE_1_H2 + (xx * 16)]; @@ -1796,6 +1813,7 @@ __global__ void revised_jk_ccsd_t_fully_fused_kernel( temp_bv[2] = sm_b[ll][idx_h3 + (idx_p6) *FUSION_SIZE_SLICE_1_H1 + 32]; temp_bv[3] = sm_b[ll][idx_h3 + (idx_p6) *FUSION_SIZE_SLICE_1_H1 + 48]; +#pragma unroll 4 for(int xx = 0; xx < 4; xx++) { temp_av = sm_a[ll][idx_h1 + (idx_h2) *FUSION_SIZE_SLICE_1_H1 + (xx * 16)]; @@ -1865,6 +1883,7 @@ __global__ void revised_jk_ccsd_t_fully_fused_kernel( temp_bv[2] = sm_b[ll][idx_h1 + (idx_p6) *FUSION_SIZE_SLICE_1_H1 + 32]; temp_bv[3] = sm_b[ll][idx_h1 + (idx_p6) *FUSION_SIZE_SLICE_1_H1 + 48]; +#pragma unroll 4 for(int xx = 0; xx < 4; xx++) { temp_av = sm_a[ll][idx_h2 + (idx_h3) *FUSION_SIZE_SLICE_1_H2 + (xx * 16)]; @@ -1934,6 +1953,7 @@ __global__ void revised_jk_ccsd_t_fully_fused_kernel( temp_bv[2] = sm_b[ll][idx_h2 + (idx_p6) *FUSION_SIZE_SLICE_1_H2 + 32]; temp_bv[3] = sm_b[ll][idx_h2 + (idx_p6) *FUSION_SIZE_SLICE_1_H2 + 48]; +#pragma unroll 4 for(int xx = 0; xx < 4; xx++) // 4 -> rng_p4: Local Transactions... { temp_av = sm_a[ll][idx_h1 + (idx_h3) *FUSION_SIZE_SLICE_1_H1 + (xx * 16)]; @@ -2699,7 +2719,9 @@ __global__ void revised_jk_ccsd_t_fully_fused_kernel( // if(idx_h3 < energy_rng_h3 && idx_h2 < energy_rng_h2 && idx_p6 < energy_rng_p6 && idx_h1 < energy_rng_h1) { +#pragma unroll 4 for(int i = 0; i < FUSION_SIZE_SLICE_1_P5; i++) { +#pragma unroll 4 for(int j = 0; j < FUSION_SIZE_SLICE_1_P4; j++) { if(i < energy_rng_p5 && j < energy_rng_p4) { // @@ -2798,28 +2820,28 @@ void fully_fused_ccsd_t_gpu(gpuStream_t& stream, size_t num_blocks, size_t base_ T* partial_energies, gpuEvent_t* done_copy) { #ifdef USE_CUDA cudaMemcpyToSymbolAsync(const_df_s1_size, host_s1_size, sizeof(int) * (6), 0, - cudaMemcpyHostToDevice, stream); + cudaMemcpyHostToDevice, stream.first); cudaMemcpyToSymbolAsync(const_df_s1_exec, host_s1_exec, sizeof(int) * (9), 0, - cudaMemcpyHostToDevice, stream); + cudaMemcpyHostToDevice, stream.first); cudaMemcpyToSymbolAsync(const_df_d1_size, host_d1_size, sizeof(int) * (7 * size_noab), 0, - cudaMemcpyHostToDevice, stream); + cudaMemcpyHostToDevice, stream.first); cudaMemcpyToSymbolAsync(const_df_d1_exec, host_d1_exec, sizeof(int) * (9 * size_noab), 0, - cudaMemcpyHostToDevice, stream); + cudaMemcpyHostToDevice, stream.first); cudaMemcpyToSymbolAsync(const_df_d2_size, host_d2_size, sizeof(int) * (7 * size_nvab), 0, - cudaMemcpyHostToDevice, stream); + cudaMemcpyHostToDevice, stream.first); cudaMemcpyToSymbolAsync(const_df_d2_exec, host_d2_exec, sizeof(int) * (9 * size_nvab), 0, - cudaMemcpyHostToDevice, stream); + cudaMemcpyHostToDevice, stream.first); - CUDA_SAFE(cudaEventRecord(*done_copy, stream)); + CUDA_SAFE(cudaEventRecord(*done_copy, stream.first)); // Depends on # of Fused Kernel dim3 gridsize_1(num_blocks); dim3 blocksize_1(FUSION_SIZE_TB_1_X, FUSION_SIZE_TB_1_Y); // to call the fused kernel for singles, doubles and energies. - revised_jk_ccsd_t_fully_fused_kernel<<>>( + revised_jk_ccsd_t_fully_fused_kernel<<>>( (int) size_noab, (int) size_nvab, (int) size_max_dim_s1_t1, (int) size_max_dim_s1_v2, (int) size_max_dim_d1_t2, (int) size_max_dim_d1_v2, (int) size_max_dim_d2_t2, (int) size_max_dim_d2_v2, df_dev_d1_t2_all, df_dev_d1_v2_all, df_dev_d2_t2_all, @@ -2833,19 +2855,23 @@ void fully_fused_ccsd_t_gpu(gpuStream_t& stream, size_t num_blocks, size_t base_ #elif defined(USE_HIP) HIP_SAFE(hipMemcpyToSymbolAsync(HIP_SYMBOL(const_df_s1_size), host_s1_size, sizeof(int) * (6), 0, - hipMemcpyHostToDevice, stream)); + hipMemcpyHostToDevice, stream.first)); HIP_SAFE(hipMemcpyToSymbolAsync(HIP_SYMBOL(const_df_s1_exec), host_s1_exec, sizeof(int) * (9), 0, - hipMemcpyHostToDevice, stream)); + hipMemcpyHostToDevice, stream.first)); HIP_SAFE(hipMemcpyToSymbolAsync(HIP_SYMBOL(const_df_d1_size), host_d1_size, - sizeof(int) * (7 * size_noab), 0, hipMemcpyHostToDevice, stream)); + sizeof(int) * (7 * size_noab), 0, hipMemcpyHostToDevice, + stream.first)); HIP_SAFE(hipMemcpyToSymbolAsync(HIP_SYMBOL(const_df_d1_exec), host_d1_exec, - sizeof(int) * (9 * size_noab), 0, hipMemcpyHostToDevice, stream)); + sizeof(int) * (9 * size_noab), 0, hipMemcpyHostToDevice, + stream.first)); HIP_SAFE(hipMemcpyToSymbolAsync(HIP_SYMBOL(const_df_d2_size), host_d2_size, - sizeof(int) * (7 * size_nvab), 0, hipMemcpyHostToDevice, stream)); + sizeof(int) * (7 * size_nvab), 0, hipMemcpyHostToDevice, + stream.first)); HIP_SAFE(hipMemcpyToSymbolAsync(HIP_SYMBOL(const_df_d2_exec), host_d2_exec, - sizeof(int) * (9 * size_nvab), 0, hipMemcpyHostToDevice, stream)); + sizeof(int) * (9 * size_nvab), 0, hipMemcpyHostToDevice, + stream.first)); - HIP_SAFE(hipEventRecord(*done_copy, stream)); + HIP_SAFE(hipEventRecord(*done_copy, stream.first)); // Depends on # of Fused Kernel dim3 gridsize_1(num_blocks); @@ -2854,12 +2880,12 @@ void fully_fused_ccsd_t_gpu(gpuStream_t& stream, size_t num_blocks, size_t base_ // to call the fused kernel for singles, doubles and energies. hipLaunchKernelGGL( HIP_KERNEL_NAME(revised_jk_ccsd_t_fully_fused_kernel), dim3(gridsize_1), dim3(blocksize_1), - 0, stream, (int) size_noab, (int) size_nvab, (int) size_max_dim_s1_t1, (int) size_max_dim_s1_v2, - (int) size_max_dim_d1_t2, (int) size_max_dim_d1_v2, (int) size_max_dim_d2_t2, - (int) size_max_dim_d2_v2, df_dev_d1_t2_all, df_dev_d1_v2_all, df_dev_d2_t2_all, - df_dev_d2_v2_all, df_dev_s1_t1_all, df_dev_s1_v2_all, dev_evl_sorted_h1b, dev_evl_sorted_h2b, - dev_evl_sorted_h3b, dev_evl_sorted_p4b, dev_evl_sorted_p5b, dev_evl_sorted_p6b, - partial_energies, CEIL(base_size_h3b, FUSION_SIZE_SLICE_1_H3), + 0, stream.first, (int) size_noab, (int) size_nvab, (int) size_max_dim_s1_t1, + (int) size_max_dim_s1_v2, (int) size_max_dim_d1_t2, (int) size_max_dim_d1_v2, + (int) size_max_dim_d2_t2, (int) size_max_dim_d2_v2, df_dev_d1_t2_all, df_dev_d1_v2_all, + df_dev_d2_t2_all, df_dev_d2_v2_all, df_dev_s1_t1_all, df_dev_s1_v2_all, dev_evl_sorted_h1b, + dev_evl_sorted_h2b, dev_evl_sorted_h3b, dev_evl_sorted_p4b, dev_evl_sorted_p5b, + dev_evl_sorted_p6b, partial_energies, CEIL(base_size_h3b, FUSION_SIZE_SLICE_1_H3), CEIL(base_size_h2b, FUSION_SIZE_SLICE_1_H2), CEIL(base_size_h1b, FUSION_SIZE_SLICE_1_H1), CEIL(base_size_p6b, FUSION_SIZE_SLICE_1_P6), CEIL(base_size_p5b, FUSION_SIZE_SLICE_1_P5), CEIL(base_size_p4b, FUSION_SIZE_SLICE_1_P4), (int) base_size_h1b, (int) base_size_h2b, @@ -2870,7 +2896,7 @@ void fully_fused_ccsd_t_gpu(gpuStream_t& stream, size_t num_blocks, size_t base_ sycl::range<2> blocksize(FUSION_SIZE_TB_1_Y, FUSION_SIZE_TB_1_X); auto global_range = gridsize * blocksize; - stream.parallel_for(sycl::nd_range<2>(global_range, blocksize), [=](auto item) { + stream.first.parallel_for(sycl::nd_range<2>(global_range, blocksize), [=](auto item) { revised_jk_ccsd_t_fully_fused_kernel( size_noab, size_nvab, size_max_dim_s1_t1, size_max_dim_s1_v2, size_max_dim_d1_t2, size_max_dim_d1_v2, size_max_dim_d2_t2, size_max_dim_d2_v2, df_dev_d1_t2_all, diff --git a/exachem/cc/ccsd_t/ccsd_t_common.hpp b/exachem/cc/ccsd_t/ccsd_t_common.hpp index 796e6de..aef84c2 100644 --- a/exachem/cc/ccsd_t/ccsd_t_common.hpp +++ b/exachem/cc/ccsd_t/ccsd_t_common.hpp @@ -14,7 +14,7 @@ #include #if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP) -#include "tamm/gpu_memory_pool.hpp" +#include "tamm/gpu_streams.hpp" using tamm::gpuEvent_t; using tamm::gpuStream_t; #endif @@ -65,9 +65,13 @@ void initMemModule(); std::string check_memory_req(const int cc_t_ts, const int nbf); void* getGpuMem(size_t bytes); +void* getPinnedMem(size_t bytes); void* getHostMem(size_t bytes); +void* getPinnedMem(size_t bytes); void freeHostMem(void* p); +void freePinnedMem(void* p); void freeGpuMem(void* p); +void freePinnedMem(void* p); void finalizeMemModule(); diff --git a/exachem/cc/ccsd_t/ccsd_t_fused_driver.hpp b/exachem/cc/ccsd_t/ccsd_t_fused_driver.hpp index 3f0807d..1293df3 100644 --- a/exachem/cc/ccsd_t/ccsd_t_fused_driver.hpp +++ b/exachem/cc/ccsd_t/ccsd_t_fused_driver.hpp @@ -166,47 +166,54 @@ std::tuple ccsd_t_fused_driver_new( size_t size_T_d2_t2 = max_d2_kernels_pertask * (max_pdim * max_pdim) * (max_hdim * max_hdim); size_t size_T_d2_v2 = max_d2_kernels_pertask * (max_pdim * max_pdim * max_pdim) * (max_hdim); - T* df_host_pinned_s1_t1 = (T*) getHostMem(sizeof(T) * size_T_s1_t1); - T* df_host_pinned_s1_v2 = (T*) getHostMem(sizeof(T) * size_T_s1_v2); - T* df_host_pinned_d1_t2 = (T*) getHostMem(sizeof(T) * size_T_d1_t2); - T* df_host_pinned_d1_v2 = (T*) getHostMem(sizeof(T) * size_T_d1_v2); - T* df_host_pinned_d2_t2 = (T*) getHostMem(sizeof(T) * size_T_d2_t2); - T* df_host_pinned_d2_v2 = (T*) getHostMem(sizeof(T) * size_T_d2_v2); + T* df_host_pinned_s1_t1{nullptr}; + T* df_host_pinned_s1_v2{nullptr}; + T* df_host_pinned_d1_t2{nullptr}; + T* df_host_pinned_d1_v2{nullptr}; + T* df_host_pinned_d2_t2{nullptr}; + T* df_host_pinned_d2_v2{nullptr}; + + int* df_simple_s1_size = static_cast(getHostMem(sizeof(int) * (6))); + int* df_simple_s1_exec = static_cast(getHostMem(sizeof(int) * (9))); + int* df_simple_d1_size = static_cast(getHostMem(sizeof(int) * (7 * noab))); + int* df_simple_d1_exec = static_cast(getHostMem(sizeof(int) * (9 * noab))); + int* df_simple_d2_size = static_cast(getHostMem(sizeof(int) * (7 * nvab))); + int* df_simple_d2_exec = static_cast(getHostMem(sizeof(int) * (9 * nvab))); + + int* host_d1_size = static_cast(getHostMem(sizeof(int) * (noab))); + int* host_d2_size = static_cast(getHostMem(sizeof(int) * (nvab))); - // - int* df_simple_s1_size = (int*) getHostMem(sizeof(int) * (6)); - int* df_simple_s1_exec = (int*) getHostMem(sizeof(int) * (9)); - - int* host_d1_size = (int*) getHostMem(sizeof(int) * (noab)); - int* df_simple_d1_size = (int*) getHostMem(sizeof(int) * (7 * noab)); - int* df_simple_d1_exec = (int*) getHostMem(sizeof(int) * (9 * noab)); - - int* host_d2_size = (int*) getHostMem(sizeof(int) * (nvab)); - int* df_simple_d2_size = (int*) getHostMem(sizeof(int) * (7 * nvab)); - int* df_simple_d2_exec = (int*) getHostMem(sizeof(int) * (9 * nvab)); #if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP) - // get GPU memory handle from pool - auto& memPool = tamm::GPUPooledStorageManager::getInstance(); - - T* df_dev_s1_t1_all = static_cast(memPool.allocate(sizeof(T) * size_T_s1_t1)); - T* df_dev_s1_v2_all = static_cast(memPool.allocate(sizeof(T) * size_T_s1_v2)); - T* df_dev_d1_t2_all = static_cast(memPool.allocate(sizeof(T) * size_T_d1_t2)); - T* df_dev_d1_v2_all = static_cast(memPool.allocate(sizeof(T) * size_T_d1_v2)); - T* df_dev_d2_t2_all = static_cast(memPool.allocate(sizeof(T) * size_T_d2_t2)); - T* df_dev_d2_v2_all = static_cast(memPool.allocate(sizeof(T) * size_T_d2_v2)); + T* df_dev_s1_t1_all = static_cast(getGpuMem(sizeof(T) * size_T_s1_t1)); + T* df_dev_s1_v2_all = static_cast(getGpuMem(sizeof(T) * size_T_s1_v2)); + T* df_dev_d1_t2_all = static_cast(getGpuMem(sizeof(T) * size_T_d1_t2)); + T* df_dev_d1_v2_all = static_cast(getGpuMem(sizeof(T) * size_T_d1_v2)); + T* df_dev_d2_t2_all = static_cast(getGpuMem(sizeof(T) * size_T_d2_t2)); + T* df_dev_d2_v2_all = static_cast(getGpuMem(sizeof(T) * size_T_d2_v2)); + + df_host_pinned_s1_t1 = static_cast(getPinnedMem(sizeof(T) * size_T_s1_t1)); + df_host_pinned_s1_v2 = static_cast(getPinnedMem(sizeof(T) * size_T_s1_v2)); + df_host_pinned_d1_t2 = static_cast(getPinnedMem(sizeof(T) * size_T_d1_t2)); + df_host_pinned_d1_v2 = static_cast(getPinnedMem(sizeof(T) * size_T_d1_v2)); + df_host_pinned_d2_t2 = static_cast(getPinnedMem(sizeof(T) * size_T_d2_t2)); + df_host_pinned_d2_v2 = static_cast(getPinnedMem(sizeof(T) * size_T_d2_v2)); +#else // cpu + df_host_pinned_s1_t1 = static_cast(getHostMem(sizeof(T) * size_T_s1_t1)); + df_host_pinned_s1_v2 = static_cast(getHostMem(sizeof(T) * size_T_s1_v2)); + df_host_pinned_d1_t2 = static_cast(getHostMem(sizeof(T) * size_T_d1_t2)); + df_host_pinned_d1_v2 = static_cast(getHostMem(sizeof(T) * size_T_d1_v2)); + df_host_pinned_d2_t2 = static_cast(getHostMem(sizeof(T) * size_T_d2_t2)); + df_host_pinned_d2_v2 = static_cast(getHostMem(sizeof(T) * size_T_d2_v2)); #endif - // size_t max_num_blocks = sys_data.options_map.ccsd_options.ccsdt_tilesize; max_num_blocks = std::ceil((max_num_blocks + 4 - 1) / 4.0); - T* df_host_energies = (T*) getHostMem(sizeof(T) * std::pow(max_num_blocks, 6) * 2); + T* df_host_energies = static_cast(getHostMem(sizeof(T) * std::pow(max_num_blocks, 6) * 2)); #if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP) - T* df_dev_energies = - static_cast(memPool.allocate(sizeof(T) * std::pow(max_num_blocks, 6) * 2)); + T* df_dev_energies = static_cast(getGpuMem(sizeof(T) * std::pow(max_num_blocks, 6) * 2)); #endif - // int num_task = 0; if(!seq_h3b) { if(rank == 0) { @@ -406,20 +413,10 @@ std::tuple ccsd_t_fused_driver_new( #if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP) gpuDeviceSynchronize(); #endif - // + energy1 = energy_l[0]; energy2 = energy_l[1]; - // - // free shared host mem. - // - freeHostMem(df_host_pinned_s1_t1); - freeHostMem(df_host_pinned_s1_v2); - freeHostMem(df_host_pinned_d1_t2); - freeHostMem(df_host_pinned_d1_v2); - freeHostMem(df_host_pinned_d2_t2); - freeHostMem(df_host_pinned_d2_v2); - freeHostMem(df_simple_s1_exec); freeHostMem(df_simple_s1_size); freeHostMem(df_simple_d1_exec); @@ -428,22 +425,33 @@ std::tuple ccsd_t_fused_driver_new( freeHostMem(df_simple_d2_exec); freeHostMem(df_simple_d2_size); freeHostMem(host_d2_size); - freeHostMem(df_host_energies); #if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP) - memPool.deallocate(static_cast(df_dev_s1_t1_all), sizeof(T) * size_T_s1_t1); - memPool.deallocate(static_cast(df_dev_s1_v2_all), sizeof(T) * size_T_s1_v2); - memPool.deallocate(static_cast(df_dev_d1_t2_all), sizeof(T) * size_T_d1_t2); - memPool.deallocate(static_cast(df_dev_d1_v2_all), sizeof(T) * size_T_d1_v2); - memPool.deallocate(static_cast(df_dev_d2_t2_all), sizeof(T) * size_T_d2_t2); - memPool.deallocate(static_cast(df_dev_d2_v2_all), sizeof(T) * size_T_d2_v2); - - memPool.deallocate(static_cast(df_dev_energies), - sizeof(T) * std::pow(max_num_blocks, 6) * 2); + freeGpuMem(df_dev_s1_t1_all); + freeGpuMem(df_dev_s1_v2_all); + freeGpuMem(df_dev_d1_t2_all); + freeGpuMem(df_dev_d1_v2_all); + freeGpuMem(df_dev_d2_t2_all); + freeGpuMem(df_dev_d2_v2_all); + freeGpuMem(df_dev_energies); + + freePinnedMem(df_host_pinned_s1_t1); + freePinnedMem(df_host_pinned_s1_v2); + freePinnedMem(df_host_pinned_d1_t2); + freePinnedMem(df_host_pinned_d1_v2); + freePinnedMem(df_host_pinned_d2_t2); + freePinnedMem(df_host_pinned_d2_v2); + +#else // cpu +freeHostMem(df_host_pinned_s1_t1); +freeHostMem(df_host_pinned_s1_v2); +freeHostMem(df_host_pinned_d1_t2); +freeHostMem(df_host_pinned_d1_v2); +freeHostMem(df_host_pinned_d2_t2); +freeHostMem(df_host_pinned_d2_v2); #endif - // finalizememmodule(); auto cc_t2 = std::chrono::high_resolution_clock::now(); diff --git a/exachem/cc/ccsd_t/memory.cpp b/exachem/cc/ccsd_t/memory.cpp index d47bfa1..6bc554d 100644 --- a/exachem/cc/ccsd_t/memory.cpp +++ b/exachem/cc/ccsd_t/memory.cpp @@ -13,12 +13,6 @@ #include using namespace std; -// #define NO_OPT - -// extern "C" { - -// static int is_init=0; - static map> free_list_gpu, free_list_host; static map live_ptrs_gpu, live_ptrs_host; @@ -31,9 +25,7 @@ static void clearGpuFreeList() { #elif defined(USE_HIP) HIP_SAFE(hipFree(*it2)); #elif defined(USE_DPCPP) - auto& pool = tamm::GPUStreamPool::getInstance(); - gpuStream_t& stream = pool.getStream(); - sycl::free(*it2, stream); + sycl::free(*it2, tamm::GPUStreamPool::getInstance().getStream().first); #endif } } @@ -49,9 +41,7 @@ static void clearHostFreeList() { #elif defined(USE_HIP) HIP_SAFE(hipHostFree(*it2)); #elif defined(USE_DPCPP) - auto& pool = tamm::GPUStreamPool::getInstance(); - gpuStream_t& stream = pool.getStream(); - sycl::free(*it2, stream); + sycl::free(*it2, tamm::GPUStreamPool::getInstance().getStream().first); #else free(*it2); #endif @@ -70,9 +60,7 @@ static void* moreDeviceMem(size_t bytes) { #elif defined(USE_HIP) HIP_SAFE(hipMalloc(&ptr, bytes)); #elif defined(USE_DPCPP) - auto& pool = tamm::GPUStreamPool::getInstance(); - gpuStream_t& stream = pool.getStream(); - ptr = sycl::malloc_device(bytes, stream); + ptr = sycl::malloc_device(bytes, tamm::GPUStreamPool::getInstance().getStream().first); #endif assert(ptr != nullptr); /*We hopefully have a pointer*/ @@ -81,14 +69,13 @@ static void* moreDeviceMem(size_t bytes) { static void* moreHostMem(size_t bytes) { void* ptr = nullptr; + #if defined(USE_CUDA) CUDA_SAFE(cudaMallocHost(&ptr, bytes)); #elif defined(USE_HIP) HIP_SAFE(hipHostMalloc(&ptr, bytes)); #elif defined(USE_DPCPP) - auto& pool = tamm::GPUStreamPool::getInstance(); - gpuStream_t& stream = pool.getStream(); - ptr = sycl::malloc_host(bytes, stream); + ptr = sycl::malloc_host(bytes, tamm::GPUStreamPool::getInstance().getStream().first); #else ptr = (void*) malloc(bytes); #endif @@ -113,18 +100,18 @@ static inline void* resurrect_from_free_list(map>& free_map, } void* getGpuMem(size_t bytes) { - // assert(is_init); void* ptr = nullptr; + #ifdef NO_OPT + #if defined(USE_CUDA) CUDA_SAFE(cudaMalloc((void**) &ptr, bytes)); #elif defined(USE_HIP) HIP_SAFE(hipMalloc((void**) &ptr, bytes)); #elif defined(USE_DPCPP) - auto& pool = tamm::GPUStreamPool::getInstance(); - gpuStream_t& stream = pool.getStream(); - ptr = sycl::malloc_device(bytes, stream); + ptr = sycl::malloc_device(bytes, tamm::GPUStreamPool::getInstance().getStream().first); #endif + #else if(free_list_gpu.find(bytes) != free_list_gpu.end()) { set& lst = free_list_gpu.find(bytes)->second; @@ -150,29 +137,39 @@ void* getGpuMem(size_t bytes) { return ptr; } -void* getHostMem(size_t bytes) { - // assert(is_init); +void* getPinnedMem(size_t bytes) { void* ptr = nullptr; -#ifdef NO_OPT + #if defined(USE_CUDA) CUDA_SAFE(cudaMallocHost((void**) &ptr, bytes)); #elif defined(USE_HIP) - HIP_SAFE(hipHostMalloc((void**) &ptr, bytes)); + HIP_SAFE(hipMallocHost((void**) &ptr, bytes)); #elif defined(USE_DPCPP) - auto& pool = tamm::GPUStreamPool::getInstance(); - gpuStream_t& stream = pool.getStream(); - ptr = sycl::malloc_host(bytes, stream); -#else // cpu - ptr = (void*) malloc(bytes); + ptr = sycl::malloc_host(bytes, tamm::GPUStreamPool::getInstance().getStream().first); #endif + return ptr; +} + +void freePinnedMem(void* ptr) { +#if defined(USE_CUDA) + CUDA_SAFE(cudaFreeHost(ptr)); +#elif defined(USE_HIP) + HIP_SAFE(hipFreeHost(ptr)); +#elif defined(USE_DPCPP) + sycl::free(ptr, tamm::GPUStreamPool::getInstance().getStream().first); +#endif +} + +void* getHostMem(size_t bytes) { + void* ptr = nullptr; + +#ifdef NO_OPT + ptr = std::malloc(bytes); #else // NO_OPT if(free_list_host.find(bytes) != free_list_host.end()) { set& lst = free_list_host.find(bytes)->second; if(lst.size() != 0) { ptr = resurrect_from_free_list(free_list_host, bytes, live_ptrs_host); - /* ptr = *lst.begin(); */ - /* lst.erase(lst.begin()); */ - /* live_ptrs_host[ptr] = bytes; */ return ptr; } } @@ -181,10 +178,6 @@ void* getHostMem(size_t bytes) { ++it) { if(it->first >= bytes && it->second.size() > 0) { ptr = resurrect_from_free_list(free_list_host, it->first, live_ptrs_host); - /* set &lst = it->second; */ - /* ptr = *lst.begin(); */ - /* lst.erase(lst.begin()); */ - /* live_ptrs_gpu[ptr] = bytes; */ return ptr; } } @@ -197,22 +190,10 @@ void* getHostMem(size_t bytes) { } void freeHostMem(void* p) { - size_t bytes; - // assert(is_init); #ifdef NO_OPT -#if defined(USE_CUDA) - CUDA_SAFE(cudaFreeHost(p)); -#elif defined(USE_HIP) - HIP_SAFE(hipHostFree(p)); -#elif defined(USE_DPCPP) - auto& pool = tamm::GPUStreamPool::getInstance(); - gpuStream_t& stream = pool.getStream(); - sycl::free(p, stream); -#else - free(p); -#endif - -#else + std::free(p); +#else // NO_OPT + size_t bytes; assert(live_ptrs_host.find(p) != live_ptrs_host.end()); bytes = live_ptrs_host[p]; live_ptrs_host.erase(p); @@ -221,8 +202,6 @@ void freeHostMem(void* p) { } void freeGpuMem(void* p) { - size_t bytes; - // assert(is_init); #ifdef NO_OPT #if defined(USE_CUDA) @@ -230,16 +209,17 @@ void freeGpuMem(void* p) { #elif defined(USE_HIP) HIP_SAFE(hipFree(p)); #elif defined(USE_DPCPP) - auto& pool = tamm::GPUStreamPool::getInstance(); - gpuStream_t& stream = pool.getStream(); - sycl::free(p, stream); + sycl::free(p, tamm::GPUStreamPool::getInstance().getStream().first); #endif // NO_OPT #else + + size_t bytes; assert(live_ptrs_gpu.find(p) != live_ptrs_gpu.end()); bytes = live_ptrs_gpu[p]; live_ptrs_gpu.erase(p); free_list_gpu[bytes].insert(p); + #endif }