diff --git a/applications/solvers/dfLowMachFoam/EEqn.H b/applications/solvers/dfLowMachFoam/EEqn.H index aac84fab9..46f59b798 100644 --- a/applications/solvers/dfLowMachFoam/EEqn.H +++ b/applications/solvers/dfLowMachFoam/EEqn.H @@ -1,36 +1,137 @@ { volScalarField& he = thermo.he(); - +#if defined GPUSolverNew_ + double *h_he = dfDataBase.getFieldPointer("he", location::cpu, position::internal); + double *h_boundary_he = dfDataBase.getFieldPointer("he", location::cpu, position::boundary); + + EEqn_GPU.process(); + EEqn_GPU.sync(); + // EEqn_GPU.postProcess(h_he, h_boundary_he); + + // copy h_he to he(cpu) + // memcpy(&he[0], h_he, dfDataBase.cell_value_bytes); + + //DEBUG_TRACE; + //he.correctBoundaryConditions(); + //DEBUG_TRACE; + +#if defined DEBUG_ + fvScalarMatrix EEqn + ( + + fvm::ddt(rho, he) + mvConvection->fvmDiv(phi, he) + + fvc::ddt(rho, K) + fvc::div(phi, K) + - dpdt + == + ( + turbName == "laminar" + ? + ( + fvm::laplacian(turbulence->alpha(), he) + - diffAlphaD + + fvc::div(hDiffCorrFlux) + ) + : + ( + fvm::laplacian(turbulence->alphaEff(), he) + ) + ) + ); + // EEqn.relax(); + EEqn.solve("ha"); + // checkResult + // TODO: for temp, now we compare ldu, finally we compare csr + std::vector h_internal_coeffs(dfDataBase.num_boundary_surfaces); + std::vector h_boundary_coeffs(dfDataBase.num_boundary_surfaces); + + offset = 0; + forAll(he.boundaryField(), patchi) + { + const fvPatchScalarField& patchHe = he.boundaryField()[patchi]; + int patchSize = patchHe.size(); + const double* internal_coeff_ptr = &EEqn.internalCoeffs()[patchi][0]; + const double* boundary_coeff_ptr = &EEqn.boundaryCoeffs()[patchi][0]; + if (patchHe.type() == "processor" + || patchHe.type() == "processorCyclic") { + memcpy(h_internal_coeffs.data() + offset, internal_coeff_ptr, patchSize * sizeof(double)); + memset(h_internal_coeffs.data() + offset + patchSize, 0, patchSize * sizeof(double)); + memcpy(h_boundary_coeffs.data() + offset, boundary_coeff_ptr, patchSize * sizeof(double)); + memset(h_boundary_coeffs.data() + offset + patchSize, 0, patchSize * sizeof(double)); + offset += patchSize * 2; + } else { + memcpy(h_internal_coeffs.data() + offset, internal_coeff_ptr, patchSize * sizeof(double)); + memcpy(h_boundary_coeffs.data() + offset, boundary_coeff_ptr, patchSize * sizeof(double)); + offset += patchSize; + } + } + + double *h_boundary_he_tmp = new double[dfDataBase.num_boundary_surfaces]; + offset = 0; + forAll(he.boundaryField(), patchi) + { + const fvPatchScalarField& patchHe = he.boundaryField()[patchi]; + int patchSize = patchHe.size(); + if (patchHe.type() == "processor" + || patchHe.type() == "processorCyclic") { + const scalarField& patchHeInternal = dynamic_cast&>(patchHe).patchInternalField()(); + memcpy(h_boundary_he_tmp + offset, &patchHe[0], patchSize * sizeof(double)); + memcpy(h_boundary_he_tmp + offset + patchSize, &patchHeInternal[0], patchSize * sizeof(double)); + offset += patchSize * 2; + } else { + memcpy(h_boundary_he_tmp + offset, &patchHe[0], patchSize * sizeof(double)); + offset += patchSize; + } + } + + bool printFlag = false; + int rank = -1; + if (mpi_init_flag) { + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + } + if (!mpi_init_flag || rank == 0) { + // DEBUG_TRACE; + // EEqn_GPU.compareResult(&EEqn.lower()[0], &EEqn.upper()[0], &EEqn.diag()[0], &EEqn.source()[0], + // h_internal_coeffs.data(), h_boundary_coeffs.data(), printFlag); + // DEBUG_TRACE; + // EEqn_GPU.compareHe(&he[0], h_boundary_he_tmp, printFlag); + } + + delete h_boundary_he_tmp; + +#endif + +#else start1 = std::clock(); fvScalarMatrix EEqn ( - fvm::ddt(rho, he) + mvConvection->fvmDiv(phi, he) - + fvc::ddt(rho, K) + fvc::div(phi, K) - - dpdt - == + fvm::ddt(rho, he) + mvConvection->fvmDiv(phi, he) + + fvc::ddt(rho, K) + fvc::div(phi, K) + - dpdt + == + ( + turbName == "laminar" + ? + ( + fvm::laplacian(turbulence->alpha(), he) + - diffAlphaD + + fvc::div(hDiffCorrFlux) + ) + : ( - turbName == "laminar" - ? - ( - fvm::laplacian(turbulence->alpha(), he) - - diffAlphaD - + fvc::div(hDiffCorrFlux) - ) - : - ( - fvm::laplacian(turbulence->alphaEff(), he) - ) + fvm::laplacian(turbulence->alphaEff(), he) ) - ); + ) + ); end1 = std::clock(); time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); - EEqn.relax(); + // EEqn.relax(); start1 = std::clock(); EEqn.solve("ha"); end1 = std::clock(); time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); time_monitor_EEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC); +#endif } diff --git a/applications/solvers/dfLowMachFoam/Make/options b/applications/solvers/dfLowMachFoam/Make/options index 67c743453..668a3133a 100644 --- a/applications/solvers/dfLowMachFoam/Make/options +++ b/applications/solvers/dfLowMachFoam/Make/options @@ -9,7 +9,6 @@ EXE_INC = -std=c++14 \ $(PFLAGS) $(PINC) \ $(if $(LIBTORCH_ROOT),-DUSE_LIBTORCH,) \ $(if $(PYTHON_INC_DIR),-DUSE_PYTORCH,) \ - $(if $(AMGX_DIR),-DGPUSolver_,) \ -I$(LIB_SRC)/transportModels/compressible/lnInclude \ -I$(LIB_SRC)/thermophysicalModels/basic/lnInclude \ -I$(LIB_SRC)/TurbulenceModels/turbulenceModels/lnInclude \ @@ -28,7 +27,7 @@ EXE_INC = -std=c++14 \ $(if $(LIBTORCH_ROOT),-I$(LIBTORCH_ROOT)/include/torch/csrc/api/include,) \ $(PYTHON_INC_DIR) \ $(if $(AMGX_DIR), -I$(DF_ROOT)/src_gpu,) \ - $(if $(AMGX_DIR), -I/usr/local/cuda-11.6/include,) \ + $(if $(AMGX_DIR), -I/usr/local/cuda/include,) \ $(if $(AMGX_DIR), -I$(AMGX_DIR)/include,) EXE_LIBS = \ @@ -50,6 +49,7 @@ EXE_LIBS = \ $(if $(LIBTORCH_ROOT),-lpthread,) \ $(if $(LIBTORCH_ROOT),$(DF_SRC)/dfChemistryModel/DNNInferencer/build/libDNNInferencer.so,) \ $(if $(PYTHON_LIB_DIR),$(PYTHON_LIB_DIR),) \ - $(if $(AMGX_DIR), /usr/local/cuda-11.6/lib64/libcudart.so,) \ + $(if $(AMGX_DIR), /usr/local/cuda/lib64/libcudart.so,) \ + $(if $(AMGX_DIR), /usr/local/cuda/lib64/libnccl.so,) \ $(if $(AMGX_DIR), $(DF_ROOT)/src_gpu/build/libdfMatrix.so,) \ - $(if $(AMGX_DIR), $(AMGX_DIR)/build/libamgxsh.so,) + $(if $(AMGX_DIR), $(AMGX_DIR)/build/libamgxsh.so,) \ No newline at end of file diff --git a/applications/solvers/dfLowMachFoam/UEqn.H b/applications/solvers/dfLowMachFoam/UEqn.H index 40067eac5..a0a9689b1 100644 --- a/applications/solvers/dfLowMachFoam/UEqn.H +++ b/applications/solvers/dfLowMachFoam/UEqn.H @@ -1,24 +1,148 @@ -start1 = std::clock(); -tmp tUEqn -( - fvm::ddt(rho, U) + fvm::div(phi, U) - + turbulence->divDevRhoReff(U) -); -fvVectorMatrix& UEqn = tUEqn.ref(); - -end1 = std::clock(); -time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); -time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); - -UEqn.relax(); -start1 = std::clock(); -if (pimple.momentumPredictor()) -{ - solve(UEqn == -fvc::grad(p)); +// Solve the Momentum equation +#ifdef GPUSolverNew_ + +#if defined DEBUG_ + // run CPU, for temp + TICK_START; + tmp tUEqn + ( + fvm::ddt(rho, U) + + + fvm::div(phi, U) + + + turbulence->divDevRhoReff(U) + ); + fvVectorMatrix& UEqn = tUEqn.ref(); + TICK_STOP(CPU assembly time); + + volTensorField gradU = fvc::grad(U); + + double *h_boundary_gradU = new double[dfDataBase.num_boundary_surfaces * 9]; + offset = 0; + forAll(U.boundaryField(), patchi) + { + const fvPatchTensorField& patchGradU = gradU.boundaryField()[patchi]; + int patchsize = patchGradU.size(); + if (patchGradU.type() == "processor" + || patchGradU.type() == "processorCyclic") { + // print info + if (dynamic_cast&>(patchGradU).doTransform()) { + Info << "gradU transform = true" << endl; + } else { + Info << "gradU transform = false" << endl; + } + Info << "rank = " << dynamic_cast&>(patchGradU).rank() << endl; + + memcpy(h_boundary_gradU + 9*offset, &patchGradU[0][0], patchsize * 9 * sizeof(double)); + tensorField patchGradUInternal = + dynamic_cast&>(patchGradU).patchInternalField()(); + memcpy(h_boundary_gradU + 9*offset + patchsize * 9, &patchGradUInternal[0][0], patchsize * 9 * sizeof(double)); + offset += patchsize * 2; + } else { + memcpy(h_boundary_gradU + 9*offset, &patchGradU[0][0], patchsize * 9 * sizeof(double)); + offset += patchsize; + } + } +#endif + + // process + TICK_START; + UEqn_GPU.process(); + UEqn_GPU.sync(); + TICK_STOP(GPU process time); + + // postProcess + // TICK_START; + // UEqn_GPU.postProcess(h_u); + // memcpy(&U[0][0], h_u, dfDataBase.cell_value_vec_bytes); + // U.correctBoundaryConditions(); + // K = 0.5*magSqr(U); + // DEBUG_TRACE; + // TICK_STOP(post process time); +#if defined DEBUG_ + // UEqn.relax(); + TICK_START; + solve(UEqn == -fvc::grad(p)); + K.oldTime(); K = 0.5*magSqr(U); -} -end1 = std::clock(); -time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); -time_monitor_UEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC); + TICK_STOP(CPU solve time); + // checkResult + // TODO: for temp, now we compare ldu, finally we compare csr + std::vector h_internal_coeffs(dfDataBase.num_boundary_surfaces * 3); + std::vector h_boundary_coeffs(dfDataBase.num_boundary_surfaces * 3); + + offset = 0; + for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++) + { + const fvPatchVectorField& patchU = U.boundaryField()[patchi]; + int patchsize = dfDataBase.patch_size[patchi]; + const double* internal_coeff_ptr = &UEqn.internalCoeffs()[patchi][0][0]; + const double* boundary_coeff_ptr = &UEqn.boundaryCoeffs()[patchi][0][0]; + memcpy(h_internal_coeffs.data() + offset * 3, internal_coeff_ptr, patchsize * 3 * sizeof(double)); + memcpy(h_boundary_coeffs.data() + offset * 3, boundary_coeff_ptr, patchsize * 3 * sizeof(double)); + if (patchU.type() == "processor" || patchU.type() == "processorCyclic") offset += 2 * patchsize; + else offset += patchsize; + } + + double *h_boundary_u_tmp = new double[dfDataBase.num_boundary_surfaces * 3]; + offset = 0; + forAll(U.boundaryField(), patchi) + { + const fvPatchVectorField& patchU = U.boundaryField()[patchi]; + int patchsize = dfDataBase.patch_size[patchi]; + + if (patchU.type() == "processor" + || patchU.type() == "processorCyclic") { + memcpy(h_boundary_u_tmp + 3*offset, &patchU[0][0], 3*patchsize * sizeof(double)); + vectorField patchUInternal = + dynamic_cast&>(patchU).patchInternalField()(); + memcpy(h_boundary_u_tmp + 3*offset + 3*patchsize, &patchUInternal[0][0], 3*patchsize * sizeof(double)); + offset += 2 * patchsize; + } else { + memcpy(h_boundary_u_tmp + 3*offset, &patchU[0][0], 3*patchsize * sizeof(double)); + offset += patchsize; + } + } + + bool printFlag = false; + + int rank = -1; + if (mpi_init_flag) { + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + } + + if (!mpi_init_flag || rank == 0) { + // UEqn_GPU.compareResult(&UEqn.lower()[0], &UEqn.upper()[0], &UEqn.diag()[0], &UEqn.source()[0][0], + // h_internal_coeffs.data(), h_boundary_coeffs.data(), + // // &gradU[0][0], h_boundary_gradU, + // printFlag); + // UEqn_GPU.compareU(&U[0][0], h_boundary_u_tmp, printFlag); + } + DEBUG_TRACE; +#endif + +#else + start1 = std::clock(); + tmp tUEqn + ( + fvm::ddt(rho, U) + fvm::div(phi, U) + + turbulence->divDevRhoReff(U) + ); + fvVectorMatrix& UEqn = tUEqn.ref(); + end1 = std::clock(); + time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); + // UEqn.relax(); + start1 = std::clock(); + if (pimple.momentumPredictor()) + { + solve(UEqn == -fvc::grad(p)); + K.oldTime(); + K = 0.5*magSqr(U); + } + end1 = std::clock(); + time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_UEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC); +#endif diff --git a/applications/solvers/dfLowMachFoam/YEqn.H b/applications/solvers/dfLowMachFoam/YEqn.H index f2b557b81..2120408d0 100644 --- a/applications/solvers/dfLowMachFoam/YEqn.H +++ b/applications/solvers/dfLowMachFoam/YEqn.H @@ -1,27 +1,67 @@ -hDiffCorrFlux = Zero; -diffAlphaD = Zero; -sumYDiffError = Zero; +#ifdef GPUSolverNew_ +#if defined DEBUG_ + hDiffCorrFlux = Zero; + diffAlphaD = Zero; + sumYDiffError = Zero; -tmp> mvConvection -( - fv::convectionScheme::New + tmp> mvConvection ( - mesh, - fields, - phi, - mesh.divScheme("div(phi,Yi_h)") - ) -); - -start1 = std::clock(); -forAll(Y, i) -{ - sumYDiffError += chemistry->rhoD(i)*fvc::grad(Y[i]); -} -const surfaceScalarField phiUc = linearInterpolate(sumYDiffError) & mesh.Sf(); -start1 = std::clock(); -time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); -time_monitor_YEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC); + fv::convectionScheme::New + ( + mesh, + fields, + phi, + mesh.divScheme("div(phi,Yi_h)") + ) + ); + // auto& mgcs = dynamic_cast&>(mvConvection.ref()); + // tmp> tinterpScheme_ = mgcs.interpolationScheme()()(Y[0]); + // tmp tweights = tinterpScheme_().weights(Y[0]); + // const surfaceScalarField& weights = tweights(); + // Info << "CPU weights\n" << weights << endl; + + // auto& limitedScheme_ = dynamic_cast&>(tinterpScheme_()); + // Info << "CPU limiter\n" << limitedScheme_.limiter(Y[0]) << endl; + + forAll(Y, i) + { + sumYDiffError += chemistry->rhoD(i)*fvc::grad(Y[i]); + } + const surfaceScalarField phiUc = linearInterpolate(sumYDiffError) & mesh.Sf(); +#endif +#else // should only for CPUSolver + hDiffCorrFlux = Zero; + diffAlphaD = Zero; + sumYDiffError = Zero; + + tmp> mvConvection + ( + fv::convectionScheme::New + ( + mesh, + fields, + phi, + mesh.divScheme("div(phi,Yi_h)") + ) + ); + + // auto& mgcs = dynamic_cast&>(mvConvection.ref()); + // tmp> tinterpScheme_ = mgcs.interpolationScheme()()(Y[0]); + // tmp tweights = tinterpScheme_().weights(Y[0]); + // const surfaceScalarField& weights = tweights(); + // Info << "CPU weights\n" << weights << endl; + + start1 = std::clock(); + forAll(Y, i) + { + sumYDiffError += chemistry->rhoD(i)*fvc::grad(Y[i]); + } + // Info << "sumYDiffError\n" << sumYDiffError << endl; + const surfaceScalarField phiUc = linearInterpolate(sumYDiffError) & mesh.Sf(); + start1 = std::clock(); + time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_YEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC); +#endif //MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM); label flag_mpi_init; @@ -29,6 +69,198 @@ MPI_Initialized(&flag_mpi_init); if(flag_mpi_init) MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM); { + +#ifdef GPUSolverNew_ +#if defined DEBUG_ + // run CPU + volScalarField Yt(0.0*Y[0]); + int speciesIndex = 0; + forAll(Y, i) + { + volScalarField& Yi = Y[i]; + hDiffCorrFlux += chemistry->hai(i)*(chemistry->rhoD(i)*fvc::grad(Yi) - Yi*sumYDiffError); + diffAlphaD += fvc::laplacian(thermo.alpha()*chemistry->hai(i), Yi); + if (i != inertIndex) + { + start1 = std::clock(); + tmp DEff = chemistry->rhoD(i) + turbulence->mut()/Sct; + + fvScalarMatrix YiEqn + ( + fvm::ddt(rho, Yi) + + + ( + turbName == "laminar" + ? (mvConvection->fvmDiv(phi, Yi) + mvConvection->fvmDiv(phiUc, Yi)) + : mvConvection->fvmDiv(phi, Yi) + ) + == + ( + splitting + ? fvm::laplacian(DEff(), Yi) + : (fvm::laplacian(DEff(), Yi) + combustion->R(Yi)) + ) + ); + + end1 = std::clock(); + time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); + // YiEqn.relax(); + + start1 = std::clock(); + YiEqn.solve("Yi"); + end1 = std::clock(); + time_monitor_YEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC); + + Yi.max(0.0); + Yt += Yi; + ++speciesIndex; + } + } + Y[inertIndex] = scalar(1) - Yt; + Y[inertIndex].max(0.0); + + int specie_index = 0; + + // should compute grad_yi before YiEqn.solve() + const volVectorField grad_yi = fvc::grad(Y[specie_index]); + + tmp DEff = chemistry->rhoD(specie_index) + turbulence->mut()/Sct; + fvScalarMatrix YiEqn + ( + fvm::ddt(rho, Y[specie_index]) + + mvConvection->fvmDiv(phi, Y[specie_index]) + + mvConvection->fvmDiv(phiUc, Y[specie_index]) + == + fvm::laplacian(DEff(), Y[specie_index]) + ); + // YiEqn.relax(); + // YiEqn.solve("Yi"); + // Y[specie_index].max(0.0); +#endif + + // process + YEqn_GPU.process(); + YEqn_GPU.sync(); + +#if defined DEBUG_ + std::vector h_boundary_diffAlphaD; + std::vector h_boundary_grad_yi; + std::vector h_boundary_sumYDiffError; + std::vector h_boundary_hDiffCorrFlux; + std::vector h_boundary_phiUc; + h_boundary_diffAlphaD.resize(dfDataBase.num_boundary_surfaces); + h_boundary_grad_yi.resize(dfDataBase.num_boundary_surfaces * 3); + h_boundary_sumYDiffError.resize(dfDataBase.num_boundary_surfaces * 3); + h_boundary_hDiffCorrFlux.resize(dfDataBase.num_boundary_surfaces * 3); + h_boundary_phiUc.resize(dfDataBase.num_boundary_surfaces); + offset = 0; + forAll(diffAlphaD.boundaryField(), patchi) + { + //const scalarField& patchdiffAlphaD = diffAlphaD.boundaryField()[patchi]; + const fvPatchScalarField& patchdiffAlphaD = diffAlphaD.boundaryField()[patchi]; + const fvPatchVectorField& patchgradyi = grad_yi.boundaryField()[patchi]; + const fvPatchVectorField& patchsumYDiffError = sumYDiffError.boundaryField()[patchi]; + const fvPatchVectorField& patchhDiffCorrFlux = hDiffCorrFlux.boundaryField()[patchi]; + const fvsPatchScalarField& patchphiUc = phiUc.boundaryField()[patchi]; + int patchSize = patchdiffAlphaD.size(); + if (patchdiffAlphaD.type() == "processor" + || patchdiffAlphaD.type() == "processorCyclic") { + scalarField patchdiffAlphaDInternal = dynamic_cast&>(patchdiffAlphaD).patchInternalField()(); + vectorField patchgradyiInternal = dynamic_cast&>(patchgradyi).patchInternalField()(); + vectorField patchsumYDiffErrorInternal = dynamic_cast&>(patchsumYDiffError).patchInternalField()(); + vectorField patchhDiffCorrFluxInternal = dynamic_cast&>(patchhDiffCorrFlux).patchInternalField()(); + memcpy(h_boundary_diffAlphaD.data() + offset, &patchdiffAlphaD[0], patchSize*sizeof(double)); + memcpy(h_boundary_diffAlphaD.data() + offset + patchSize, &patchdiffAlphaDInternal[0], patchSize*sizeof(double)); + memcpy(h_boundary_grad_yi.data() + offset * 3, &patchgradyi[0][0], patchSize * 3 *sizeof(double)); + memcpy(h_boundary_grad_yi.data() + (offset + patchSize) * 3, &patchgradyiInternal[0][0], patchSize * 3 *sizeof(double)); + memcpy(h_boundary_sumYDiffError.data() + offset * 3, &patchsumYDiffError[0][0], patchSize * 3 *sizeof(double)); + memcpy(h_boundary_sumYDiffError.data() + (offset + patchSize) * 3, &patchsumYDiffErrorInternal[0][0], patchSize * 3 *sizeof(double)); + memcpy(h_boundary_hDiffCorrFlux.data() + offset * 3, &patchhDiffCorrFlux[0][0], patchSize * 3 *sizeof(double)); + memcpy(h_boundary_hDiffCorrFlux.data() + (offset + patchSize) * 3, &patchhDiffCorrFluxInternal[0][0], patchSize * 3 *sizeof(double)); + memcpy(h_boundary_phiUc.data() + offset, &patchphiUc[0], patchSize*sizeof(double)); + memcpy(h_boundary_phiUc.data() + offset, &patchphiUc[0], patchSize*sizeof(double)); + offset += patchSize * 2; + } else { + memcpy(h_boundary_diffAlphaD.data() + offset, &patchdiffAlphaD[0], patchSize*sizeof(double)); + memcpy(h_boundary_grad_yi.data() + offset * 3, &patchgradyi[0][0], patchSize * 3 *sizeof(double)); + memcpy(h_boundary_sumYDiffError.data() + offset * 3, &patchsumYDiffError[0][0], patchSize * 3 *sizeof(double)); + memcpy(h_boundary_hDiffCorrFlux.data() + offset * 3, &patchhDiffCorrFlux[0][0], patchSize * 3 *sizeof(double)); + memcpy(h_boundary_phiUc.data() + offset, &patchphiUc[0], patchSize*sizeof(double)); + offset += patchSize; + } + } + DEBUG_TRACE; + // YEqn_GPU.comparediffAlphaD(&diffAlphaD[0], h_boundary_diffAlphaD.data(), false); + // YEqn_GPU.comparegradyi(&grad_yi[0][0], h_boundary_grad_yi.data(), specie_index, false); + // YEqn_GPU.comparesumYDiffError(&sumYDiffError[0][0], h_boundary_sumYDiffError.data(), false); + // YEqn_GPU.comparehDiffCorrFlux(&hDiffCorrFlux[0][0], h_boundary_hDiffCorrFlux.data(), false); + // YEqn_GPU.comparephiUc(&phiUc[0], h_boundary_phiUc.data(), false); + DEBUG_TRACE; + + // checkResult + // TODO: for temp, now we compare ldu, finally we compare csr + std::vector yeqn_h_internal_coeffs(dfDataBase.num_boundary_surfaces); + std::vector yeqn_h_boundary_coeffs(dfDataBase.num_boundary_surfaces); + + offset = 0; + forAll(Y[specie_index].boundaryField(), patchi) + { + const fvPatchScalarField& patchYi = Y[specie_index].boundaryField()[patchi]; + int patchsize = patchYi.size(); + const double* internal_coeff_ptr = &YiEqn.internalCoeffs()[patchi][0]; + const double* boundary_coeff_ptr = &YiEqn.boundaryCoeffs()[patchi][0]; + if (patchYi.type() == "processor" + || patchYi.type() == "processorCyclic") { + memcpy(yeqn_h_internal_coeffs.data() + offset, internal_coeff_ptr, patchsize * sizeof(double)); + memset(yeqn_h_internal_coeffs.data() + offset + patchsize, 0, patchsize * sizeof(double)); + memcpy(yeqn_h_boundary_coeffs.data() + offset, boundary_coeff_ptr, patchsize * sizeof(double)); + memset(yeqn_h_boundary_coeffs.data() + offset + patchsize, 0, patchsize * sizeof(double)); + offset += patchsize * 2; + } else { + memcpy(yeqn_h_internal_coeffs.data() + offset, internal_coeff_ptr, patchsize * sizeof(double)); + memcpy(yeqn_h_boundary_coeffs.data() + offset, boundary_coeff_ptr, patchsize * sizeof(double)); + offset += patchsize; + } + } + // NOTE: ldu and yi can't be compared at the same time + // to compare ldu data, you should open both DEBUG_ and DEBUG_CHECK_LDU in src_gpu + // to compare yi, you should only open DEBUG_ in src_gpu. + // Besides, if you compare ldu data, be patient to keep specie_index in YEqn.H and dfYEqn.cu the same. + //DEBUG_TRACE; + bool printFlag = false; + int rank = -1; + if (mpi_init_flag) { + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + } + if (!mpi_init_flag || rank == 0) { + // YEqn_GPU.compareResult(&YiEqn.lower()[0], &YiEqn.upper()[0], &YiEqn.diag()[0], &YiEqn.source()[0], + // yeqn_h_internal_coeffs.data(), yeqn_h_boundary_coeffs.data(), printFlag); + } + + DEBUG_TRACE; + // YEqn_GPU.compareYi(&Y[specie_index][0], specie_index, false); + // DEBUG_TRACE; +#endif + + // postProcess + double *h_y = dfDataBase.getFieldPointer("y", location::cpu, position::internal); + double *h_boundary_y = dfDataBase.getFieldPointer("y", location::cpu, position::boundary); + // YEqn_GPU.postProcess(h_y, h_boundary_y); + DEBUG_TRACE; + + // copy h_y to Y(cpu) + // offset = 0; + // forAll(Y, i) + // { + // volScalarField& Yi = Y[i]; + // memcpy(&Yi[0], h_y + offset, Yi.size() * sizeof(double)); + // offset += Yi.size(); + // Yi.correctBoundaryConditions(); + // } + DEBUG_TRACE; + + fflush(stderr); +#else if (!splitting) { std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now(); @@ -72,7 +304,7 @@ if(flag_mpi_init) MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM); end1 = std::clock(); time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); - YiEqn.relax(); + // YiEqn.relax(); start1 = std::clock(); YiEqn.solve("Yi"); @@ -88,4 +320,5 @@ if(flag_mpi_init) MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM); Y[inertIndex].max(0.0); end2 = std::clock(); time_monitor_YEqn += double(end2 - start2) / double(CLOCKS_PER_SEC); +#endif } diff --git a/applications/solvers/dfLowMachFoam/createFields.H b/applications/solvers/dfLowMachFoam/createFields.H index 15413d462..b505c63ab 100644 --- a/applications/solvers/dfLowMachFoam/createFields.H +++ b/applications/solvers/dfLowMachFoam/createFields.H @@ -10,6 +10,7 @@ fluidThermo& thermo = *pThermo; const volScalarField& psi = thermo.psi(); volScalarField& p = thermo.p(); volScalarField& T = thermo.T(); +T.correctBoundaryConditions(); volScalarField rho ( IOobject @@ -40,6 +41,9 @@ volVectorField U #include "compressibleCreatePhi.H" +U.correctBoundaryConditions(); +p.correctBoundaryConditions(); + pressureControl pressureControl(p, rho, pimple.dict(), false); mesh.setFluxRequired(p.name()); @@ -87,6 +91,10 @@ const word turbName(mesh.objectRegistry::lookupObject("turbulenceP dfChemistryModel* chemistry = combustion->chemistry(); PtrList& Y = chemistry->Y(); +forAll(Y, i) +{ + Y[i].correctBoundaryConditions(); +} const word inertSpecie(chemistry->lookup("inertSpecie")); const label inertIndex(chemistry->species()[inertSpecie]); chemistry->setEnergyName("ha"); @@ -98,6 +106,7 @@ if (combModelName != "flareFGM") chemistry->correctThermo(); Info<< "At initial time, min/max(T) = " << min(T).value() << ", " << max(T).value() << endl; } +rho = thermo.rho(); //for dpdt @@ -168,6 +177,22 @@ IOdictionary CanteraTorchProperties IOobject::NO_WRITE ) ); + +volScalarField UEqn_A +( + IOobject + ( + "A("+U.name()+')', + runTime.timeName(), + mesh, + IOobject::NO_READ, + IOobject::NO_WRITE + ), + mesh, + dimensionedScalar(dimensionSet(1,-3,-1,0,0,0,0), Zero), + extrapolatedCalculatedFvPatchScalarField::typeName +); + const Switch splitting = CanteraTorchProperties.lookupOrDefault("splittingStrategy", false); #ifdef USE_PYTORCH const Switch log_ = CanteraTorchProperties.subDict("TorchSettings").lookupOrDefault("log", false); diff --git a/applications/solvers/dfLowMachFoam/createGPUSolver.H b/applications/solvers/dfLowMachFoam/createGPUSolver.H new file mode 100644 index 000000000..79455fb22 --- /dev/null +++ b/applications/solvers/dfLowMachFoam/createGPUSolver.H @@ -0,0 +1,709 @@ +#include + +ncclUniqueId nccl_id; +ncclComm_t nccl_comm; +int nRanks, myRank, localRank, mpi_init_flag = 0; + +dfMatrixDataBase dfDataBase; +dfThermo thermo_GPU(dfDataBase); +dfChemistrySolver chemistrySolver_GPU(dfDataBase); +dfRhoEqn rhoEqn_GPU(dfDataBase); +dfUEqn UEqn_GPU(dfDataBase); +dfYEqn YEqn_GPU(dfDataBase, chemistrySolver_GPU); +dfEEqn EEqn_GPU(dfDataBase, thermo_GPU); +dfpEqn pEqn_GPU(dfDataBase); + +#if defined(DEBUG_) +template +void getTypeInfo(size_t *stride, size_t *internal_size, size_t *boundary_size) { + size_t s = 1; + bool isVol = false; + if (typeid(T) == typeid(surfaceScalarField)) { + s = 1; + isVol = false; + } else if (typeid(T) == typeid(surfaceVectorField)) { + s = 3; + isVol = false; + } else if (typeid(T) == typeid(surfaceTensorField)) { + s = 9; + isVol = false; + } else if (typeid(T) == typeid(volScalarField)) { + s = 1; + isVol = true; + } else if (typeid(T) == typeid(volVectorField)) { + s = 3; + isVol = true; + } else if (typeid(T) == typeid(volTensorField)) { + s = 9; + isVol = true; + } else { + fprintf(stderr, "ERROR! Unsupported field type()!\n"); + exit(EXIT_FAILURE); + } + *stride = s; + *internal_size = (isVol ? dfDataBase.num_cells : dfDataBase.num_surfaces) * s; + *boundary_size = dfDataBase.num_boundary_surfaces * s; +} + +template +void getFieldPtr(std::queue& fieldPtrQue, T& field){ + fieldPtrQue.push(&field[0]); + forAll(field.boundaryField(), patchi){ + auto& patchField = field.boundaryFieldRef()[patchi]; + fieldPtrQue.push(&patchField[0]); + } +}; + +template +void randomInitField(T& field) { + size_t stride = 0; + size_t internal_size = 0; + size_t boundary_size = 0; + getTypeInfo(&stride, &internal_size, &boundary_size); + size_t internal_value_bytes = internal_size * sizeof(double) * stride; + std::queue fieldPtrQue; + // std::vector fieldPtrQue; + getFieldPtr(fieldPtrQue, field); + + // random init field value to (-0.5, 0.5) + // internal + double *&field_internal_ptr = fieldPtrQue.front(); fieldPtrQue.pop(); + // double *field_internal_ptr = fieldPtrQue[0]; + std::vector init_field_internal; + init_field_internal.resize(internal_size * stride); + for (size_t i = 0; i < internal_size * stride; i++) { + init_field_internal[i] = (rand() % 10000 - 5000) / 10000.0; + } + memcpy(field_internal_ptr, init_field_internal.data(), internal_value_bytes); + // boundary + int ptrIndex = 1; + forAll(field.boundaryField(), patchi) + { + auto& patchField = field.boundaryFieldRef()[patchi]; + size_t patchsize = patchField.size(); + double *&field_boundary_ptr = fieldPtrQue.front(); fieldPtrQue.pop(); + // double *field_boundary_ptr = fieldPtrQue[ptrIndex]; + // ptrIndex ++; + std::vector init_field_boundary; + init_field_boundary.resize(patchsize * stride); + for (size_t i = 0; i < patchsize * stride; i++) { + init_field_boundary[i] = (rand() % 10000 - 5000) / 10000.0; + } + memcpy(field_boundary_ptr, init_field_boundary.data(), patchsize * stride * sizeof(double)); + } + + field.correctBoundaryConditions(); +} +#endif + +void initNccl() { + ncclInit(PstreamGlobals::MPI_COMM_FOAM, nccl_comm, nccl_id, &nRanks, &myRank, &localRank, &mpi_init_flag); +} + +void createGPUBase(const IOdictionary& CanteraTorchProperties, fvMesh& mesh, PtrList& Y) { + // prepare constant values: num_cells, num_surfaces, num_boundary_surfaces, + // num_patches, patch_size, num_species, rdelta_t + const labelUList& owner = mesh.owner(); + const labelUList& neighbour = mesh.neighbour(); + int num_cells = mesh.nCells(); + int num_total_cells = Foam::returnReduce(num_cells, sumOp