pyscf · henryw7 · Jan 10, 2025 · Jan 14, 2025 · Jan 15, 2025 · Jan 16, 2025
diff --git a/gpu4pyscf/gto/int3c1e_ipip.py b/gpu4pyscf/gto/int3c1e_ipip.py
diff --git a/gpu4pyscf/gto/tests/test_int1e_grids_ip.py b/gpu4pyscf/gto/tests/test_int1e_grids_ip.py
@@ -364,5 +364,5 @@ def test_int1e_grids_ip1_density_contracted(self):
         cp.testing.assert_allclose(ref_int1e_dA, test_int1e_dA, atol = integral_threshold)
 
 if __name__ == "__main__":
-    print("Full Tests for One Electron Coulomb Integrals")
+    print("Full Tests for One Electron Coulomb Integrals 1st Derivative")
     unittest.main()
diff --git a/gpu4pyscf/gto/tests/test_int1e_grids_ipip.py b/gpu4pyscf/gto/tests/test_int1e_grids_ipip.py
diff --git a/gpu4pyscf/lib/gint/CMakeLists.txt b/gpu4pyscf/lib/gint/CMakeLists.txt
@@ -26,6 +26,7 @@ add_library(gint SHARED
   nr_fill_ao_ints.cu
   nr_fill_ao_int3c1e.cu
   nr_fill_ao_int3c1e_ip.cu
+  nr_fill_ao_int3c1e_ipip.cu
   nr_fill_ao_int3c2e.cu
   nr_fill_ao_int3c2e_ip1.cu
   nr_fill_ao_int3c2e_ip2.cu

diff --git a/gpu4pyscf/lib/gint/g3c1e_ipip.cu b/gpu4pyscf/lib/gint/g3c1e_ipip.cu
diff --git a/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ipip.cu b/gpu4pyscf/lib/gint/nr_fill_ao_int3c1e_ipip.cu
diff --git a/gpu4pyscf/lib/solvent/pcm.cu b/gpu4pyscf/lib/solvent/pcm.cu
@@ -78,9 +78,9 @@ static void _pcm_d_s(double *matrix_d, double *matrix_s,
 
 __global__
 static void _pcm_dD_dS(double *matrix_dd, double *matrix_ds,
-                    const double *coords, const double *norm_vec, const double *r_vdw,
-                    const double *charge_exp, const double *switch_fun,
-                    int n)
+                       const double *coords, const double *norm_vec,
+                       const double *charge_exp,
+                       int n)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     int j = blockIdx.y * blockDim.y + threadIdx.y;
@@ -130,6 +130,105 @@ static void _pcm_dD_dS(double *matrix_dd, double *matrix_ds,
     }
 }
 
+
+__global__
+static void _pcm_d2D_d2S(double *matrix_d2D, double *matrix_d2S,
+                         const double *coords, const double *norm_vec,
+                         const double *charge_exp,
+                         int n)
+{
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    const int j = blockIdx.y * blockDim.y + threadIdx.y;
+    if (i >= n || j >= n) {
+        return;
+    }
+
+    // calculate xi
+    const double ei = charge_exp[i];
+    const double ej = charge_exp[j];
+    const double eij = ei * ej / sqrt(ei*ei + ej*ej);
+
+    // calculate r
+    const double dx = coords[3*i]   - coords[3*j];
+    const double dy = coords[3*i+1] - coords[3*j+1];
+    const double dz = coords[3*i+2] - coords[3*j+2];
+    const double rij = norm3d(dx, dy, dz);
+    const double rij_1 = 1.0 / rij;
+    const double rij_2 = rij_1 * rij_1;
+    const double rij_3 = rij_2 * rij_1;
+    const double rij_4 = rij_2 * rij_2;
+    const double rij_5 = rij_2 * rij_3;
+    const double eij2 = eij * eij;
+
+    const double eij_rij = eij * rij;
+    const double erf_eij_rij = erf(eij_rij);
+    const double exp_minus_eij2_rij2 = exp(-eij_rij * eij_rij);
+    const double two_eij_over_sqrt_pi = 2.0 * eij / SQRT_PI;
+    const double two_eij_over_sqrt_pi_exp_minus_eij2_rij2 = exp_minus_eij2_rij2 * two_eij_over_sqrt_pi;
+
+    const double S_direct_product_prefactor = -two_eij_over_sqrt_pi_exp_minus_eij2_rij2 * (3 * rij_4 + 2 * eij2 * rij_2)
+                                              + 3 * rij_5 * erf_eij_rij;
+    const double S_xyz_diagonal_prefactor = two_eij_over_sqrt_pi_exp_minus_eij2_rij2 * rij_2 - rij_3 * erf_eij_rij;
+
+    const int n2 = n * n;
+    if (i == j) {
+        matrix_d2S[i*n + j         ] = 0.0;
+        matrix_d2S[i*n + j + n2    ] = 0.0;
+        matrix_d2S[i*n + j + n2 * 2] = 0.0;
+        matrix_d2S[i*n + j + n2 * 3] = 0.0;
+        matrix_d2S[i*n + j + n2 * 4] = 0.0;
+        matrix_d2S[i*n + j + n2 * 5] = 0.0;
+        matrix_d2S[i*n + j + n2 * 6] = 0.0;
+        matrix_d2S[i*n + j + n2 * 7] = 0.0;
+        matrix_d2S[i*n + j + n2 * 8] = 0.0;
+    } else {
+        matrix_d2S[i*n + j         ] = dx * dx * S_direct_product_prefactor + S_xyz_diagonal_prefactor;
+        matrix_d2S[i*n + j + n2    ] = dx * dy * S_direct_product_prefactor;
+        matrix_d2S[i*n + j + n2 * 2] = dx * dz * S_direct_product_prefactor;
+        matrix_d2S[i*n + j + n2 * 3] = dy * dx * S_direct_product_prefactor;
+        matrix_d2S[i*n + j + n2 * 4] = dy * dy * S_direct_product_prefactor + S_xyz_diagonal_prefactor;
+        matrix_d2S[i*n + j + n2 * 5] = dy * dz * S_direct_product_prefactor;
+        matrix_d2S[i*n + j + n2 * 6] = dz * dx * S_direct_product_prefactor;
+        matrix_d2S[i*n + j + n2 * 7] = dz * dy * S_direct_product_prefactor;
+        matrix_d2S[i*n + j + n2 * 8] = dz * dz * S_direct_product_prefactor + S_xyz_diagonal_prefactor;
+    }
+
+    if (matrix_d2D != NULL) {
+        const double nxj = norm_vec[3*j];
+        const double nyj = norm_vec[3*j+1];
+        const double nzj = norm_vec[3*j+2];
+        const double nj_rij = dx * nxj + dy * nyj + dz * nzj;
+
+        const double eij4 = eij2 * eij2;
+        const double rij_6 = rij_4 * rij_2;
+        const double rij_7 = rij_4 * rij_3;
+
+        const double D_direct_product_prefactor = (-two_eij_over_sqrt_pi_exp_minus_eij2_rij2 * (15 * rij_6 + 10 * eij2 * rij_4 + 4 * eij4 * rij_2)
+                                                   + 15 * rij_7 * erf_eij_rij) * nj_rij;
+        if (i == j) {
+            matrix_d2D[i*n + j         ] = 0.0;
+            matrix_d2D[i*n + j + n2    ] = 0.0;
+            matrix_d2D[i*n + j + n2 * 2] = 0.0;
+            matrix_d2D[i*n + j + n2 * 3] = 0.0;
+            matrix_d2D[i*n + j + n2 * 4] = 0.0;
+            matrix_d2D[i*n + j + n2 * 5] = 0.0;
+            matrix_d2D[i*n + j + n2 * 6] = 0.0;
+            matrix_d2D[i*n + j + n2 * 7] = 0.0;
+            matrix_d2D[i*n + j + n2 * 8] = 0.0;
+        } else {
+            matrix_d2D[i*n + j         ] = D_direct_product_prefactor * dx * dx - S_direct_product_prefactor * (dx * nxj + dx * nxj + nj_rij);
+            matrix_d2D[i*n + j + n2    ] = D_direct_product_prefactor * dx * dy - S_direct_product_prefactor * (dy * nxj + dx * nyj);
+            matrix_d2D[i*n + j + n2 * 2] = D_direct_product_prefactor * dx * dz - S_direct_product_prefactor * (dz * nxj + dx * nzj);
+            matrix_d2D[i*n + j + n2 * 3] = D_direct_product_prefactor * dy * dx - S_direct_product_prefactor * (dx * nyj + dy * nxj);
+            matrix_d2D[i*n + j + n2 * 4] = D_direct_product_prefactor * dy * dy - S_direct_product_prefactor * (dy * nyj + dy * nyj + nj_rij);
+            matrix_d2D[i*n + j + n2 * 5] = D_direct_product_prefactor * dy * dz - S_direct_product_prefactor * (dz * nyj + dy * nzj);
+            matrix_d2D[i*n + j + n2 * 6] = D_direct_product_prefactor * dz * dx - S_direct_product_prefactor * (dx * nzj + dz * nxj);
+            matrix_d2D[i*n + j + n2 * 7] = D_direct_product_prefactor * dz * dy - S_direct_product_prefactor * (dy * nzj + dz * nyj);
+            matrix_d2D[i*n + j + n2 * 8] = D_direct_product_prefactor * dz * dz - S_direct_product_prefactor * (dz * nzj + dz * nzj + nj_rij);
+        }
+    }
+}
+
 extern "C" {
 int pcm_d_s(cudaStream_t stream, double *matrix_d, double *matrix_s,
                     const double *coords, const double *norm_vec, const double *r_vdw,
@@ -149,15 +248,32 @@ int pcm_d_s(cudaStream_t stream, double *matrix_d, double *matrix_s,
 }
 
 int pcm_dd_ds(cudaStream_t stream, double *matrix_dD, double *matrix_dS,
-                    const double *coords, const double *norm_vec, const double *r_vdw,
-                    const double *charge_exp, const double *switch_fun,
-                    int n)
+              const double *coords, const double *norm_vec,
+              const double *charge_exp,
+              int n)
 {
     int ntilex = (n + THREADS - 1) / THREADS;
     int ntiley = (n + THREADS - 1) / THREADS;
     dim3 threads(THREADS, THREADS);
     dim3 blocks(ntilex, ntiley);
-    _pcm_dD_dS<<<blocks, threads, 0, stream>>>(matrix_dD, matrix_dS, coords, norm_vec, r_vdw, charge_exp, switch_fun, n);
+    _pcm_dD_dS<<<blocks, threads, 0, stream>>>(matrix_dD, matrix_dS, coords, norm_vec, charge_exp, n);
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        return 1;
+    }
+    return 0;
+}
+
+int pcm_d2d_d2s(cudaStream_t stream, double *matrix_d2D, double *matrix_d2S,
+                const double *coords, const double *norm_vec,
+                const double *charge_exp,
+                int n)
+{
+    const int ntilex = (n + THREADS - 1) / THREADS;
+    const int ntiley = (n + THREADS - 1) / THREADS;
+    const dim3 threads(THREADS, THREADS);
+    const dim3 blocks(ntilex, ntiley);
+    _pcm_d2D_d2S<<<blocks, threads, 0, stream>>>(matrix_d2D, matrix_d2S, coords, norm_vec, charge_exp, n);
     cudaError_t err = cudaGetLastError();
     if (err != cudaSuccess) {
         return 1;

diff --git a/gpu4pyscf/solvent/grad/pcm.py b/gpu4pyscf/solvent/grad/pcm.py
@@ -40,13 +40,6 @@ def grad_switch_h(x):
     dy[x>1] = 0.0
     return dy
 
-def gradgrad_switch_h(x):
-    ''' 2nd derivative of h(x) '''
-    ddy = 60.0*x - 180.0*x**2 + 120*x**3
-    ddy[x<0] = 0.0
-    ddy[x>1] = 0.0
-    return ddy
-
 def get_dF_dA(surface):
     '''
     J. Chem. Phys. 133, 244111 (2010), Appendix C
@@ -63,10 +56,9 @@ def get_dF_dA(surface):
     dF = cupy.zeros([ngrids, natom, 3])
     dA = cupy.zeros([ngrids, natom, 3])
 
-    for ia in range(atom_coords.shape[0]):
+    for ia in range(natom):
         p0,p1 = surface['gslice_by_atom'][ia]
         coords = grid_coords[p0:p1]
-        p1 = p0 + coords.shape[0]
         ri_rJ = cupy.expand_dims(coords, axis=1) - atom_coords
         riJ = cupy.linalg.norm(ri_rJ, axis=-1)
         diJ = (riJ - R_in_J) / R_sw_J
@@ -145,9 +137,7 @@ def get_dD_dS(surface, with_S=True, with_D=False, stream=None):
     '''
     charge_exp  = surface['charge_exp']
     grid_coords = surface['grid_coords']
-    switch_fun  = surface['switch_fun']
     norm_vec    = surface['norm_vec']
-    R_vdw       = surface['R_vdw']
     n = charge_exp.shape[0]
     dS = cupy.empty([3,n,n])
     dD = None
@@ -163,9 +153,7 @@ def get_dD_dS(surface, with_S=True, with_D=False, stream=None):
         dD_ptr, dS_ptr,
         ctypes.cast(grid_coords.data.ptr, ctypes.c_void_p),
         ctypes.cast(norm_vec.data.ptr, ctypes.c_void_p),
-        ctypes.cast(R_vdw.data.ptr, ctypes.c_void_p),
         ctypes.cast(charge_exp.data.ptr, ctypes.c_void_p),
-        ctypes.cast(switch_fun.data.ptr, ctypes.c_void_p),
         ctypes.c_int(n)
     )
     if err != 0:
@@ -181,7 +169,7 @@ def get_dSii(surface, dF):
     dSii = dSii_dF[:,None] * dF
     return dSii
 
-def grad_nuc(pcmobj, dm):
+def grad_nuc(pcmobj, dm, q_sym = None):
     mol = pcmobj.mol
     log = logger.new_logger(mol, mol.verbose)
     t1 = log.init_timer()
@@ -194,7 +182,8 @@ def grad_nuc(pcmobj, dm):
         pcmobj._get_vind(dm)
 
     mol = pcmobj.mol
-    q_sym        = pcmobj._intermediates['q_sym'].get()
+    if q_sym is None:
+        q_sym = pcmobj._intermediates['q_sym'].get()
     gridslice    = pcmobj.surface['gslice_by_atom']
     grid_coords  = pcmobj.surface['grid_coords'].get()
     exponents    = pcmobj.surface['charge_exp'].get()
@@ -220,7 +209,7 @@ def grad_nuc(pcmobj, dm):
     t1 = log.timer_debug1('grad nuc', *t1)
     return de
 
-def grad_qv(pcmobj, dm):
+def grad_qv(pcmobj, dm, q_sym = None):
     '''
     contributions due to integrals
     '''
@@ -237,7 +226,8 @@ def grad_qv(pcmobj, dm):
     gridslice   = pcmobj.surface['gslice_by_atom']
     charge_exp  = pcmobj.surface['charge_exp']
     grid_coords = pcmobj.surface['grid_coords']
-    q_sym       = pcmobj._intermediates['q_sym']
+    if q_sym is None:
+        q_sym = pcmobj._intermediates['q_sym']
 
     intopt = int3c1e.VHFOpt(mol)
     intopt.build(1e-14, aosym=False)
@@ -282,37 +272,37 @@ def grad_solver(pcmobj, dm):
     vK_1 = cupy.linalg.solve(K.T, v_grids)
     epsilon = pcmobj.eps
 
+    def contract_bra(a, B, c):
+        ''' i,xij,j->jx '''
+        tmp = a.dot(B)
+        return (tmp*c).T
+
+    def contract_ket(a, B, c):
+        ''' i,xij,j->ix '''
+        tmp = B.dot(c)
+        return (a*tmp).T
+
     de = cupy.zeros([pcmobj.mol.natm,3])
     if pcmobj.method.upper() in ['C-PCM', 'CPCM', 'COSMO']:
         dD, dS = get_dD_dS(pcmobj.surface, with_D=False, with_S=True)
 
         # dR = 0, dK = dS
-        de_dS = (vK_1 * dS.dot(q)).T                  # cupy.einsum('i,xij,j->ix', vK_1, dS, q)
+        de_dS  = 0.5 * contract_ket(vK_1, dS, q)
+        de_dS -= 0.5 * contract_bra(vK_1, dS, q)
         de -= cupy.asarray([cupy.sum(de_dS[p0:p1], axis=0) for p0,p1 in gridslice])
         dD = dS = None
 
         dF, dA = get_dF_dA(pcmobj.surface)
         dSii = get_dSii(pcmobj.surface, dF)
         de -= 0.5*contract('i,xij->jx', vK_1*q, dSii) # 0.5*cupy.einsum('i,xij,i->jx', vK_1, dSii, q)
 
-    elif pcmobj.method.upper() in ['IEF-PCM', 'IEFPCM', 'SS(V)PE', 'SMD']:
+    elif pcmobj.method.upper() in ['IEF-PCM', 'IEFPCM', 'SMD']:
         dF, dA = get_dF_dA(pcmobj.surface)
         dSii = get_dSii(pcmobj.surface, dF)
         dF = None
 
         dD, dS = get_dD_dS(pcmobj.surface, with_D=True, with_S=True)
 
-        def contract_bra(a, B, c):
-            ''' i,xij,j->jx '''
-            tmp = a.dot(B)
-            return (tmp*c).T
-
-        def contract_ket(a, B, c):
-            ''' i,xij,j->ix '''
-            tmp = B.dot(c)
-            return (a*tmp).T
-
-        # IEF-PCM and SS(V)PE formally are the same in gradient calculation
         # dR = f_eps/(2*pi) * (dD*A + D*dA),
         # dK = dS - f_eps/(2*pi) * (dD*A*S + D*dA*S + D*A*dS)
         f_epsilon = (epsilon - 1.0)/(epsilon + 1.0)
@@ -352,6 +342,67 @@ def contract_ket(a, B, c):
 
         de_dK = de_dS0 - fac * (de_dD + de_dA + de_dS1)
         de += de_dR - de_dK
+    elif pcmobj.method.upper() in [ 'SS(V)PE' ]:
+        dF, dA = get_dF_dA(pcmobj.surface)
+        dSii = get_dSii(pcmobj.surface, dF)
+        dF = None
+
+        dD, dS = get_dD_dS(pcmobj.surface, with_D=True, with_S=True)
+
+        # dR = f_eps/(2*pi) * (dD*A + D*dA),
+        # dK = dS - f_eps/(2*pi) * (dD*A*S + D*dA*S + D*A*dS)
+        f_epsilon = (epsilon - 1.0)/(epsilon + 1.0)
+        fac = f_epsilon/(2.0*PI)
+
+        Av = A*v_grids
+        de_dR  = 0.5*fac * contract_ket(vK_1, dD, Av)
+        de_dR -= 0.5*fac * contract_bra(vK_1, dD, Av)
+        de_dR  = cupy.asarray([cupy.sum(de_dR[p0:p1], axis=0) for p0,p1 in gridslice])
+
+        vK_1_D = vK_1.dot(D)
+        vK_1_Dv = vK_1_D * v_grids
+        de_dR += 0.5*fac * contract('j,xjn->nx', vK_1_Dv, dA)
+
+        de_dS0  = 0.5*contract_ket(vK_1, dS, q)
+        de_dS0 -= 0.5*contract_bra(vK_1, dS, q)
+        de_dS0  = cupy.asarray([cupy.sum(de_dS0[p0:p1], axis=0) for p0,p1 in gridslice])
+
+        vK_1_q = vK_1 * q
+        de_dS0 += 0.5*contract('i,xin->nx', vK_1_q, dSii)
+
+        vK_1_DA = vK_1_D*A
+        de_dS1  = 0.5*contract_ket(vK_1_DA, dS, q)
+        de_dS1 -= 0.5*contract_bra(vK_1_DA, dS, q)
+        de_dS1  = cupy.asarray([cupy.sum(de_dS1[p0:p1], axis=0) for p0,p1 in gridslice])
+        vK_1_DAq = vK_1_DA*q
+        de_dS1 += 0.5*contract('j,xjn->nx', vK_1_DAq, dSii)
+
+        DT_q = cupy.dot(D.T, q)
+        ADT_q = A * DT_q
+        de_dS1_T  = 0.5*contract_ket(vK_1, dS, ADT_q)
+        de_dS1_T -= 0.5*contract_bra(vK_1, dS, ADT_q)
+        de_dS1_T  = cupy.asarray([cupy.sum(de_dS1_T[p0:p1], axis=0) for p0,p1 in gridslice])
+        vK_1_ADT_q = vK_1 * ADT_q
+        de_dS1_T += 0.5*contract('j,xjn->nx', vK_1_ADT_q, dSii)
+
+        Sq = cupy.dot(S,q)
+        ASq = A*Sq
+        de_dD  = 0.5*contract_ket(vK_1, dD, ASq)
+        de_dD -= 0.5*contract_bra(vK_1, dD, ASq)
+        de_dD  = cupy.asarray([cupy.sum(de_dD[p0:p1], axis=0) for p0,p1 in gridslice])
+
+        vK_1_S = cupy.dot(vK_1, S)
+        vK_1_SA = vK_1_S * A
+        de_dD_T  = 0.5*contract_ket(vK_1_SA, -dD.transpose(0,2,1), q)
+        de_dD_T -= 0.5*contract_bra(vK_1_SA, -dD.transpose(0,2,1), q)
+        de_dD_T  = cupy.asarray([cupy.sum(de_dD_T[p0:p1], axis=0) for p0,p1 in gridslice])
+
+        de_dA = 0.5*contract('j,xjn->nx', vK_1_D*Sq, dA)   # 0.5*cupy.einsum('j,xjn,j->nx', vK_1_D, dA, Sq)
+
+        de_dA_T = 0.5*contract('j,xjn->nx', vK_1_S*DT_q, dA)
+
+        de_dK = de_dS0 - 0.5 * fac * (de_dD + de_dA + de_dS1 + de_dD_T + de_dA_T + de_dS1_T)
+        de += de_dR - de_dK
     else:
         raise RuntimeError(f"Unknown implicit solvent model: {pcmobj.method}")
     t1 = log.timer_debug1('grad solver', *t1)