-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathAutoNMR.py
449 lines (342 loc) · 16.8 KB
/
AutoNMR.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
from rdkit import Chem
from rdkit.Chem import AllChem
import tempfile
import os
import re
import subprocess
import time
import sys
import pandas as pd
import numpy as np
import argparse
import matplotlib.pyplot as plt
from rdkit import Chem
def generate_conformers(smiles, num_conformers):
"""
Generate conformers for a molecule from SMILES and return them with energies.
Args:
smiles (str): SMILES string of the molecule.
num_conformers (int): Number of conformers to generate.
Returns:
tuple: (sdf_data, energies)
sdf_data (str): SDF file content containing all conformers.
energies (list): List of tuples containing (conformer_id, energy).
"""
mol = Chem.MolFromSmiles(smiles)
mol = Chem.AddHs(mol) # Add hydrogen atoms
# Generate 3D coordinates
AllChem.EmbedMolecule(mol)
# Optimize geometry using UFF
AllChem.UFFOptimizeMolecule(mol)
# Generate conformers
cids = AllChem.EmbedMultipleConfs(mol, numConfs=num_conformers)
# Optimize each conformer using UFF and calculate energies
energies = []
for cid in cids:
AllChem.UFFOptimizeMolecule(mol, confId=cid)
energy = AllChem.UFFGetMoleculeForceField(mol, confId=cid).CalcEnergy()
energies.append((cid, energy))
# Sort conformers by energy
energies.sort(key=lambda x: x[1])
# Write conformers to an SDF file in a temporary directory
with tempfile.TemporaryDirectory() as tmp_dir:
sdf_file = os.path.join(tmp_dir, 'conformers.sdf')
writer = Chem.SDWriter(sdf_file)
for cid, energy in energies:
mol.SetProp('_Name', f'Conformer_{cid}_Energy_{energy:.2f}')
writer.write(mol, confId=cid)
writer.close()
# Read the SDF file content
with open(sdf_file, 'rb') as f:
sdf_data = f.read().decode('utf-8')
return sdf_data, energies
def generate_3d_coordinates(smiles, sdf_data, xc_functional, output_file_prefix, case):
"""
Generates 3D geometry input files for NWChem based on optimized conformers.
Args:
smiles (str): The SMILES string of the molecule (for information purposes).
sdf_data (str): SDF file content containing all optimized conformers.
xc_functional (str): The exchange-correlation functional to use in the DFT calculations.
output_file_prefix (str): The prefix for the output NWChem input files.
Returns:
list: A list of generated NWChem input file paths.
"""
atomic_number_to_symbol = {
1: 'H', 2: 'He', 3: 'Li', 4: 'Be', 5: 'B',
6: 'C', 7: 'N', 8: 'O', 9: 'F', 10: 'Ne',
11: 'Na', 12: 'Mg', 13: 'Al', 14: 'Si', 15: 'P',
16: 'S', 17: 'Cl', 18: 'Ar', 19: 'K', 20: 'Ca',
# Extend this mapping as needed for other elements
}
atomic_numbers_to_include = {
1: 'Hydrogen', # Case 1: Hydrogen atoms only
6: 'Carbon' # Case 2: Carbon atoms only
}
if case not in atomic_numbers_to_include:
raise ValueError("Invalid case selection. Choose 1 for Hydrogen or 2 for Carbon.")
target_atomic_number = case
target_element = atomic_numbers_to_include[target_atomic_number]
suppl = Chem.SDMolSupplier('conformers.sdf', removeHs=False)
nwchem_input_files = []
for mol_idx, mol in enumerate(suppl):
if mol is None:
print(f"Error: Could not read molecule from SDF data.")
continue
for conf_idx in range(mol.GetNumConformers()):
conformer = mol.GetConformer(conf_idx)
nwchem_input_file = f"{output_file_prefix}_mol_{mol_idx}_conformer_{conf_idx}.nw"
nwchem_input_files.append(nwchem_input_file)
with open(nwchem_input_file, 'w') as f:
f.write(f"start\n\ntitle \"Molecule conformer {conf_idx}\"\n\nmemory total 4000 mb\n\necho\ngeometry units angstroms\n")
atom_indices = []
for atom_idx in range(mol.GetNumAtoms()):
atom = mol.GetAtomWithIdx(atom_idx)
atomic_num = atom.GetAtomicNum()
symbol = atomic_number_to_symbol.get(atomic_num, 'Xx') # 'Xx' for unknown symbols
pos = conformer.GetAtomPosition(atom_idx)
x, y, z = pos.x, pos.y, pos.z
f.write(f" {symbol:<2s} {x:>10.4f} {y:>10.4f} {z:>10.4f}\n")
# Collect indices of target atoms
if atomic_num == target_atomic_number:
atom_indices.append(atom_idx + 1) # 1-based index for NWChem
f.write(
"end\n\n\nbasis spherical\n* library 6-311G\nend\n\n"
"driver\n"
" maxiter 500\n xyz final\n"
"end\n\n"
"relativistic\n"
" zora on\n"
" zora:cutoff_NMR 1d-8\n"
" zora:cutoff 1d-30\n"
"end\n\n"
"dft\n"
" direct\n"
" grid fine\n"
f" xc {xc_functional}\n"
" mult 1\n"
" maxiter 1500\n"
' noprint "final vectors analysis" multipole\n'
"end\n\n"
"task dft optimize\n\n"
"task dft freq\n\n"
"property\n"
)
# Specify target atoms for shielding tensor calculation
if atom_indices:
f.write(f" shielding {len(atom_indices)} {' '.join(map(str, atom_indices))}\n")
f.write(
"end\n\n"
"cosmo\n"
" solvent cdcl3\n"
"end\n\n"
"task dft property\n"
)
return nwchem_input_files
def run_nwchem(commands):
"""
Executes an NWChem command using the system shell and measures its execution time.
This function:
1. Prints a message indicating that NWChem is running.
2. Records the start time of the NWChem command.
3. Executes the provided NWChem command using the system shell.
4. Records the end time of the NWChem command.
5. Calculates and prints the execution time of the NWChem command.
Args:
commands (str): The NWChem command to be executed.
Notes:
- The function uses `subprocess.run` to execute the command.
- The `shell=True` argument allows the command to be executed through the shell, which is necessary for complex commands.
- The execution time is printed to the standard error stream.
Example:
run_nwchem("mpiexec --use-hwthread-cpus -np 40 nwchem input_file.nw > output_file.out")
"""
print("Running NWChem...")
start_time = time.time() # Record start time
subprocess.run(commands, shell=True) # Execute the command
end_time = time.time() # Record end time
execution_time = end_time - start_time # Calculate execution time
print("NWChem execution time: {:.2f} seconds".format(execution_time), file=sys.stderr) # Print execution time
def run(smiles):
"""
Generates NWChem input and output file names based on a given SMILES string.
This function:
1. Checks if the given SMILES string contains any special characters.
2. If special characters are found, the SMILES string is enclosed in double quotes.
3. Generates NWChem input and output file names using the (possibly modified) SMILES string.
Args:
smiles (str): A SMILES string representing a molecule.
Returns:
tuple: A tuple containing the names of the NWChem input file and output file.
Global Variables:
output_file_prefix (str): The prefix for the NWChem input files.
output_file_name (str): The name of the NWChem output file. atomic_number_to_symbol = {
Notes:
- Special characters in file names can cause issues in shell commands. Enclosing the SMILES string in
double quotes helps prevent these issues.
- The special characters checked for are: @, _, !, #, $, %, ^, &, *, (, ), <, >, ?, /, \, |, }, {, ~, and :.
"""
# Make own character set and pass this as argument in compile method
regex = re.compile(r'[@_!#$%^&*()<>?/\|}{~:]')
# Check for special characters in the string
if regex.search(smiles) is not None:
smiles = f'"{smiles}"'
# Generate NWChem input and output file names
output_file_prefix = f"{smiles}"
output_file_name = f"{smiles}.out"
return output_file_prefix, output_file_name
def free_energy_output(output_file):
with open(output_file, 'r') as f:
content = f.read()
# Extract the final DFT energy
energy_matches = re.findall(r'Total DFT energy =\s+([-]?\d+\.\d+)', content)
if energy_matches:
e_elec = float(energy_matches[-1]) * 627.5095
# Extract thermal correction to energy
thermal_corr_match = re.search(r'Thermal correction to Energy\s+=\s+(\d+\.\d+)\s+kcal/mol', content)
if thermal_corr_match:
thermal_corr = float(thermal_corr_match.group(1))
# Extract total entropy
entropy_match = re.search(r'Total Entropy\s+=\s+(\d+\.\d+)\s+cal/mol-K', content)
if entropy_match:
entropy = float(entropy_match.group(1)) / 1000 # Convert to kcal/mol-K
# Calculate free energy
T = 298.15 # Temperature in K
G = e_elec + thermal_corr - T * entropy
return G
def compute_boltzmann_weights(relative_energies, temperature):
"""
Compute Boltzmann weights based on relative free energies.
Args:
- relative_energies (list or numpy array): List of relative free energies for each conformer.
- temperature (float): Temperature in Kelvin.
Returns:
- numpy array: Boltzmann weights for each conformer.
"""
# Normalize relative energies to prevent overflow
min_energy = np.min(relative_energies)
normalized_energies = relative_energies - min_energy
beta = 1 / (0.0083145 * temperature) # Boltzmann constant in kJ/mol/K
exp_terms = np.exp(-beta * normalized_energies)
boltz_weights = exp_terms / np.sum(exp_terms)
return boltz_weights
def apply_boltzmann_weights_to_tensors(tensors, boltz_weights):
"""
Apply Boltzmann weights to NMR shielding tensors.
Args:
- tensors (dict): Dictionary containing NMR shielding tensors for each atom.
Example: {'H1': np.array([...]), ...}
- boltz_weights (numpy array): Boltzmann weights for each conformer.
Returns:
- dict: Boltzmann-weighted average NMR shielding tensors for each atom.
"""
weighted_tensors = {}
for atom, tensor_data in tensors.items():
weighted_tensor = np.zeros(len(tensor_data[0]))
for i in range(len(boltz_weights)):
weighted_tensor += boltz_weights[i] * tensor_data[i]
weighted_tensors[atom] = weighted_tensor
return weighted_tensors
def parse_shielding_tensor_from_output(output_file):
"""
Parse isotropic shielding tensor values from NWChem output file.
Args:
output_file (str): Path to the NWChem output file.
Returns:
dict: A dictionary with combined (atom name + atom index) as keys and isotropic shielding values as values.
"""
shielding_values = {}
with open(output_file, 'r') as file:
data = file.read()
# Find all matches of atom information using regular expressions
matches = re.findall(r'Atom:\s+(\d+)\s+(\w+)\s+.*?isotropic\s+=\s+([\d.-]+)', data, re.DOTALL)
# Iterate over the matches and store the isotropic values
for match in matches:
atom_number = int(match[0])
atom_name = match[1]
isotropic_value = float(match[2])
key = f"{atom_name}{atom_number}"
shielding_values[key] = isotropic_value
return shielding_values
def write_chemical_shifts_to_excel(output_dir, output_excel):
"""
Write the boltzmann weighted average NMR shielding tensor for each conformer to an Excel sheet.
Args:
output_dir (str): Directory containing the NWChem output files.
output_excel (str): Path to the output Excel file.
Returns:
None
"""
data = []
free_energies = {}
tensors = {}
# Read NWChem output files and extract free energies and shielding values
for output_file in os.listdir(output_dir):
if output_file.endswith('.out'):
conformer_id = os.path.splitext(output_file)[0]
output_path = os.path.join(output_dir, output_file)
free_energies[conformer_id] = free_energy_output(output_path)
shielding_values = parse_shielding_tensor_from_output(output_path)
if conformer_id not in tensors:
tensors[conformer_id] = {}
for atom, shielding in shielding_values.items():
if atom in tensors[conformer_id]:
tensors[conformer_id][atom].append(shielding)
else:
tensors[conformer_id][atom] = [shielding]
# Calculate Boltzmann weights
relative_energies = np.array(list(free_energies.values()))
boltz_weights = compute_boltzmann_weights(relative_energies, temperature=298.15)
# Combine tensors for Boltzmann-weighted average
combined_tensors = {}
unique_atoms = set(atom for conformer in tensors.values() for atom in conformer.keys())
for atom in unique_atoms:
atom_tensors = np.array([tensors[conformer][atom] for conformer in tensors if atom in tensors[conformer]])
combined_tensors[atom] = apply_boltzmann_weights_to_tensors({atom: atom_tensors}, boltz_weights)[atom]
print(f"Boltzmann-weighted average NMR shielding tensors: {combined_tensors}")
for atom, shielding_tensor in combined_tensors.items():
# Convert numpy.ndarray to float if it contains a single value
if isinstance(shielding_tensor, np.ndarray) and shielding_tensor.size == 1:
shielding_tensor = shielding_tensor.item() # or shielding_tensor = shielding_tensor[0]
data.append([atom,shielding_tensor])
# Write results to Excel
df = pd.DataFrame(data, columns=['Atom', 'Shielding Tensor'])
df.to_excel(output_excel, index=False)
def main():
"""
Main function to generate conformers, perform NWChem calculations, and write chemical shifts to an Excel file.
Steps:
1. Parse command-line arguments to obtain SMILES string, exchange-correlation functional, case, number of conformers, and output Excel file name.
2. Generate conformers for the given SMILES string and print their energies for verification.
3. Write SDF data to a file if needed.
4. Generate 3D coordinates and create NWChem input files for each conformer.
5. Run NWChem calculations for each input file using MPI execution.
6. Write the calculated chemical shifts to an Excel file.
"""
parser = argparse.ArgumentParser(description="Generate conformers and calculate chemical shifts.")
parser.add_argument("smiles", type=str, help="SMILES string of the molecule.")
parser.add_argument("xc_functional", type=str, help="Exchange-correlation functional for NWChem calculations.")
parser.add_argument("--case", type=int, help="Case 1: 1H Spectrum , Case 2: 13C Spectrum")
parser.add_argument("--num_conformers", type=int, default=10, help="Number of conformers to generate (default: 10).")
parser.add_argument("--output_excel", type=str, default="chemical_shifts.xlsx", help="Output Excel file (default: chemical_shifts.xlsx).")
args = parser.parse_args()
# Generate conformers
sdf_data, energies = generate_conformers(args.smiles, args.num_conformers)
# Print energies for verification
print("Conformer Energies:")
for cid, energy in energies:
print(f"Conformer {cid}: Energy = {energy:.2f}")
# Example: Write sdf_data to a file if needed
with open('conformers.sdf', 'w') as f:
f.write(sdf_data)
# Generate 3D coordinates and create NWChem input files
nwchem_input_files = generate_3d_coordinates(args.smiles, sdf_data, args.xc_functional, args.smiles, args.case)
for nwchem_input_file in nwchem_input_files:
nwchem_input_file, nwchem_output_file = run(nwchem_input_file)
nwchem_command = f"nwchem {nwchem_input_file} > {nwchem_output_file}"
final_command = f"mpiexec --use-hwthread-cpus -np 1 {nwchem_command}"
run_nwchem(final_command)
output_dir = './' # Directory containing NWChem output files
# Write chemical shifts to Excel
write_chemical_shifts_to_excel(output_dir, args.output_excel)
if __name__ == "__main__":
main()