From 8f1d02bf592236a944c5f85dd041d0d2795998d0 Mon Sep 17 00:00:00 2001 From: Anindit Gopalakrishnan Date: Fri, 13 Dec 2024 08:16:42 +0000 Subject: [PATCH 1/6] paddle helix fork --- .../helixfold3/helixfold/data/templates.py | 4 ++-- apps/protein_folding/helixfold3/run_infer.sh | 18 ++++++------------ 2 files changed, 8 insertions(+), 14 deletions(-) mode change 100644 => 100755 apps/protein_folding/helixfold3/run_infer.sh diff --git a/apps/protein_folding/helixfold3/helixfold/data/templates.py b/apps/protein_folding/helixfold3/helixfold/data/templates.py index aa7b83b1..95f00837 100644 --- a/apps/protein_folding/helixfold3/helixfold/data/templates.py +++ b/apps/protein_folding/helixfold3/helixfold/data/templates.py @@ -817,7 +817,7 @@ def _process_single_hit( TemplateAtomMaskAllZerosError) as e: # These 3 errors indicate missing mmCIF experimental data rather than a # problem with the template search, so turn them into warnings. - warning = ('%s_%s (sum_probs: %s, rank: %s): feature extracting errors: ' + warning = ('%s_%s (sum_probs: %.2f, rank: %d): feature extracting errors: ' '%s, mmCIF parsing errors: %s' % (hit_pdb_code, hit_chain_id, hit.sum_probs, hit.index, str(e), parsing_result.errors)) @@ -826,7 +826,7 @@ def _process_single_hit( else: return SingleHitResult(features=None, error=None, warning=warning) except Error as e: - error = ('%s_%s (sum_probs: %s, rank: %s): feature extracting errors: ' + error = ('%s_%s (sum_probs: %.2f, rank: %d): feature extracting errors: ' '%s, mmCIF parsing errors: %s' % (hit_pdb_code, hit_chain_id, hit.sum_probs, hit.index, str(e), parsing_result.errors)) diff --git a/apps/protein_folding/helixfold3/run_infer.sh b/apps/protein_folding/helixfold3/run_infer.sh old mode 100644 new mode 100755 index 5b0644e5..0da270f4 --- a/apps/protein_folding/helixfold3/run_infer.sh +++ b/apps/protein_folding/helixfold3/run_infer.sh @@ -1,14 +1,11 @@ #!/bin/bash -PYTHON_BIN="/usr/bin/python3" # changes to your python -ENV_BIN="/root/miniconda3/bin" # change to your env -MAXIT_SRC="PATH/TO/MAXIT/SRC" # changes to your MAXIT -export OBABEL_BIN="PATH/TO/OBABEL/BIN" # changes to your openbabel +PYTHON_BIN="/home/anindit/.conda/envs/helixfold/bin/python" # changes to your python +ENV_BIN="/home/anindit/.conda/envs/helixfold/bin" # change to your env DATA_DIR="./data" -export PATH="$MAXIT_SRC/bin:$PATH" +export OBABEL_BIN="/opt/schrodinger2024-3/utilities/obabel" CUDA_VISIBLE_DEVICES=0 "$PYTHON_BIN" inference.py \ - --maxit_binary "$MAXIT_SRC/bin/maxit" \ --jackhmmer_binary_path "$ENV_BIN/jackhmmer" \ --hhblits_binary_path "$ENV_BIN/hhblits" \ --hhsearch_binary_path "$ENV_BIN/hhsearch" \ @@ -17,10 +14,7 @@ CUDA_VISIBLE_DEVICES=0 "$PYTHON_BIN" inference.py \ --hmmbuild_binary_path "$ENV_BIN/hmmbuild" \ --nhmmer_binary_path "$ENV_BIN/nhmmer" \ --preset='reduced_dbs' \ - --bfd_database_path "$DATA_DIR/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt" \ --small_bfd_database_path "$DATA_DIR/small_bfd/bfd-first_non_consensus_sequences.fasta" \ - --bfd_database_path "$DATA_DIR/small_bfd/bfd-first_non_consensus_sequences.fasta" \ - --uniclust30_database_path "$DATA_DIR/uniclust30/uniclust30_2018_08/uniclust30_2018_08" \ --uniprot_database_path "$DATA_DIR/uniprot/uniprot.fasta" \ --pdb_seqres_database_path "$DATA_DIR/pdb_seqres/pdb_seqres.txt" \ --uniref90_database_path "$DATA_DIR/uniref90/uniref90.fasta" \ @@ -30,10 +24,10 @@ CUDA_VISIBLE_DEVICES=0 "$PYTHON_BIN" inference.py \ --ccd_preprocessed_path "$DATA_DIR/ccd_preprocessed_etkdg.pkl.gz" \ --rfam_database_path "$DATA_DIR/Rfam-14.9_rep_seq.fasta" \ --max_template_date=2020-05-14 \ - --input_json data/demo_6zcy.json \ + --input_json $1\ --output_dir ./output \ --model_name allatom_demo \ --init_model init_models/HelixFold3-240814.pdparams \ - --infer_times 1 \ + --infer_times 5 \ --diff_batch_size 1 \ - --precision "fp32" \ No newline at end of file + --precision "fp32" From 22e16c69589ff2035e7d10da98a108871d4922a3 Mon Sep 17 00:00:00 2001 From: Anindit Gopalakrishnan Date: Fri, 13 Dec 2024 08:18:04 +0000 Subject: [PATCH 2/6] done --- .../helixfold3/run_all_internal_xtal.sh | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100755 apps/protein_folding/helixfold3/run_all_internal_xtal.sh diff --git a/apps/protein_folding/helixfold3/run_all_internal_xtal.sh b/apps/protein_folding/helixfold3/run_all_internal_xtal.sh new file mode 100755 index 00000000..8fb4914f --- /dev/null +++ b/apps/protein_folding/helixfold3/run_all_internal_xtal.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +DIRECTORY="data/internal_xtal_inputs" + +# Check if the provided path is a valid directory +if [ ! -d "$DIRECTORY" ]; then + echo "Error: $DIRECTORY is not a valid directory." + exit 1 +fi + +echo "Files in $DIRECTORY:" +for FILE in "$DIRECTORY"/*; do + if [ -f "$FILE" ]; then + ./run_infer.sh "$FILE" + fi +done \ No newline at end of file From 36855051e53195b54851313cb962766e24f67fe1 Mon Sep 17 00:00:00 2001 From: Anindit Gopalakrishnan Date: Fri, 13 Dec 2024 08:27:54 +0000 Subject: [PATCH 3/6] update maxit --- .../helixfold3/helixfold/common/all_atom_pdb_save.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py b/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py index deb8e087..3ce63bdd 100644 --- a/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py +++ b/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py @@ -164,14 +164,14 @@ def prediction_to_mmcif(pred_atom_pos: Union[np.ndarray, paddle.Tensor], - maxit_binary: path to maxit_binary, use to convert pdb to cif - mmcif_path: path to save *.cif """ - assert maxit_binary is not None and os.path.exists(maxit_binary), ( - f'maxit_binary: {maxit_binary} not exists. ' - f'link: https://sw-tools.rcsb.org/apps/MAXIT/source.html') +# assert maxit_binary is not None and os.path.exists(maxit_binary), ( +# f'maxit_binary: {maxit_binary} not exists. ' +# f'link: https://sw-tools.rcsb.org/apps/MAXIT/source.html') assert mmcif_path.endswith('.cif'), f'mmcif_path should endswith .cif; got {mmcif_path}' pdb_path = mmcif_path.replace('.cif', '.pdb') pdb_path = prediction_to_pdb(pred_atom_pos, FeatsDict, pdb_path) - msg = os.system(f'{maxit_binary} -i {pdb_path} -o 1 -output {mmcif_path}') + msg = os.system(f'structconvert -PDBx {pdb_path} {mmcif_path}') if msg != 0: print(f'convert pdb to cif failed, error message: {msg}') return mmcif_path \ No newline at end of file From eadb2719f70351af8a7a47a32d43ce44791d7aa3 Mon Sep 17 00:00:00 2001 From: Anindit Gopalakrishnan Date: Fri, 10 Jan 2025 20:25:55 +0000 Subject: [PATCH 4/6] create script to highlight issue --- .../helixfold3/isolate_msa_issue.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 apps/protein_folding/helixfold3/isolate_msa_issue.py diff --git a/apps/protein_folding/helixfold3/isolate_msa_issue.py b/apps/protein_folding/helixfold3/isolate_msa_issue.py new file mode 100644 index 00000000..5c419307 --- /dev/null +++ b/apps/protein_folding/helixfold3/isolate_msa_issue.py @@ -0,0 +1,20 @@ +from helixfold.data.pipeline_parallel import make_msa_features +from helixfold.data import parsers +import os + +def get_text(p): + with open(p, 'r') as f: + return f.read() + +def try_creating_msa_features(folder): + uniref90_msa = parsers.parse_stockholm(get_text(os.path.join(folder, 'uniref90_hits.sto'))) + mgnify_msa = parsers.parse_stockholm(get_text(os.path.join(folder, 'mgnify_hits.sto'))) + bfd_msa = parsers.parse_stockholm(get_text(os.path.join(folder, 'small_bfd_hits.sto'))) + msa_features = make_msa_features((uniref90_msa, bfd_msa, mgnify_msa)) + +if __name__ == '__main__': + internal_xtal_folder = "/home/anindit/paddle-helix-fork/apps/protein_folding/helixfold3/output/row-0/msas/protein_A/A" + posebuster_folder = "/home/anindit/deep-affinity/experimental/users/anindit/posebuster_5SAK_ZRY" + msa_features = try_creating_msa_features(posebuster_folder) + + From 94ac1858e6dcc613aeeaafe1465ddea76be10ca3 Mon Sep 17 00:00:00 2001 From: Anindit Gopalakrishnan Date: Fri, 10 Jan 2025 20:52:57 +0000 Subject: [PATCH 5/6] make it easy to see where the msas fail --- .../helixfold/data/pipeline_parallel.py | 55 ++++++++++--------- apps/protein_folding/helixfold3/inference.py | 38 +++++++------ .../helixfold3/isolate_msa_issue.py | 4 +- 3 files changed, 52 insertions(+), 45 deletions(-) diff --git a/apps/protein_folding/helixfold3/helixfold/data/pipeline_parallel.py b/apps/protein_folding/helixfold3/helixfold/data/pipeline_parallel.py index 3ca52693..0c18b26f 100644 --- a/apps/protein_folding/helixfold3/helixfold/data/pipeline_parallel.py +++ b/apps/protein_folding/helixfold3/helixfold/data/pipeline_parallel.py @@ -62,6 +62,8 @@ def make_msa_features(msas: Sequence[parsers.Msa]) -> FeatureDict: for msa_index, msa in enumerate(msas): if not msa: raise ValueError(f'MSA {msa_index} must contain at least one sequence.') + + print("MSA SEQUENCE LENGTH", len(msa.sequences)) for sequence_index, sequence in enumerate(msa.sequences): if sequence in seen_sequences: continue @@ -239,40 +241,40 @@ def process(self, input_fasta_path: str, msa_output_dir: str) -> FeatureDict: except Exception as exc: print(f'Task {task} generated an exception : {exc}') - msa_for_templates = msa_results['uniref90']['sto'] - msa_for_templates = parsers.deduplicate_stockholm_msa(msa_for_templates) - msa_for_templates = parsers.remove_empty_columns_from_stockholm_msa(msa_for_templates) + # msa_for_templates = msa_results['uniref90']['sto'] + # msa_for_templates = parsers.deduplicate_stockholm_msa(msa_for_templates) + # msa_for_templates = parsers.remove_empty_columns_from_stockholm_msa(msa_for_templates) - if self.template_searcher.input_format == 'sto': - pdb_templates_result = self.template_searcher.query(msa_for_templates) - elif self.template_searcher.input_format == 'a3m': - uniref90_msa_as_a3m = parsers.convert_stockholm_to_a3m(msa_for_templates) - pdb_templates_result = self.template_searcher.query(uniref90_msa_as_a3m) - else: - raise ValueError('Unrecognized template input format: ' - f'{self.template_searcher.input_format}') + # if self.template_searcher.input_format == 'sto': + # pdb_templates_result = self.template_searcher.query(msa_for_templates) + # elif self.template_searcher.input_format == 'a3m': + # uniref90_msa_as_a3m = parsers.convert_stockholm_to_a3m(msa_for_templates) + # pdb_templates_result = self.template_searcher.query(uniref90_msa_as_a3m) + # else: + # raise ValueError('Unrecognized template input format: ' + # f'{self.template_searcher.input_format}') - pdb_hits_out_path = os.path.join( - msa_output_dir, f'pdb_hits.{self.template_searcher.output_format}') - with open(pdb_hits_out_path, 'w') as f: - f.write(pdb_templates_result) + # pdb_hits_out_path = os.path.join( + # msa_output_dir, f'pdb_hits.{self.template_searcher.output_format}') + # with open(pdb_hits_out_path, 'w') as f: + # f.write(pdb_templates_result) uniref90_msa = parsers.parse_stockholm(msa_results['uniref90']['sto']) mgnify_msa = parsers.parse_stockholm(msa_results['mgnify']['sto']) - pdb_template_hits = self.template_searcher.get_template_hits( - output_string=pdb_templates_result, input_sequence=input_sequence) + # pdb_template_hits = self.template_searcher.get_template_hits( + # output_string=pdb_templates_result, input_sequence=input_sequence) if self._use_small_bfd: bfd_msa = parsers.parse_stockholm(msa_results['small_bfd']['sto']) else: raise ValueError("Doesn't support full BFD yet.") - templates_result = self.template_featurizer.get_templates( - query_sequence=input_sequence, - hits=pdb_template_hits, - query_pdb_code=None, - query_release_date=None) + # templates_result = self.template_featurizer.get_templates( + # query_sequence=input_sequence, + # hits=pdb_template_hits, + # query_pdb_code=None, + # query_release_date=None) sequence_features = make_sequence_features( sequence=input_sequence, @@ -286,8 +288,9 @@ def process(self, input_fasta_path: str, msa_output_dir: str) -> FeatureDict: logging.info('MGnify MSA size: %d sequences.', len(mgnify_msa)) logging.info('Final (deduplicated) MSA size: %d sequences.', msa_features['num_alignments'][0]) - logging.info('Total number of templates (NB: this can include bad ' - 'templates and is later filtered to top 4): %d.', - templates_result.features['template_domain_names'].shape[0]) + # logging.info('Total number of templates (NB: this can include bad ' + # 'templates and is later filtered to top 4): %d.', + # templates_result.features['template_domain_names'].shape[0]) - return {**sequence_features, **msa_features, **templates_result.features} + # return {**sequence_features, **msa_features, **templates_result.features} + return {**sequence_features, **msa_features} diff --git a/apps/protein_folding/helixfold3/inference.py b/apps/protein_folding/helixfold3/inference.py index 51cf6ec6..e05adeb0 100644 --- a/apps/protein_folding/helixfold3/inference.py +++ b/apps/protein_folding/helixfold3/inference.py @@ -467,24 +467,7 @@ def main(args): msa_templ_data_pipeline_dict = get_msa_templates_pipeline(args) - ### create model - model_config = config.model_config(args.model_name) - print(f'>>> model_config:\n{model_config}') - - model = RunModel(model_config) - - if (not args.init_model is None) and (not args.init_model == ""): - print(f"Load pretrain model from {args.init_model}") - pd_params = paddle.load(args.init_model) - - has_opt = 'optimizer' in pd_params - if has_opt: - model.helixfold.set_state_dict(pd_params['model']) - else: - model.helixfold.set_state_dict(pd_params) - if args.precision == "bf16" and args.amp_level == "O2": - raise NotImplementedError("bf16 O2 is not supported yet.") print(f"============ Data Loading ============") job_base = pathlib.Path(args.input_json).stem @@ -506,6 +489,27 @@ def main(args): feature_dict['feat'] = batch_convert(feature_dict['feat'], add_batch=True) feature_dict['label'] = batch_convert(feature_dict['label'], add_batch=True) + return + print(f"============ Model Loading ============") + ### create model + model_config = config.model_config(args.model_name) + print(f'>>> model_config:\n{model_config}') + + model = RunModel(model_config) + + if (not args.init_model is None) and (not args.init_model == ""): + print(f"Load pretrain model from {args.init_model}") + pd_params = paddle.load(args.init_model) + + has_opt = 'optimizer' in pd_params + if has_opt: + model.helixfold.set_state_dict(pd_params['model']) + else: + model.helixfold.set_state_dict(pd_params) + + if args.precision == "bf16" and args.amp_level == "O2": + raise NotImplementedError("bf16 O2 is not supported yet.") + print(f"============ Start Inference ============") infer_times = args.infer_times diff --git a/apps/protein_folding/helixfold3/isolate_msa_issue.py b/apps/protein_folding/helixfold3/isolate_msa_issue.py index 5c419307..fe6d7c41 100644 --- a/apps/protein_folding/helixfold3/isolate_msa_issue.py +++ b/apps/protein_folding/helixfold3/isolate_msa_issue.py @@ -13,8 +13,8 @@ def try_creating_msa_features(folder): msa_features = make_msa_features((uniref90_msa, bfd_msa, mgnify_msa)) if __name__ == '__main__': - internal_xtal_folder = "/home/anindit/paddle-helix-fork/apps/protein_folding/helixfold3/output/row-0/msas/protein_A/A" - posebuster_folder = "/home/anindit/deep-affinity/experimental/users/anindit/posebuster_5SAK_ZRY" + # internal_xtal_folder = "/home/anindit/paddle-helix-fork/apps/protein_folding/helixfold3/output/row-0/msas/protein_A/A" + posebuster_folder = "/home/anindit/deep-affinity/experimental/users/anindit/posebuster_5SAK_ZRY/sto" msa_features = try_creating_msa_features(posebuster_folder) From c21d206f99cf25a72af8625db1904f77d1eb9ead Mon Sep 17 00:00:00 2001 From: Anindit Gopalakrishnan Date: Fri, 10 Jan 2025 20:58:02 +0000 Subject: [PATCH 6/6] isoalte --- apps/protein_folding/helixfold3/isolate_msa_issue.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/apps/protein_folding/helixfold3/isolate_msa_issue.py b/apps/protein_folding/helixfold3/isolate_msa_issue.py index fe6d7c41..2d23074a 100644 --- a/apps/protein_folding/helixfold3/isolate_msa_issue.py +++ b/apps/protein_folding/helixfold3/isolate_msa_issue.py @@ -10,11 +10,11 @@ def try_creating_msa_features(folder): uniref90_msa = parsers.parse_stockholm(get_text(os.path.join(folder, 'uniref90_hits.sto'))) mgnify_msa = parsers.parse_stockholm(get_text(os.path.join(folder, 'mgnify_hits.sto'))) bfd_msa = parsers.parse_stockholm(get_text(os.path.join(folder, 'small_bfd_hits.sto'))) - msa_features = make_msa_features((uniref90_msa, bfd_msa, mgnify_msa)) + make_msa_features((uniref90_msa, bfd_msa, mgnify_msa)) if __name__ == '__main__': - # internal_xtal_folder = "/home/anindit/paddle-helix-fork/apps/protein_folding/helixfold3/output/row-0/msas/protein_A/A" - posebuster_folder = "/home/anindit/deep-affinity/experimental/users/anindit/posebuster_5SAK_ZRY/sto" - msa_features = try_creating_msa_features(posebuster_folder) + failed_posebuster_folder = "/home/anindit/deep-affinity/experimental/users/anindit/posebuster_5SAK_ZRY/sto" + working_posebuster_folder = "/home/anindit/paddle-helix-fork/apps/protein_folding/helixfold3/output/posebuster_5SAK_ZRY/msas/protein_A/A-HF3" + msa_features = try_creating_msa_features(failed_posebuster_folder)