From 8f1d02bf592236a944c5f85dd041d0d2795998d0 Mon Sep 17 00:00:00 2001
From: Anindit Gopalakrishnan <aninditg98@gmail.com>
Date: Fri, 13 Dec 2024 08:16:42 +0000
Subject: [PATCH 1/6] paddle helix fork

---
 .../helixfold3/helixfold/data/templates.py     |  4 ++--
 apps/protein_folding/helixfold3/run_infer.sh   | 18 ++++++------------
 2 files changed, 8 insertions(+), 14 deletions(-)
 mode change 100644 => 100755 apps/protein_folding/helixfold3/run_infer.sh

diff --git a/apps/protein_folding/helixfold3/helixfold/data/templates.py b/apps/protein_folding/helixfold3/helixfold/data/templates.py
index aa7b83b1..95f00837 100644
--- a/apps/protein_folding/helixfold3/helixfold/data/templates.py
+++ b/apps/protein_folding/helixfold3/helixfold/data/templates.py
@@ -817,7 +817,7 @@ def _process_single_hit(
           TemplateAtomMaskAllZerosError) as e:
     # These 3 errors indicate missing mmCIF experimental data rather than a
     # problem with the template search, so turn them into warnings.
-    warning = ('%s_%s (sum_probs: %s, rank: %s): feature extracting errors: '
+    warning = ('%s_%s (sum_probs: %.2f, rank: %d): feature extracting errors: '
                '%s, mmCIF parsing errors: %s'
                % (hit_pdb_code, hit_chain_id, hit.sum_probs, hit.index,
                   str(e), parsing_result.errors))
@@ -826,7 +826,7 @@ def _process_single_hit(
     else:
       return SingleHitResult(features=None, error=None, warning=warning)
   except Error as e:
-    error = ('%s_%s (sum_probs: %s, rank: %s): feature extracting errors: '
+    error = ('%s_%s (sum_probs: %.2f, rank: %d): feature extracting errors: '
              '%s, mmCIF parsing errors: %s'
              % (hit_pdb_code, hit_chain_id, hit.sum_probs, hit.index,
                 str(e), parsing_result.errors))
diff --git a/apps/protein_folding/helixfold3/run_infer.sh b/apps/protein_folding/helixfold3/run_infer.sh
old mode 100644
new mode 100755
index 5b0644e5..0da270f4
--- a/apps/protein_folding/helixfold3/run_infer.sh
+++ b/apps/protein_folding/helixfold3/run_infer.sh
@@ -1,14 +1,11 @@
 #!/bin/bash
 
-PYTHON_BIN="/usr/bin/python3" # changes to your python
-ENV_BIN="/root/miniconda3/bin"  # change to your env
-MAXIT_SRC="PATH/TO/MAXIT/SRC" # changes to your MAXIT
-export OBABEL_BIN="PATH/TO/OBABEL/BIN" # changes to your openbabel
+PYTHON_BIN="/home/anindit/.conda/envs/helixfold/bin/python" # changes to your python
+ENV_BIN="/home/anindit/.conda/envs/helixfold/bin"  # change to your env
 DATA_DIR="./data"
-export PATH="$MAXIT_SRC/bin:$PATH"
+export OBABEL_BIN="/opt/schrodinger2024-3/utilities/obabel"
 
 CUDA_VISIBLE_DEVICES=0 "$PYTHON_BIN" inference.py \
-    --maxit_binary "$MAXIT_SRC/bin/maxit" \
     --jackhmmer_binary_path "$ENV_BIN/jackhmmer" \
 	--hhblits_binary_path "$ENV_BIN/hhblits" \
 	--hhsearch_binary_path "$ENV_BIN/hhsearch" \
@@ -17,10 +14,7 @@ CUDA_VISIBLE_DEVICES=0 "$PYTHON_BIN" inference.py \
 	--hmmbuild_binary_path "$ENV_BIN/hmmbuild" \
     --nhmmer_binary_path "$ENV_BIN/nhmmer" \
     --preset='reduced_dbs' \
-    --bfd_database_path "$DATA_DIR/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt" \
     --small_bfd_database_path "$DATA_DIR/small_bfd/bfd-first_non_consensus_sequences.fasta" \
-    --bfd_database_path "$DATA_DIR/small_bfd/bfd-first_non_consensus_sequences.fasta" \
-    --uniclust30_database_path "$DATA_DIR/uniclust30/uniclust30_2018_08/uniclust30_2018_08" \
     --uniprot_database_path "$DATA_DIR/uniprot/uniprot.fasta" \
     --pdb_seqres_database_path "$DATA_DIR/pdb_seqres/pdb_seqres.txt" \
     --uniref90_database_path "$DATA_DIR/uniref90/uniref90.fasta" \
@@ -30,10 +24,10 @@ CUDA_VISIBLE_DEVICES=0 "$PYTHON_BIN" inference.py \
     --ccd_preprocessed_path "$DATA_DIR/ccd_preprocessed_etkdg.pkl.gz" \
     --rfam_database_path "$DATA_DIR/Rfam-14.9_rep_seq.fasta" \
     --max_template_date=2020-05-14 \
-    --input_json data/demo_6zcy.json \
+    --input_json $1\
     --output_dir ./output \
     --model_name allatom_demo \
     --init_model init_models/HelixFold3-240814.pdparams \
-    --infer_times 1 \
+    --infer_times 5 \
     --diff_batch_size 1 \
-    --precision "fp32"
\ No newline at end of file
+    --precision "fp32"

From 22e16c69589ff2035e7d10da98a108871d4922a3 Mon Sep 17 00:00:00 2001
From: Anindit Gopalakrishnan <aninditg98@gmail.com>
Date: Fri, 13 Dec 2024 08:18:04 +0000
Subject: [PATCH 2/6] done

---
 .../helixfold3/run_all_internal_xtal.sh          | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100755 apps/protein_folding/helixfold3/run_all_internal_xtal.sh

diff --git a/apps/protein_folding/helixfold3/run_all_internal_xtal.sh b/apps/protein_folding/helixfold3/run_all_internal_xtal.sh
new file mode 100755
index 00000000..8fb4914f
--- /dev/null
+++ b/apps/protein_folding/helixfold3/run_all_internal_xtal.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+DIRECTORY="data/internal_xtal_inputs"
+
+# Check if the provided path is a valid directory
+if [ ! -d "$DIRECTORY" ]; then
+    echo "Error: $DIRECTORY is not a valid directory."
+    exit 1
+fi
+
+echo "Files in $DIRECTORY:"
+for FILE in "$DIRECTORY"/*; do
+    if [ -f "$FILE" ]; then
+        ./run_infer.sh "$FILE"
+    fi
+done
\ No newline at end of file

From 36855051e53195b54851313cb962766e24f67fe1 Mon Sep 17 00:00:00 2001
From: Anindit Gopalakrishnan <aninditg98@gmail.com>
Date: Fri, 13 Dec 2024 08:27:54 +0000
Subject: [PATCH 3/6] update maxit

---
 .../helixfold3/helixfold/common/all_atom_pdb_save.py      | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py b/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py
index deb8e087..3ce63bdd 100644
--- a/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py
+++ b/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py
@@ -164,14 +164,14 @@ def prediction_to_mmcif(pred_atom_pos: Union[np.ndarray, paddle.Tensor],
     - maxit_binary: path to maxit_binary, use to convert pdb to cif
     - mmcif_path: path to save *.cif
   """
-  assert maxit_binary is not None and os.path.exists(maxit_binary), (
-      f'maxit_binary: {maxit_binary} not exists. '
-      f'link: https://sw-tools.rcsb.org/apps/MAXIT/source.html')
+#   assert maxit_binary is not None and os.path.exists(maxit_binary), (
+#       f'maxit_binary: {maxit_binary} not exists. '
+#       f'link: https://sw-tools.rcsb.org/apps/MAXIT/source.html')
   assert mmcif_path.endswith('.cif'), f'mmcif_path should endswith .cif; got {mmcif_path}'
 
   pdb_path = mmcif_path.replace('.cif', '.pdb')
   pdb_path = prediction_to_pdb(pred_atom_pos, FeatsDict, pdb_path)
-  msg = os.system(f'{maxit_binary} -i {pdb_path} -o 1 -output {mmcif_path}')
+  msg = os.system(f'structconvert -PDBx {pdb_path} {mmcif_path}')
   if msg != 0:
     print(f'convert pdb to cif failed, error message: {msg}')
   return mmcif_path
\ No newline at end of file

From eadb2719f70351af8a7a47a32d43ce44791d7aa3 Mon Sep 17 00:00:00 2001
From: Anindit Gopalakrishnan <aninditg98@gmail.com>
Date: Fri, 10 Jan 2025 20:25:55 +0000
Subject: [PATCH 4/6] create script to highlight issue

---
 .../helixfold3/isolate_msa_issue.py           | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100644 apps/protein_folding/helixfold3/isolate_msa_issue.py

diff --git a/apps/protein_folding/helixfold3/isolate_msa_issue.py b/apps/protein_folding/helixfold3/isolate_msa_issue.py
new file mode 100644
index 00000000..5c419307
--- /dev/null
+++ b/apps/protein_folding/helixfold3/isolate_msa_issue.py
@@ -0,0 +1,20 @@
+from helixfold.data.pipeline_parallel import make_msa_features
+from helixfold.data import parsers
+import os
+
+def get_text(p):
+    with open(p, 'r') as f:
+        return f.read()
+
+def try_creating_msa_features(folder):
+    uniref90_msa = parsers.parse_stockholm(get_text(os.path.join(folder, 'uniref90_hits.sto')))
+    mgnify_msa = parsers.parse_stockholm(get_text(os.path.join(folder, 'mgnify_hits.sto')))
+    bfd_msa = parsers.parse_stockholm(get_text(os.path.join(folder, 'small_bfd_hits.sto')))
+    msa_features = make_msa_features((uniref90_msa, bfd_msa, mgnify_msa))
+
+if __name__ == '__main__':
+    internal_xtal_folder = "/home/anindit/paddle-helix-fork/apps/protein_folding/helixfold3/output/row-0/msas/protein_A/A"
+    posebuster_folder = "/home/anindit/deep-affinity/experimental/users/anindit/posebuster_5SAK_ZRY"
+    msa_features = try_creating_msa_features(posebuster_folder)
+    
+    

From 94ac1858e6dcc613aeeaafe1465ddea76be10ca3 Mon Sep 17 00:00:00 2001
From: Anindit Gopalakrishnan <aninditg98@gmail.com>
Date: Fri, 10 Jan 2025 20:52:57 +0000
Subject: [PATCH 5/6] make it easy to see where the msas fail

---
 .../helixfold/data/pipeline_parallel.py       | 55 ++++++++++---------
 apps/protein_folding/helixfold3/inference.py  | 38 +++++++------
 .../helixfold3/isolate_msa_issue.py           |  4 +-
 3 files changed, 52 insertions(+), 45 deletions(-)

diff --git a/apps/protein_folding/helixfold3/helixfold/data/pipeline_parallel.py b/apps/protein_folding/helixfold3/helixfold/data/pipeline_parallel.py
index 3ca52693..0c18b26f 100644
--- a/apps/protein_folding/helixfold3/helixfold/data/pipeline_parallel.py
+++ b/apps/protein_folding/helixfold3/helixfold/data/pipeline_parallel.py
@@ -62,6 +62,8 @@ def make_msa_features(msas: Sequence[parsers.Msa]) -> FeatureDict:
   for msa_index, msa in enumerate(msas):
     if not msa:
       raise ValueError(f'MSA {msa_index} must contain at least one sequence.')
+  
+    print("MSA SEQUENCE LENGTH", len(msa.sequences))
     for sequence_index, sequence in enumerate(msa.sequences):
       if sequence in seen_sequences:
         continue
@@ -239,40 +241,40 @@ def process(self, input_fasta_path: str, msa_output_dir: str) -> FeatureDict:
         except Exception as exc:
           print(f'Task {task} generated an exception : {exc}')
 
-    msa_for_templates = msa_results['uniref90']['sto']
-    msa_for_templates = parsers.deduplicate_stockholm_msa(msa_for_templates)
-    msa_for_templates = parsers.remove_empty_columns_from_stockholm_msa(msa_for_templates)
+    # msa_for_templates = msa_results['uniref90']['sto']
+    # msa_for_templates = parsers.deduplicate_stockholm_msa(msa_for_templates)
+    # msa_for_templates = parsers.remove_empty_columns_from_stockholm_msa(msa_for_templates)
 
-    if self.template_searcher.input_format == 'sto':
-      pdb_templates_result = self.template_searcher.query(msa_for_templates)
-    elif self.template_searcher.input_format == 'a3m':
-      uniref90_msa_as_a3m = parsers.convert_stockholm_to_a3m(msa_for_templates)
-      pdb_templates_result = self.template_searcher.query(uniref90_msa_as_a3m)
-    else:
-      raise ValueError('Unrecognized template input format: '
-                       f'{self.template_searcher.input_format}')
+    # if self.template_searcher.input_format == 'sto':
+    #   pdb_templates_result = self.template_searcher.query(msa_for_templates)
+    # elif self.template_searcher.input_format == 'a3m':
+    #   uniref90_msa_as_a3m = parsers.convert_stockholm_to_a3m(msa_for_templates)
+    #   pdb_templates_result = self.template_searcher.query(uniref90_msa_as_a3m)
+    # else:
+    #   raise ValueError('Unrecognized template input format: '
+    #                    f'{self.template_searcher.input_format}')
 
-    pdb_hits_out_path = os.path.join(
-        msa_output_dir, f'pdb_hits.{self.template_searcher.output_format}')
-    with open(pdb_hits_out_path, 'w') as f:
-      f.write(pdb_templates_result)
+    # pdb_hits_out_path = os.path.join(
+    #     msa_output_dir, f'pdb_hits.{self.template_searcher.output_format}')
+    # with open(pdb_hits_out_path, 'w') as f:
+    #   f.write(pdb_templates_result)
 
     uniref90_msa = parsers.parse_stockholm(msa_results['uniref90']['sto'])
     mgnify_msa = parsers.parse_stockholm(msa_results['mgnify']['sto'])
 
-    pdb_template_hits = self.template_searcher.get_template_hits(
-        output_string=pdb_templates_result, input_sequence=input_sequence)
+    # pdb_template_hits = self.template_searcher.get_template_hits(
+    #     output_string=pdb_templates_result, input_sequence=input_sequence)
 
     if self._use_small_bfd:
         bfd_msa = parsers.parse_stockholm(msa_results['small_bfd']['sto'])
     else:
         raise ValueError("Doesn't support full BFD yet.")
 
-    templates_result = self.template_featurizer.get_templates(
-        query_sequence=input_sequence,
-        hits=pdb_template_hits,
-        query_pdb_code=None,
-        query_release_date=None)
+    # templates_result = self.template_featurizer.get_templates(
+    #     query_sequence=input_sequence,
+    #     hits=pdb_template_hits,
+    #     query_pdb_code=None,
+    #     query_release_date=None)
 
     sequence_features = make_sequence_features(
         sequence=input_sequence,
@@ -286,8 +288,9 @@ def process(self, input_fasta_path: str, msa_output_dir: str) -> FeatureDict:
     logging.info('MGnify MSA size: %d sequences.', len(mgnify_msa))
     logging.info('Final (deduplicated) MSA size: %d sequences.',
                  msa_features['num_alignments'][0])
-    logging.info('Total number of templates (NB: this can include bad '
-                 'templates and is later filtered to top 4): %d.',
-                 templates_result.features['template_domain_names'].shape[0])
+    # logging.info('Total number of templates (NB: this can include bad '
+    #              'templates and is later filtered to top 4): %d.',
+    #              templates_result.features['template_domain_names'].shape[0])
 
-    return {**sequence_features, **msa_features, **templates_result.features}
+    # return {**sequence_features, **msa_features, **templates_result.features}
+    return {**sequence_features, **msa_features}
diff --git a/apps/protein_folding/helixfold3/inference.py b/apps/protein_folding/helixfold3/inference.py
index 51cf6ec6..e05adeb0 100644
--- a/apps/protein_folding/helixfold3/inference.py
+++ b/apps/protein_folding/helixfold3/inference.py
@@ -467,24 +467,7 @@ def main(args):
     msa_templ_data_pipeline_dict = get_msa_templates_pipeline(args)
         
 
-    ### create model
-    model_config = config.model_config(args.model_name)
-    print(f'>>> model_config:\n{model_config}')
-
-    model = RunModel(model_config)
-
-    if (not args.init_model is None) and (not args.init_model == ""):
-        print(f"Load pretrain model from {args.init_model}")
-        pd_params = paddle.load(args.init_model)
-        
-        has_opt = 'optimizer' in pd_params
-        if has_opt:
-            model.helixfold.set_state_dict(pd_params['model'])
-        else:
-            model.helixfold.set_state_dict(pd_params)
     
-    if args.precision == "bf16" and args.amp_level == "O2":
-        raise NotImplementedError("bf16 O2 is not supported yet.")
 
     print(f"============ Data Loading ============")
     job_base = pathlib.Path(args.input_json).stem
@@ -506,6 +489,27 @@ def main(args):
     feature_dict['feat'] = batch_convert(feature_dict['feat'], add_batch=True)
     feature_dict['label'] = batch_convert(feature_dict['label'], add_batch=True)
     
+    return
+    print(f"============ Model Loading ============")
+    ### create model
+    model_config = config.model_config(args.model_name)
+    print(f'>>> model_config:\n{model_config}')
+
+    model = RunModel(model_config)
+
+    if (not args.init_model is None) and (not args.init_model == ""):
+        print(f"Load pretrain model from {args.init_model}")
+        pd_params = paddle.load(args.init_model)
+        
+        has_opt = 'optimizer' in pd_params
+        if has_opt:
+            model.helixfold.set_state_dict(pd_params['model'])
+        else:
+            model.helixfold.set_state_dict(pd_params)
+    
+    if args.precision == "bf16" and args.amp_level == "O2":
+        raise NotImplementedError("bf16 O2 is not supported yet.")
+    
     print(f"============ Start Inference ============")
     
     infer_times = args.infer_times
diff --git a/apps/protein_folding/helixfold3/isolate_msa_issue.py b/apps/protein_folding/helixfold3/isolate_msa_issue.py
index 5c419307..fe6d7c41 100644
--- a/apps/protein_folding/helixfold3/isolate_msa_issue.py
+++ b/apps/protein_folding/helixfold3/isolate_msa_issue.py
@@ -13,8 +13,8 @@ def try_creating_msa_features(folder):
     msa_features = make_msa_features((uniref90_msa, bfd_msa, mgnify_msa))
 
 if __name__ == '__main__':
-    internal_xtal_folder = "/home/anindit/paddle-helix-fork/apps/protein_folding/helixfold3/output/row-0/msas/protein_A/A"
-    posebuster_folder = "/home/anindit/deep-affinity/experimental/users/anindit/posebuster_5SAK_ZRY"
+    # internal_xtal_folder = "/home/anindit/paddle-helix-fork/apps/protein_folding/helixfold3/output/row-0/msas/protein_A/A"
+    posebuster_folder = "/home/anindit/deep-affinity/experimental/users/anindit/posebuster_5SAK_ZRY/sto"
     msa_features = try_creating_msa_features(posebuster_folder)
     
     

From c21d206f99cf25a72af8625db1904f77d1eb9ead Mon Sep 17 00:00:00 2001
From: Anindit Gopalakrishnan <aninditg98@gmail.com>
Date: Fri, 10 Jan 2025 20:58:02 +0000
Subject: [PATCH 6/6] isoalte

---
 apps/protein_folding/helixfold3/isolate_msa_issue.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/apps/protein_folding/helixfold3/isolate_msa_issue.py b/apps/protein_folding/helixfold3/isolate_msa_issue.py
index fe6d7c41..2d23074a 100644
--- a/apps/protein_folding/helixfold3/isolate_msa_issue.py
+++ b/apps/protein_folding/helixfold3/isolate_msa_issue.py
@@ -10,11 +10,11 @@ def try_creating_msa_features(folder):
     uniref90_msa = parsers.parse_stockholm(get_text(os.path.join(folder, 'uniref90_hits.sto')))
     mgnify_msa = parsers.parse_stockholm(get_text(os.path.join(folder, 'mgnify_hits.sto')))
     bfd_msa = parsers.parse_stockholm(get_text(os.path.join(folder, 'small_bfd_hits.sto')))
-    msa_features = make_msa_features((uniref90_msa, bfd_msa, mgnify_msa))
+    make_msa_features((uniref90_msa, bfd_msa, mgnify_msa))
 
 if __name__ == '__main__':
-    # internal_xtal_folder = "/home/anindit/paddle-helix-fork/apps/protein_folding/helixfold3/output/row-0/msas/protein_A/A"
-    posebuster_folder = "/home/anindit/deep-affinity/experimental/users/anindit/posebuster_5SAK_ZRY/sto"
-    msa_features = try_creating_msa_features(posebuster_folder)
+    failed_posebuster_folder = "/home/anindit/deep-affinity/experimental/users/anindit/posebuster_5SAK_ZRY/sto"
+    working_posebuster_folder = "/home/anindit/paddle-helix-fork/apps/protein_folding/helixfold3/output/posebuster_5SAK_ZRY/msas/protein_A/A-HF3"
+    msa_features = try_creating_msa_features(failed_posebuster_folder)