From 3f8000342989b4ab86282de302f355f11f2be522 Mon Sep 17 00:00:00 2001
From: Amanda Paulson <amanda.paulson@ucsf.edu>
Date: Thu, 16 Jan 2025 16:42:43 -0800
Subject: [PATCH 1/6] exclude all-NaN columns from mordred features and impute
 the column mean for other NaNs to calculate AD index

---
 atomsci/ddm/pipeline/model_pipeline.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/atomsci/ddm/pipeline/model_pipeline.py b/atomsci/ddm/pipeline/model_pipeline.py
index 7d49092a..2ae45c56 100644
--- a/atomsci/ddm/pipeline/model_pipeline.py
+++ b/atomsci/ddm/pipeline/model_pipeline.py
@@ -902,6 +902,10 @@ def predict_full_dataset(self, dset_df, is_featurized=False, contains_responses=
                 pred_data = self.predict_embedding(dset_df, dset_params=dset_params)
             else:
                 pred_data = copy.deepcopy(self.data.dataset.X)
+                
+            if self.featurization.descriptor_type=='mordred_filtered':
+                pred_data = pred_data[:,~np.isnan(pred_data).all(axis=0)]
+                pred_data = np.where(np.isnan(pred_data), np.nanmean(pred_data, axis=0), pred_data)
 
             try:
                 if not hasattr(self, 'featurized_train_data'):
@@ -926,6 +930,9 @@ def predict_full_dataset(self, dset_df, is_featurized=False, contains_responses=
                         train_dset = dc.data.NumpyDataset(train_X)
                         self.featurized_train_data = self.model_wrapper.generate_embeddings(train_dset)
                     else:
+                        if self.featurization.descriptor_type=='mordred_filtered':
+                            train_X = train_X[:,~np.isnan(train_X).all(axis=0)]
+                            train_X = np.where(np.isnan(train_X), np.nanmean(train_X, axis=0), train_X)
                         self.featurized_train_data = train_X
 
                 if not hasattr(self, "train_pair_dis") or not hasattr(self, "train_pair_dis_metric") or self.train_pair_dis_metric != dist_metric:
@@ -933,7 +940,7 @@ def predict_full_dataset(self, dset_df, is_featurized=False, contains_responses=
                     self.train_pair_dis_metric = dist_metric
 
                 self.log.debug("Calculating AD index.")
-
+                
                 if AD_method == "local_density":
                     result_df["AD_index"] = calc_AD_kmean_local_density(self.featurized_train_data, pred_data, k, train_dset_pair_distance=self.train_pair_dis, dist_metric=dist_metric)
                 else:

From fe252c10d4f815d6389176482c8e1f8dd89bf008 Mon Sep 17 00:00:00 2001
From: Amanda Paulson <amanda.paulson@ucsf.edu>
Date: Fri, 17 Jan 2025 13:09:26 -0800
Subject: [PATCH 2/6] check for existence of descriptor_type

---
 atomsci/ddm/pipeline/model_pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/atomsci/ddm/pipeline/model_pipeline.py b/atomsci/ddm/pipeline/model_pipeline.py
index 2ae45c56..d4b4d201 100644
--- a/atomsci/ddm/pipeline/model_pipeline.py
+++ b/atomsci/ddm/pipeline/model_pipeline.py
@@ -903,7 +903,7 @@ def predict_full_dataset(self, dset_df, is_featurized=False, contains_responses=
             else:
                 pred_data = copy.deepcopy(self.data.dataset.X)
                 
-            if self.featurization.descriptor_type=='mordred_filtered':
+            if hasattr(self.featurization.descriptor_type) and self.featurization.descriptor_type=='mordred_filtered':
                 pred_data = pred_data[:,~np.isnan(pred_data).all(axis=0)]
                 pred_data = np.where(np.isnan(pred_data), np.nanmean(pred_data, axis=0), pred_data)
 

From 13e9cd41ec792ae540a956ebdfb6344c3e513b1f Mon Sep 17 00:00:00 2001
From: Amanda Paulson <amanda.paulson@ucsf.edu>
Date: Fri, 17 Jan 2025 13:18:55 -0800
Subject: [PATCH 3/6] add tests for each feature type

---
 .../integrative/ad_index/test_ad_index.py     | 48 +++++++++++--------
 1 file changed, 27 insertions(+), 21 deletions(-)

diff --git a/atomsci/ddm/test/integrative/ad_index/test_ad_index.py b/atomsci/ddm/test/integrative/ad_index/test_ad_index.py
index cca6e9f4..b96bdfc9 100644
--- a/atomsci/ddm/test/integrative/ad_index/test_ad_index.py
+++ b/atomsci/ddm/test/integrative/ad_index/test_ad_index.py
@@ -18,7 +18,7 @@ def clean():
             os.remove("./output/"+f)
 
 def test():
-    """Test full model pipeline: Curate data, fit model, and predict property for new compounds"""
+    """Test AD index calculation: Curate data, fit model, and predict property for new compounds for each feature set"""
 
     # Clean
     # -----
@@ -33,31 +33,37 @@ def test():
     python_path = sys.executable
     hp_params["script_dir"] = script_dir
     hp_params["python_path"] = python_path
+    
+    for feat in ['ECFP','mordred_filtered','rdkit_raw','graphconv']:
+        if feat in ['ECFP','graphconv']:
+            hp_params['featurizer']=feat
+        else:
+            hp_params['featurizer']='computed_descriptors'
+            hp_params['descriptor_type']=feat
+        params = parse.wrapper(hp_params)
+        if not os.path.isfile(params.dataset_key):
+            params.dataset_key = os.path.join(params.script_dir, params.dataset_key)
 
-    params = parse.wrapper(hp_params)
-    if not os.path.isfile(params.dataset_key):
-        params.dataset_key = os.path.join(params.script_dir, params.dataset_key)
+        train_df = pd.read_csv(params.dataset_key)
 
-    train_df = pd.read_csv(params.dataset_key)
+        print(f"Train an RF models with {feat}")
+        pl = mp.ModelPipeline(params)
+        pl.train_model()
 
-    print("Train a RF models with ECFP")
-    pl = mp.ModelPipeline(params)
-    pl.train_model()
+        print("Calculate AD index with the just trained model.")
+        pred_df_mp = pl.predict_on_dataframe(train_df[:10], contains_responses=True, AD_method="z_score")
 
-    print("Calculate AD index with the just trained model.")
-    pred_df_mp = pl.predict_on_dataframe(train_df[:10], contains_responses=True, AD_method="z_score")
+        assert("AD_index" in pred_df_mp.columns.values), 'Error: No AD_index column pred_df_mp'
 
-    assert("AD_index" in pred_df_mp.columns.values), 'Error: No AD_index column pred_df_mp'
-
-    print("Calculate AD index with the saved model tarball file.")
-    pred_df_file = pfm.predict_from_model_file(model_path=pl.params.model_tarball_path,
-                                         input_df=train_df[:10],
-                                         id_col="compound_id",
-                                         smiles_col="base_rdkit_smiles",
-                                         response_col="pKi_mean",
-                                         dont_standardize=True,
-                                         AD_method="z_score")
-    assert("AD_index" in pred_df_file.columns.values), 'Error: No AD_index column in pred_df_file'
+        print("Calculate AD index with the saved model tarball file.")
+        pred_df_file = pfm.predict_from_model_file(model_path=pl.params.model_tarball_path,
+                                            input_df=train_df[:10],
+                                            id_col="compound_id",
+                                            smiles_col="base_rdkit_smiles",
+                                            response_col="pKi_mean",
+                                            dont_standardize=True,
+                                            AD_method="z_score")
+        assert("AD_index" in pred_df_file.columns.values), 'Error: No AD_index column in pred_df_file'
 
 if __name__ == '__main__':
     test()

From f7115a723a77cf321c7933bdcc5c4442e958f187 Mon Sep 17 00:00:00 2001
From: Amanda Paulson <amanda.paulson@ucsf.edu>
Date: Fri, 17 Jan 2025 13:26:49 -0800
Subject: [PATCH 4/6] add second check and fix typo

---
 atomsci/ddm/pipeline/model_pipeline.py                 | 2 +-
 atomsci/ddm/test/integrative/ad_index/test_ad_index.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/atomsci/ddm/pipeline/model_pipeline.py b/atomsci/ddm/pipeline/model_pipeline.py
index d4b4d201..5c1cb30f 100644
--- a/atomsci/ddm/pipeline/model_pipeline.py
+++ b/atomsci/ddm/pipeline/model_pipeline.py
@@ -930,7 +930,7 @@ def predict_full_dataset(self, dset_df, is_featurized=False, contains_responses=
                         train_dset = dc.data.NumpyDataset(train_X)
                         self.featurized_train_data = self.model_wrapper.generate_embeddings(train_dset)
                     else:
-                        if self.featurization.descriptor_type=='mordred_filtered':
+                        if hasattr(self.featurization.descriptor_type) and self.featurization.descriptor_type=='mordred_filtered':
                             train_X = train_X[:,~np.isnan(train_X).all(axis=0)]
                             train_X = np.where(np.isnan(train_X), np.nanmean(train_X, axis=0), train_X)
                         self.featurized_train_data = train_X
diff --git a/atomsci/ddm/test/integrative/ad_index/test_ad_index.py b/atomsci/ddm/test/integrative/ad_index/test_ad_index.py
index b96bdfc9..443700b3 100644
--- a/atomsci/ddm/test/integrative/ad_index/test_ad_index.py
+++ b/atomsci/ddm/test/integrative/ad_index/test_ad_index.py
@@ -34,8 +34,8 @@ def test():
     hp_params["script_dir"] = script_dir
     hp_params["python_path"] = python_path
     
-    for feat in ['ECFP','mordred_filtered','rdkit_raw','graphconv']:
-        if feat in ['ECFP','graphconv']:
+    for feat in ['ecfp','mordred_filtered','rdkit_raw','graphconv']:
+        if feat in ['ecfp','graphconv']:
             hp_params['featurizer']=feat
         else:
             hp_params['featurizer']='computed_descriptors'

From edb9a7457646de7a6eafc9a184b638db5d58840e Mon Sep 17 00:00:00 2001
From: Amanda Paulson <amanda.paulson@ucsf.edu>
Date: Tue, 21 Jan 2025 14:54:48 -0800
Subject: [PATCH 5/6] change hasattr to check explicitly for feat_type

---
 atomsci/ddm/pipeline/model_pipeline.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/atomsci/ddm/pipeline/model_pipeline.py b/atomsci/ddm/pipeline/model_pipeline.py
index 5c1cb30f..9ac2aa33 100644
--- a/atomsci/ddm/pipeline/model_pipeline.py
+++ b/atomsci/ddm/pipeline/model_pipeline.py
@@ -903,7 +903,7 @@ def predict_full_dataset(self, dset_df, is_featurized=False, contains_responses=
             else:
                 pred_data = copy.deepcopy(self.data.dataset.X)
                 
-            if hasattr(self.featurization.descriptor_type) and self.featurization.descriptor_type=='mordred_filtered':
+            if self.featurization.feat_type=="computed_descriptors" and self.featurization.descriptor_type=='mordred_filtered':
                 pred_data = pred_data[:,~np.isnan(pred_data).all(axis=0)]
                 pred_data = np.where(np.isnan(pred_data), np.nanmean(pred_data, axis=0), pred_data)
 
@@ -930,7 +930,7 @@ def predict_full_dataset(self, dset_df, is_featurized=False, contains_responses=
                         train_dset = dc.data.NumpyDataset(train_X)
                         self.featurized_train_data = self.model_wrapper.generate_embeddings(train_dset)
                     else:
-                        if hasattr(self.featurization.descriptor_type) and self.featurization.descriptor_type=='mordred_filtered':
+                        if self.featurization.feat_type=="computed_descriptors" and self.featurization.descriptor_type=='mordred_filtered':
                             train_X = train_X[:,~np.isnan(train_X).all(axis=0)]
                             train_X = np.where(np.isnan(train_X), np.nanmean(train_X, axis=0), train_X)
                         self.featurized_train_data = train_X

From 3d818333dcaf4e70872b72852bd1ba9055f85ce6 Mon Sep 17 00:00:00 2001
From: Amanda Paulson <amanda.paulson@ucsf.edu>
Date: Tue, 21 Jan 2025 15:42:43 -0800
Subject: [PATCH 6/6] update test to work with graphconv nn

---
 atomsci/ddm/test/integrative/ad_index/test_ad_index.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/atomsci/ddm/test/integrative/ad_index/test_ad_index.py b/atomsci/ddm/test/integrative/ad_index/test_ad_index.py
index 443700b3..7774f31b 100644
--- a/atomsci/ddm/test/integrative/ad_index/test_ad_index.py
+++ b/atomsci/ddm/test/integrative/ad_index/test_ad_index.py
@@ -35,7 +35,10 @@ def test():
     hp_params["python_path"] = python_path
     
     for feat in ['ecfp','mordred_filtered','rdkit_raw','graphconv']:
-        if feat in ['ecfp','graphconv']:
+        if feat == 'ecfp':
+            hp_params['featurizer']=feat
+        elif feat =='graphconv':
+            hp_params['model_type']='NN'
             hp_params['featurizer']=feat
         else:
             hp_params['featurizer']='computed_descriptors'