From 3f8000342989b4ab86282de302f355f11f2be522 Mon Sep 17 00:00:00 2001 From: Amanda Paulson Date: Thu, 16 Jan 2025 16:42:43 -0800 Subject: [PATCH 1/6] exclude all-NaN columns from mordred features and impute the column mean for other NaNs to calculate AD index --- atomsci/ddm/pipeline/model_pipeline.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/atomsci/ddm/pipeline/model_pipeline.py b/atomsci/ddm/pipeline/model_pipeline.py index 7d49092a..2ae45c56 100644 --- a/atomsci/ddm/pipeline/model_pipeline.py +++ b/atomsci/ddm/pipeline/model_pipeline.py @@ -902,6 +902,10 @@ def predict_full_dataset(self, dset_df, is_featurized=False, contains_responses= pred_data = self.predict_embedding(dset_df, dset_params=dset_params) else: pred_data = copy.deepcopy(self.data.dataset.X) + + if self.featurization.descriptor_type=='mordred_filtered': + pred_data = pred_data[:,~np.isnan(pred_data).all(axis=0)] + pred_data = np.where(np.isnan(pred_data), np.nanmean(pred_data, axis=0), pred_data) try: if not hasattr(self, 'featurized_train_data'): @@ -926,6 +930,9 @@ def predict_full_dataset(self, dset_df, is_featurized=False, contains_responses= train_dset = dc.data.NumpyDataset(train_X) self.featurized_train_data = self.model_wrapper.generate_embeddings(train_dset) else: + if self.featurization.descriptor_type=='mordred_filtered': + train_X = train_X[:,~np.isnan(train_X).all(axis=0)] + train_X = np.where(np.isnan(train_X), np.nanmean(train_X, axis=0), train_X) self.featurized_train_data = train_X if not hasattr(self, "train_pair_dis") or not hasattr(self, "train_pair_dis_metric") or self.train_pair_dis_metric != dist_metric: @@ -933,7 +940,7 @@ def predict_full_dataset(self, dset_df, is_featurized=False, contains_responses= self.train_pair_dis_metric = dist_metric self.log.debug("Calculating AD index.") - + if AD_method == "local_density": result_df["AD_index"] = calc_AD_kmean_local_density(self.featurized_train_data, pred_data, k, train_dset_pair_distance=self.train_pair_dis, dist_metric=dist_metric) else: From fe252c10d4f815d6389176482c8e1f8dd89bf008 Mon Sep 17 00:00:00 2001 From: Amanda Paulson Date: Fri, 17 Jan 2025 13:09:26 -0800 Subject: [PATCH 2/6] check for existence of descriptor_type --- atomsci/ddm/pipeline/model_pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/atomsci/ddm/pipeline/model_pipeline.py b/atomsci/ddm/pipeline/model_pipeline.py index 2ae45c56..d4b4d201 100644 --- a/atomsci/ddm/pipeline/model_pipeline.py +++ b/atomsci/ddm/pipeline/model_pipeline.py @@ -903,7 +903,7 @@ def predict_full_dataset(self, dset_df, is_featurized=False, contains_responses= else: pred_data = copy.deepcopy(self.data.dataset.X) - if self.featurization.descriptor_type=='mordred_filtered': + if hasattr(self.featurization.descriptor_type) and self.featurization.descriptor_type=='mordred_filtered': pred_data = pred_data[:,~np.isnan(pred_data).all(axis=0)] pred_data = np.where(np.isnan(pred_data), np.nanmean(pred_data, axis=0), pred_data) From 13e9cd41ec792ae540a956ebdfb6344c3e513b1f Mon Sep 17 00:00:00 2001 From: Amanda Paulson Date: Fri, 17 Jan 2025 13:18:55 -0800 Subject: [PATCH 3/6] add tests for each feature type --- .../integrative/ad_index/test_ad_index.py | 48 +++++++++++-------- 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/atomsci/ddm/test/integrative/ad_index/test_ad_index.py b/atomsci/ddm/test/integrative/ad_index/test_ad_index.py index cca6e9f4..b96bdfc9 100644 --- a/atomsci/ddm/test/integrative/ad_index/test_ad_index.py +++ b/atomsci/ddm/test/integrative/ad_index/test_ad_index.py @@ -18,7 +18,7 @@ def clean(): os.remove("./output/"+f) def test(): - """Test full model pipeline: Curate data, fit model, and predict property for new compounds""" + """Test AD index calculation: Curate data, fit model, and predict property for new compounds for each feature set""" # Clean # ----- @@ -33,31 +33,37 @@ def test(): python_path = sys.executable hp_params["script_dir"] = script_dir hp_params["python_path"] = python_path + + for feat in ['ECFP','mordred_filtered','rdkit_raw','graphconv']: + if feat in ['ECFP','graphconv']: + hp_params['featurizer']=feat + else: + hp_params['featurizer']='computed_descriptors' + hp_params['descriptor_type']=feat + params = parse.wrapper(hp_params) + if not os.path.isfile(params.dataset_key): + params.dataset_key = os.path.join(params.script_dir, params.dataset_key) - params = parse.wrapper(hp_params) - if not os.path.isfile(params.dataset_key): - params.dataset_key = os.path.join(params.script_dir, params.dataset_key) + train_df = pd.read_csv(params.dataset_key) - train_df = pd.read_csv(params.dataset_key) + print(f"Train an RF models with {feat}") + pl = mp.ModelPipeline(params) + pl.train_model() - print("Train a RF models with ECFP") - pl = mp.ModelPipeline(params) - pl.train_model() + print("Calculate AD index with the just trained model.") + pred_df_mp = pl.predict_on_dataframe(train_df[:10], contains_responses=True, AD_method="z_score") - print("Calculate AD index with the just trained model.") - pred_df_mp = pl.predict_on_dataframe(train_df[:10], contains_responses=True, AD_method="z_score") + assert("AD_index" in pred_df_mp.columns.values), 'Error: No AD_index column pred_df_mp' - assert("AD_index" in pred_df_mp.columns.values), 'Error: No AD_index column pred_df_mp' - - print("Calculate AD index with the saved model tarball file.") - pred_df_file = pfm.predict_from_model_file(model_path=pl.params.model_tarball_path, - input_df=train_df[:10], - id_col="compound_id", - smiles_col="base_rdkit_smiles", - response_col="pKi_mean", - dont_standardize=True, - AD_method="z_score") - assert("AD_index" in pred_df_file.columns.values), 'Error: No AD_index column in pred_df_file' + print("Calculate AD index with the saved model tarball file.") + pred_df_file = pfm.predict_from_model_file(model_path=pl.params.model_tarball_path, + input_df=train_df[:10], + id_col="compound_id", + smiles_col="base_rdkit_smiles", + response_col="pKi_mean", + dont_standardize=True, + AD_method="z_score") + assert("AD_index" in pred_df_file.columns.values), 'Error: No AD_index column in pred_df_file' if __name__ == '__main__': test() From f7115a723a77cf321c7933bdcc5c4442e958f187 Mon Sep 17 00:00:00 2001 From: Amanda Paulson Date: Fri, 17 Jan 2025 13:26:49 -0800 Subject: [PATCH 4/6] add second check and fix typo --- atomsci/ddm/pipeline/model_pipeline.py | 2 +- atomsci/ddm/test/integrative/ad_index/test_ad_index.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/atomsci/ddm/pipeline/model_pipeline.py b/atomsci/ddm/pipeline/model_pipeline.py index d4b4d201..5c1cb30f 100644 --- a/atomsci/ddm/pipeline/model_pipeline.py +++ b/atomsci/ddm/pipeline/model_pipeline.py @@ -930,7 +930,7 @@ def predict_full_dataset(self, dset_df, is_featurized=False, contains_responses= train_dset = dc.data.NumpyDataset(train_X) self.featurized_train_data = self.model_wrapper.generate_embeddings(train_dset) else: - if self.featurization.descriptor_type=='mordred_filtered': + if hasattr(self.featurization.descriptor_type) and self.featurization.descriptor_type=='mordred_filtered': train_X = train_X[:,~np.isnan(train_X).all(axis=0)] train_X = np.where(np.isnan(train_X), np.nanmean(train_X, axis=0), train_X) self.featurized_train_data = train_X diff --git a/atomsci/ddm/test/integrative/ad_index/test_ad_index.py b/atomsci/ddm/test/integrative/ad_index/test_ad_index.py index b96bdfc9..443700b3 100644 --- a/atomsci/ddm/test/integrative/ad_index/test_ad_index.py +++ b/atomsci/ddm/test/integrative/ad_index/test_ad_index.py @@ -34,8 +34,8 @@ def test(): hp_params["script_dir"] = script_dir hp_params["python_path"] = python_path - for feat in ['ECFP','mordred_filtered','rdkit_raw','graphconv']: - if feat in ['ECFP','graphconv']: + for feat in ['ecfp','mordred_filtered','rdkit_raw','graphconv']: + if feat in ['ecfp','graphconv']: hp_params['featurizer']=feat else: hp_params['featurizer']='computed_descriptors' From edb9a7457646de7a6eafc9a184b638db5d58840e Mon Sep 17 00:00:00 2001 From: Amanda Paulson Date: Tue, 21 Jan 2025 14:54:48 -0800 Subject: [PATCH 5/6] change hasattr to check explicitly for feat_type --- atomsci/ddm/pipeline/model_pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/atomsci/ddm/pipeline/model_pipeline.py b/atomsci/ddm/pipeline/model_pipeline.py index 5c1cb30f..9ac2aa33 100644 --- a/atomsci/ddm/pipeline/model_pipeline.py +++ b/atomsci/ddm/pipeline/model_pipeline.py @@ -903,7 +903,7 @@ def predict_full_dataset(self, dset_df, is_featurized=False, contains_responses= else: pred_data = copy.deepcopy(self.data.dataset.X) - if hasattr(self.featurization.descriptor_type) and self.featurization.descriptor_type=='mordred_filtered': + if self.featurization.feat_type=="computed_descriptors" and self.featurization.descriptor_type=='mordred_filtered': pred_data = pred_data[:,~np.isnan(pred_data).all(axis=0)] pred_data = np.where(np.isnan(pred_data), np.nanmean(pred_data, axis=0), pred_data) @@ -930,7 +930,7 @@ def predict_full_dataset(self, dset_df, is_featurized=False, contains_responses= train_dset = dc.data.NumpyDataset(train_X) self.featurized_train_data = self.model_wrapper.generate_embeddings(train_dset) else: - if hasattr(self.featurization.descriptor_type) and self.featurization.descriptor_type=='mordred_filtered': + if self.featurization.feat_type=="computed_descriptors" and self.featurization.descriptor_type=='mordred_filtered': train_X = train_X[:,~np.isnan(train_X).all(axis=0)] train_X = np.where(np.isnan(train_X), np.nanmean(train_X, axis=0), train_X) self.featurized_train_data = train_X From 3d818333dcaf4e70872b72852bd1ba9055f85ce6 Mon Sep 17 00:00:00 2001 From: Amanda Paulson Date: Tue, 21 Jan 2025 15:42:43 -0800 Subject: [PATCH 6/6] update test to work with graphconv nn --- atomsci/ddm/test/integrative/ad_index/test_ad_index.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/atomsci/ddm/test/integrative/ad_index/test_ad_index.py b/atomsci/ddm/test/integrative/ad_index/test_ad_index.py index 443700b3..7774f31b 100644 --- a/atomsci/ddm/test/integrative/ad_index/test_ad_index.py +++ b/atomsci/ddm/test/integrative/ad_index/test_ad_index.py @@ -35,7 +35,10 @@ def test(): hp_params["python_path"] = python_path for feat in ['ecfp','mordred_filtered','rdkit_raw','graphconv']: - if feat in ['ecfp','graphconv']: + if feat == 'ecfp': + hp_params['featurizer']=feat + elif feat =='graphconv': + hp_params['model_type']='NN' hp_params['featurizer']=feat else: hp_params['featurizer']='computed_descriptors'