Merge pull request #192 from PNNL-CompBio/drug_descrip

Drug descriptor addition to pipeline, other fixes included
PNNL-CompBio · Jul 3, 2024 · ce5afeb · ce5afeb
2 parents df7702c + 1b4c201
commit ce5afeb
Show file tree

Hide file tree

Showing 32 changed files with 2,155 additions and 1,579 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
diff --git a/.github/workflows/old_ci b/.github/workflows/old_ci
diff --git a/build/beatAML/GetBeatAML.py b/build/beatAML/GetBeatAML.py
@@ -409,7 +409,7 @@ def map_and_combine(df, data_type, entrez_map_file, improve_map_file, map_file=N
     # Process each dataframe based on its data_type
     if data_type == "transcriptomics":
         df['Gene'] = df['Gene'].str.replace(r'\.\d+$', '', regex=True)
-        mapped_df = df.merge(genes, left_on='Gene', right_on='gene_symbol', how='left').reindex(
+        mapped_df = df.merge(genes, left_on='Gene', right_on='other_id', how='left').reindex(
                         columns=['transcriptomics', 'entrez_id', "sample_id","Gene"])
         mapped_df = mapped_df.merge(mapped_ids[['dbgap_rnaseq_sample', 'labId']], 
                          left_on='sample_id', 
@@ -631,8 +631,8 @@ def generate_drug_list(drug_map_path,drug_path):
             exit()
         else:
             improve_map_file = args.curSamples
-            transcriptomics_file = "beataml_waves1to4_norm_exp_dbgap.txt"
-            transcriptomics_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_norm_exp_dbgap.txt"
+            transcriptomics_file = "beataml_waves1to4_counts_dbgap.txt" #"beataml_waves1to4_norm_exp_dbgap.txt" ##this is the wrong file, these are the normalize values
+            transcriptomics_url = "https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_counts_dbgap.txt" #"https://github.com/biodev/beataml2.0_data/raw/main/beataml_waves1to4_norm_exp_dbgap.txt"
             download_from_github(transcriptomics_url, transcriptomics_file)
 
             mutations_file = "beataml_wes_wv1to4_mutations_dbgap.txt"
@@ -644,15 +644,19 @@ def generate_drug_list(drug_map_path,drug_path):
             download_from_github(mutation_map_url, mutation_map_file)
             # New Transcriptomics Data
             print("Starting Transcriptomics Data")
-            t_df = pd.read_csv(transcriptomics_file, sep = '\t')
-            t_df.index = t_df.display_label
-            t_df = t_df.iloc[:, 4:]
-            t_df = t_df.reset_index().rename(columns={'display_label': 'Gene'})
+            ##first run conversion tool
+            os.system("python tpmFromCounts.py --counts "+transcriptomics_file)
+
+
+            t_df = pd.read_csv('tpm_'+transcriptomics_file, sep = '\t')
+           # t_df.index = t_df.stable_id#display_label
+#            t_df = t_df.iloc[:, 4:]
+            t_df = t_df.reset_index().rename(columns={'stable_id': 'Gene'})
             t_df = pd.melt(t_df, id_vars=['Gene'], var_name='sample_id', value_name='transcriptomics')
             print(improve_map_file)
             t_df = map_and_combine(t_df, "transcriptomics", args.genes, improve_map_file, sample_mapping_file)
             t_df = t_df[t_df.entrez_id.notna()]
-            t_df = t_df[["improve_sample_id","transcriptomics","entrez_id","source","study"]]
+            t_df = t_df[["improve_sample_id","transcriptomics","entrez_id","source","study"]].drop_duplicates()
             t_df.to_csv("/tmp/beataml_transcriptomics.csv.gz",index=False,compression='gzip')
 
             # New Proteomics Data

diff --git a/build/beatAML/README.md b/build/beatAML/README.md
@@ -1,24 +1,68 @@
 ## BeatAML Data generation
 
-This directory builds the data for the BeatAML samples
+This directory builds the data for the BeatAML samples. To build and
+test this module, run the following commands from the root directory.
 
-### Sample generation
+## Build with test data
+Build commands should be similar to every other coderdata build
+module.
 
-To generate samples, you need to pass in the path of the previous
-sample file. 
 
+### Build gene table
+First we need to build the gene table
+
+1. Build genes docker
 ```
-python GetBeatAML.py --token $SYNAPSE_AUTH_TOKEN --prevSamples=[path to previous samples]
+   docker build -f build/docker/Dockerfile.genes -t genes . --build-arg HTTPS_PROXY=$HTTPS_PROXY 
 ```
 
-### Drug generation
+2. Build gene file
+```
+	docker run -v $PWD:/tmp genes sh build_genes.sh
+```
 
-How are the drugs generated???
+### Build AML data
+1. Build the Docker image:
+   ```
+   docker build -f build/docker/Dockerfile.beataml -t beataml . --build-arg HTTPS_PROXY=$HTTPS_PROXY 
+   ```
 
-### Omics and Experiment Data
+2. Generate new identifiers for these samples to create a
+   `beataml_samples.csv` file. This pulls from the latest synapse
+   project metadata table.
+   ```
+   docker run -e SYNAPSE_AUTH_TOKEN=$SYNAPSE_AUTH_TOKEN -v $PWD:/tmp beataml sh build_samples.sh /tmp/build/build_test/test_samples.csv 
+   ```
 
+3. Pull the data and map it to the samples. This uses the metadata
+   table pulled above.
+   ```
+   docker run -v $PWD:/tmp  -e SYNAPSE_AUTH_TOKEN=$SYNAPSE_AUTH_TOKEN beataml sh build_omics.sh /tmp/build/build_test/test_genes.csv /tmp/beataml_samples.csv 
+   ```
 
-```
-python GetBeatAML.py --token $SYNAPSE_AUTH_TOKEN --curSamples=[path togenerated sample file]
+4. Process drug data
+   ```
+   docker run  -e SYNAPSE_AUTH_TOKEN=$SYNAPSE_AUTH_TOKEN -v $PWD:/tmp beataml sh build_drugs.sh /tmp/build/build_test/test_drugs.tsv
+   ```
+
+5. Process experiment data. This uses the metadata from above as well as the file metadata on synapse:
+   ```
+   docker run  -e SYNAPSE_AUTH_TOKEN=$SYNAPSE_AUTH_TOKEN -v $PWD:/tmp beataml sh build_exp.sh /tmp/beataml_samples.csv /tmp/beataml_drugs.tsv.gz
+   ```
 
+Please ensure that each step is followed in order for correct dataset
+compilation.
+
+
+### BeatAML Dataset structure
+The build commands above create the following files in the local directory
+
+```
+├── beataml_samples.csv.gz
+├── beataml_transcriptomics.csv.gz
+├── beataml_mutations.csv.gz
+├── beataml_proteomics.csv.gg
+├── beataml_drugs.tsv.gz
+├── beataml_drug_descriptors.tsv.gz
+├── beataml_experiments.tsv.gz
 ```
diff --git a/build/beatAML/build_drugs.sh b/build/beatAML/build_drugs.sh
@@ -1 +1,2 @@
 python GetBeatAML.py --token $SYNAPSE_AUTH_TOKEN --drugs --drugFile $1
+python build_drug_desc.py --drugtable /tmp/beataml_drugs.tsv --desctable /tmp/beataml_drug_descriptors.tsv.gz
diff --git a/build/beatAML/requirements.txt b/build/beatAML/requirements.txt
@@ -3,7 +3,9 @@ wget==3.2
 requests
 synapseclient
 argparse
-numpy
+numpy==1.26.4
 openpyxl
 matplotlib
 scikit-learn
+mordredcommunity
+rdkit
diff --git a/build/broad_sanger/01-broadSangerSamples.R b/build/broad_sanger/01-broadSangerSamples.R
@@ -127,5 +127,5 @@ long.df<-full.df%>%
   subset(other_id!="")
 
 
-write.table(long.df,'/tmp/broad_sanger_samples.csv',sep=',',row.names=F,col.names=T)
+readr::write_csv(long.df,'/tmp/broad_sanger_samples.csv',quote='needed')
 
diff --git a/build/broad_sanger/README.md b/build/broad_sanger/README.md
@@ -1,56 +1,75 @@
 ## Building Broad and Sanger cell line data
 The Broad and Sanger data is the first to be built, and requires the
 following commands. All scripts write files in to the `/tmp/`
-directory, so mounting to that directly will help output the files
+directory, so mounting to that directly will help output the files. We
+broke the Docker image into two to reduce overall size and complexity
+of each image. 
 
 
-### Docker image and gene file
-First step is to build the docker file and the genes.csv file. This is
-required for all future data files.
-```
-docker build -f ../../build/docker/Dockerfile.broad_sanger -t broad_sanger ../../
-docker run -v $PWD:/tmp/ broad_sanger Rscript 00-buildGeneFile.R
+### Build gene table
+First we need to build the gene table
 
+1. Build genes docker
 ```
-
-### DepMap reference samples and identifiers
-Next we retrieve all the standard cell line identifiers we can, from diverse
-sources, and map them to IMPROVE sample identifiers for future reference.
+   docker build -f build/docker/Dockerfile.genes -t genes . --build-arg HTTPS_PROXY=$HTTPS_PROXY 
 ```
-docker run -v $PWD:/tmp/ broad_sanger Rscript 01-broadSangerSamples.R
 
+2. Build gene file
 ```
+	docker run -v $PWD:/tmp genes sh build_genes.sh
+```
+
+### Build sample, and omics data
+Below are the steps required to build and test the gene/sample/omics
+builds. Commands are designed to be run from the root of the repo. 
 
-### Omics data for Broad/Sanger cell lines
-Third we collect the omics data for these cell lines, again from
-diverse sources. Currently we have a single script for each
-source. Each script takes our list of gene and sample identifiers
+1. Build omics docker
+```
+   docker build -f build/docker/Dockerfile.broad_sanger_omics -t broad_sanger_omics . --build-arg HTTPS_PROXY=$HTTPS_PROXY 
+```
+2. Build sample file
+```
+  docker run -v $PWD:/tmp broad_sanger_omics sh build_samples.sh
+```
+3. Build omics files
 ```
-docker run -v $PWD:/tmp/ broad_sanger Rscript 02-broadSangerOmics.R /tmp/genes.csv /tmp/broad_sanger_samples.csv
+  docker run -v $PWD:/tmp broad_sanger_omics sh build_omics.sh
+```
+This should leave you with the following files for al the cell lines
+```
+├── broad_sanger_samples.csv.gz
+├── broad_sanger_transcriptomics.csv.gz
+├── broad_sanger_mutations.csv.gz
+├── broad_sanger_copy_number.csv.gz
+├── genes.csv
 
 ```
 
-### Drug data for all experiments
+### Build out drug files and experiments
+Both of these steps can be lengthy - the experiment fitting can be
+parallelized but the drug information requires querying PubChem which
+can take a while.
 
-Fourth we collect drugs and map them to pubchem, then structure. This
-is a slow step as we collect from diverse studies including:
-1. CTRPv2
-2. GDSCv1
-3. GDSCv2
-4. gCSI
-5. PRISM2020
-6. CCLE
-7. FIMM
-8. NCI60
+1. Build experiment docker fille
 
 ```
-docker run -v $PWD:/tmp broad_sanger Rscript 03-createDrugFile.R CTRPv2,GDSC,gCSI,PRISM,CCLE,FIMM,NCI60
+   docker build -f build/docker/Dockerfile.broad_sanger_exp -t broad_sanger_exp . --build-arg HTTPS_PROXY=$HTTPS_PROXY 
+
 
 ```
-### Dose response and curve fitting
-This last command will generate the drug file, drugs.tsv.gz, which we
-can pass into the next commands. Then we will collect the dose
-response data and fit the curves for the following experiments:
+2. Build drug files
+   ```
+   docker run -v $PWD:/tmp broad_sanger_exp sh build_drugs.sh /tmp/build/build_test/test_drugs.tsv
+   ```
+3. Build experiment files
+   ```
+   docker run -v $PWD:/tmp   broad_sanger_exp sh build_exp.sh /tmp/broad_sanger_samples.csv /tmp/broad_sanger_drugs.tsv.gz
+   ```
+
+### Datasets collected
+
+Fourth we collect drugs and map them to pubchem, then structure. This
+is a slow step as we collect from diverse studies including:
 1. CTRPv2
 2. GDSCv1
 3. GDSCv2
@@ -60,9 +79,4 @@ response data and fit the curves for the following experiments:
 7. FIMM
 8. NCI60
 
-```
-docker run -v $PWD:/tmp/ broad_sanger /opt/venv/bin/python 04-drug_dosage_and_curves.py --drugfile=/tmp/broad_sanger_drugs.tsv --curSampleFile=/tmp/broad_sanger_samples.csv
-
-```
-
 
diff --git a/build/broad_sanger/build_drugs.sh b/build/broad_sanger/build_drugs.sh
@@ -1,2 +1,3 @@
 /opt/venv/bin/python 03a-nci60Drugs.py 
 Rscript 03-createDrugFile.R CTRPv2,GDSC,gCSI,PRISM,CCLE,FIMM
+/opt/venv/bin/python build_drug_desc.py --drugtable /tmp/broad_sanger_drugs.tsv --desctable /tmp/broad_sanger_drug_descriptors.tsv.gz
diff --git a/build/broad_sanger/requirements.txt b/build/broad_sanger/requirements.txt
@@ -1,10 +1,12 @@
 pandas
 matplotlib
-numpy
+numpy==1.26.4
 argparse
 tqdm
 scikit-learn
 scipy
 requests
 openpyxl
 polars
+mordredcommunity
+rdkit
diff --git a/build/build_all.py b/build/build_all.py
@@ -91,22 +91,23 @@ def process_drugs(executor, datasets):
         dflist = []  
 
         # WE NEED A METHOD TO CONFIRM THAT DRUG FILES ARE NOT INCOMPLETE
-            
+        ##THIS IS BUILT IN- always rerun drug code to check
         # Check for existing files and update dflist with processed files
         for da in datasets:
             if da not in ['cptac', 'hcmi']: 
                 file_path = f'local/{da}_drugs.tsv'
-                if os.path.exists(file_path):
-                    dflist.append(f'/tmp/{da}_drugs.tsv')  # Add to dflist if already processed
+                desc_path = f'local/{da}_drug_descriptor.tsv.gz'
+                #if os.path.exists(file_path): ##always rerun drug process
+                #    dflist.append(f'/tmp/{da}_drugs.tsv')  # Add to dflist if already processed
 
         for da in datasets:
             if da not in ['cptac', 'hcmi']:
                 di = 'broad_sanger_exp' if da == 'broad_sanger' else da
-                if not os.path.exists(f'local/{da}_drugs.tsv'):
-                    if last_drug_future:
-                        last_drug_future.result()  # Ensure the last drug process is completed before starting the next
-                    last_drug_future = executor.submit(run_docker_cmd, [di, 'sh', 'build_drugs.sh', ','.join(dflist)], f'{da} drugs')
-                    dflist.append(f'/tmp/{da}_drugs.tsv')
+                #if not os.path.exists(f'local/{da}_drugs.tsv'):
+                if last_drug_future:
+                    last_drug_future.result()  # Ensure the last drug process is completed before starting the next
+                last_drug_future = executor.submit(run_docker_cmd, [di, 'sh', 'build_drugs.sh', ','.join(dflist)], f'{da} drugs')
+                dflist.append(f'/tmp/{da}_drugs.tsv')
 
     def process_samples(executor, datasets):
         '''
@@ -224,8 +225,7 @@ def compress_file(file_path):
     # Ouput is logged at local/docker.log
     if args.docker or args.all:
         process_docker()
-
-    print("Docker image generation completed")
+        print("Docker image generation completed")
 
 
     ### Build Drugs files, Samples files, and Genes file. These two steps are run in Parallel. 
@@ -241,12 +241,12 @@ def compress_file(file_path):
         # Wait for both processes to complete before proceeding to omics and experiments
         if args.drugs or args.all:
             drug_thread.result()
-        if args.samples or args.all:
+        if args.samples or args.all:##need to wait for samples for all of these
             sample_thread.result()
         if args.samples or args.omics or args.exp or args.all:
             gene_thread.result()
 
-    print("All samples, drugs files, and genes file completed")
+    print("All samples, drugs files, and genes file completed or skipped")
 
 
     ### At this point in the pipeline, all samples and drugs files have been created. There are no blockers to proceed.
@@ -261,11 +261,10 @@ def compress_file(file_path):
 
         if args.omics or args.all:
             omics_thread.result()
+            print("All omics files completed")
         if args.exp or args.all:
             exp_thread.result()
-
-
-    print("All omics and experiments files completed")
+            print("All experiments files completed")
 
     ######
     ### Begin Upload
@@ -338,4 +337,4 @@ def compress_file(file_path):
 
 
 if __name__ == '__main__':
-    main()
+    main()
diff --git a/build/docker/Dockerfile.beataml b/build/docker/Dockerfile.beataml
@@ -5,6 +5,8 @@ WORKDIR /usr/src/app
 
 COPY build/beatAML/GetBeatAML.py . 
 COPY build/utils/fit_curve.py .
+COPY build/utils/build_drug_desc.py .
+COPY build/utils/tpmFromCounts.py .
 COPY build/beatAML/*sh ./
 COPY build/beatAML/requirements.txt .
 

diff --git a/build/docker/Dockerfile.broad_sanger_exp b/build/docker/Dockerfile.broad_sanger_exp
@@ -1,4 +1,4 @@
-FROM r-base:4.2.1
+FROM r-base:4.4.1
 ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get update  --fix-missing
 #RUN apt-get install -y --fix-missing --allow-unauthenticated build-essential libpq-dev python3.10 python3-pip python3-setuptools python3-dev python3-venv libcurl4-openssl-dev libxml2-dev libglpk-dev

diff --git a/build/docker/Dockerfile.broad_sanger_omics b/build/docker/Dockerfile.broad_sanger_omics
@@ -1,7 +1,6 @@
-FROM r-base:4.2.1
+FROM r-base:4.4.1
 ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get update  --fix-missing
-#RUN apt-get install -y --fix-missing --allow-unauthenticated build-essential libpq-dev python3.10 python3-pip python3-setuptools python3-dev python3-venv libcurl4-openssl-dev libxml2-dev libglpk-dev
 
 RUN apt-get install -y --fix-missing --allow-unauthenticated build-essential python3-pip python3-setuptools python3-dev python3-venv libcurl4-openssl-dev libglpk-dev libxml2-dev libpq-dev
 

diff --git a/build/docker/Dockerfile.genes b/build/docker/Dockerfile.genes
@@ -1,10 +1,8 @@
-#FROM bioconductor/bioconductor_docker
-FROM r-base:4.2.1
+FROM r-base:4.4.1
 ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get update  --fix-missing
-RUN apt-get install -y --fix-missing --allow-unauthenticated build-essential libpq-dev libcurl4-openssl-dev
-RUN apt-get install -y --fix-missing libxml2-dev
-#libxml2 libglpk-dev
+RUN apt-get install -y --fix-missing --allow-unauthenticated build-essential python3-pip python3-setuptools python3-dev python3-venv libcurl4-openssl-dev libglpk-dev libxml2-dev libpq-dev
+
 
 WORKDIR /app
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		python GetBeatAML.py --token $SYNAPSE_AUTH_TOKEN --drugs --drugFile $1
		python build_drug_desc.py --drugtable /tmp/beataml_drugs.tsv --desctable /tmp/beataml_drug_descriptors.tsv.gz
Original file line number	Diff line number	Diff line change
Expand Up		@@ -127,5 +127,5 @@ long.df<-full.df%>%
		subset(other_id!="")


		write.table(long.df,'/tmp/broad_sanger_samples.csv',sep=',',row.names=F,col.names=T)
		readr::write_csv(long.df,'/tmp/broad_sanger_samples.csv',quote='needed')