From 49457ec6fd14981d32bca9a2add2b211c79a7837 Mon Sep 17 00:00:00 2001 From: Yunhai Luo Date: Thu, 18 Jul 2019 13:25:17 -0700 Subject: [PATCH] Fix for missing biospecimen in one TCGA-BRCA case --- xena_gdc_etl/xena_dataset.py | 47 +++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/xena_gdc_etl/xena_dataset.py b/xena_gdc_etl/xena_dataset.py index 60622e8..4d30bbf 100644 --- a/xena_gdc_etl/xena_dataset.py +++ b/xena_gdc_etl/xena_dataset.py @@ -2267,26 +2267,19 @@ def transform(self): .replace(r'\r\n', ' ', regex=True) .replace(r'^\s*$', np.nan, regex=True) .dropna(axis=1, how='all') + .rename(columns={ + 'bcr_sample_barcode': 'submitter_id.samples', + 'bcr_patient_barcode': 'submitter_id', + }) ) clin_matrix = ( pd.concat(clin_dfs, axis=0) .replace(r'\r\n', ' ', regex=True) .replace(r'^\s*$', np.nan, regex=True) .dropna(axis=1, how='all') - ) - bio_columns = bio_matrix.columns.difference( - clin_matrix.columns - ).insert(0, 'bcr_patient_barcode') - xena_matrix = ( - pd.merge( - clin_matrix, - bio_matrix[bio_columns], - how='outer', - on='bcr_patient_barcode', - ) - .replace(r'^\s*$', np.nan, regex=True) - .set_index('bcr_sample_barcode') - .fillna(bio_matrix.set_index('bcr_sample_barcode')) + .rename(columns={ + 'bcr_patient_barcode': 'submitter_id', + }) ) # Query GDC API for GDC harmonized phenotype info api_clin = gdc.get_samples_clinical(self.projects) @@ -2316,22 +2309,36 @@ def transform(self): pass for c in self._RAW_DROPS: try: - xena_matrix.drop(c, axis=1, inplace=True) + clin_matrix.drop(c, axis=1, inplace=True) + except Exception: + pass + try: + bio_matrix.drop(c, axis=1, inplace=True) except Exception: pass - # Merge phenotype matrix from raw data and that from GDC's API - xena_matrix = xena_matrix.reset_index().rename( - columns={'bcr_sample_barcode': 'submitter_id.samples'} + # Merge phenotype matrices from raw data and that from GDC's API + bio_columns = bio_matrix.columns.difference( + clin_matrix.columns + ).insert(0, 'submitter_id') + xena_matrix = ( + pd.merge( + bio_matrix[bio_columns], + api_clin.reset_index(), + how='outer', + on=['submitter_id.samples', 'submitter_id'], + ) + .replace(r'^\s*$', np.nan, regex=True) ) xena_matrix = ( pd.merge( + clin_matrix, xena_matrix, - api_clin.reset_index(), how='outer', - on='submitter_id.samples', + on='submitter_id', ) .replace(r'^\s*$', np.nan, regex=True) .set_index('submitter_id.samples') + .fillna(bio_matrix.set_index('submitter_id.samples')) ) print('Dropping TCGA-**-****-**Z samples ...') xena_matrix = xena_matrix[~xena_matrix.index.str.endswith('Z')]