Skip to content

Commit

Permalink
Fix for missing biospecimen in one TCGA-BRCA case
Browse files Browse the repository at this point in the history
  • Loading branch information
yunhailuo committed Jul 18, 2019
1 parent 25e990b commit 49457ec
Showing 1 changed file with 27 additions and 20 deletions.
47 changes: 27 additions & 20 deletions xena_gdc_etl/xena_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2267,26 +2267,19 @@ def transform(self):
.replace(r'\r\n', ' ', regex=True)
.replace(r'^\s*$', np.nan, regex=True)
.dropna(axis=1, how='all')
.rename(columns={
'bcr_sample_barcode': 'submitter_id.samples',
'bcr_patient_barcode': 'submitter_id',
})
)
clin_matrix = (
pd.concat(clin_dfs, axis=0)
.replace(r'\r\n', ' ', regex=True)
.replace(r'^\s*$', np.nan, regex=True)
.dropna(axis=1, how='all')
)
bio_columns = bio_matrix.columns.difference(
clin_matrix.columns
).insert(0, 'bcr_patient_barcode')
xena_matrix = (
pd.merge(
clin_matrix,
bio_matrix[bio_columns],
how='outer',
on='bcr_patient_barcode',
)
.replace(r'^\s*$', np.nan, regex=True)
.set_index('bcr_sample_barcode')
.fillna(bio_matrix.set_index('bcr_sample_barcode'))
.rename(columns={
'bcr_patient_barcode': 'submitter_id',
})
)
# Query GDC API for GDC harmonized phenotype info
api_clin = gdc.get_samples_clinical(self.projects)
Expand Down Expand Up @@ -2316,22 +2309,36 @@ def transform(self):
pass
for c in self._RAW_DROPS:
try:
xena_matrix.drop(c, axis=1, inplace=True)
clin_matrix.drop(c, axis=1, inplace=True)
except Exception:
pass
try:
bio_matrix.drop(c, axis=1, inplace=True)
except Exception:
pass
# Merge phenotype matrix from raw data and that from GDC's API
xena_matrix = xena_matrix.reset_index().rename(
columns={'bcr_sample_barcode': 'submitter_id.samples'}
# Merge phenotype matrices from raw data and that from GDC's API
bio_columns = bio_matrix.columns.difference(
clin_matrix.columns
).insert(0, 'submitter_id')
xena_matrix = (
pd.merge(
bio_matrix[bio_columns],
api_clin.reset_index(),
how='outer',
on=['submitter_id.samples', 'submitter_id'],
)
.replace(r'^\s*$', np.nan, regex=True)
)
xena_matrix = (
pd.merge(
clin_matrix,
xena_matrix,
api_clin.reset_index(),
how='outer',
on='submitter_id.samples',
on='submitter_id',
)
.replace(r'^\s*$', np.nan, regex=True)
.set_index('submitter_id.samples')
.fillna(bio_matrix.set_index('submitter_id.samples'))
)
print('Dropping TCGA-**-****-**Z samples ...')
xena_matrix = xena_matrix[~xena_matrix.index.str.endswith('Z')]
Expand Down

0 comments on commit 49457ec

Please sign in to comment.