-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPertussis_challenge_4.1.Rmd
433 lines (363 loc) · 15.1 KB
/
Pertussis_challenge_4.1.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
---
title: "Pertussis Challenge 4.1_+ge"
author: "Runqi Zhang"
date: "`r Sys.Date()`"
output:
pdf_document:
latex_engine: xelatex
keep_tex: true
number_sections: true
---
```{r package}
# Load required libraries
suppressPackageStartupMessages({
library(dplyr)
library(ggplot2)
library(tidyr)
library(tidyverse)
library(readr)
library(kableExtra)
library(glmnet)
library(here)
library(knitr)
library(GSVA)
library(agua)
library(tibble) # For handling row names
library(impute)
#source(here("./scripts/codebase.R"))
})
# Define working directory and start H2O
workDir <- "C:/Users/zhang/Desktop/cmi-pb-3rd-final/Runqi/CMI-PB"
agua::h2o_start()
options(readr.show_col_types = FALSE)
```
```{r load ge}
read_data <- function(year, type = "LD") {
list(
pts = read_tsv(file.path(workDir, paste0("data/", year, type, "_subject.tsv"))),
sample = read_tsv(file.path(workDir, paste0("data/", year, type, "_specimen.tsv"))),
ge = read_tsv(file.path(workDir, paste0("data/", year, type, "_pbmc_gene_expression.tsv")))
)
}
# Load datasets for 2020–2023
data2020 <- read_data("2020")
data2021 <- read_data("2021")
data2022 <- read_data("2022")
data2023 <- read_data("2023", type = "BD")
```
```{r target gene entry}
# List of base gene IDs (genes of interest)
target_genes <- c(
#"ENSG00000113525.9", #IL5
"ENSG00000107485.15", #GATA3
#"ENSG00000136574.17", #GATA4
"ENSG00000102145.13", #GATA1
#"ENSG00000130700.6", #GATA5
#"ENSG00000141448.8", #GATA6
"ENSG00000179348.11", #GATA2
"ENSG00000220201.7", #ZGLP1
"ENSG00000104447.12", #TRPS1
"ENSG00000071564.14", #TCF3
"ENSG00000196628.15", #TCF4
"ENSG00000140262.17", #TCF12
#"ENSG00000112499.12", #SLC22A2
"ENSG00000172216.5", #CEBPB
"ENSG00000185591.9", #SP1
"ENSG00000167182.14", #SP2
"ENSG00000172845.14", #SP3
"ENSG00000105866.13" #SP4
)
```
to be tested
"ENSG00000111537.4", #IFNG
"ENSG00000105829.11", #BET1
"ENSG00000131196.17", #NFATC1
"ENSG00000101096.19", #NFATC2
"ENSG00000072736.18", #NFATC3
#"ENSG00000100968.13", #NFATC4
"ENSG00000102908.20", #NFAT5
"ENSG00000109320.11", #NFKB1
"ENSG00000077150.18", #NFKB2
"ENSG00000162924.13", #REL
"ENSG00000100811.12", #YY1
#"ENSG00000230797.2", #YY2
#"ENSG00000179059.9", #ZFP42
"ENSG00000115415.18", #STAT1
"ENSG00000170581.13", #STAT2
"ENSG00000168610.14", #STAT3
"ENSG00000138378.17", #STAT4
"ENSG00000126561.16", #STAT5A
"ENSG00000166888.11", #STAT6
"ENSG00000173757.9" #STAT5B
```{r match gene emselble version id}
# Function to extract day 0 TPM values
extract_day0_tpm <- function(ge_data, sample_data, pts_data, target_genes) {
ge_data %>%
filter(versioned_ensembl_gene_id %in% target_genes) %>%
inner_join(sample_data, by = "specimen_id") %>%
inner_join(pts_data, by = "subject_id") %>%
filter(planned_day_relative_to_boost == 0) %>%
dplyr::select(subject_id, versioned_ensembl_gene_id, tpm) %>%
pivot_wider(
names_from = versioned_ensembl_gene_id,
values_from = tpm,
names_prefix = "tpm_"
) %>%
replace(is.na(.), 0) %>% # Replace NA with 0
column_to_rownames("subject_id") # Use subject_id as rownames
}
# Extract Day 0 TPM values for 2021, 2022, and 2023
x21_gene_expr <- extract_day0_tpm(data2021$ge, data2021$sample, data2021$pts, target_genes)
x22_gene_expr <- extract_day0_tpm(data2022$ge, data2022$sample, data2022$pts, target_genes)
x23_gene_expr <- extract_day0_tpm(data2023$ge, data2023$sample, data2023$pts, target_genes)
```
```{r task4 t-cell activation data loading}
# Load data for 2021
pts2021DF <- read_tsv(file.path(workDir, "data/2021LD_subject.tsv"))
sample2021DF <- read_tsv(file.path(workDir, "data/2021LD_specimen.tsv"))
tcp2021DF <- read_tsv(file.path(workDir, "data/2021LD_t_cell_polarization.tsv"))
tca2021DF <- read_tsv(file.path(workDir, "data/2021LD_t_cell_activation.tsv"))
# Load data for 2022
pts2022DF <- read_tsv(file.path(workDir, "data/2022LD_subject.tsv"))
sample2022DF <- read_tsv(file.path(workDir, "data/2022LD_specimen.tsv"))
tcp2022DF <- read_tsv(file.path(workDir, "data/2022LD_t_cell_polarization.tsv"))
tca2022DF <- read_tsv(file.path(workDir, "data/2022LD_t_cell_activation.tsv"))
# Load data for 2023
pts2023DF <- read_tsv(file.path(workDir, "data/2023BD_subject.tsv"))
sample2023DF <- read_tsv(file.path(workDir, "data/2023BD_specimen.tsv"))
tcp2023DF <- read_tsv(file.path(workDir, "data/2023BD_t_cell_polarization.tsv"))
tca2023DF <- read_tsv(file.path(workDir, "data/2023BD_t_cell_activation.tsv"))
```
```{r preparing outcome: calculate Th1_Th2_ratio}
# Function to calculate Th1/Th2 ratio for a given dataset
calculate_th1_th2_ratio <- function(tcpDF, sampleDF, ptsDF) {
tcpDF %>%
filter(stimulation == "PT", protein_id %in% c("P01579", "P05113")) %>% # Select IFN-γ and IL-5 with PT stimulation
merge(y = sampleDF, by = "specimen_id", all.x = TRUE) %>% # Merge sample data
merge(y = ptsDF, by = "subject_id", all.x = TRUE) %>% # Merge patient data
filter(planned_day_relative_to_boost == 30) %>% # Filter for Day 30 post-booster
dplyr::select(subject_id, protein_id, analyte_counts) %>% # Select relevant columns
group_by(subject_id, protein_id) %>% # Group by subject and protein
summarize(analyte_counts = mean(analyte_counts, na.rm = TRUE), .groups = "drop") %>% # Aggregate duplicates
pivot_wider(names_from = protein_id, values_from = analyte_counts) %>% # Reshape to wide format
mutate(Th1_Th2_ratio = P01579 / P05113) %>% # Calculate Th1/Th2 ratio
dplyr::select(subject_id, Th1_Th2_ratio) # Select output columns
}
# Calculate Th1/Th2 ratio for 2021 and 2022
y21DF <- calculate_th1_th2_ratio(tcp2021DF, sample2021DF, pts2021DF)
y22DF <- calculate_th1_th2_ratio(tcp2022DF, sample2022DF, pts2022DF)
```
```{r data preparation}
# Process TCP data for a single year
preprocess_tcp <- function(tcpDF, sampleDF, ptsDF, stimulation_filter, proteins, timepoint) {
tcpDF %>%
filter(stimulation %in% stimulation_filter, protein_id %in% proteins) %>%
merge(y = sampleDF, by = "specimen_id", all.x = TRUE) %>%
merge(y = ptsDF, by = "subject_id", all.x = TRUE) %>%
filter(planned_day_relative_to_boost == timepoint) %>%
dplyr::select(subject_id, stimulation, protein_id, analyte_counts) %>%
pivot_wider(
names_from = c(stimulation, protein_id),
values_from = analyte_counts
) %>%
replace(is.na(.), 0) %>%
column_to_rownames("subject_id") %>%
setNames(make.names(names(.), unique = TRUE))
}
# Process TCA data for a single year
preprocess_tca <- function(tcaDF, sampleDF, ptsDF, stimulation_filter, timepoint) {
tcaDF %>%
filter(stimulation %in% stimulation_filter) %>%
merge(y = sampleDF, by = "specimen_id", all.x = TRUE) %>%
merge(y = ptsDF, by = "subject_id", all.x = TRUE) %>%
filter(planned_day_relative_to_boost == timepoint) %>%
dplyr::select(subject_id, stimulation, analyte_percentages) %>%
pivot_wider(
names_from = stimulation,
values_from = analyte_percentages
) %>%
replace(is.na(.), 0) %>%
column_to_rownames("subject_id") %>%
setNames(make.names(names(.), unique = TRUE))
}
# Combine TCP and TCA predictors for a single year
combine_tcp_tca <- function(tcpDF, tcaDF, sampleDF, ptsDF, yDF, tcp_stimulation, tcp_proteins, tca_stimulation, timepoint) {
# Process TCP
tcp_processed <- preprocess_tcp(tcpDF, sampleDF, ptsDF, tcp_stimulation, tcp_proteins, timepoint) %>%
rownames_to_column(var = "subject_id") # Add row names back as a column
# Process TCA
tca_processed <- preprocess_tca(tcaDF, sampleDF, ptsDF, tca_stimulation, timepoint) %>%
rownames_to_column(var = "subject_id") # Add row names back as a column
# Merge TCP and TCA predictors
combined_data <- full_join(
tcp_processed, tca_processed,
by = "subject_id" # Merge by subject_id
) %>%
replace(is.na(.), 0) %>% # Replace NA with 0
column_to_rownames(var = "subject_id") # Convert subject_id back to row names
# Align with response variable
combined_data <- combined_data[rownames(combined_data) %in% yDF$subject_id, ]
return(combined_data)
}
# Define constants
tcp_stimulation <- c("PT") # TCP stimulation
tcp_proteins <- c("P01579", "P05113") # IFN-γ and IL-5
tca_stimulation <- c("PT", "PHA", "DMSO", "TT") # TCA stimulation
timepoint <- 0 # Baseline (Day 0)
# Process TCP and TCA data for each year
x21DF <- combine_tcp_tca(tcp2021DF, tca2021DF, sample2021DF, pts2021DF, y21DF, tcp_stimulation, tcp_proteins, tca_stimulation, timepoint)
x22DF <- combine_tcp_tca(tcp2022DF, tca2022DF, sample2022DF, pts2022DF, y22DF, tcp_stimulation, tcp_proteins, tca_stimulation, timepoint)
# Process TCP and TCA data for 2023
x23DF <- combine_tcp_tca(
tcp2023DF,
tca2023DF,
sample2023DF,
pts2023DF,
data.frame(subject_id = pts2023DF$subject_id), # Correct alignment with 2023 subjects
tcp_stimulation,
tcp_proteins,
tca_stimulation,
timepoint
)
# Combine predictors (TCP/TCA data) with Day 0 gene expression
combine_predictors <- function(predictor_data, gene_expr_data) {
# Add rownames as a column for merging
predictor_data <- predictor_data %>% rownames_to_column("subject_id")
gene_expr_data <- gene_expr_data %>% rownames_to_column("subject_id")
# Merge on subject_id to ensure alignment
combined_data <- predictor_data %>%
full_join(gene_expr_data, by = "subject_id") %>%
replace(is.na(.), 0) %>% # Replace NA with 0
column_to_rownames("subject_id") # Restore rownames
return(combined_data)
}
# Combine datasets for each year
x21DF <- combine_predictors(x21DF, x21_gene_expr)
x22DF <- combine_predictors(x22DF, x22_gene_expr)
x23DF <- combine_predictors(x23DF, x23_gene_expr)
# Ensure consistent columns across datasets
common_cols <- Reduce(intersect, list(colnames(x21DF), colnames(x22DF), colnames(x23DF)))
x21DF <- x21DF[, common_cols]
x22DF <- x22DF[, common_cols]
x23DF <- x23DF[, common_cols]
# Match x21DF rows to y21DF subject IDs
x21DF <- x21DF %>%
rownames_to_column("subject_id") %>% # Convert rownames to a column for matching
filter(subject_id %in% y21DF$subject_id) %>% # Filter rows to include only those in y21DF
arrange(match(subject_id, y21DF$subject_id)) %>% # Reorder rows to match y21DF order
column_to_rownames("subject_id") # Restore subject_id as rownames
# Match x22DF rows to y22DF subject IDs
x22DF <- x22DF %>%
rownames_to_column("subject_id") %>% # Convert rownames to a column for matching
filter(subject_id %in% y22DF$subject_id) %>% # Filter rows to include only those in y22DF
arrange(match(subject_id, y22DF$subject_id)) %>% # Reorder rows to match y22DF order
column_to_rownames("subject_id") # Restore subject_id as rownames
```
```{r aqua model, include=FALSE}
# Combine training data and response variable
trainDF <- rbind(x21DF, x22DF) %>%
as.data.frame() %>%
mutate(Th1_Th2_ratio = c(y21DF$Th1_Th2_ratio, y22DF$Th1_Th2_ratio))
# Impute missing values if needed
train_matrix <- as.matrix(trainDF) # Convert to matrix
imputed_matrix <- impute.knn(train_matrix)$data
trainDF <- as.data.frame(imputed_matrix) # Convert back to dataframe
# Train regression model
set.seed(3)
auto_fit <- auto_ml() %>%
set_engine("h2o", max_runtime_secs = 5) %>%
set_mode("regression") %>%
fit(Th1_Th2_ratio ~ ., data = trainDF)
# Validate model on training data
train_predictions <- predict(auto_fit, new_data = trainDF)$.pred
# Calculate correlations
pearson_cor <- cor(train_predictions, trainDF$Th1_Th2_ratio, method = "pearson")
spearman_cor <- cor(train_predictions, trainDF$Th1_Th2_ratio, method = "spearman")
# Print correlations
cat("Pearson Correlation: ", pearson_cor, "\n")
cat("Spearman Correlation: ", spearman_cor, "\n")
# Plot validation results
ggplot(data = data.frame(
Predicted = train_predictions,
Actual = trainDF$Th1_Th2_ratio
), aes(x = Predicted, y = Actual)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", color = "blue", se = FALSE) +
labs(
title = "Model Validation: Predicted vs Actual",
subtitle = paste0("Pearson: ", round(pearson_cor, 2), " | Spearman: ", round(spearman_cor, 2)),
x = "Predicted Th1/Th2 Ratio",
y = "Actual Th1/Th2 Ratio"
) +
theme_minimal()
```
```{r log transformation}
# Combine training data and response variable
trainDF <- rbind(x21DF, x22DF) %>%
as.data.frame() %>%
mutate(Th1_Th2_ratio = c(y21DF$Th1_Th2_ratio, y22DF$Th1_Th2_ratio))
# Impute missing values if needed
train_matrix <- as.matrix(trainDF) # Convert to matrix
imputed_matrix <- impute.knn(train_matrix)$data
trainDF <- as.data.frame(imputed_matrix) # Convert back to dataframe
trainDF$Th1_Th2_ratio <- log1p(trainDF$Th1_Th2_ratio)
# Train regression model
set.seed(3)
auto_fit <- auto_ml() %>%
set_engine("h2o", max_runtime_secs = 5) %>%
set_mode("regression") %>%
fit(Th1_Th2_ratio ~ ., data = trainDF)
# Validate model on training data
train_predictions <- predict(auto_fit, new_data = trainDF)$.pred
# Calculate correlations
pearson_cor <- cor(train_predictions, trainDF$Th1_Th2_ratio, method = "pearson")
spearman_cor <- cor(train_predictions, trainDF$Th1_Th2_ratio, method = "spearman")
# Print correlations
cat("Pearson Correlation: ", pearson_cor, "\n")
cat("Spearman Correlation: ", spearman_cor, "\n")
# Plot validation results
ggplot(data = data.frame(
Predicted = train_predictions,
Actual = trainDF$Th1_Th2_ratio
), aes(x = Predicted, y = Actual)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", color = "blue", se = FALSE) +
labs(
title = "Model Validation: Predicted vs Actual",
subtitle = paste0("Pearson: ", round(pearson_cor, 2), " | Spearman: ", round(spearman_cor, 2)),
x = "Predicted Th1/Th2 Ratio",
y = "Actual Th1/Th2 Ratio"
) +
theme_minimal()
```
```{r predict and rank}
# Predict and rank for 2023
yhat <- predict(auto_fit, new_data = x23DF)$.pred
rhat <- rank(-1 * yhat, ties.method = "first")
# Print rankings
print(cbind(rownames(x23DF), rhat))
```
```{r submission file}
# Save the rankings into a revised submission template
data <- read_tsv(file.path(workDir, "3rdChallengeSubmissionTemplate_revised.tsv"))
ranking_df <- data.frame(
SubjectID = as.numeric(rownames(x23DF)),
"4.1) IFNG/IL5-Polarization-D30-Rank" = rhat,
check.names = FALSE
)
data <- data %>%
mutate(
`4.1) IFNG/IL5-Polarization-D30-Rank` = ifelse(
SubjectID %in% ranking_df$SubjectID,
ranking_df$`4.1) IFNG/IL5-Polarization-D30-Rank`[match(SubjectID, ranking_df$SubjectID)],
`4.1) IFNG/IL5-Polarization-D30-Rank`
)
)
# Write updated data back to file
write_tsv(data, file.path(workDir, "3rdChallengeSubmissionTemplate_revised.tsv"))
```
```{r session info}
# End H2O session and display session info
agua::h2o_end()
sessionInfo()
```