-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path03_merge_mets.R
74 lines (57 loc) · 3.12 KB
/
03_merge_mets.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
library(dplyr)
library(tidyr)
##Read data over
cn <- read.csv("results/no_batch_adj/plasma/QCd_data/inv_norm/DoD_c18_plasma_QCd_inv_norm.csv")
cp <- read.csv("results/no_batch_adj/plasma/QCd_data/inv_norm/DoD_c8_plasma_QCd_inv_norm.csv")
hn <- read.csv("results/no_batch_adj/plasma/QCd_data/inv_norm/DoD_hilic_neg_plasma_QCd_inv_norm.csv")
hp <- read.csv("results/no_batch_adj/plasma/QCd_data/inv_norm/DoD_hilic_pos_plasma_QCd_inv_norm.csv")
output_fnm <- "results/no_batch_adj/plasma/merged_data/inv_norm_merged_QCd_plasma_knowns.csv"
met_info <- fread("results/no_batch_adj/plasma/met_info.csv",na.strings="")
#cn <- read.csv("results/no_batch_adj/muscle/QCd_data/inv_norm/DoD_c18_muscle_QCd_inv_norm.csv")
#cp <- read.csv("results/no_batch_adj/muscle/QCd_data/inv_norm/DoD_c8_muscle_QCd_inv_norm.csv")
#hn <- read.csv("results/no_batch_adj/muscle/QCd_data/inv_norm/DoD_hilic_neg_muscle_QCd_inv_norm.csv")
#hp <- read.csv("results/no_batch_adj/muscle/QCd_data/inv_norm/DoD_hilic_pos_muscle_QCd_inv_norm.csv")
#output_fnm <- "results/no_batch_adj/muscle/merged_data/inv_norm_merged_QCd_muscle_knowns.csv"
#met_info <- fread("results/no_batch_adj/muscle/met_info.csv",na.strings="")
dir.create(dirname(output_fnm))
names(cn)[1] <- "Compound_Id"
names(cp)[1] <- "Compound_Id"
names(hn)[1] <- "Compound_Id"
names(hp)[1] <- "Compound_Id"
# Deal with muscle "_repeat" samples in muscle. Remove duplicates (preferring the samples with _repeat), then rename the ids to remove "_repeat".
fix_repeats <- function(df) {
id_no_repeat <- sub("_repeat","",colnames(df))
cols2keep <- grepl("_repeat",colnames(df)) | !( duplicated(id_no_repeat) | duplicated(id_no_repeat,fromLast=T) )
df <- df[,cols2keep]
colnames(df) <- sub("_repeat","",colnames(df))
df
}
cn <- cn %>%
mutate(Compound_Id = paste(Compound_Id, "cn", sep = "_")) %>% fix_repeats
cp <- cp %>%
mutate(Compound_Id = paste(Compound_Id, "cp", sep = "_")) %>% fix_repeats
hn <- hn %>%
mutate(Compound_Id = paste(Compound_Id, "hn", sep = "_")) %>% fix_repeats
hp <- hp %>%
mutate(Compound_Id = paste(Compound_Id, "hp", sep = "_")) %>% fix_repeats
## Merging the four methods
# Assuming df1 and df2 are your data frames
merged_df <- bind_rows(cn, cp, hn, hp)
#Selecting only relevant columns for merging
info <- met_info[, c("Compound_Id","Name", "HMDB_Id")]
## Merging with metab info
data <- merge(info, merged_df, by = "Compound_Id" )
#Write the files
#write.csv(data, "results/merged_data/inv_norm_merged_QCd_plasma.csv")
## Selecting only known metabolites
selected_rows <- data[!is.na(HMDB_Id) & HMDB_Id!="NA",]
## Remove columns and keeping only HMDB_Id column
df <- subset(selected_rows, select = -c(Compound_Id,Name))
#Remove the controls
df <- df[!grepl("PREF",HMDB_Idl)]
# Removing metabolite duplicates
df <- distinct(df, HMDB_Id, .keep_all = TRUE)
# Transposing data samples as rows, metbaolites as columns
transposed_df <- pivot_longer(df, cols = -HMDB_Id, names_to = "sample_id", values_to = "Value")
transposed_df <- pivot_wider(transposed_df, names_from = HMDB_Id, values_from = Value)
fwrite(transposed_df, output_fnm)