-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path01_find_contaminants.R
110 lines (85 loc) · 3.9 KB
/
01_find_contaminants.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
source("00_data_structuring.R")
# Filter data to keep only ASV_id and sort by abundance (count per sample)
# Add 'count_overall' = features count across the dataset (count_sample is by ... sample)
find_contaminats <- all_data %>%
select(sample_id, state, colony_ms, levels_colony_ms, state_colony_frgmt_ms, ASV_id, count_sample) %>%
group_by(ASV_id) %>%
arrange(desc(count_sample)) %>%
mutate(count_overall = sum(count_sample)) %>%
ungroup() %>%
relocate(count_overall, .after = count_sample)
# Add 'rank_overall' = ranking by relative abundance across the whole dataset
find_contaminats <- find_contaminats %>%
select(ASV_id, count_overall) %>%
unique() %>%
arrange(desc(count_overall)) %>%
mutate(rank_overall = row_number()) %>%
right_join(., find_contaminats, by = c("ASV_id", "count_overall")) %>%
relocate(count_sample, .after = rank_overall) # %>% view()
# identify the features found in C1 (control) with > 100 reads (should be 9) ............................
C1_100 <- find_contaminats %>%
filter(sample_id == "C1-16S" & count_sample > 100) %>%
pull(ASV_id)
# Use C1_100 to filter sequences to plot
# (to viz assess potential contaminates based on presence/absence between C1 and coral samples)
plot_find_contaminats <- find_contaminats %>%
filter(ASV_id %in% C1_100) # %>% filter(count > 0)
# PLOT --------------------------------------------
ggplot(data = plot_find_contaminats,
aes(x = forcats::fct_reorder(state_colony_frgmt_ms, levels_colony_ms ),
y = as_factor(rank_overall),
size = count_sample)) +
geom_point(aes(color = colony_ms),
shape = 16,
stroke = 0) +
scale_size_area(name = "seqs count \nper sample",
max_size = 10) +
ggtitle("All features found in the PCR control with >100 reads") +
ylab("Feature ranking by overall frequency\n(1 = most frequent across the whole data set)") +
theme_bw() +
theme(
text = element_text(colour = "white"),
axis.text = element_text(color = "white"),
plot.background = element_rect(fill = "black", color = "black"),
panel.background = element_rect(fill = "black", color = NA),
legend.background = element_rect(fill = "black", color = NA),
legend.key = element_rect(fill = "grey21"),
axis.title.x = element_blank(),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 0),
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
axis.ticks = element_blank(),
strip.background = element_blank(),
strip.text = element_blank()
) +
ggforce::facet_row(vars(forcats::fct_rev(factor(state))),
scales = 'free_x', space = 'free')
ggsave("./out/Gfas_16S/find_contaminants/find_contaminants.png",
dpi = 330,
units = "cm", width = 25, height = 13)
# Make table to export --------------------------------------------
tab1 <- find_contaminats %>%
filter(sample_id == "C1-16S") %>%
arrange(desc(count_sample)) %>%
mutate(
rank_ctrl = row_number(),
count_ctrl = count_sample,
) %>%
# rename(ASV_id = taxon_name) %>%
select(ASV_id, count_overall, rank_overall, count_ctrl, rank_ctrl) #%>% view()
tab2 <- all_data %>%
filter(ASV_id %in% C1_100) %>%
select(ASV_id, c(Kingdom:Species)) %>%
unique()
tab3 <- asv_count_taxa %>%
select(ASV_id, Sequence) # to add the sequences!
potential_contaminants <- inner_join(tab1, tab2, by = "ASV_id") %>%
inner_join(., tab3, by = "ASV_id")
# Export table
# This will be used to BLASTn search and identify likely contaminants
write_csv(potential_contaminants,
"./out/Gfas_16S/find_contaminants/potential_contaminants.csv")
# Clean up
rm(list = setdiff(ls(), c("all_data", "asv_count_taxa")))
# Now I will open this in excel and include the info that was manually searched (BLASTn)
# then flag as to keep or remove and import again and remove contaminant sequences