-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathQCBioscan.Rmd
1745 lines (1707 loc) · 96.7 KB
/
QCBioscan.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "BIOSCAN QC Report"
author: "Alicja Witwicka"
date: '`r Sys.Date()`'
output:
html_document:
toc: true
pdf_document:
fig_caption: yes
toc: yes
github_document:
toc: yes
editor_options:
chunk_output_type: console
geometry: margin = 1cm
---
```{r setup, echo = FALSE}
knitr::opts_chunk$set(
echo = FALSE, # hide code
message = FALSE,
warning = FALSE,
cache.lazy = FALSE,
include = TRUE,
out.height = "\textheight",
out.width = "\textwidth",
comment = ""
)
```
```{bash, eval=TRUE}
# Write environment variables to a temporary file
echo "batch_path=$batch_path" > /tmp/env_vars.txt
echo "output_path=$output_path" >> /tmp/env_vars.txt
echo "batch_no=$batch_no" >> /tmp/env_vars.txt
```
```{r read_temp_file}
# Read environment variables from the file
env_vars <- readLines("/tmp/env_vars.txt")
env_list <- lapply(env_vars, function(x) strsplit(x, "=")[[1]][2])
names(env_list) <- c("batch_path", "output_path", "batch_no")
# Assign to individual variables
batch_path <- env_list$batch_path
output_path <- env_list$output_path
batch_no <- env_list$batch_no
# For manual processing:
# batch_no <- "batch19"
# batch_path <- ("/lustre/scratch126/tol/teams/lawniczak/users/aw43/2024_07_bioscan_qc/input/mbrave_batch_data/batch19/")
# output_path <- ("/lustre/scratch126/tol/teams/lawniczak/users/aw43/2024_07_bioscan_qc/output/qc_reports/batch19/")
```
```{r load_libraries}
# Load required libraries; All libraries should be automatically installed in the environment
load_pkgs <- function(pkg, bioconductor = FALSE) {
for (p in pkg) {
library(p, character.only = TRUE)
}
}
# CRAN packages
cran_pkgs <- c(
"BiocManager", "tidyverse", "RColorBrewer", "scales", "kableExtra",
"here", "dplyr", "cluster", "reshape", "reshape2", "stringdist", "pander",
"ggiraph", "e1071", "gridExtra", "knitr", "patchwork", "colorspace", "purrr"
)
# Bioconductor packages
bioconductor_pkgs <- c(
"biomaRt", "Biostrings", "msa", "ape"
)
# Load CRAN packages
load_pkgs(cran_pkgs, bioconductor = FALSE)
# Load Bioconductor packages
load_pkgs(bioconductor_pkgs, bioconductor = TRUE)
# pander::panderOptions('digits' , 2)
```
```{r ggplot_theme_setup}
# Set a custom ggplot theme - generate 97 pastel colors (for box plots)
pastel_colors <- colorRampPalette(brewer.pal(9, "Set1"))(97)
pastel_colors_small <- colorRampPalette(brewer.pal(9, "Set1"))(17) %>% head(16)
# barplot(rep(1, 17), col = pastel_colors_small, space = 0, border = NA)
```
```{r load_files}
# Collect files
network_tsv <- list.files(pattern = "*.tsv", path = batch_path)
full_fasta <- list.files(pattern = "*.fas", path = batch_path)
sample_file <- list.files(pattern = "*sample_stats.txt", path = batch_path)
n_cont_file <- list.files(pattern = "*control_neg_stats.txt", path = batch_path)
p_cont_file <- list.files(pattern = "*control_pos_stats.txt", path = batch_path)
# Load files
qc_tsv_raw <- read.table(paste(batch_path, network_tsv, sep = ""), sep="\t", header = TRUE)
sequences_all <- readDNAStringSet(paste(batch_path, full_fasta, sep = ""))
stats_upload <- function(patch, sample){
sample_table <- read.table(paste(patch, sample, sep = ""), sep="\t", header = TRUE, fill = TRUE) %>%
dplyr::select(Label, Count, Reads.Excised.from.Subsampling, Reads.in.Contigs, Contigs.Produced, Median.Read.Count.in.Contigs,
Group, UMI.Plate.ID, Sample.Plate.ID, Forward.UMI.Label, Reverse.UMI.Label, Data.File.Location)
return(sample_table)
}
sample_stats <- stats_upload(patch = batch_path, sample = sample_file)
n_cont_stats <- stats_upload(patch = batch_path, sample = n_cont_file)
p_cont_stats <- stats_upload(patch = batch_path, sample = p_cont_file)
umi_plates <- read.csv("/lustre/scratch126/tol/teams/lawniczak/users/aw43/2024_07_bioscan_qc/input/UMI_plates.csv")
```
```{r check_files}
cat(paste(batch_no, "\nQC REPORT",
"\nInput files downloaded from:\n", batch_path,
"\nOutput files are saved to:\n", output_path),
paste("\n\nThe consensus network .tsv file exists:", exists("qc_tsv_raw")),
paste("\nThe fasta file exists:", exists("sequences_all")),
paste("\nThe stample statistics file exists:", exists("sample_stats")),
paste("\nThe negative control statistics file exists:", exists("n_cont_stats")),
paste("\nThe positive control statistics file exists:", exists("p_cont_stats"))
)
```
### Statistics for the positive controls
```{r positive_control_stats}
unique_sample_ids <- unique(sample_stats$Sample.Plate.ID)
unique_p_cont_ids <- unique(p_cont_stats$Sample.Plate.ID)
plates_p_cont <- all(unique_sample_ids %in% unique_p_cont_ids)
no_p_cont <- setdiff(unique_sample_ids, unique_p_cont_ids)
p_cont_quantiles <- quantile(p_cont_stats$Count,
probs = c(0.05, 0.1, 0.25, 0.50, 0.75, 0.95, 1))
cat(
paste(
"Total number of positive controls:", (p_cont_stats %>% pull(Label) %>% unique() %>% length()),
"\nNumber of positive controls per plate:", paste(p_cont_stats$Sample.Plate.ID %>% table() %>% table() %>% names(), collapse = ", "),
"\n\nAll plates have positive controls:", plates_p_cont,
if (!plates_p_cont) {
paste("\nPlates without positive controls:", paste(no_p_cont, collapse = "\n"))
} else {
""
},
"\nTotal number of reads in positive controls:", sum(p_cont_stats$Count),
"\nMaximum number of reads:", max(p_cont_stats$Count), "in positive control sample:", p_cont_stats[(which(p_cont_stats$Count == max(p_cont_stats$Count))),] %>% pull(Label),
"\nMinimum number of reads:", min(p_cont_stats$Count), "in", p_cont_stats[(which(p_cont_stats$Count == min(p_cont_stats$Count))),] %>% pull(Label),
"\n\nAverage number of positive control reads:", mean(p_cont_stats$Count),
"\nMedian number of positive control reads:", median(p_cont_stats$Count),
"\nRead standard deviation:", sd(p_cont_stats$Count),
"\n\nQuantiles:\n", paste(names(p_cont_quantiles), p_cont_quantiles, sep = ": ", collapse = "\n")
)
)
sample_metadata <- data.frame(max_read_pcont = max(p_cont_stats$Count),
min_read_pcont = min(p_cont_stats$Count),
average_pcont = mean(p_cont_stats$Count),
median_pcont = median(p_cont_stats$Count),
sd_pcont = sd(p_cont_stats$Count))
```
```{r positive_control_histogram, fig.align='center', fig.width=2.5, fig.height=2}
# Positive control read distribution
p_cont_stats %>%
ggplot(aes(x = Count)) +
geom_histogram(binwidth = 10, fill = "#07dbd0", color = "#1c67fc", linewidth = 0.5) +
labs(title = "Distribution of reads in positive controls [binwidth = 10]", x = "Read count", y = "Frequency") +
theme_classic() +
geom_vline(aes(xintercept = mean(Count)), color = "#1c67fc", linetype = "solid", linewidth = 0.5) +
geom_vline(aes(xintercept = p_cont_quantiles[1]), color = "#db5f07", linetype = "dotted", linewidth = 0.5) +
geom_vline(aes(xintercept = p_cont_quantiles[2]), color = "#f2aa2e", linetype = "dotted", linewidth = 0.5) +
theme(
plot.title = element_text(size = 7),
axis.title = element_text(size = 7),
axis.text = element_text(size = 6)
)
```
Blue solid line: read mean <br>
Orange dotted lines: 5% and 10% lower quantiles
```{r failed_positive_control_search}
p_cont_failed <- p_cont_stats %>% filter(Count <= p_cont_quantiles[1]) %>% arrange(Label) %>% pull(Label)
p_cont_failed_partners <- p_cont_stats %>% filter(Count < p_cont_quantiles[1]) %>% arrange(Label) %>% pull(Group) %>% unique()
cat(paste("Number of positive control samples in the lower 5% quantile:",
(p_cont_stats %>% filter(Count <= p_cont_quantiles[1]) %>% nrow()),
"\n\n", paste(p_cont_failed, collapse = "\n"),
"\n\nNames of the associated partners:", paste(p_cont_failed_partners, collapse = ", ")
))
```
### Statistics for the negative controls
```{r negative_control_stats}
lysate_n_cont <- n_cont_stats[(grep("LYSATE", n_cont_stats$Label)), ]
empty_n_cont <- n_cont_stats[!grepl("LYSATE", n_cont_stats$Label), ]
n_cont_table <- n_cont_stats$Sample.Plate.ID %>%
table() %>%
as.data.frame() %>%
arrange(desc(Freq))
summary_table <- table(n_cont_table$Freq) %>% as.data.frame()
less_than_3 <- summary_table[summary_table$Var1 %in% c(1, 2), ]
more_than_2 <- summary_table[!(summary_table$Var1 %in% c(1, 2)), ]
summarized_row <- data.frame(Var1 = "more than 2", Freq = sum(more_than_2$Freq))
summary_table <- rbind(less_than_3, summarized_row)
plates_n_cont <- all(unique(sample_stats$Sample.Plate.ID) %in% unique(n_cont_stats$Sample.Plate.ID))
n_lysate_quantiles <- quantile(lysate_n_cont$Count,
probs = c(0.05, 0.1, 0.25, 0.50, 0.75, 0.95, 0.98))
n_empty_quantiles <- quantile(empty_n_cont$Count,
probs = c(0.05, 0.1, 0.25, 0.50, 0.75, 0.95, 0.98))
n_cont_quantiles <- quantile(n_cont_stats$Count,
probs = c(0.05, 0.1, 0.25, 0.50, 0.75, 0.95, 0.98))
cat(
paste(
"Total number of negative controls:", (n_cont_stats %>% pull(Label) %>% unique() %>% length()),
"\nTotal number of lysate negative controls:", (lysate_n_cont %>% pull(Label) %>% unique() %>% length()),
"\nTotal number of empty negative controls:", (empty_n_cont %>% pull(Label) %>% unique() %>% length()),
"\n\nNumber of negative controls per plate:\n"
))
knitr::kable(summary_table, col.names = c("Number of negative controls per plate", "Number of plates"), format = "html") %>%
kableExtra::kable_styling(full_width = FALSE, position = "center")
cat(
paste("\nAll plates have negative controls:", plates_n_cont),
if (!plates_n_cont) {
# Calculate plates without positive controls
no_n_cont <- sample_stats$Label[(which(!(unique(sample_stats$Sample.Plate.ID) %in% unique(n_cont_stats$Sample.Plate.ID))))]
cat("\n\nPlates without negative controls:", paste(no_n_cont, collapse = "\n"))
},
paste("\n\nTotal number of reads in lysate negative controls:", sum(lysate_n_cont$Count),
"\nTotal number of reads in empty negative controls:", sum(empty_n_cont$Count),
"\nMaximum number of reads:", max(lysate_n_cont$Count), "in lysate negative control sample:",
lysate_n_cont[(which(lysate_n_cont$Count == max(lysate_n_cont$Count))),] %>% pull(Label),
"\nMaximum number of reads:", max(empty_n_cont$Count), "in empty negative control sample:",
empty_n_cont[(which(empty_n_cont$Count == max(empty_n_cont$Count))),] %>% pull(Label),
"\n\nZero reads in:", length(which(n_cont_stats$Count == 0)), "negative control samples",
"\nIn lysate controls:", length(which(lysate_n_cont$Count == 0)),
"\nIn empty controls:", length(which(empty_n_cont$Count == 0)),
"\n\nAverage number of negative control reads:", mean(n_cont_stats$Count),
"\nIn lysate controls:", mean(lysate_n_cont$Count),
"\nIn empty controls:", mean(empty_n_cont$Count),
"\n\nMedian number of negative control reads:", median(n_cont_stats$Count),
"\nIn lysate controls:", median(lysate_n_cont$Count),
"\nIn empty controls:", median(empty_n_cont$Count),
"\n\nSkewness number of negative control reads:", skewness(n_cont_stats$Count),
"\nIn lysate controls:", skewness(lysate_n_cont$Count),
"\nIn empty controls:", skewness(empty_n_cont$Count),
"\n\nQuantiles in lysate controls:\n", paste(names(n_lysate_quantiles), n_lysate_quantiles, sep = ": ", collapse = "\n"),
"\n\nQuantiles in empty controls:\n", paste(names(n_empty_quantiles), n_empty_quantiles, sep = ": ", collapse = "\n")
)
)
sample_metadata_n <- data.frame(max_read_n_lysate = max(lysate_n_cont$Count),
max_read_n_empty = max(empty_n_cont$Count),
min_read_n_lysate = min(lysate_n_cont$Count),
min_read_n_empty = min(empty_n_cont$Count),
average_n_lysate = mean(lysate_n_cont$Count),
average_n_empty = mean(empty_n_cont$Count),
median_n_lysate = median(lysate_n_cont$Count),
median_n_empty = median(empty_n_cont$Count),
skewness_n_lysate = skewness(lysate_n_cont$Count),
skewness_n_empty = skewness(empty_n_cont$Count)
)
sample_metadata <- cbind(sample_metadata, sample_metadata_n)
```
```{r negative_control_histogram, fig.align='center', fig.width=3.5, fig.height=5}
# Scalling for y axis
stretch_trans <- scales::trans_new(
"stretch",
transform = function(x) x^0.5,
inverse = function(x) x^2
)
# Calculate maximum y
basic_plot <- n_cont_stats %>%
ggplot(aes(x = Count)) +
geom_histogram(binwidth = 1, fill = "#07dbd0", color = "#1c67fc", linewidth = 0.5)
# Extract maximum count from the plot object
max_count <- max(ggplot_build(basic_plot)$data[[1]]$count)
# Plot negative control distribution
n_cont_plot <- basic_plot +
labs(title = "Distribution of reads in all negative controls [binwidth = 1]", x = "Read count", y = "Frequency") +
theme_classic() +
geom_vline(aes(xintercept = mean(Count)), color = "#1c67fc", linetype = "solid", linewidth = 0.5) +
geom_vline(aes(xintercept = n_cont_quantiles[6]), color = "#db5f07", linetype = "dotted", linewidth = 0.5) +
geom_vline(aes(xintercept = n_cont_quantiles[7]), color = "#f2aa2e", linetype = "dotted", linewidth = 0.5) +
scale_y_continuous(limits = c(0, max_count + 10), trans = stretch_trans) +
theme(
plot.title = element_text(size = 7),
axis.title = element_text(size = 7),
axis.text = element_text(size = 6)
)
lysate_n_cont_plot <- lysate_n_cont %>%
ggplot(aes(x = Count)) +
geom_histogram(binwidth = 1, fill = "#07dbd0", color = "#1c67fc", linewidth = 0.5) +
labs(title = "Lysate", x = "Read count", y = "Frequency") +
theme_classic() +
geom_vline(aes(xintercept = mean(Count)), color = "#1c67fc", linetype = "solid", linewidth = 0.5) +
geom_vline(aes(xintercept = n_cont_quantiles[6]), color = "#db5f07", linetype = "dotted", linewidth = 0.5) +
geom_vline(aes(xintercept = n_cont_quantiles[7]), color = "#f2aa2e", linetype = "dotted", linewidth = 0.5) +
scale_y_continuous(limits = c(0, max_count + 10), trans = stretch_trans) +
theme(
plot.title = element_text(size = 7),
axis.title = element_text(size = 7),
axis.text = element_text(size = 6)
)
empty_n_cont_plot <- empty_n_cont %>%
ggplot(aes(x = Count)) +
geom_histogram(binwidth = 1, fill = "#07dbd0", color = "#1c67fc", linewidth = 0.5) +
labs(title = "Empty", x = "Read count", y = "Frequency") +
theme_classic() +
geom_vline(aes(xintercept = mean(Count)), color = "#1c67fc", linetype = "solid", linewidth = 0.5) +
geom_vline(aes(xintercept = n_cont_quantiles[6]), color = "#db5f07", linetype = "dotted", linewidth = 0.5) +
geom_vline(aes(xintercept = n_cont_quantiles[7]), color = "#f2aa2e", linetype = "dotted", linewidth = 0.5) +
scale_y_continuous(limits = c(0, max_count + 10), trans = stretch_trans) +
theme(
plot.title = element_text(size = 7),
axis.title = element_text(size = 7),
axis.text = element_text(size = 6)
)
# Combine and display plots
n_cont_plot / (lysate_n_cont_plot + empty_n_cont_plot)
```
Blue solid line: read mean <br>
Orange dotted lines: upper 5% and 2% of samples with the highers number of reads
```{r failed_negative_control_search}
# Names of the failed samples
n_cont_failed_5 <- n_cont_stats %>% filter(Count >= n_cont_quantiles[6]) %>% arrange(Label) %>% pull(Label)
n_cont_failed_2 <- n_cont_stats %>% filter(Count >= n_cont_quantiles[7]) %>% arrange(Label) %>% pull(Label)
n_cont_failed_partners <- n_cont_stats %>% filter(Count >= n_cont_quantiles[6]) %>% arrange(Label) %>% pull(Group) %>% unique()
cat(
"Number of negative control samples in the higher 5%:",
(n_cont_failed_5 %>% length()), "\n",
"Out of in the lysate controls:", (lysate_n_cont %>% filter(Count >= n_cont_quantiles[6]) %>% nrow()), "\n",
"Out of in the empty controls:", (empty_n_cont %>% filter(Count >= n_cont_quantiles[6]) %>% nrow()), "\n",
"\nNumber of negative control samples in the higher 2%:",
(n_cont_failed_2 %>% length()), "\n",
"Out of in the lysate controls:", (lysate_n_cont %>% filter(Count >= n_cont_quantiles[7]) %>% nrow()), "\n",
"Out of in the empty controls:", (empty_n_cont %>% filter(Count >= n_cont_quantiles[7]) %>% nrow()),
paste("\n\nNames of the associated partners:", paste(n_cont_failed_partners, collapse = ", ")
))
```
### Statistics for the samples
```{r sample_stats}
sample_quantiles <- quantile(sample_stats$Count,
probs = c(0.05, 0.1, 0.25, 0.50, 0.75, 0.95, 1))
total_sample_number <- sample_stats %>% pull(Label) %>% unique() %>% length()
cat(
paste(
"Number of samples in the batch (exclusing controls):", total_sample_number,
"\nTotal number of partner plates:", sample_stats$Sample.Plate.ID %>% unique() %>% length(),
"\nTotal number of sample reads:", sum(sample_stats$Count),
"\n\nMaximum number of sample reads:", max(sample_stats$Count), "in sample:",
sample_stats[(which(sample_stats$Count == max(sample_stats$Count))),] %>% pull(Label),
"\nMinimum number of sample reads:", min(sample_stats$Count), "in",
length(which(sample_stats$Count == min(sample_stats$Count))), "samples\n", "which is",
length(which(sample_stats$Count == min(sample_stats$Count)))*100/length(unique(sample_stats$Label)),
"% of all samples",
"\n\nAverage number of reads:", mean(sample_stats$Count),
"\nMedian number of reads:", median(sample_stats$Count),
"\nRead standard deviation:", sd(sample_stats$Count),
"\nSkewness number of sample reads:", skewness(sample_stats$Count),
"\n\nQuantiles:\n", paste(names(sample_quantiles), sample_quantiles, sep = ": ", collapse = "\n")
)
)
sample_metadata_s <- data.frame(sample_number = total_sample_number,
read_count_s = sum(sample_stats$Count),
max_read_s = max(sample_stats$Count),
mind_read_s = min(sample_stats$Count),
average_s = mean(sample_stats$Count),
median_s = median(sample_stats$Count),
sd_s = sd(sample_stats$Count)
)
sample_metadata <- cbind(sample_metadata_s, sample_metadata)
```
```{r sample_histogram, fig.align='center', fig.width=3.5, fig.height=2}
# Plot sample read distribution
sample_stats %>%
ggplot(aes(x = Count)) +
geom_histogram(binwidth = 10, fill = "#07dbd0", color = "#1c67fc", linewidth = 0.5) +
labs(title = "Distribution of reads in samples [binwidth = 10]", x = "Read count", y = "Frequency") +
theme_classic() +
geom_vline(aes(xintercept = mean(Count)), color = "#1c67fc", linetype = "solid", linewidth = 0.5) +
geom_vline(aes(xintercept = sample_quantiles[1]), color = "#db5f07", linetype = "dotted", linewidth = 0.5) +
geom_vline(aes(xintercept = sample_quantiles[2]), color = "#f2aa2e", linetype = "dotted", linewidth = 0.5) +
theme(
plot.title = element_text(size = 7),
axis.title = element_text(size = 7),
axis.text = element_text(size = 6)
)
```
Blue solid line: read mean <br>
Orange dotted lines: lower 5% and 10% of samples
```{r failed_sample_search}
failed_sample_table <- sample_stats %>% filter(Count <= sample_quantiles[1]) %>%
arrange(Label) %>% pull(Group) %>% table() %>% as.data.frame()
cat(
paste(
"Number of samples in the lower 10%:",
(sample_stats %>% filter(Count <= sample_quantiles[2]) %>% nrow()), "out of", length(unique(sample_stats$Label)), "samples",
"\nNumber of samples in the lower 5%:",
(sample_stats %>% filter(Count <= sample_quantiles[1]) %>% nrow()), "out of", length(unique(sample_stats$Label)), "samples",
"\n\nPartners associated with the bottom 5% of samples by read count:"
)
)
knitr::kable((failed_sample_table %>% arrange(-Freq)), col.names = c("Partner names", "Frequency"), format = "html") %>%
kableExtra::kable_styling(full_width = FALSE, position = "center")
# Samples with 0 reads are eliminated
failed_samples_quantile <- sample_stats %>% filter(Count <= sample_quantiles[1]) %>% pull(Label)
failed_samples_read <- sample_stats %>% filter(Count == 0) %>% pull(Label)
cat(paste("\nNumber of samples with 0 reads:", length(failed_samples_read)))
failed_samples <- failed_samples_read
# Save
sample_metadata <- cbind(sample_metadata, data.frame(zero_read_samples = length(failed_samples)))
```
### Plate boxplots
```{r partner_plate_assessment, fig.align='center', fig.width=12, fig.height=5.5}
# Merge the stats files
lysate_n_cont$sample_type <- "l_n_cont"
empty_n_cont$sample_type <- "e_n_cont"
sample_stats$sample_type <- "sample"
p_cont_stats$sample_type <- "p_cont"
stats_table_comb <- rbind(lysate_n_cont, empty_n_cont, sample_stats, p_cont_stats)
# Update the UMI info
umi_plates <- merge(stats_table_comb, umi_plates, by = c("Forward.UMI.Label", "Reverse.UMI.Label"), all = FALSE) %>% dplyr::select("Label", "Count", "UMI.Plate.ID", "Plate.Number", "Forward.UMI.Label", "Reverse.UMI.Label", "Well.Coordinate")
# Summarise the statistics
average_sample_count <- sample_stats %>%
summarize(
avg_count = mean(Count),
sd_count = sd(Count),
median = median(Count)
)
low_read_plates <- sample_stats %>%
group_by(Sample.Plate.ID) %>%
summarize(
avg_count = mean(Count),
median = median(Count),
quantile75 = quantile(Count, probs = 0.75)) %>%
filter(quantile75 < average_sample_count$avg_count) %>% arrange(Sample.Plate.ID) %>% pull(Sample.Plate.ID)
subgroup <- sample(pastel_colors_small)
stats_table_comb %>% filter(sample_type != "n_cont") %>%
ggplot(aes(x = Sample.Plate.ID, y = Count)) +
geom_point(alpha = 0.5, aes(colour = Group), size = 0.5) +
geom_boxplot(alpha = 0.8, aes(colour = Group), fill = "white") +
geom_boxplot(data = (stats_table_comb %>% filter(sample_type != "n_cont") %>%
filter(Sample.Plate.ID %in% low_read_plates)),
alpha = 0.1, colour = NA, fill = "black") +
geom_point(data = subset(stats_table_comb, sample_type == "l_n_cont"), color = "#000080") +
geom_point(data = subset(stats_table_comb, sample_type == "e_n_cont"), color = "#00C5CD") +
geom_point(data = subset(stats_table_comb, sample_type == "p_cont"), color = "#00EE76") +
geom_hline(yintercept = average_sample_count$avg_count, color = "#8B5742", linetype = "dotted", linewidth = 1) +
geom_hline(yintercept = average_sample_count$median, color = "grey", linetype = "dotted", linewidth = 1) +
labs(title = "Read Count per Partner Plate", x = "Plate", y = "Read Count") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 1),
legend.position = "none") +
scale_colour_manual(values = subgroup) +
theme(
plot.title = element_text(size = 10),
axis.title = element_text(size = 8),
axis.text = element_text(size = 6)
)
cat(paste("Plates where the 75th percentile of the data is lower than expected mean read count (dark grey):\n\n", paste(low_read_plates, collapse = "\n"),"\n"),
paste("\nwhich constitutes", length(low_read_plates)*100/length(sample_stats %>% pull(Sample.Plate.ID) %>% unique()), "% of all partner plates in this batch"))
```
Grey line: median <br>
Brown line: mean <br>
Green data points: positive controls <br>
Blue data points: empty negative controls <br>
Navy data points: lysate negative controls <br><br>
```{r umi_plate_assessment, fig.align='center', fig.width=12, fig.height=5.5}
# Select low-quality UMI plates
low_read_UMI <- sample_stats %>%
group_by(UMI.Plate.ID) %>%
summarize(
avg_count = mean(Count),
median = median(Count),
quantile75 = quantile(Count, probs = 0.75)) %>%
filter(quantile75 < average_sample_count$avg_count) %>% arrange(UMI.Plate.ID) %>% pull(UMI.Plate.ID)
sample_stats %>%
ggplot(aes(x = factor(UMI.Plate.ID), y = Count)) +
geom_point(alpha = 1, aes(colour = Group), size = 0.5) +
geom_boxplot(alpha = 0.8, colour = "#CD9B9B", fill = "white") +
geom_point(data = (sample_stats %>%
filter(Sample.Plate.ID %in% low_read_plates)), colour = "#AB82FF") +
geom_point(data = subset(stats_table_comb, sample_type == "l_n_cont"), color = "#000080") +
geom_point(data = subset(stats_table_comb, sample_type == "e_n_cont"), color = "#00C5CD") +
geom_point(data = subset(stats_table_comb, sample_type == "p_cont"), color = "#54FF9F") +
labs(title = "Read Count per UMI Plate", x = "Plate", y = "Read Count") +
geom_boxplot(data = (sample_stats %>%
filter(UMI.Plate.ID %in% low_read_UMI)),
alpha = 0.1, colour = NA, fill = "black") +
geom_hline(yintercept = average_sample_count$avg_count, color = "#8B5742", linetype = "dotted", linewidth = 1) +
geom_hline(yintercept = average_sample_count$median, color = "grey", linetype = "dotted", linewidth = 1) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 1),
legend.position = "none") +
scale_colour_manual(values = subgroup) +
theme(
plot.title = element_text(size = 10),
axis.title = element_text(size = 8),
axis.text = element_text(size = 6)
)
cat(
paste("Plates where the 75th percentile of the data is lower than expected mean read count (dark grey):", paste(low_read_UMI, collapse = ", ")),
paste("\nHow many samples from the low-performance partner plates are present in the low-performance UMI plates (purple data points):",
(((sample_stats %>% filter(Sample.Plate.ID %in% low_read_plates)) %>% filter(UMI.Plate.ID %in% low_read_UMI)) %>%
pull(Label) %>% length())*100/(sample_stats %>% filter(Sample.Plate.ID %in% low_read_plates) %>% pull(Label) %>% length()), "%"))
```
<b>Assess the positive controls with the low number of reads detected in the previous steps:</b>
```{r failed_positive_control_plate_assessment}
# Expected positive control reads per plate (average count)
obs_p_cont_count <- stats_table_comb %>% filter(Label %in% p_cont_failed) %>% dplyr::select(Label, Count) %>%
mutate(Sample.Plate.ID = gsub("CONTROL_POS_", "", (sub("_[^_]+$", "", Label))))
exp_p_cont_count <- sample_stats %>%
group_by(Sample.Plate.ID) %>%
summarize(average_count = mean(Count, na.rm = TRUE)) %>%
arrange(average_count) %>%
filter(Sample.Plate.ID %in% (obs_p_cont_count %>% pull(Sample.Plate.ID)))
cont_count <- merge(obs_p_cont_count, exp_p_cont_count, by = "Sample.Plate.ID")
colnames(cont_count) <- c("Sample.Plate.ID", "Label", "Observed.Count", "Expected.Count")
for (i in 1:nrow(cont_count)) {
observed <- cont_count$Observed.Count[i]
expected <- cont_count$Expected.Count[i]
sample_plate_id <- cont_count$Sample.Plate.ID[i]
if (observed < expected) {
cat(paste(sample_plate_id, "Positive control failed.\n", "Observed number of reads:", observed, "Expected:", expected, "\n\n"))
} else if (observed > expected) {
cat(paste(sample_plate_id, "More reads in positive control than in samples on average.\n", "Observed number of reads:", observed, "Expected:", expected, "\n\n"))
} else {
cat(paste(sample_plate_id, "Positive control failed.\n", "Observed number of reads:", observed, "Expected:", expected, "\n\n"))
}
}
# Loop through plates to detect plates with low read count AND low p_cont reads
any_true <- FALSE
failed_plates <- vector()
for (plate in 1:nrow(obs_p_cont_count)) {
if (obs_p_cont_count$Sample.Plate.ID[plate] %in% low_read_plates) {
# If the condition is TRUE, print the Sample.Plate.ID and update the flag
cat(obs_p_cont_count$Sample.Plate.ID[plate], "\n")
any_true <- TRUE
failed_plates[plate] <- obs_p_cont_count$Sample.Plate.ID[plate]
}
}
if (any_true) {
cat("The above plates have lower than expected number of reads \nAND failed positive controls. \nTHESE PLATES NEED TO BE EXAMINED FURTHER")
} else {
cat("This batch does not contain plates that have lower than expected number of reads in samples\nAND positive controls.")
}
```
<b> Low-quality plates are displayed here. All the other plates are plotted in the last part of this report. </b> <br>
Green squares: controls [any kind]<br><br>
```{r low_quality_plates, fig.width=12, fig.height=2.3}
heatmapPlate <- function(plateIDs, statsDF, empty_list){
max_count1 <- max(statsDF$Count, na.rm = TRUE) + 10 # Calculate global maximum
max_count2 <- sample_quantiles[6] + 10
max_count <- max(max_count1, max_count2)
for (i in seq_along(plateIDs)) {
plate <- plateIDs[i]
group_data <- statsDF %>% filter(partner_plate == plate)
plot <- ggplot(group_data, aes(y = factor(plate_column, levels = rev(LETTERS[1:8])), x = factor(plate_row, levels = 1:12))) +
geom_tile(aes(fill = Count), color = "#C1CDCD") +
geom_tile(data = (group_data %>% filter(sample_type == "control")),
aes(fill = Count), color = "#54FF9F", linewidth = 0.5) +
scale_fill_gradient(low = "#0000FF", high = "#FFD700", na.value = "white", limits = c(0, max_count)) +
labs(title = paste("Plate:", plate),
x = "Plate Column",
y = "Plate Row",
fill = "Count") +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
panel.grid = element_blank(),
strip.text = element_text(size = 10),
plot.title = element_text(size = 7),
axis.title = element_text(size = 7),
axis.text = element_text(size = 6)) + theme_minimal()
if (i != length(plateIDs)) {
plot <- plot + guides(fill = "none")
}
empty_list[[plate]] <- plot
}
# Display the plots in pairs or triplets
plot_index <- 1
total_plots <- length(empty_list)
while (plot_index <= total_plots) {
if (plot_index == total_plots) {
plots_to_display <- list(empty_list[[plot_index]])
} else {
plots_to_display <- empty_list[plot_index:min(plot_index + 1, total_plots)]
}
do.call(grid.arrange, c(plots_to_display, ncol = 2))
plot_index <- plot_index + 2
}
}
stats_table_comb$Label2 <- gsub("CONTROL_NEG_LYSATE_|CONTROL_NEG_|CONTROL_POS_|CONTROL_", "", stats_table_comb$Label)
stats_table_comb$Label2 <- gsub("-", "_", stats_table_comb$Label2)
stats_table_comb <- stats_table_comb %>%
mutate(
plate_row = as.integer(str_extract(Label2, "\\d+$")),
plate_column = str_extract(Label2, "(?<=_)\\D(?=\\d+$)"),
partner_plate = str_extract(Label2, "^[^_]+_[^_]+"),
plate_number = str_extract(Label2, "(?<=_)[^_]+(?=_)"),
sample_type = ifelse(grepl("CONTROL", Label, ignore.case = TRUE), "control", "sample")
)
stats_table_comb_failed <- stats_table_comb %>% filter(partner_plate %in% low_read_plates)
unique_plates <- stats_table_comb_failed %>% arrange(partner_plate) %>% pull(partner_plate) %>% unique()
plot_list_counts <- list()
heatmapPlate(unique_plates, stats_table_comb_failed, plot_list_counts)
```
### Assessment of sequence conflicts and contaminants
<b>Positive control as contamination source</b>
```{r otu_data_handling}
# Remove failed samples - samples with no reads at all!
qc_tsv <- qc_tsv_raw %>% filter(!(pid %in% failed_samples))
# Merge the consensus tables [primary and secondary hits]
# Select only primary hits
qc_primary <- qc_tsv %>%
dplyr::select(pid, run_primary, rep_count_primary,
id_similarity_primary, p_primary,
c_primary, o_primary, f_primary,
g_primary, s_primary, otu_primary) %>%
group_by(pid) %>% slice_head(n = 1)
# Select secondary hits
qc_secondary <- qc_tsv %>%
dplyr::select(pid, run_primary, rep_count_secondary, id_similarity_secondary,
p_secondary, c_secondary, o_secondary, f_secondary,
g_secondary, s_secondary, otu_secondary)
# Remove secondary hits with NA reads [these are there when there's no secondary sequence at all for a sample]
qc_secondary <- qc_secondary[!is.na(qc_secondary$rep_count_secondary) & !is.na(qc_secondary$id_similarity_secondary), ]
# Combine the tables
qc_secondary$assignment <- "secondary"
qc_primary$assignment <- "primary"
colnames(qc_secondary) <- colnames(qc_primary)
qc_AM <- rbind(qc_secondary, qc_primary)
# Combine the sequence data
# Consensus OTU formating
qc_AM <- qc_AM %>%
mutate(id = otu_primary)
qc_AM$id <- gsub("TAX:", "", qc_AM$id)
qc_AM$id <- gsub("BOLD:", "", qc_AM$id)
qc_AM$id <- paste(qc_AM$pid, qc_AM$id, sep = "_")
qc_AM$id <- paste(qc_AM$id, qc_AM$rep_count_primary, sep = "_")
qc_AM$id <- paste(qc_AM$id, qc_AM$g_primary, sep = "_")
# Define the function to merge sequences and metadata
seq_to_df <- function(sequence_object){
# Extract sequences
sequences <- as.character(sequence_object)
# Extract names and split into components
seq_names_df <- data.frame(names = names(sequence_object))
seq_names_df <- seq_names_df %>%
separate(names, into = c("pid", "Run", "Contig_ID", "Rep_Count", "ID_Similarity", "C_Count",
"CMXD", "CMND", "CNND", "p_primary", "c_primary", "o_primary", "f_primary",
"g_primary", "s_primary", "otu_primary", "Date"), sep = "\\|") %>%
mutate(across(everything(), ~ gsub(".*:", "", .))) # Remove prefixes
seq_names_df$pid2 <- gsub("CONTROL_NEG_LYSATE_|CONTROL_NEG_|CONTROL_POS_|CONTROL_", "", seq_names_df$pid)
seq_names_df <- seq_names_df %>%
mutate(
plate_row = as.integer(str_extract(pid2, "\\d+$")),
plate_column = str_extract(pid2, "(?<=_)\\D(?=\\d+$)"),
partner_plate = str_extract(pid2, "^[^_]+"),
plate_number = str_extract(pid2, "(?<=_)[^_]+(?=_)")
)
seq_names_df$sequence <- sequences
return(seq_names_df)
}
# Transform the seq objects into data frames
sequences_df <- seq_to_df(sequences_all) %>% mutate(id = otu_primary)
sequences_df$id <- gsub("TAX:", "", sequences_df$id)
sequences_df$id <- gsub("BOLD:", "", sequences_df$id)
sequences_df$id <- paste(sequences_df$pid, sequences_df$id, sep = "_")
sequences_df$id <- paste(sequences_df$id, sequences_df$Rep_Count, sep = "_")
sequences_df$id <- paste(sequences_df$id, sequences_df$g_primary, sep = "_")
qc_AM_seq <- merge(qc_AM, sequences_df, by = "id")
qc_AM_seq <- qc_AM_seq %>% dplyr::select(id, pid.x, run_primary, rep_count_primary, id_similarity_primary, p_primary.x, c_primary.x, o_primary.x, f_primary.x, g_primary.x, s_primary.x, otu_primary.x, assignment, Contig_ID, Date, pid2, plate_row, plate_column, partner_plate, plate_number, sequence )
colnames(qc_AM_seq) <- c("id", "pid", "run_primary", "rep_count_primary", "id_similarity_primary", "p_primary", "c_primary", "o_primary", "f_primary", "g_primary", "s_primary", "otu_primary", "assignment", "Contig_ID", "Date", "pid2", "plate_row", "plate_column", "partner_plate", "plate_number", "sequence")
consensus_samples <- c(names(table(sequences_df$pid %in% qc_AM$pid)),
names(table(qc_AM$pid %in% sequences_df$pid)),
names(table(qc_AM$id %in% sequences_df$id)),
names(table(qc_AM_seq$pid %in% qc_AM$pid)))
qc_AM <- qc_AM_seq
if (length(consensus_samples) == 4) {
cat("NOTE: All sample and sequence IDs match - data successfully merged\n")
} else {
cat("ERROR: Sample and sequence IDs do not match!\n")
}
```
```{r positive_control_read_contamination}
# Determine p cont OTU
p_cont_samples <- unique(p_cont_stats$Label)
p_cont_OTU <- qc_AM %>% filter(pid %in% p_cont_samples) %>% filter(assignment == "primary") %>% pull(otu_primary) %>% unique()
p_cont_contamination <- qc_AM %>% filter(!(pid %in% p_cont_samples)) %>% filter(otu_primary %in% p_cont_OTU) %>% dplyr::select(pid, rep_count_primary, id_similarity_primary, otu_primary, assignment, plate_row, plate_column, partner_plate, plate_number)
p_cont_contamination_tab <- table(p_cont_contamination$assignment)
qc_AM$plate_row <- as.factor(qc_AM$plate_row)
cat(paste("Positive control OTU is", p_cont_OTU,
"\n\nNon-positve control samples that contain positive control reads:"
))
```
```{r positive_control_read_plot_partner}
base_grid <- expand.grid(plate_row = LETTERS[1:8], plate_column = 1:12)
partner_plate <- ggplot(base_grid, aes(x = factor(plate_column, levels = 1:12), y = factor(plate_row, levels = rev(LETTERS[1:8])))) +
geom_tile(fill = "white", color = "black") +
geom_tile(
data = p_cont_contamination,
aes(
x = factor(plate_row, levels = 1:12),
y = factor(plate_column, levels = rev(LETTERS[1:8]))
),
fill = "#CCEBC5", color = "#CCEBC5", linewidth = 1, alpha = 0.8
) +
geom_tile(data = data.frame(plate_row = "G", plate_column = 12), fill = "#FBB4AE", color = "#FBB4AE", linewidth = 1, alpha = 0.8) +
geom_text(data = p_cont_contamination,
aes(
x = factor(plate_row, levels = 1:12),
y = factor(plate_column, levels = rev(LETTERS[1:8])), label = rep_count_primary), size = 1.5, color = "black") +
labs(title = "Partner Plate", x = "Column", y = "Row") +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
panel.grid = element_blank(),
strip.text = element_text(size = 0.5),
plot.title = element_text(size = 1),
axis.title = element_text(size = 0.5),
axis.text = element_text(size = 0.5)) +
theme_minimal()
```
```{r positive_control_read_plot_umi, fig.align='center', fig.height=2.7}
colnames(umi_plates) <- c("pid", "Count", "UMI.Plate.ID", "Plate.Number", "Forward.UMI.Label", "Reverse.UMI.Label", "Well.Coordinate" )
umi_p_cont_contamination <- merge(qc_AM, umi_plates, by = "pid", all = FALSE) %>%
filter(otu_primary == p_cont_OTU) %>% filter(pid %in% p_cont_contamination$pid) %>%
mutate(
row = substr(Well.Coordinate, 1, 1),
column = substr(Well.Coordinate, 2, 3)
)
kable((umi_p_cont_contamination %>% dplyr::select(pid, rep_count_primary, id_similarity_primary, assignment, UMI.Plate.ID)),
col.names = c("Sample", "Control Sequence Count", "Sequence Similarity", "Sequence Type", "UMI Plate ID"),format = "html") %>%
kableExtra::kable_styling(full_width = FALSE, position = "center")
cat(paste("Number of samples with positive control OTU as primary sequence:", p_cont_contamination_tab["primary"]),
paste("\nNumber of samples with positive control OTU as secondary sequence:", p_cont_contamination_tab["secondary"]),
paste("\nout of", length(qc_AM %>% filter(assignment == "secondary") %>% pull(pid) %>% unique()), "samples with secondary sequences"),
paste("\n\nLocation of the contaminants relative to the source:"))
base_grid <- expand.grid(row = LETTERS[1:16], column = 1:24)
umi_plate <- ggplot(base_grid, aes(x = factor(column, levels = 1:24), y = factor(row, levels = rev(LETTERS[1:16])))) +
geom_tile(fill = "white", color = "black") +
geom_tile(data = umi_p_cont_contamination, fill = "#CCEBC5", color = "#CCEBC5", linewidth = 1, alpha = 0.8) +
geom_tile(data = data.frame(row = "M", column = 23), fill = "#FBB4AE", color = "#FBB4AE", linewidth = 1, alpha = 0.8) +
geom_tile(data = data.frame(row = "N", column = 23), fill = "#FBB4AE", color = "#FBB4AE", linewidth = 1, alpha = 0.8) +
geom_tile(data = data.frame(row = "M", column = 24), fill = "#FBB4AE", color = "#FBB4AE", linewidth = 1, alpha = 0.8) +
geom_tile(data = data.frame(row = "N", column = 24), fill = "#FBB4AE", color = "#FBB4AE", linewidth = 1, alpha = 0.8) +
geom_text(data = umi_p_cont_contamination, aes(label = rep_count_primary), size = 1.5, color = "black") +
labs(title = "UMI Plate", x = "Column", y = "Row") +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
panel.grid = element_blank(),
strip.text = element_text(size = 0.5),
plot.title = element_text(size = 1),
axis.title = element_text(size = 0.5),
axis.text = element_text(size = 0.5)) +
theme_minimal()
partner_plate + umi_plate + plot_layout(widths = c(0.3, 0.7))
```
Orange square: positive contros<br>
Green squares: samples with positive control contamination<br><br>
```{r secondary_contig_histogram, fig.align='center', fig.width=2.5, fig.height=2}
qc_secondary %>%
ggplot(aes(x = rep_count_primary)) +
geom_histogram(binwidth = 1, fill = "#07dbd0", color = "#1c67fc", linewidth = 0.5) +
labs(title = "Number of reads [secondary sequences; binwidth = 1]", x = "Read count", y = "Frequency") +
theme_classic() +
geom_vline(aes(xintercept = mean(rep_count_primary)), color = "#1c67fc", linetype = "solid", linewidth = 0.5) +
geom_vline(aes(xintercept = mean(p_cont_contamination$rep_count_primary)), color = "#db5f07", linetype = "dotted", linewidth = 0.5) +
theme(
plot.title = element_text(size = 7),
axis.title = element_text(size = 7),
axis.text = element_text(size = 6)
)
cat(paste(
"Read count mean of all secondary sequences in all samples:", mean(qc_secondary$rep_count_primary),
"\nRead count mean of all positive control sequences in other samples:", mean(p_cont_contamination$rep_count_primary),
"\n\nRead count median of all secondary sequences in all samples:", median(qc_secondary$rep_count_primary),
"\nRead count median of all positive control sequences in other samples:", median(p_cont_contamination$rep_count_primary)
))
cat(paste())
```
Blue solid line: secondary hit read mean <br>
Orange dotted lines: mean of reads found as secondary contaminants from the positive controls in other samples <br>
<b>Both lines should be in close proximity meaning that the secondary contamination from positive controls is comparable to the potential contamination in other samples.</b> <br>
```{r primary_control_contig_replacement_or_removal}
p_cont_contamination_primary <- p_cont_contamination %>% filter(assignment == "primary") %>% pull(pid)
p_cont_contamination_primary_df <- qc_AM %>% filter(pid %in% p_cont_contamination_primary)
if (length(p_cont_contamination_primary) > 0) {
cat("NOTE: Non-control samples with control reads recognised as the primary hit need to be examined further!")
kable((p_cont_contamination_primary_df %>% filter(assignment == "primary") %>%
dplyr::select(pid, rep_count_primary, otu_primary, assignment)), col.names = c("Sample", "Count", "OTU", "Sequence"), format = "html") %>%
kableExtra::kable_styling(full_width = FALSE, position = "center")
} else {
cat("There are no samples that contain positive control reads as the primary hit")
}
# Secondary non-arthropod removed
p_cont_contamination_exclude1 <- p_cont_contamination_primary_df %>% filter(rep_count_primary == 1) %>% pull(pid)
p_cont_contamination_exclude2 <- qc_AM %>% filter(pid %in% p_cont_contamination_primary_df$pid) %>% filter(assignment == "secondary" & p_primary == "Arthropoda") %>% pull(pid)
p_cont_contamination_exclude2 <- p_cont_contamination_primary_df %>% filter(!(pid %in% p_cont_contamination_exclude2)) %>% pull(pid)
p_cont_contamination_exclude <- c(p_cont_contamination_exclude1, p_cont_contamination_exclude2) %>% unique()
# Exclude
if (length(p_cont_contamination_exclude) > 0) {
qc_AM <- qc_AM %>% filter(!(pid %in% p_cont_contamination_exclude))
} else {
cat("NO SAMPLES TO BE REMOVED")
}
```
NOTE: the above samples are automatically removed if: <br>
<li>There's only one primary read</li>
<li>Secondary sequence found in the same sample is not an Arthropod</li>
<br><br>
<b>Negative control contamination</b>
<br><br>
Distribution of reads in negative controls
```{r negative_control_contamination_histogram, fig.align='center', fig.width=3.7, fig.height=2}
qc_AM$Sample.Plate.ID <- paste(qc_AM$partner_plate, qc_AM$plate_number, sep = "_")
n_cont_samples_qc_AM <- qc_AM %>% filter(pid %in% n_cont_stats$Label)
# Calculate the maximum y value for both primary and secondary data
combined_data <- n_cont_samples_qc_AM %>% filter(assignment %in% c("primary", "secondary"))
basic_plot <- combined_data %>%
ggplot(aes(x = rep_count_primary)) +
geom_histogram(binwidth = 1, fill = "#07dbd0", color = "#1c67fc", linewidth = 0.5)
# Extract maximum count
max_count <- max(ggplot_build(basic_plot)$data[[1]]$count)
# Primary plot
primary_ncont_hist <- n_cont_samples_qc_AM %>% filter(assignment == "primary") %>%
ggplot(aes(x = rep_count_primary)) +
geom_histogram(binwidth = 1, fill = "#07dbd0", color = "#1c67fc", linewidth = 0.5) +
labs(title = "Primary [binwidth = 1]", x = "Read count", y = "Frequency") +
theme_classic() +
geom_vline(aes(xintercept = mean(rep_count_primary)), color = "#1c67fc", linetype = "solid", linewidth = 0.5) +
scale_y_continuous(limits = c(0, max_count + 10), trans = stretch_trans) +
theme(
plot.title = element_text(size = 7),
axis.title = element_text(size = 7),
axis.text = element_text(size = 6)
)
# Secondary plot
secondary_ncont_hist <- n_cont_samples_qc_AM %>% filter(assignment == "secondary") %>%
ggplot(aes(x = rep_count_primary)) +
geom_histogram(binwidth = 1, fill = "#07dbd0", color = "#1c67fc", linewidth = 0.5) +
labs(title = "Secondary [binwidth = 1]", x = "Read count", y = "Frequency") +
theme_classic() +
geom_vline(aes(xintercept = mean(rep_count_primary)), color = "#1c67fc", linetype = "solid", linewidth = 0.5) +
scale_y_continuous(limits = c(0, max_count + 10), trans = stretch_trans) +
theme(
plot.title = element_text(size = 7),
axis.title = element_text(size = 7),
axis.text = element_text(size = 6)
)
# Combine plots with consistent y-axis values
primary_ncont_hist + secondary_ncont_hist + plot_layout(widths = c(0.6, 0.4))
```
```{r contamination_map_partner, fig.align='center', fig.width=15, fig.height=16}
qc_AM <- qc_AM %>%
mutate(
sample = case_when(
grepl("POS", pid) ~ "p_cont",
grepl("NEG", pid) ~ "n_cont",
TRUE ~ "sample"
)
)
# Split the data frame into a list by partner plate
control_sequences_list <- split(qc_AM, qc_AM$Sample.Plate.ID)
# Remove plates that do not have contaminated negative controls
filtered_indices <- lapply(control_sequences_list, function(df) {
"sample" %in% colnames(df) && any(grepl("n_cont", df$sample))
})
control_sequences_list_filt <- control_sequences_list[unlist(filtered_indices)]
# Rename
names(control_sequences_list_filt) <- names(which(filtered_indices == TRUE))
# Create an empty list
contamination_n_cont_list <- list()
# Maximum number of substitutions/deletions
max_differences <- 2
for(df in 1:length(control_sequences_list_filt)){
control_sequences_df_temp <- control_sequences_list_filt[[df]]
plate_control_sequences <- control_sequences_df_temp %>%
filter(sample == "n_cont") %>%
pull(sequence)
# Find source samples with sequences similar to those in negative controls
source_samples <- control_sequences_df_temp %>%
filter(sample == "sample") %>%
rowwise() %>%
filter(any(stringdist::stringdist(sequence, plate_control_sequences, method = "lv") <= max_differences)) %>% # change to method='lv' when accounting for deletions and insertions!
pull(pid)
# Find source OTUs with sequences similar to those in negative controls
source_OTUs <- control_sequences_df_temp %>%
filter(sample == "sample") %>%
rowwise() %>%
filter(any(stringdist::stringdist(sequence, plate_control_sequences, method = "lv") <= max_differences)) %>%
pull(otu_primary)
contamination_df <- control_sequences_df_temp %>%
filter(sample == "n_cont" | pid %in% source_samples & otu_primary %in% source_OTUs)
contamination_n_cont_list[[df]] <- contamination_df
}
contamination_n_cont_df <- do.call(rbind, contamination_n_cont_list) %>% dplyr::select(-sequence)
contamination_n_cont_df$rep_count_primary <- as.numeric(contamination_n_cont_df$rep_count_primary)
contamination_n_cont_df_bovidae <- contamination_n_cont_df %>% filter(f_primary == "Bovidae")
text_data <- merge((contamination_n_cont_df %>% filter(!(f_primary == "Bovidae")) %>%
group_by(pid) %>% summarise(sum = sum(rep_count_primary))), contamination_n_cont_df, all = FALSE) %>%
dplyr::select(pid, pid2, sum, plate_row, plate_column, Sample.Plate.ID) %>% unique()
cont_source <- contamination_n_cont_df %>% filter(!(f_primary == "Bovidae")) %>%
filter(!(str_detect(pid, "CONTROL"))) %>% pull(f_primary) %>% table() %>% as.data.frame() %>% arrange(-Freq)
cat(paste("NOTE: contamination source can be either primary or secondary sequence within samples!"))
knitr::kable(cont_source,
col.names = c("Family", "No. Source Samples"),
format = "html") %>%
kableExtra::kable_styling(full_width = FALSE, position = "center")
# Plot heatmaps
contamination_n_cont_df %>% filter(!(f_primary == "Bovidae")) %>% arrange(rep_count_primary) %>%
ggplot(aes(y = factor(plate_column, levels = rev(LETTERS[1:8])), x = factor(plate_row, levels = 1:12))) +
geom_tile(aes(fill = rep_count_primary), color = "#C1CDCD") +
geom_tile(data = (contamination_n_cont_df %>% filter(!(f_primary == "Bovidae")) %>% filter(sample == "n_cont")),
aes(fill = rep_count_primary, color = partner_plate), linewidth = 1) +
geom_tile(data = (contamination_n_cont_df %>% filter(!(f_primary == "Bovidae")) %>% filter(pid %in% n_cont_failed_2)),
aes(fill = rep_count_primary), color = "#7FFF00", linewidth = 1.5) +
geom_text(data = (text_data),
aes(label = sum), size = 1.5, color = "white") +
# geom_tile(data = (contamination_n_cont_df_bovidae %>% filter(pid %in% n_cont_failed_2)),
# aes(fill = rep_count_primary), color = "#FF00FF", linewidth = 1.5) +
scale_fill_gradient(low = "#0000FF", high = "#FFD700", na.value = "white") +
facet_wrap(~ Sample.Plate.ID, ncol = 6) +
labs(title = paste("Heatmap of Read Counts by Partner Plate Position - Sequences in Negative Controls"),
x = "Plate Column",
y = "Plate Row",
fill = "Count", colour = "Partner [Controls]") +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
panel.grid = element_blank(),
strip.text = element_text(size = 10)) + theme_minimal()
```
```{r contamination_map_umi, fig.align='center', fig.width=18, fig.height=15}
# UMI
qc_AM <- merge(qc_AM, (umi_plates %>% dplyr::select(pid, UMI.Plate.ID, Forward.UMI.Label, Reverse.UMI.Label, Well.Coordinate)), by = "pid")
# Split the data frame into a list by partner plate ID
control_sequences_list <- split(qc_AM, qc_AM$UMI.Plate.ID)
# Remove plates that do not have contaminated negative controls
filtered_indices <- lapply(control_sequences_list, function(df) {
"sample" %in% colnames(df) && any(grepl("n_cont", df$sample))
})
control_sequences_list_filt <- control_sequences_list[unlist(filtered_indices)]
# Rename
names(control_sequences_list_filt) <- names(which(filtered_indices == TRUE))
contamination_n_cont_list <- list()
for(df in 1:length(control_sequences_list_filt)){
control_sequences_df_temp <- control_sequences_list_filt[[df]]
plate_control_sequences <- control_sequences_df_temp %>%
filter(sample == "n_cont") %>%
pull(sequence)
# Find source samples with sequences similar to those in negative controls
source_samples <- control_sequences_df_temp %>%
filter(sample == "sample") %>%
rowwise() %>%
filter(any(stringdist::stringdist(sequence, plate_control_sequences, method = "lv") <= max_differences)) %>%
pull(pid)
source_OTUs <- control_sequences_df_temp %>%
filter(sample == "sample") %>%
rowwise() %>%
filter(any(stringdist::stringdist(sequence, plate_control_sequences, method = "lv") <= max_differences)) %>%
pull(otu_primary)
contamination_df <- control_sequences_df_temp %>%
filter(sample == "n_cont" | pid %in% source_samples & otu_primary %in% source_OTUs)
contamination_n_cont_list[[df]] <- contamination_df
}
contamination_n_cont_df <- do.call(rbind, contamination_n_cont_list) %>% dplyr::select(-sequence)
contamination_n_cont_df$rep_count_primary <- as.numeric(contamination_n_cont_df$rep_count_primary)
contamination_n_cont_df <- contamination_n_cont_df %>%
mutate(
row = substr(Well.Coordinate, 1, 1),
column = substr(Well.Coordinate, 2, 3)
)
contamination_n_cont_df$column <- factor(contamination_n_cont_df$column, levels = 1:24)
text_data <- merge((contamination_n_cont_df %>% filter(!(f_primary == "Bovidae")) %>%
group_by(pid) %>% summarise(sum = sum(rep_count_primary))), contamination_n_cont_df, all = FALSE) %>% dplyr::select(pid, pid2, sum, row, column, UMI.Plate.ID) %>% unique()
# Plot heatmaps
contamination_n_cont_df %>% filter(!(f_primary == "Bovidae")) %>%
ggplot(aes(x = factor(column, levels = 1:24), y = factor(row, levels = rev(LETTERS[1:16])))) +
geom_tile(aes(fill = rep_count_primary), color = "#C1CDCD") +
geom_tile(data = (contamination_n_cont_df %>% filter(!(f_primary == "Bovidae")) %>% filter(sample == "n_cont")),
aes(fill = rep_count_primary, color = partner_plate), linewidth = 1) +
geom_tile(data = (contamination_n_cont_df %>% filter(!(f_primary == "Bovidae")) %>% filter(pid %in% n_cont_failed_2)),
aes(fill = rep_count_primary), color = "#7FFF00", linewidth = 1.5) +
# geom_tile(data = (contamination_n_cont_df_bovidae %>% filter(pid %in% n_cont_failed_2)),
# aes(fill = rep_count_primary), color = "#FF00FF", linewidth = 1.5) +
scale_fill_gradient(low = "#0000FF", high = "#FFD700", na.value = "white") +
geom_text(data = (text_data),
aes(label = sum), size = 1.5, color = "white") +
facet_wrap(~ UMI.Plate.ID, ncol = 4) +
labs(title = paste("Heatmap of Read Counts by UMI Plate Position - Sequences in Negative Controls"),
x = "Plate Column",
y = "Plate Row",
fill = "Count", colour = "Partner [Controls]") +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
panel.grid = element_blank(),
strip.text = element_text(size = 10)) + theme_minimal()
```
```{r negative_control_contamination_summary}
if (length(contamination_n_cont_df_bovidae$pid) == 0) {
non_bos_contaminated_n_cont <- n_cont_failed_2
} else {
non_bos_contaminated_n_cont <- n_cont_failed_2[-which(n_cont_failed_2 %in% contamination_n_cont_df_bovidae$pid)]
}
```
Outline: negative controls with contaminants <br>
Colour of the oultine indicates partners to track the samples between partner and UMI plates.<br>
Thicker chartreuse outline: FAILED negative controls with contaminants [2%] <br>
Numbers indicate the read count <br>
Squares that are not outlined represent potential sources of contamination within plates: identical sequences found within these wells and negative controls.