-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathentropy.R
113 lines (97 loc) · 5.15 KB
/
entropy.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
library(entropy)
library(tidyverse)
library(ggpubr)
library(ggprism)
library(bio3d)
bcr <- read_csv("bcr_heavy.csv.gz")
bcr.heavy.v <- bcr
bcr.heavy.v$V <- gsub("\\-.*", "", bcr.heavy.v$V)
bcr.heavy.v$V <- gsub("S.*", "", bcr.heavy.v$V)
bcr.heavy.v$V <- gsub("IGHV", "VH", bcr.heavy.v$V)
CDR3aa <- bcr.heavy.v %>% filter(cell_type == "B-1a") %>%
select(cell_type, condition,CDR3aa, CDR3nt, count) %>%
group_by(cell_type, condition) %>%
mutate(seq_n = sum(count)) %>%
ungroup %>%
group_by(cell_type, condition, CDR3aa) %>%
dplyr::summarize(c_CDR3 = sum(count), f_CDR3 = sum(count)/seq_n ) %>%
ungroup %>% distinct %>% arrange(-f_CDR3)
CDR3aa.V <- bcr.heavy.v %>% filter(cell_type == "B-1a") %>%
select(cell_type, condition,CDR3aa, CDR3nt, count, V) %>%
group_by(cell_type, condition) %>%
mutate(seq_n = sum(count)) %>%
ungroup %>%
group_by(cell_type, condition, CDR3aa, V) %>%
dplyr::summarize(c_CDR3 = sum(count), f_CDR3 = sum(count)/seq_n ) %>%
ungroup %>% distinct %>% arrange(-f_CDR3) %>% ungroup
CDR3_props <- CDR3aa.V %>% select(CDR3aa, condition, f_CDR3, c_CDR3) %>% group_by(condition) %>% unique %>% ungroup
table(CDR3_props$condition, CDR3_props$f_CDR3 > 0.01)
seq_clones_WT <- sum((CDR3_props[CDR3_props$f_CDR3 > 0.01 & CDR3_props$condition == "WT",])$f_CDR3)
seq_clones_KO <- sum((CDR3_props[CDR3_props$f_CDR3 > 0.01 & CDR3_props$condition == "KO",])$f_CDR3)
# Sample BCR or CDR3 sequences for two lists
list1_sequences <- CDR3_props %>% filter(condition == "WT") %>% pull(CDR3aa)
list2_sequences <- CDR3_props %>% filter(condition == "KO") %>% pull(CDR3aa)
# Function to calculate Shannon entropy
calculate_entropy <- function(sequence) {
probabilities <- table(strsplit(sequence, NULL)) / nchar(sequence)
entropy_value <- entropy::entropy(probabilities)
return(entropy_value)
}
colo <- c('#4363d8', '#f58231','#e6194b', '#3cb44b', '#ffe119', '#911eb4',
'#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff',
'#9a6324', '#fffac8', '#800000', '#aaffc3',
'#808000', '#ffd8b1', '#000075', '#808080', '#ffffff', '#000000')
# Calculate entropy for each list
entropy_list1 <- sapply(list1_sequences, calculate_entropy)
entropy_list2 <- sapply(list2_sequences, calculate_entropy)
entropy_list_wt <- tibble(condition = "WT", entropies = entropy_list1)
entropy_list_ko <- tibble(condition = "KO", entropies = entropy_list2)
entropy_all <- bind_rows(entropy_list_wt, entropy_list_ko)
entropy_all <- entropy_all %>% aggregate(entropies ~ condition, sum)
entropy_all$condition <- factor(entropy_all$condition, levels = c("WT", "KO"))
entropy_all %>% ggplot(aes(x = condition, y = entropies, fill = condition)) +
geom_bar(position = "dodge", stat = "identity") + theme_prism() +
scale_fill_manual(values = colo) + ggtitle("B-1a") + ylab("Shannon Entropy")
ggsave("plots/B-1a_entropy.png", w = 3.3, h = 2.5)
CDR3aa <- bcr.heavy.v %>% filter(cell_type == "B-1b") %>%
select(cell_type, condition,CDR3aa, CDR3nt, count) %>%
group_by(cell_type, condition) %>%
mutate(seq_n = sum(count)) %>%
ungroup %>%
group_by(cell_type, condition, CDR3aa) %>%
dplyr::summarize(c_CDR3 = sum(count), f_CDR3 = sum(count)/seq_n ) %>%
ungroup %>% distinct %>% arrange(-f_CDR3)
CDR3aa.V <- bcr.heavy.v %>% filter(cell_type == "B-1b") %>%
select(cell_type, condition,CDR3aa, CDR3nt, count, V) %>%
group_by(cell_type, condition) %>%
mutate(seq_n = sum(count)) %>%
ungroup %>%
group_by(cell_type, condition, CDR3aa, V) %>%
dplyr::summarize(c_CDR3 = sum(count), f_CDR3 = sum(count)/seq_n ) %>%
ungroup %>% distinct %>% arrange(-f_CDR3) %>% ungroup
CDR3_props <- CDR3aa.V %>% select(CDR3aa, condition, f_CDR3, c_CDR3) %>% group_by(condition) %>% unique %>% ungroup
# Sample BCR or CDR3 sequences for two lists
list1_sequences <- CDR3_props %>% filter(condition == "WT") %>% pull(CDR3aa)
list2_sequences <- CDR3_props %>% filter(condition == "KO") %>% pull(CDR3aa)
# Function to calculate Shannon entropy
calculate_entropy <- function(sequence) {
probabilities <- table(strsplit(sequence, NULL)) / nchar(sequence)
entropy_value <- entropy::entropy(probabilities)
return(entropy_value)
}
colo <- c('#4363d8', '#f58231','#e6194b', '#3cb44b', '#ffe119', '#911eb4',
'#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff',
'#9a6324', '#fffac8', '#800000', '#aaffc3',
'#808000', '#ffd8b1', '#000075', '#808080', '#ffffff', '#000000')
# Calculate entropy for each list
entropy_list1 <- sapply(list1_sequences, calculate_entropy)
entropy_list2 <- sapply(list2_sequences, calculate_entropy)
entropy_list_wt <- tibble(condition = "WT", entropies = entropy_list1)
entropy_list_ko <- tibble(condition = "KO", entropies = entropy_list2)
entropy_all <- bind_rows(entropy_list_wt, entropy_list_ko)
entropy_all <- entropy_all %>% aggregate(entropies ~ condition, sum)
entropy_all$condition <- factor(entropy_all$condition, levels = c("WT", "KO"))
entropy_all %>% ggplot(aes(x = condition, y = entropies, fill = condition)) +
geom_bar(position = "dodge", stat = "identity") + theme_prism() +
scale_fill_manual(values = colo) + ggtitle("B-1b") + ylab("Shannon Entropy")
ggsave("plots/B-1b_entropy.png", w = 3.3, h = 2.5 )