-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
91 lines (68 loc) · 3.42 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import re
import streamlit as st
input_seqs = st.text_area("Paste your sequence(s) and/or nomenclatures here. Each sequence on a separate line.", height=100, key="seq").strip().upper().split("\n")
accepted_chars = ["A", "C", "G", "T", "N"]
rev_comp = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
min_len = {1: 5, 2: 2, 3: 2, 4: 2, 5: 2, 6: 2}
min_repeats = st.number_input("Minimum number of repeats", min_value=2, value=2, step=1, key="min_repeats", help="Minimum number of repeats to be considered as a motif. Note that the minimum number of repeats for mono-nucleotide motifs is always 5.")
convert_to_base_motif = st.checkbox("Convert to base motif", value=True, key="convert_to_base_motif", help="Reverse complement base motifs are indicated by ~ (tilde) symbol")
hide_N = st.checkbox("Hide N", value=True, key="hide_N", help="Hide N in the nomenclature")
def convert_nomenclature_to_sequence(input_seq):
motifs = re.findall(r'(~?[ATCGN]+)\[(\d+)\]', input_seq)
sequence = ""
for motif, count in motifs:
if motif.startswith("~"):
motif = motif.replace("~", "")
motif = ''.join(rev_comp[nt] for nt in reversed(motif))
sequence += motif * int(count)
return sequence
for i, input_seq in enumerate(input_seqs):
if "[" in input_seq or "]" in input_seq:
input_seq = convert_nomenclature_to_sequence(input_seq)
for char in input_seq:
if char not in accepted_chars:
st.error("Invalid characters are ignored")
break
input_seq = "".join([char for char in input_seq if char in accepted_chars])
motifs_list = [line.rstrip('\n') for line in open("motifs.txt")]
base_motifs_list = [line.rstrip("\n").split(",") for line in open("motifs_dict.txt")]
base_motifs_dict = {key: value for key, value in base_motifs_list}
str_repeat_list = []
for str_motif in motifs_list:
repeat_num = max(min_len[len(str_motif)], min_repeats)
if str_motif * repeat_num in input_seq:
pattern = f"((?:{str_motif}){{{repeat_num},}})"
for match in re.finditer(pattern, input_seq):
str_repeat_list.append({"start": match.start(1), "end": match.end(1), "motif": str_motif,
"repeats": match.group(1).count(str_motif)})
else:
continue
sorted_str_repeat_list = sorted(str_repeat_list, key=lambda d: d['start'])
nomenclature = ""
cursor = 0
for motif in sorted_str_repeat_list:
if convert_to_base_motif:
if motif['motif'] in base_motifs_dict:
use_motif = base_motifs_dict[motif['motif']]
else:
use_motif = motif['motif']
else:
use_motif = motif['motif']
if motif["start"] == cursor:
nomenclature += f"{use_motif}[{motif['repeats']}]"
cursor = motif['end']
else:
if not hide_N:
nomenclature += f"N[{motif['start'] - cursor}]"
nomenclature += f"{use_motif}[{motif['repeats']}]"
cursor = motif['end']
if cursor < len(input_seq):
if not hide_N:
nomenclature += f"N[{len(input_seq) - cursor}]"
nomenclature += ""
st.subheader(f"Result {i+1}")
st.write(input_seq)
st.write(nomenclature)
unique_motifs = set([motif['motif'].replace("~", "") for motif in sorted_str_repeat_list])
st.write(f"Unique motifs: {', '.join(unique_motifs)}")
st.divider()