-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtools.R
157 lines (140 loc) · 7.08 KB
/
tools.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#' @keywords internal
parse_ft <- function(text) {
text <- gsub(text, pattern = "\n\t\t\t", replacement = "\t", fixed = TRUE)
parts <- strsplit(text, "\n\n", fixed = TRUE)[[1]]
part_data <- lapply(parts, function(x) {
first_line <- sub(x, pattern = "\\n.+$", replacement = "")
acc <- stringr::str_match(first_line, pattern = "\\|(.+)\\|")[,2]
the_rest <- sub(x, pattern = "^>.+?\\n", replacement = "")
# replace extra \t with , when there are multiple features
lines <- strsplit(the_rest, "\n")[[1]]
lines <- purrr::map2(stringr::str_locate_all(lines, "\t"), lines, function(matches, line) {
if (nrow(matches) > 4) {
for (i in matches[3:(nrow(matches) - 2), 1]) {
substr(line, i, i) <- ","
}
}
return(line)
})
the_rest <- paste0(lines, collapse = "\n")
output <- readr::read_tsv(paste0(the_rest, "\n"), col_names = c("start", "end", "feature", "type", "name"), col_types = "ccccc")
output <- tibble::as_tibble(cbind(list(acc = acc), output, stringsAsFactors = FALSE))
})
output <- dplyr::bind_rows(part_data)
output$complete <- ifelse(startsWith(as.character(output$start), "<") | startsWith(as.character(output$end), ">"), FALSE, TRUE)
output$start <- as.integer(gsub(output$start, pattern = "<", replacement = ""))
output$end <- as.integer(gsub(output$end, pattern = ">", replacement = ""))
return(output)
}
#' @keywords internal
parse_seqs <- function(text) {
xml <- xml2::read_xml(text)
tibble::tibble(acc = xml2::xml_text(xml2::xml_find_all(xml, "//TSeq_accver")),
seq = xml2::xml_text(xml2::xml_find_all(xml, "//TSeq_sequence")),
header = xml2::xml_text(xml2::xml_find_all(xml, "//TSeq_defline")),
length = xml2::xml_text(xml2::xml_find_all(xml, "//TSeq_length")))
}
#' Lookup gene sequences from NCBI
#'
#' Look for sequences of a particular gene for a list of species/isolates/genes from the Genbank
#' nucleotide database.
#'
#' @param species The names of species to look up.
#' @param genes The names of the genes to look up.
#' @param isolates The names of isolates to look up. Must be the same length as \code{species} if used.
#' @param extract_features If TRUE, return the sequence for each feature in the sequence annotation
#' instead of the whole sequence.
#' @param gene_name_in_feature If TRUE, only return features that have one of the gene names
#' somewhere in their description. Only has an effect if extract_features is TRUE.
#' @param db The name of the NCBI database to query. Only tested with "nucleotide", but a few others
#' might work.
#' @param pause The number of seconds to pause between each query. This avoids annoying NCBI and
#' having them block you IP address. Should be at least 0.35 seconds if you dont have an NCBI API
#' key and at least 0.1 seconds if you do.
#' @param ... Additional terms to add to the search request for each species/isolate, see NCBI documentation for a complete list: http://www.ncbi.nlm.nih.gov/books/NBK25499/#_chapter4_ESearch_
#'
#' @return A table
#'
#' @examples \dontrun{
#'
#' # Search for the whole seqeunces for with P. infestans Cox I
#' get_isolate_seqs(species = c("Phytophthora infestans"),
#' genes = c("cox I", "cox 1", "cox1", "coxI", "cytochrome oxidase I", "cytochrome oxidase 1"),
#' retmax = 100)
#'
#' # Search for the just the gene sequence for P. infestans Cox I
#' get_isolate_seqs(species = c("Phytophthora infestans"),
#' genes = c("cox I", "cox 1", "cox1", "coxI", "cytochrome oxidase I", "cytochrome oxidase 1"),
#' retmax = 100,
#' extract_features = TRUE)
#'
#' # Search for all the gene sequences in whole sequences that contain P. infestans Cox I
#' get_isolate_seqs(species = c("Phytophthora infestans"),
#' genes = c("cox I", "cox 1", "cox1", "coxI", "cytochrome oxidase I", "cytochrome oxidase 1"),
#' retmax = 100,
#' extract_features = TRUE,
#' gene_name_in_feature = FALSE)
#'
#' # Search for whole sequences for P. infestans Cox I for just some isolates
#' get_isolate_seqs(species = c("Phytophthora infestans", "Phytophthora infestans", "Phytophthora infestans"),
#' isolates = c("44", "580", "180"),
#' genes = c("cox I", "cox 1", "cox1", "coxI", "cytochrome oxidase I", "cytochrome oxidase 1"))
#'
#' # Search for just the gene sequences for P. infestans Cox I for just some isolates
#' get_isolate_seqs(species = c("Phytophthora infestans", "Phytophthora infestans", "Phytophthora infestans"),
#' isolates = c("44", "580", "180"),
#' genes = c("cox I", "cox 1", "cox1", "coxI", "cytochrome oxidase I", "cytochrome oxidase 1"),
#' extract_features = TRUE)
#' }
#'
#' @export
get_isolate_seqs <- function(species, genes, isolates = NULL, extract_features = FALSE, gene_name_in_feature = TRUE, db = "nucleotide", pause = 0.5, ...) {
get_one <- function(name, isolate = NULL) {
# Wait a bit so NCBI doesnt get unhappy
Sys.sleep(pause)
# Search for sequences
if (is.null(isolate)) {
query <- paste0('"', name, '"[Organism] AND (', paste0('"', genes, '"[All Fields]', collapse = " OR "), ')')
} else {
query <- paste0('"', name, '"[Organism] AND ("', isolate, '"[Isolate] OR "', isolate, '"[Strain]) AND (', paste0('"', genes, '"[All Fields]', collapse = " OR "), ')')
}
search <- rentrez::entrez_search(db, term = query, ...)
if (length(search$ids) == 0) {
return(NULL)
}
if (extract_features) {
# Parse features
features <- parse_ft(rentrez::entrez_fetch(db, id = search$ids, rettype = "ft", retmode = "text"))
if (gene_name_in_feature) {
gene_in_feature <- purrr:::map_lgl(features$name, function(text) {
purrr:::reduce(lapply(genes, function(gene) grepl(tolower(text), pattern = tolower(gene), fixed = TRUE)), `|`)
})
features <- features[gene_in_feature, ]
}
if (nrow(features) == 0) {
return(NULL)
}
# Parse sequences
sequences <- parse_seqs(rentrez::entrez_fetch(db, id = search$ids, rettype = "fasta", retmode = "xml"))
# Join feature and sequence data
output <- dplyr::left_join(features, sequences, by = "acc")
# Subset sequences to fetures
output$seq <- substr(output$seq, output$start, output$end)
output$length <- nchar(output$seq)
} else {
output <- parse_seqs(rentrez::entrez_fetch(db, id = search$ids, rettype = "fasta", retmode = "xml"))
}
# Add query info
if (is.null(isolate)) {
output <- tibble::as_tibble(cbind(list(species = name, query = query), output, stringsAsFactors = FALSE))
} else {
output <- tibble::as_tibble(cbind(list(species = name, isolate = isolate, query = query), output, stringsAsFactors = FALSE))
}
return(output)
}
if (is.null(isolates)) {
return(dplyr::bind_rows(purrr::pmap(list(species), get_one)))
} else {
return(dplyr::bind_rows(purrr::pmap(list(species, isolates), get_one)))
}
}