Skip to content

Commit

Permalink
update protein_blast to use Biostrings and pwalign for alignment issue
Browse files Browse the repository at this point in the history
  • Loading branch information
nvelden committed May 13, 2024
1 parent 360c379 commit d56df6a
Show file tree
Hide file tree
Showing 7 changed files with 47 additions and 66 deletions.
1 change: 0 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -64,5 +64,4 @@ importFrom(stats,setNames)
importFrom(tidyr,pivot_wider)
importFrom(tools,file_path_sans_ext)
importFrom(utils,modifyList)
importFrom(utils,packageVersion)
importFrom(utils,read.table)
55 changes: 20 additions & 35 deletions R/cluster_blast.R
Original file line number Diff line number Diff line change
Expand Up @@ -219,38 +219,23 @@ synteny_score <- function(order1, order2, identity, i = 0.5) {
#' cur_data arrange desc
#' @importFrom stats setNames ave
#' @importFrom rlang .data
#' @importFrom utils packageVersion
#' @importFrom parallel detectCores makeCluster clusterExport clusterEvalQ
#' parLapply stopCluster
#' @note This function selects the appropriate alignment package based on the
#' Bioconductor version. For versions earlier than 3.19, it relies on the
#' Biostrings package for sequence alignment. For versions 3.19 and higher, it
#' uses the pwalign package. Ensure these packages are installed using
#' BiocManager and loaded into your R session. The dplyr package is also used
#' for data manipulation.
#' @note This function relies on the Biostrings and pwalign package for sequence
#' alignment and the dplyr package for data manipulation. Ensure these packages
#' are installed and loaded into your R session.
#'
#' @export
protein_blast <- function(data, query, id = "protein_id", start = "start", end = "end", cluster = "cluster", genes = NULL, identity = 30, parallel = TRUE) {

# Check Bioconductor version and use either Biostrings or pwalign
if (requireNamespace("BiocVersion", quietly = TRUE)) {
if (utils::packageVersion("BiocVersion") < "3.19") {
if (!requireNamespace("Biostrings", quietly = TRUE)) {
stop('Biostrings package is not installed. Please install it using BiocManager::install("Biostrings").')
}
pairwise_alignment_fun <- Biostrings::pairwiseAlignment
get_pattern <- Biostrings::pattern
get_subject <- Biostrings::subject
} else {
if (!requireNamespace("pwalign", quietly = TRUE)) {
stop('pwalign package is not installed. Please install it using BiocManager::install("pwalign").')
}
pairwise_alignment_fun <- pwalign::pairwiseAlignment
get_pattern <- pwalign::pattern
get_subject <- pwalign::subject
}
} else {
stop("BiocManager is not installed. Please install it using install.packages('BiocManager').")
# Check if Biostrings package is installed
if (!requireNamespace("Biostrings", quietly = TRUE) || !"Biostrings" %in% loadedNamespaces()) {
stop('Biostrings package is not installed or not loaded. Install with BiocManager::install("Biostrings").')
}

# Check if pwalign package is installed and loaded
if (!requireNamespace("pwalign", quietly = TRUE) || !"pwalign" %in% loadedNamespaces()) {
stop('pwalign package is not installed or not loaded. Install with BiocManager::install("pwalign").')
}

# Load from .gbk files
Expand Down Expand Up @@ -337,9 +322,9 @@ protein_blast <- function(data, query, id = "protein_id", start = "start", end =
protein_combinations_alignment <- protein_combinations_all[protein_combinations_all$cluster1 != protein_combinations_all$cluster2, ]
protein_combinations_query <- protein_combinations_all[protein_combinations_all$rowID.x == protein_combinations_all$rowID.y, ]

alignments <- pairwise_alignment_fun(pattern = protein_combinations_alignment$translation1,
subject = protein_combinations_alignment$translation2,
scoreOnly = FALSE)
alignments <- pwalign::pairwiseAlignment(pattern = protein_combinations_alignment$translation1,
subject = protein_combinations_alignment$translation2,
scoreOnly = FALSE)

# Decide on parallel or sequential processing based on the number of rows
if (parallel && nrow(protein_combinations_alignment) > 1000) {
Expand All @@ -351,15 +336,15 @@ protein_blast <- function(data, query, id = "protein_id", start = "start", end =
no_cores <- parallel::detectCores() - 1
cl <- parallel::makeCluster(no_cores)

# Export the alignments list and the Biostrings library to each cluster node
# Export the alignments list and the pwalign library to each cluster node
parallel::clusterExport(cl, varlist = c("alignments"), envir = environment())
# parallel::clusterEvalQ(cl, library(Biostrings))
parallel::clusterEvalQ(cl, library(pwalign))

# Use parLapply for parallel execution
alignment_list <- parallel::parLapply(cl, alignments, function(alignment) {
list(
pattern = as.character(get_pattern(alignment)),
subject = as.character(get_subject(alignment))
pattern = as.character(pwalign::pattern(alignment)),
subject = as.character(pwalign::subject(alignment))
)
})

Expand All @@ -369,8 +354,8 @@ protein_blast <- function(data, query, id = "protein_id", start = "start", end =
# Sequential processing
alignment_list <- lapply(alignments, function(alignment) {
list(
pattern = as.character(get_pattern(alignment)),
subject = as.character(get_subject(alignment))
pattern = as.character(pwalign::pattern(alignment)),
subject = as.character(pwalign::subject(alignment))
)
})
}
Expand Down
2 changes: 1 addition & 1 deletion R/geneviewer.R
Original file line number Diff line number Diff line change
Expand Up @@ -1882,7 +1882,7 @@ GC_trackMouse <- function(
#'
#' @param GC_chart The gene chart object to be modified.
#' @param formatter A character string defining the HTML content of the tooltip. It can
#' include placeholders like `{start}` and `{end}` which will be replaced by actual
#' include placeholders like \code{\{start\}} and \code{\{end\}} which will be replaced by actual
#' data values. The default value shows start and end data.
#' @param show Logical, whether to display the tooltip or not.
#' @param cluster Optional; used to specify which clusters in the chart should have tooltips.
Expand Down
26 changes: 13 additions & 13 deletions docs/articles/BLASTP.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/GC_tooltip.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 3 additions & 6 deletions man/protein_blast.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 9 additions & 9 deletions vignettes/BLASTP.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ library(knitr)
library(geneviewer)
library(parallel)
library(dplyr)
library(Biostrings)
library(pwalign)
knitr::opts_chunk$set(
collapse = TRUE,
Expand All @@ -25,20 +27,18 @@ This tutorial describes how we can use `geneviewer` to identify and visualize ho

### Materials

The .gbk files and the additional gene info can be downloaded from the [geneviewer-tutorials](https://github.com/nvelden/geneviewer-tutorials/tree/main/Cluster%20comparison%20using%20BlastP) repository. For visualization, the `geneviewer` package is required. Sequence alignment is performed using either the `pwalign` or `Biostrings` package, depending on the Bioconductor version in use. Optionally, the `parallel` package can be utilized to increase processing times.
The .gbk files and the additional gene info can be downloaded from the [geneviewer-tutorials](https://github.com/nvelden/geneviewer-tutorials/tree/main/Cluster%20comparison%20using%20BlastP) repository. For visualization, the `geneviewer` package is required. Sequence alignment is performed using the `Biostrings` and `pwalign` packages that can be downloaded from Bioconductor. Optionally, the `parallel` package can be utilized to increase processing times.

```{r eval=FALSE, results='hide'}
install.packages("devtools")
install.packages("parallel") # Optional but recommended for speeding up processing
devtools::install_github("nvelden/geneviewer")
# For BiocManager versions > 3.19
BiocManager::install("Biostrings")
BiocManager::install("pwalign")
# For BiocManager versions < 3.19
BiocManager::install("Biostrings")
# Optional but recommended for speeding up processing
install.packages("parallel")
library(geneviewer)
library(Biostrings)
library(pwalign)
library(parallel)
```

Expand Down Expand Up @@ -67,7 +67,7 @@ kable(head(genbank_df))

### Run BlastP

In this tutorial, we will directly input the folder path into `the protein_blast()` function to load our data. We'll select BGC0000055 as our query cluster and conduct a BlastP analysis to find the homologous in the other clusters. We use 30 as the minimum identity threshold. Performing the BlastP analysis can take several minutes so we set parallel processing to TRUE.
In this tutorial, we will directly input the folder path into `the protein_blast()` function to load our data. We'll select BGC0000055 as our query cluster and conduct a BlastP analysis to find the homologous in the other clusters. We use 30 as the minimum identity threshold. Performing the BlastP analysis with this dataset can take several minutes so we set parallel processing to TRUE. For smaller datasets or if the `parallel` package is not installed, set parallel processing to `FALSE`.

```{r echo=TRUE, results='hide'}
BlastP_results <- geneviewer::protein_blast(
Expand Down

0 comments on commit d56df6a

Please sign in to comment.