update protein_blast to use Biostrings and pwalign for alignment issue …

…#6
nvelden · May 13, 2024 · d56df6a · d56df6a
1 parent 360c379
commit d56df6a
Show file tree

Hide file tree

Showing 7 changed files with 47 additions and 66 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -64,5 +64,4 @@ importFrom(stats,setNames)
 importFrom(tidyr,pivot_wider)
 importFrom(tools,file_path_sans_ext)
 importFrom(utils,modifyList)
-importFrom(utils,packageVersion)
 importFrom(utils,read.table)
diff --git a/R/cluster_blast.R b/R/cluster_blast.R
@@ -219,38 +219,23 @@ synteny_score <- function(order1, order2, identity, i = 0.5) {
 #'   cur_data arrange desc
 #' @importFrom stats setNames ave
 #' @importFrom rlang .data
-#' @importFrom utils packageVersion
 #' @importFrom parallel detectCores makeCluster clusterExport clusterEvalQ
 #'   parLapply stopCluster
-#' @note This function selects the appropriate alignment package based on the
-#'   Bioconductor version. For versions earlier than 3.19, it relies on the
-#'   Biostrings package for sequence alignment. For versions 3.19 and higher, it
-#'   uses the pwalign package. Ensure these packages are installed using
-#'   BiocManager and loaded into your R session. The dplyr package is also used
-#'   for data manipulation.
+#' @note This function relies on the Biostrings and pwalign package for sequence
+#' alignment and the dplyr package for data manipulation. Ensure these packages
+#' are installed and loaded into your R session.
 #'
 #' @export
 protein_blast <- function(data, query, id = "protein_id", start = "start", end = "end", cluster = "cluster", genes = NULL, identity = 30, parallel = TRUE) {
 
-  # Check Bioconductor version and use either Biostrings or pwalign
-  if (requireNamespace("BiocVersion", quietly = TRUE)) {
-    if (utils::packageVersion("BiocVersion") < "3.19") {
-      if (!requireNamespace("Biostrings", quietly = TRUE)) {
-        stop('Biostrings package is not installed. Please install it using BiocManager::install("Biostrings").')
-      }
-      pairwise_alignment_fun <- Biostrings::pairwiseAlignment
-      get_pattern <- Biostrings::pattern
-      get_subject <- Biostrings::subject
-    } else {
-      if (!requireNamespace("pwalign", quietly = TRUE)) {
-        stop('pwalign package is not installed. Please install it using BiocManager::install("pwalign").')
-      }
-      pairwise_alignment_fun <- pwalign::pairwiseAlignment
-      get_pattern <- pwalign::pattern
-      get_subject <- pwalign::subject
-    }
-  } else {
-    stop("BiocManager is not installed. Please install it using install.packages('BiocManager').")
+  # Check if Biostrings package is installed
+  if (!requireNamespace("Biostrings", quietly = TRUE) || !"Biostrings" %in% loadedNamespaces()) {
+    stop('Biostrings package is not installed or not loaded. Install with BiocManager::install("Biostrings").')
+  }
+
+  # Check if pwalign package is installed and loaded
+  if (!requireNamespace("pwalign", quietly = TRUE) || !"pwalign" %in% loadedNamespaces()) {
+    stop('pwalign package is not installed or not loaded. Install with BiocManager::install("pwalign").')
   }
 
   # Load from .gbk files
@@ -337,9 +322,9 @@ protein_blast <- function(data, query, id = "protein_id", start = "start", end =
   protein_combinations_alignment <- protein_combinations_all[protein_combinations_all$cluster1 != protein_combinations_all$cluster2, ]
   protein_combinations_query <- protein_combinations_all[protein_combinations_all$rowID.x == protein_combinations_all$rowID.y, ]
 
-  alignments <- pairwise_alignment_fun(pattern = protein_combinations_alignment$translation1,
-                                             subject = protein_combinations_alignment$translation2,
-                                             scoreOnly = FALSE)
+  alignments <- pwalign::pairwiseAlignment(pattern = protein_combinations_alignment$translation1,
+                                              subject = protein_combinations_alignment$translation2,
+                                              scoreOnly = FALSE)
 
   # Decide on parallel or sequential processing based on the number of rows
   if (parallel && nrow(protein_combinations_alignment) > 1000) {
@@ -351,15 +336,15 @@ protein_blast <- function(data, query, id = "protein_id", start = "start", end =
     no_cores <- parallel::detectCores() - 1
     cl <- parallel::makeCluster(no_cores)
 
-    # Export the alignments list and the Biostrings library to each cluster node
+    # Export the alignments list and the pwalign library to each cluster node
     parallel::clusterExport(cl, varlist = c("alignments"), envir = environment())
-    # parallel::clusterEvalQ(cl, library(Biostrings))
+    parallel::clusterEvalQ(cl, library(pwalign))
 
     # Use parLapply for parallel execution
     alignment_list <- parallel::parLapply(cl, alignments, function(alignment) {
       list(
-        pattern = as.character(get_pattern(alignment)),
-        subject = as.character(get_subject(alignment))
+        pattern = as.character(pwalign::pattern(alignment)),
+        subject = as.character(pwalign::subject(alignment))
       )
     })
 
@@ -369,8 +354,8 @@ protein_blast <- function(data, query, id = "protein_id", start = "start", end =
     # Sequential processing
     alignment_list <- lapply(alignments, function(alignment) {
       list(
-        pattern = as.character(get_pattern(alignment)),
-        subject = as.character(get_subject(alignment))
+        pattern = as.character(pwalign::pattern(alignment)),
+        subject = as.character(pwalign::subject(alignment))
       )
     })
   }

diff --git a/R/geneviewer.R b/R/geneviewer.R
@@ -1882,7 +1882,7 @@ GC_trackMouse <- function(
 #'
 #' @param GC_chart The gene chart object to be modified.
 #' @param formatter A character string defining the HTML content of the tooltip. It can
-#'   include placeholders like `{start}` and `{end}` which will be replaced by actual
+#'   include placeholders like \code{\{start\}} and \code{\{end\}} which will be replaced by actual
 #'   data values. The default value shows start and end data.
 #' @param show Logical, whether to display the tooltip or not.
 #' @param cluster Optional; used to specify which clusters in the chart should have tooltips.

diff --git a/docs/articles/BLASTP.html b/docs/articles/BLASTP.html
diff --git a/man/GC_tooltip.Rd b/man/GC_tooltip.Rd
diff --git a/man/protein_blast.Rd b/man/protein_blast.Rd
diff --git a/vignettes/BLASTP.Rmd b/vignettes/BLASTP.Rmd
@@ -12,6 +12,8 @@ library(knitr)
 library(geneviewer)
 library(parallel)
 library(dplyr)
+library(Biostrings)
+library(pwalign)
 
 knitr::opts_chunk$set(
   collapse = TRUE,
@@ -25,20 +27,18 @@ This tutorial describes how we can use `geneviewer` to identify and visualize ho
 
 ### Materials
 
-The .gbk files and the additional gene info can be downloaded from the [geneviewer-tutorials](https://github.com/nvelden/geneviewer-tutorials/tree/main/Cluster%20comparison%20using%20BlastP) repository. For visualization, the `geneviewer` package is required. Sequence alignment is performed using either the `pwalign` or `Biostrings` package, depending on the Bioconductor version in use. Optionally, the `parallel` package can be utilized to increase processing times.
+The .gbk files and the additional gene info can be downloaded from the [geneviewer-tutorials](https://github.com/nvelden/geneviewer-tutorials/tree/main/Cluster%20comparison%20using%20BlastP) repository. For visualization, the `geneviewer` package is required. Sequence alignment is performed using the `Biostrings` and `pwalign` packages that can be downloaded from Bioconductor. Optionally, the `parallel` package can be utilized to increase processing times.
 
 ```{r eval=FALSE, results='hide'}
-install.packages("devtools")
-install.packages("parallel") # Optional but recommended for speeding up processing
-
 devtools::install_github("nvelden/geneviewer")
-
-# For BiocManager versions > 3.19 
+BiocManager::install("Biostrings")
 BiocManager::install("pwalign")
-# For BiocManager versions < 3.19 
-BiocManager::install("Biostrings") 
+# Optional but recommended for speeding up processing
+install.packages("parallel") 
 
 library(geneviewer)
+library(Biostrings)
+library(pwalign)
 library(parallel) 
 ```
 
@@ -67,7 +67,7 @@ kable(head(genbank_df))
 
 ### Run BlastP
 
-In this tutorial, we will directly input the folder path into `the protein_blast()` function to load our data. We'll select BGC0000055 as our query cluster and conduct a BlastP analysis to find the homologous in the other clusters. We use 30 as the minimum identity threshold. Performing the BlastP analysis can take several minutes so we set parallel processing to TRUE.
+In this tutorial, we will directly input the folder path into `the protein_blast()` function to load our data. We'll select BGC0000055 as our query cluster and conduct a BlastP analysis to find the homologous in the other clusters. We use 30 as the minimum identity threshold. Performing the BlastP analysis with this dataset can take several minutes so we set parallel processing to TRUE. For smaller datasets or if the `parallel` package is not installed, set parallel processing to `FALSE`.
 
 ```{r echo=TRUE, results='hide'}
 BlastP_results <- geneviewer::protein_blast(