diff --git a/DESCRIPTION b/DESCRIPTION index 3bb0c74..19b6b85 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,9 +2,11 @@ Package: pato Type: Package Title: Pangenome Analysis Toolkit Version: 1.0.2 -Author: Alba Talavera Rodríguez - Miguel Diez Fernandez de Bobadilla +Author: Miguel Diez Fernandez de Bobadilla + Alba Talavera Rodríguez Lucia Chacon Vargas + Fernando Baquero + Teresa M. Coque Val F. Lanza Maintainer: Val F. Lanza Description: PATO is a R package designed to analyze pangenomes (set of genomes) diff --git a/NAMESPACE b/NAMESPACE index dd10836..ad60625 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -40,9 +40,7 @@ export(snps_map) export(snps_pairwaise) export(twins) export(umap_plot) -import(Biostrings) import(ape) -import(data.table) import(dbscan) import(doParallel) import(dplyr) @@ -64,4 +62,5 @@ import(threejs) import(tibble) import(tidyr) import(uwot) +importFrom(data.table,fread) importFrom(randomcoloR,distinctColorPalette) diff --git a/R/accnet.R b/R/accnet.R index e884c74..65038d8 100644 --- a/R/accnet.R +++ b/R/accnet.R @@ -30,9 +30,8 @@ #' #' @import dplyr #' @import tidyr -#' @import tibble #' @import dtplyr -#' @import data.table +#' #' accnet <- function(mmseqs,threshold = 0.8, singles = TRUE) { diff --git a/R/accnet_enrichment_analysis.R b/R/accnet_enrichment_analysis.R index 3509462..eca7cd6 100644 --- a/R/accnet_enrichment_analysis.R +++ b/R/accnet_enrichment_analysis.R @@ -37,8 +37,6 @@ #' @import dplyr #' @import tidyr #' @import tibble -#' @import dtplyr -#' @import data.table #' accnet_enrichment_analysis <- function(data,cluster, padj_method = "BY") { diff --git a/R/accnet_with_padj.R b/R/accnet_with_padj.R index 91bada1..457be05 100644 --- a/R/accnet_with_padj.R +++ b/R/accnet_with_padj.R @@ -14,8 +14,6 @@ #' @import dplyr #' @import tidyr #' @import tibble -#' @import dtplyr -#' @import data.table #' accnet_with_padj <- function(data) { diff --git a/R/annotate.R b/R/annotate.R index bb3a90b..fff4323 100644 --- a/R/annotate.R +++ b/R/annotate.R @@ -50,8 +50,6 @@ #' @import dplyr #' @import tidyr #' @import tibble -#' @import dtplyr -#' @import data.table #' annotate <- function(data, type = "nucl", database =c("AbR","VF_A","VF_B"), query = "all") { diff --git a/R/as.data.frame.nr_list.R b/R/as.data.frame.nr_list.R index 2efbf11..be3e70c 100644 --- a/R/as.data.frame.nr_list.R +++ b/R/as.data.frame.nr_list.R @@ -5,7 +5,7 @@ #' @return A `data.frame` #' @export #' -#' @examples +#' as.data.frame.nr_list <- function(data) { return(data.frame( diff --git a/R/classifier.R b/R/classifier.R index bb9d3c5..c031e0e 100644 --- a/R/classifier.R +++ b/R/classifier.R @@ -13,12 +13,12 @@ #' @return Classifier returns a data.frame with the best hit for each input genome. #' @export #' -#' @examples +#' #' @import dplyr #' @import tidyr #' @import tibble #' @import dtplyr -#' @import data.table +#' @importFrom data.table fread classifier <- function(file_list, n_cores, type ="nucl", max_dist = 0.06) { diff --git a/R/cluster_accnet.R b/R/cluster_accnet.R index eda21bb..c130047 100644 --- a/R/cluster_accnet.R +++ b/R/cluster_accnet.R @@ -13,9 +13,11 @@ #' @import tibble #' @import uwot #' @import mclust -#' @import data.table +#' @importFrom data.table fread #' @import dbscan #' @import parallelDist +#' +#' cluster_accnet <- function(data, method, n_cluster, d_reduction, ...) { Dist <- diff --git a/R/cluster_files_from_distance.R b/R/cluster_files_from_distance.R index 0f1a270..c795dfd 100644 --- a/R/cluster_files_from_distance.R +++ b/R/cluster_files_from_distance.R @@ -13,7 +13,7 @@ #' @import dtplyr #' @import tidyr #' @import tibble -#' @import data.table +#' @importFrom data.table fread #' @import igraph #' cluster_files_from_distance <- function(files, file_type, distance, n_cores, folder) diff --git a/R/cluster_knnn.R b/R/cluster_knnn.R index 83042d3..02923e9 100644 --- a/R/cluster_knnn.R +++ b/R/cluster_knnn.R @@ -9,7 +9,7 @@ #' @import dplyr #' @import tidyr #' @import tibble -#' @import data.table +#' @importFrom data.table fread cluster_knnn <- function(data,method) { diff --git a/R/cluster_mash.R b/R/cluster_mash.R index ec8f0ed..8657f9d 100644 --- a/R/cluster_mash.R +++ b/R/cluster_mash.R @@ -13,7 +13,7 @@ #' @import tibble #' @import uwot #' @import mclust -#' @import data.table +#' @importFrom data.table fread #' @import dbscan #' cluster_mash <- function(data, method, n_cluster,d_reduction,...) diff --git a/R/coincidents.R b/R/coincidents.R index 6f49cbb..9fdf16f 100644 --- a/R/coincidents.R +++ b/R/coincidents.R @@ -12,12 +12,12 @@ #' @return A data.frame with the protein/gene ID ("Target") and the membership (cluster number) #' @export #' -#' @examples +#' #' @import dplyr #' @import tidyr #' @import tibble #' @import dtplyr -#' @import data.table +#' @importFrom data.table fread #' @import uwot #' @import dbscan #' diff --git a/R/core_genome.R b/R/core_genome.R index 5e9ca40..79d857e 100644 --- a/R/core_genome.R +++ b/R/core_genome.R @@ -20,12 +20,12 @@ #' @return A core_genome object (a data.frame with two columns: fasta header and sequence) #' @export #' -#' @examples +#' #' @import dplyr #' @import tidyr #' @import tibble #' @import dtplyr -#' @import data.table +#' @importFrom data.table fread #' @import foreach #' @import doParallel #' @import parallel diff --git a/R/core_plots.R b/R/core_plots.R index d57a237..78a346d 100644 --- a/R/core_plots.R +++ b/R/core_plots.R @@ -26,7 +26,7 @@ #' @import tibble #' @import dtplyr #' @import ggplot2 -#' @import data.table +#' @importFrom data.table fread #' @import doParallel #' @import foreach #' diff --git a/R/core_snp_genome.R b/R/core_snp_genome.R index d1d9606..fdb8b08 100644 --- a/R/core_snp_genome.R +++ b/R/core_snp_genome.R @@ -28,7 +28,7 @@ #' @return core_snp_genome object #' @export #' -#' @examples +#' core_snp_genome <- function(file_list, n_cores, ref, type) { if(grepl('linux',Sys.getenv("R_PLATFORM"))) ## Linux diff --git a/R/core_snps_matrix.R b/R/core_snps_matrix.R index d060432..52d2ee2 100644 --- a/R/core_snps_matrix.R +++ b/R/core_snps_matrix.R @@ -9,7 +9,7 @@ #' @return A square matrix #' @export #' -#' @examples +#' #' @import stringdist #' @import magrittr #' diff --git a/R/dn_ds.R b/R/dn_ds.R index b2e5c95..4c5f10f 100644 --- a/R/dn_ds.R +++ b/R/dn_ds.R @@ -17,7 +17,7 @@ #' @return #' @export #' -#' @examples +#' #' #' dn_ds <- function(mmseq,accnet,min_size =5 ,n_cores,mode = "fast") diff --git a/R/export_accnet_aln.R b/R/export_accnet_aln.R index f0fe390..061b15b 100644 --- a/R/export_accnet_aln.R +++ b/R/export_accnet_aln.R @@ -12,7 +12,7 @@ #' #' @export #' -#' @examples +#' #' export_accnet_aln <- function(accnet,file, min_freq =3) { diff --git a/R/export_core_to_fasta.R b/R/export_core_to_fasta.R index 604bfe6..1970f51 100644 --- a/R/export_core_to_fasta.R +++ b/R/export_core_to_fasta.R @@ -5,7 +5,7 @@ #' #' @export #' -#' @examples +#' export_core_to_fasta <- function(core_data,file) { diff --git a/R/export_to_gephi.R b/R/export_to_gephi.R index d48ff41..ae623d4 100644 --- a/R/export_to_gephi.R +++ b/R/export_to_gephi.R @@ -22,7 +22,7 @@ #' @import tibble #' @import dtplyr #' @import igraph -#' @import data.table +#' @importFrom data.table fread export_to_gephi <- function(data, file, cluster) { if(is(data,"accnet")) diff --git a/R/extract_non_redundant.R b/R/extract_non_redundant.R index 157eeb2..5f88175 100644 --- a/R/extract_non_redundant.R +++ b/R/extract_non_redundant.R @@ -17,7 +17,7 @@ #' @import tidyr #' @import tibble #' @import dtplyr -#' @import data.table +#' @importFrom data.table fread #' extract_non_redundant <- function(data, nr_list) { diff --git a/R/heatmap_of_annotation.R b/R/heatmap_of_annotation.R index ba21948..1c5f421 100644 --- a/R/heatmap_of_annotation.R +++ b/R/heatmap_of_annotation.R @@ -11,7 +11,7 @@ #' #' @export #' -#' @examples +#' #' #' @import dplyr #' @import tidyr diff --git a/R/knnn.R b/R/knnn.R index ce8fecf..ff46853 100644 --- a/R/knnn.R +++ b/R/knnn.R @@ -17,12 +17,12 @@ #' @return Returns an \emph{igraph} object. #' @export #' -#' @examples +#' #' @import dplyr #' @import tidyr #' @import tibble #' @import dtplyr -#' @import data.table +#' @importFrom data.table fread #' @import igraph #' @import parallelDist diff --git a/R/load_gff_list.R b/R/load_gff_list.R index 374f3de..a45b38a 100644 --- a/R/load_gff_list.R +++ b/R/load_gff_list.R @@ -13,13 +13,12 @@ #' The function returns a \code{gff_list} object the can be used as input for other functions (mmseqs, mash) #' @export #' -#' @examples +#' #' @import dplyr #' @import tidyr #' @import tibble #' @import dtplyr -#' @import data.table -#' @import Biostrings +#' @importFrom data.table fread #' @import openssl #' @import microseq #' @import foreach @@ -36,6 +35,9 @@ load_gff_list <- function(input_files, n_cores) cl <- makeCluster(n_cores) registerDoParallel(cl) + on.exit(file.remove(list.files(pattern = "gffTMP"))) + on.exit(stopCluster(cl), add = T) + input_files <- as_tibble(input_files) %>% rename(File = 1) folderName <- paste(getwd(),"/",md5(paste(input_files$File, sep = "",collapse = "")),"_gffList",sep = "",collapse = "") @@ -99,13 +101,12 @@ load_gff_list <- function(input_files, n_cores) ffn_faa <- inner_join(ffn_faa,gff) %>% select(Annot,Sequence) %>% rename(Header = Annot) writeFasta(ffn_faa, paste(folderName,"/ffn/",pathName,".ffn",sep = "",collapse = "")) ## Write the ffn file - ffn_faa <- ffn_faa %>% mutate(Sequence = translate(Sequence)) + ffn_faa <- ffn_faa %>% mutate(Sequence = microseq::translate(Sequence)) writeFasta(ffn_faa , paste(folderName,"/faa/",pathName,".faa",sep = "",collapse = "")) ## Write the faa file } - on.exit(file.remove(list.files(pattern = "gffTMP"))) - on.exit(stopCluster(cl), add = T) + results <- list(path = folderName, files = input_files) class(results) <- append(class(results),"gff_list") diff --git a/R/mash.R b/R/mash.R index bcd69ed..fe9a2a3 100644 --- a/R/mash.R +++ b/R/mash.R @@ -26,12 +26,12 @@ #' #' @references Mash: fast genome and metagenome distance estimation using MinHash. Ondov BD, Treangen TJ, Melsted P, Mallonee AB, Bergman NH, Koren S, Phillippy AM. Genome Biol. 2016 Jun 20;17(1):132. doi: 10.1186/s13059-016-0997-x. #' @references Mash Screen: High-throughput sequence containment estimation for genome discovery. Ondov BD, Starrett GJ, Sappington A, Kostic A, Koren S, Buck CB, Phillippy AM. BioRxiv. 2019 Mar. doi: 10.1101/557314 -#' @examples +#' #' @import dplyr #' @import tidyr #' @import tibble #' @import dtplyr -#' @import data.table +#' @importFrom data.table fread diff --git a/R/mmseqs.R b/R/mmseqs.R index 1c6e35d..3ee83c5 100644 --- a/R/mmseqs.R +++ b/R/mmseqs.R @@ -45,12 +45,12 @@ #' @references Steinegger M and Soeding J. MMseqs2 enables sensitive protein sequence searching for the analysis of massive data sets. Nature Biotechnology, doi: 10.1038/nbt.3988 (2017). #' @references Steinegger M and Soeding J. Clustering huge protein sequence sets in linear time. Nature Communications, doi: 10.1038/s41467-018-04964-5 (2018). #' @references Mirdita M, Steinegger M and Soeding J. MMseqs2 desktop and local web server app for fast, interactive sequence searches. Bioinformatics, doi: 10.1093/bioinformatics/bty1057 (2019) -#' @examples +#' #' @import dplyr #' @import tidyr #' @import tibble #' @import dtplyr -#' @import data.table +#' @importFrom data.table fread #' @import foreach #' @import doParallel #' @import openssl diff --git a/R/network_of_annotation.R b/R/network_of_annotation.R index 02cfa08..72185ee 100644 --- a/R/network_of_annotation.R +++ b/R/network_of_annotation.R @@ -10,7 +10,7 @@ #' @return \emph{igraph} object #' @export #' -#' @examples +#' #' @seealso \code{\link{annotate}} #' #' @import dplyr @@ -18,7 +18,7 @@ #' @import tibble #' @import dtplyr #' @import igraph -#' @import data.table +#' @importFrom data.table fread #' network_of_annotation <- function(data, min_identity = 0.95, max_evalue = 1e-25) { diff --git a/R/non_redundant.R b/R/non_redundant.R index aff4791..fa474ed 100644 --- a/R/non_redundant.R +++ b/R/non_redundant.R @@ -21,13 +21,13 @@ #' #' @seealso \code{\link{extract_non_redundant}} #' -#' @examples +#' #' @import dplyr #' @import tidyr #' @import tibble #' @import dtplyr #' @import igraph -#' @import data.table +#' @importFrom data.table fread #' @import parallelDist #' non_redundant <- function(data, number, fraction, distance, tolerance = 0.05, max_iter = 10000, fast =TRUE, snps) diff --git a/R/non_redundant_hier.R b/R/non_redundant_hier.R index 7eaba3c..c3b5a98 100644 --- a/R/non_redundant_hier.R +++ b/R/non_redundant_hier.R @@ -24,13 +24,13 @@ #' #' @seealso \code{\link{extract_non_redundant}} #' -#' @examples +#' #' @import dplyr #' @import tidyr #' @import tibble #' @import dtplyr #' @import igraph -#' @import data.table +#' @importFrom data.table fread #' @import parallelDist #' non_redundant_hier <- function(data, number, fraction, distance, tolerance = 0.05, partitions = 10,max_iter = 10000, fast =FALSE, snps) diff --git a/R/non_redundant_pangenomes.R b/R/non_redundant_pangenomes.R index cc9d4f0..a0e211e 100644 --- a/R/non_redundant_pangenomes.R +++ b/R/non_redundant_pangenomes.R @@ -17,13 +17,13 @@ #' @import tidyr #' @import tibble #' @import dtplyr -#' @import data.table +#' @importFrom data.table fread #' @import igraph # #' @return #' @export #' -#' @examples +#' non_redundant_pangenomes <- function(file_list, distance, type = "prot", n_cores,sketch = 1000, kmer = 21) { diff --git a/R/outliers.R b/R/outliers.R index c34eb8a..7d9e169 100644 --- a/R/outliers.R +++ b/R/outliers.R @@ -13,14 +13,14 @@ #' @export #' #' @seealso \code{\link{remove_outliers}} -#' @examples +#' #' @import dplyr #' @import tidyr #' @import tibble #' @import dtplyr #' @import ggplot2 #' @import magrittr -#' @import data.table +#' @importFrom data.table fread #' @import parallelDist #' #' diff --git a/R/pagenomes_from_mmseqs.R b/R/pagenomes_from_mmseqs.R index 51ce0ad..a94ab0e 100644 --- a/R/pagenomes_from_mmseqs.R +++ b/R/pagenomes_from_mmseqs.R @@ -21,12 +21,12 @@ #' #' @seealso \code{\link{clustering}} #' -#' @examples +#' #' @import dplyr #' @import tidyr #' @import tibble #' @import dtplyr -#' @import data.table +#' @importFrom data.table fread #' pangenomes_from_mmseqs <-function(data,cluster,min_freq = 0,max_freq = 1,min_pangenome_size = 1) diff --git a/R/pangenomes_from_files.R b/R/pangenomes_from_files.R index 75ea3d4..7fb75fa 100644 --- a/R/pangenomes_from_files.R +++ b/R/pangenomes_from_files.R @@ -34,7 +34,7 @@ #' @return An \emph{accnet} with an extra membership table. #' @export #' -#' @examples +#' pangenomes_from_files <- function(files, min_pange_size = 10, min_prot_freq = 2, file_type = 'prot', distance, cluster, coverage = 0.8, identity = 0.8, evalue = 1e-6, n_cores, cov_mode = 0, cluster_mode = 0) { diff --git a/R/pangenomes_from_files_mmseqs.R b/R/pangenomes_from_files_mmseqs.R index 234f9fb..f8ad427 100644 --- a/R/pangenomes_from_files_mmseqs.R +++ b/R/pangenomes_from_files_mmseqs.R @@ -15,12 +15,12 @@ #' @return A \emph{list} with two tables, the membership of the pangenome, #' and the gene/protein frequency. #' -#' @examples +#' #' @import dplyr #' @import tidyr #' @import tibble #' @import dtplyr -#' @import data.table +#' @importFrom data.table fread #' pangenomes_from_files_mmseqs <- function(file_list,i, coverage, identity, evalue, n_cores, cov_mode,cluster_mode,folder) { diff --git a/R/pangenomes_mmseq.R b/R/pangenomes_mmseq.R index c0e1a14..96827aa 100644 --- a/R/pangenomes_mmseq.R +++ b/R/pangenomes_mmseq.R @@ -39,12 +39,12 @@ #' @references Steinegger M and Soeding J. MMseqs2 enables sensitive protein sequence searching for the analysis of massive data sets. Nature Biotechnology, doi: 10.1038/nbt.3988 (2017). #' @references Steinegger M and Soeding J. Clustering huge protein sequence sets in linear time. Nature Communications, doi: 10.1038/s41467-018-04964-5 (2018). #' @references Mirdita M, Steinegger M and Soeding J. MMseqs2 desktop and local web server app for fast, interactive sequence searches. Bioinformatics, doi: 10.1093/bioinformatics/bty1057 (2019) -#' @examples +#' #' @import dplyr #' @import tidyr #' @import tibble #' @import dtplyr -#' @import data.table +#' @importFrom data.table fread #' pangenomes_mmseqs <- function(file_list, coverage, identity, evalue, n_cores, cov_mode, cluster_mode,folder) { diff --git a/R/plasmidome.R b/R/plasmidome.R index 4bab447..1772678 100644 --- a/R/plasmidome.R +++ b/R/plasmidome.R @@ -35,8 +35,6 @@ #' @return #' @export #' -#' @examples -#' #' #' @references Arredondo-Alonso et al., Microbial Genomics 2018;4 DOI 10.1099/mgen.0.000224 #' @@ -44,10 +42,11 @@ #' @import tidyr #' @import tibble #' @import dtplyr -#' @import data.table -#' @import Biostrings +#' @importFrom data.table fread #' @import openssl #' @import microseq +#' +#' @value return a \emph{gff_list} object #' @@ -120,7 +119,7 @@ plasmidome <- function(gff_list, specie) ffn_faa <- gff2fasta(gff %>% filter(Type == "CDS"),fasta) writeFasta(ffn_faa, paste(folderName,"/ffn/",pathName,".ffn",sep = "",collapse = "")) ## Write the ffn file - ffn_faa <- ffn_faa %>% mutate(Sequence = translate(Sequence)) + ffn_faa <- ffn_faa %>% mutate(Sequence = microseq::translate(Sequence)) writeFasta(ffn_faa , paste(folderName,"/faa/",pathName,".faa",sep = "",collapse = "")) ## Write the faa file } } diff --git a/R/plot_knnn_network.R b/R/plot_knnn_network.R index 78a22f4..b3c9d11 100644 --- a/R/plot_knnn_network.R +++ b/R/plot_knnn_network.R @@ -30,13 +30,13 @@ #' #' @seealso \code{\link{igraph}} #' -#' @examples +#' #' @import dplyr #' @import tidyr #' @import tibble #' @import dtplyr #' @import threejs -#' @import data.table +#' @importFrom data.table fread #' @import manipulateWidget #' @import htmltools #' @importFrom randomcoloR distinctColorPalette diff --git a/R/remove_outliers.R b/R/remove_outliers.R index c7d3978..9152969 100644 --- a/R/remove_outliers.R +++ b/R/remove_outliers.R @@ -10,12 +10,12 @@ #' @export #' #' @seealso \code{\link{outliers}} -#' @examples +#' #' @import dplyr #' @import tidyr #' @import tibble #' @import dtplyr -#' @import data.table +#' @importFrom data.table fread #' remove_outliers <- function(data,outliers) { diff --git a/R/sharedness.R b/R/sharedness.R index 0bd5653..9059b2c 100644 --- a/R/sharedness.R +++ b/R/sharedness.R @@ -8,7 +8,7 @@ #' each pair of samples. #' @export #' -#' @examples +#' sharedness <- function(data) { if(is(data,"accnet")) diff --git a/R/similarity_network.R b/R/similarity_network.R index c1aaf08..fbff540 100644 --- a/R/similarity_network.R +++ b/R/similarity_network.R @@ -17,7 +17,7 @@ #' @import igraph #' @import parallelDist #' -#' @examples +#' similarity_network <- function(data, threshold) { if(is(data,"mash")) diff --git a/R/similarity_tree.R b/R/similarity_tree.R index 9dcc5c8..e723a9f 100644 --- a/R/similarity_tree.R +++ b/R/similarity_tree.R @@ -21,11 +21,11 @@ #' @return An object of class \emph{phylo} #' @export #' -#' @examples +#' #' @import dplyr #' @import tidyr #' @import tibble -#' @import data.table +#' @importFrom data.table fread #' @import ape #' @import parallelDist diff --git a/R/singles.R b/R/singles.R index 97dcd52..5182d28 100644 --- a/R/singles.R +++ b/R/singles.R @@ -13,12 +13,12 @@ #' Genome/Cluster and its annotation. #' @export #' -#' @examples +#' #' @import dplyr #' @import tidyr #' @import tibble #' @import dtplyr -#' @import data.table +#' @importFrom data.table fread #' singles <- function(data, cluster) { diff --git a/R/snps_map.R b/R/snps_map.R index cee5ba8..ac1c462 100644 --- a/R/snps_map.R +++ b/R/snps_map.R @@ -13,7 +13,7 @@ #' @import dplyr #' @import tidyr #' -#' @examples +#' snps_map <- function(core_snp_genome) { diff --git a/R/snps_pairwaise.R b/R/snps_pairwaise.R index 53fcb2f..cb24b51 100644 --- a/R/snps_pairwaise.R +++ b/R/snps_pairwaise.R @@ -14,14 +14,14 @@ #' @return #' @export #' -#' @examples +#' #' #' @import foreach #' @import doParallel #' @import dplyr #' @import tidyr #' @import stringr -#' @import data.table +#' @importFrom data.table fread #' snps_pairwaise <- function(file_list,type,n_cores, norm =T) { diff --git a/R/twins.R b/R/twins.R index 202c1d1..8d01dc3 100644 --- a/R/twins.R +++ b/R/twins.R @@ -14,12 +14,12 @@ #' } #' @export #' -#' @examples +#' #' @import dplyr #' @import tidyr #' @import tibble #' @import dtplyr -#' @import data.table +#' @importFrom data.table fread #' twins <- function(data) { diff --git a/R/umap_plot.R b/R/umap_plot.R index dacdfe7..8d8a68d 100644 --- a/R/umap_plot.R +++ b/R/umap_plot.R @@ -18,12 +18,12 @@ #' @references McInnes, L, Healy, J, UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction, ArXiv e-prints 1802.03426, 2018 #' @references James Melville (2019). uwot: The Uniform Manifold Approximation and Projection (UMAP) Method for #' Dimensionality Reduction. R package version 0.1.4. https://CRAN.R-project.org/package=uwot -#' @examples +#' #' @import dplyr #' @import tidyr #' @import tibble #' @import ggplot2 -#' @import data.table +#' @importFrom data.table fread #' @importFrom randomcoloR distinctColorPalette umap_plot <- function(data, plot = TRUE, cluster,...) { diff --git a/man/dn_ds.Rd b/man/dn_ds.Rd index 72ef8f2..dba8d80 100644 --- a/man/dn_ds.Rd +++ b/man/dn_ds.Rd @@ -29,7 +29,3 @@ dn_ds(mmseq, accnet, min_size = 5, n_cores, mode = "fast") This function needs \emph{mafft} and/or \emph{blastn} intalled in you system and available in the PATH. } -\examples{ - - -} diff --git a/man/export_accnet_aln.Rd b/man/export_accnet_aln.Rd index ba53fcf..1af09fb 100644 --- a/man/export_accnet_aln.Rd +++ b/man/export_accnet_aln.Rd @@ -21,6 +21,3 @@ that can be use with RaxML or iqtree to build a phylogenetic tree. Some phylogenetic tree inference tool allow as input binary aligments. We recommend to use RAxML-NG or iQtree to inference an accessory genome tree. } -\examples{ - -} diff --git a/man/heatmap_of_annotation.Rd b/man/heatmap_of_annotation.Rd index 16f95d6..ac85f2d 100644 --- a/man/heatmap_of_annotation.Rd +++ b/man/heatmap_of_annotation.Rd @@ -19,6 +19,3 @@ filter the results by identity and/or evalue. This function uses \emph{pheatmap} instead of \emph{heatmap} funtion if \emph{pheatmap} is installed. } -\examples{ - -} diff --git a/man/plasmidome.Rd b/man/plasmidome.Rd index 8f93ce4..4afdde9 100644 --- a/man/plasmidome.Rd +++ b/man/plasmidome.Rd @@ -45,10 +45,6 @@ To install \emph{mlplasmids} you must have installed \emph{devtools} (usually you have because PATO requires that you have devtools installed too). Then type: \code{devtools::install_git("https://gitlab.com/sirarredondo/mlplasmids")} -} -\examples{ - - } \references{ Arredondo-Alonso et al., Microbial Genomics 2018;4 DOI 10.1099/mgen.0.000224 diff --git a/man/snps_pairwaise.Rd b/man/snps_pairwaise.Rd index 947d013..f73c02a 100644 --- a/man/snps_pairwaise.Rd +++ b/man/snps_pairwaise.Rd @@ -25,6 +25,3 @@ calculates the raw number of SNPs among each pair of sequences avoiding the bias reference. However, this function is an \emph{O(N²)} so can be very slow for large datasets. } -\examples{ - -} diff --git a/vignettes/cov_mode.png b/vignettes/cov_mode.png index 76f844e..76bf6f6 100644 Binary files a/vignettes/cov_mode.png and b/vignettes/cov_mode.png differ diff --git a/vignettes/vignette.Rmd b/vignettes/vignette.Rmd index b490421..04ed2ed 100644 --- a/vignettes/vignette.Rmd +++ b/vignettes/vignette.Rmd @@ -1,10 +1,12 @@ --- title: 'PATO: Pangenome Analysis Toolkit' -author: "Val F. Lanza" +author: "Miguel D. Fernández-de-Bobadilla & Val F. Lanza" date: "`r Sys.Date()`" output: pdf_document: toc: yes + toc_depth: 3 + fig_crop: false github_document: null html_document: number_section: no @@ -244,7 +246,7 @@ PATO includes internal functions and other wrap functions that calls external binaries. We split the functions into 4 categories: *Main Functions*, *Analysis Functions*, *Quality Control* and *Visualization*. -![](./FunctionsTable.png){height=70%} +![Table of the PATO functions](./FunctionsTable.png){height=70%} ### Main Functions @@ -269,10 +271,8 @@ gffs <- load_gff_list(gff_files) #### `mash()` -`mash()` functions is a wrapper of [Mash](https://github.com/marbl/Mash) -Mash is a method for "Fast genome and metagenome distance estimation using -MinHash"(). MASH accept nucleotide or aminoacid fasta files and estimates a +It is a wrapper of [Mash](https://github.com/marbl/Mash). Mash is a method for "Fast genome and metagenome distance estimation using MinHash"(). MASH accept nucleotide or aminoacid fasta files and estimates a similarity distance. To extend the information we recommend to read the main paper *Mash: fast genome and metagenome distance estimation using MinHash. Ondov BD, Treangen TJ, Melsted P, Mallonee AB, Bergman NH, Koren S, Phillippy @@ -297,19 +297,18 @@ sketch = 1000, kmer = 21, type = "prot") #### `mmseqs()` -`mmseqs()` is a wrapper of [MMSeqs2](https://github.com/soedinglab/MMseqs2) +It is a wrapper of [MMSeqs2](https://github.com/soedinglab/MMseqs2) software. - Creators of MMSeqs define their software as: ->MMseqs2 (Many-against-Many sequence searching) is a software suite to search and +>_MMseqs2 (Many-against-Many sequence searching) is a software suite to search and cluster huge protein and nucleotide sequence sets. MMseqs2 is an open source GPL-licensed software implemented in C++ for Linux, MacOS, and (as beta version, via cygwin) Windows. The software is designed to run on multiple cores and servers and exhibits a very good scalability. MMseqs2 can run 10000 times faster than BLAST. At 100 times its speed achieves almost the same sensitivity. It can perform profile searches with the same sensitivity as -PSI-BLAST at over 400 times its speed. +PSI-BLAST at over 400 times its speed._ The algorithm renames all the files first, creating a new unique header for each gene/protein. Then the algorithm clusters all the sequences using the `mmseqs linclust` pipeline. Then the clustering results are loaded in R and parse to create the `mmseq` object. @@ -343,7 +342,8 @@ bidirectional, (1) target coverage, (2) query coverage and (3) target-in-query length coverage. In the context of cluster or linclust, the query is seen representative sequence and target is a member sequence. The `-cov-mode` flag also automatically sets the `cluster_mode`. -![Coverage Mode](./cov_mode.png){width=100%} + +![Coverage Mode](./cov_mode.png) ##### Clustering modes All clustering modes transform the alignment results into an undirected graph. @@ -942,7 +942,7 @@ We can inspect the Pangenome composition of our dataset with: cp <- core_plots(ecoli_mm,reps = 10, threshold = 0.95, steps = 10) ``` -![Core Plots](./core_plot.png) +![Plots of the pangenome composition](./core_plot.png) @@ -1027,7 +1027,7 @@ the genome (in magabases) or can be a raw number of variants. var_matrix = core_snps_matrix(core, norm = TRUE) pheatmap::pheatmap(var_matrix,show_rownames = F, show_colnames = F) ``` -![SNPs matrix heatmap](./snp_heatmap.png) +![HeatMap of the _core-snp-genome_ SNP matrix](./snp_heatmap.png) ### Accessory Genome Analysis @@ -1167,7 +1167,7 @@ The output has a `phylo` format, so can be visualized with external packages as mash_tree_fastme %>% midpoint %>% ggtree(layout = "circular") %<+% annot_tree + geom_tippoint(aes(color = organism_name)) ``` -![Neighbor Joining tree](./tree_mash.png) +![Neighbor Joining tree of MASH distances](./tree_mash.png) Using other external packages, we can compare the arrangement of the pangenome (`mash` data) against the accessory genome (`accnet`) @@ -1178,7 +1178,7 @@ tanglegram(ladderize(mash_tree_upgma), ladderize(accnet_tree_upgma), ``` -![](./tanglegram.png) +![Tanglegram between Mash and AccNET dendrograms](./tanglegram.png) Some Maximum Likelihood inference trees software accept, as input, binary data (0-1) alignments. So, we can use accessory data (*accnet*) to infer a tree @@ -1570,7 +1570,7 @@ pheatmap::pheatmap(tmp, ) ``` -![](./SnpMatrixRoaryLike.png) +![Matrix of SNPs of the _core-genome_](./SnpMatrixRoaryLike.png) We can manually inspect the matrix in the console. ```{r eval=F, include=T, tidy=T} tmp %>% @@ -1664,7 +1664,7 @@ pheatmap(tmp, ) ``` -![SNPs matrix](./SnpMatrixRoaryLike.png) +![HeatMap of the _core-snp-genome_ SNPs matrix](./SnpMatrixSnippyLike.png) And a manual inspection ```{r eval = F, include=T, tidy=TRUE} @@ -1735,7 +1735,7 @@ pheatmap(tmp, ``` -![](./SnpMatrixPairwise.png) +![HeatMap of the pairwaise SNP Matrix](./SnpMatrixPairwise.png) And the manual inspection reveals that ```{r eval=F, include=T, tidy=T} @@ -1995,7 +1995,7 @@ CompareAll(trees,RobinsonFoulds)%>% pheatmap::pheatmap(.,display_numbers = T) ``` -![](./TreeComparison.png) +![HeatMap of the Robinson Foulds distances](./TreeComparison.png) We compare the trees using [_TreeDist_](https://doi.org/10.1093/bioinformatics/btaa614) diff --git a/vignettes/vignette.pdf b/vignettes/vignette.pdf deleted file mode 100644 index 8aa8b9b..0000000 Binary files a/vignettes/vignette.pdf and /dev/null differ