diff --git a/.Rbuildignore b/.Rbuildignore index acafa50..0932fc1 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -2,6 +2,7 @@ ^fasta_files$ ^genbank_files$ ^gff_files$ +^bed_files$ ^renv$ ^renv\.lock$ ^.*\.Rproj$ diff --git a/.gitignore b/.gitignore index c4aba76..ee72a43 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ .DS_store renv/ fasta_files/ +bed_files/ genbank_files/ gff_files/ renv.lock diff --git a/NAMESPACE b/NAMESPACE index e0e1b27..7480fd7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -24,6 +24,7 @@ export(GC_tooltip) export(GC_trackMouse) export(gbk_features_to_df) export(protein_blast) +export(read_bed) export(read_fasta) export(read_gbk) export(read_gff) diff --git a/R/read_bed.R b/R/read_bed.R new file mode 100644 index 0000000..c3d2c49 --- /dev/null +++ b/R/read_bed.R @@ -0,0 +1,130 @@ +#' Read BED Files +#' +#' This function reads BED files from a specified directory or file path and +#' combines them into a single data frame. BED files use 0-based coordinate starts, +#' while this function transforms the coordinates to 1-based during import. +#' +#' @param path A character string specifying the directory containing BED files +#' or the file path to a single BED file. +#' +#' @return A data frame combining data from the BED files. +#' +#' @details This function can read multiple BED files from a directory or a +#' single BED file from a specified path. It adds a 'filename' column with the +#' name of the file, and combines the data frames from all files into one. +#' +#' @examples +#' \dontrun{ +#' # Read BED files from a directory +#' bed_data <- read_bed("path/to/directory") +#' +#' # Read a single BED file +#' bed_data <- read_bed("path/to/file.bed") +#' } +#' @importFrom dplyr bind_rows +#' @export +read_bed <- function(path){ + + if(dir.exists(path)){ + # It's a directory + files <- list.files(path, pattern = "\\.bed[0-9]*$", full.names = TRUE) + + # Check if there are any .bed files in the directory + if (length(files) == 0) { + stop("No .bed files found in the specified directory.") + } + + # Initialize an empty list to store data frames from each file + data_list <- list() + + # Process each .bed file in the directory + for (file in files) { + data <- process_bed(file) + filename <- sub("\\.bed[0-9]*$", "", basename(file)) + data$filename <- filename + data_list[[filename]] <- data + } + + # Combine data frames from all files into one + combined_data <- do.call(dplyr::bind_rows, data_list) + + return(combined_data) + + } else if(file.exists(path)){ + + data <- process_bed(path) + + return(data) + + } else { + stop("The specified path does not exist.") + } +} + +#' @noRd +block_to_numeric <- function(x) { + as.numeric(unlist(strsplit(x, ","))) + 1 +} + +#' @noRd +process_bed <- function(path){ + + field_names <- c("chrom", "chromStart", "chromEnd", "name", "score", + "strand", "thickStart", "thickEnd", "itemRgb", "blockCount", + "blockSizes", "blockStarts") + + if(file.exists(path)){ + + lines <- readLines(path) + # Find the line with the pattern 'track' + track_line <- grep("track", lines) + # Read data from the line after the 'track' line if it exists + if (length(track_line) > 0) { + lines <- lines[(track_line + 1):length(lines)] + } + + # Determine the number of columns from the first data line + if (length(lines) > 0) { + num_cols <- length(strsplit(lines[1], "\t")[[1]]) + used_fields <- field_names[1:num_cols] + } else { + used_fields <- field_names + } + + data <- read.table( + text = lines, + header = FALSE, + quote = "", + sep = "\t", + fill = TRUE, + stringsAsFactors = FALSE, + col.names = used_fields + ) + + # Add +1 to Start and End + if ("chromStart" %in% colnames(data)) { + data$chromStart <- data$chromStart + 1 + } + if("chromEnd" %in% colnames(data)){ + data$chromEnd <- data$chromEnd + 1 + } + if ("thickStart" %in% colnames(data)) { + data$thickStart <- data$thickStart + 1 + } + if("thickEnd" %in% colnames(data)){ + data$thickEnd <- data$thickEnd + 1 + } + # Convert blocks to numeric vector + if ("blockSizes" %in% colnames(data)) { + data$blockSizes <- sapply(data$blockSizes, block_to_numeric) + } + if ("blockStarts" %in% colnames(data)) { + data$blockStarts <- sapply(data$blockStarts, block_to_numeric) + } + + } else { + stop("The specified path does not exist.") + } + + return(data) +} diff --git a/man/read_bed.Rd b/man/read_bed.Rd new file mode 100644 index 0000000..72c780b --- /dev/null +++ b/man/read_bed.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/read_bed.R +\name{read_bed} +\alias{read_bed} +\title{Read BED Files} +\usage{ +read_bed(path) +} +\arguments{ +\item{path}{A character string specifying the directory containing BED files +or the file path to a single BED file.} +} +\value{ +A data frame combining data from the BED files. +} +\description{ +This function reads BED files from a specified directory or file path and +combines them into a single data frame. BED files use 0-based coordinate starts, +while this function transforms the coordinates to 1-based during import. +} +\details{ +This function can read multiple BED files from a directory or a + single BED file from a specified path. It adds a 'filename' column with the + name of the file, and combines the data frames from all files into one. +} +\examples{ +\dontrun{ +# Read BED files from a directory +bed_data <- read_bed("path/to/directory") + +# Read a single BED file +bed_data <- read_bed("path/to/file.bed") +} +}