diff --git a/.gitignore b/.gitignore index 9cdd8b97..a505cff8 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,4 @@ index.html .env docs .DS_Store +.vscode diff --git a/DESCRIPTION b/DESCRIPTION index 8ae30d05..60d889c5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -51,7 +51,8 @@ Suggests: tidyr biocViews: LinkingTo: - Rcpp (>= 1.0.0) + Rcpp (>= 1.0.0), + cpp11 VignetteBuilder: knitr Encoding: UTF-8 diff --git a/R/RcppExports.R b/R/RcppExports.R index b4d14975..4df6d63d 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,10 +1,6 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 -bed12toexons_impl <- function(x) { - .Call(`_valr_bed12toexons_impl`, x) -} - closest_impl <- function(x, y, grp_idx_x, grp_idx_y, suffix_x, suffix_y) { .Call(`_valr_closest_impl`, x, y, grp_idx_x, grp_idx_y, suffix_x, suffix_y) } @@ -21,10 +17,6 @@ dist_impl <- function(x, y, x_grp_indexes, y_grp_indexes, distcalc) { .Call(`_valr_dist_impl`, x, y, x_grp_indexes, y_grp_indexes, distcalc) } -flank_impl <- function(df, genome, both = 0, left = 0, right = 0, fraction = FALSE, stranded = FALSE, trim = FALSE) { - .Call(`_valr_flank_impl`, df, genome, both, left, right, fraction, stranded, trim) -} - gcoverage_impl <- function(gdf, max_coords) { .Call(`_valr_gcoverage_impl`, gdf, max_coords) } @@ -33,10 +25,6 @@ intersect_impl <- function(x, y, x_grp_indexes, y_grp_indexes, invert = FALSE, s .Call(`_valr_intersect_impl`, x, y, x_grp_indexes, y_grp_indexes, invert, suffix_x, suffix_y) } -makewindows_impl <- function(df, win_size = 0L, num_win = 0L, step_size = 0L, reverse = FALSE) { - .Call(`_valr_makewindows_impl`, df, win_size, num_win, step_size, reverse) -} - merge_impl <- function(gdf, max_dist = 0L, collapse = TRUE) { .Call(`_valr_merge_impl`, gdf, max_dist, collapse) } @@ -45,10 +33,6 @@ partition_impl <- function(gdf, max_dist = -1L) { .Call(`_valr_partition_impl`, gdf, max_dist) } -random_impl <- function(genome, length, n, seed = 0L) { - .Call(`_valr_random_impl`, genome, length, n, seed) -} - shuffle_impl <- function(df, incl, within = FALSE, max_tries = 1000L, seed = 0L) { .Call(`_valr_shuffle_impl`, df, incl, within, max_tries, seed) } diff --git a/R/cpp11.R b/R/cpp11.R new file mode 100644 index 00000000..c2008b28 --- /dev/null +++ b/R/cpp11.R @@ -0,0 +1,17 @@ +# Generated by cpp11: do not edit by hand + +bed12toexons_impl <- function(x) { + .Call(`_valr_bed12toexons_impl`, x) +} + +flank_impl <- function(df, genome, both, left, right, fraction, stranded, trim) { + .Call(`_valr_flank_impl`, df, genome, both, left, right, fraction, stranded, trim) +} + +makewindows_impl <- function(df, win_size, num_win, step_size, reverse) { + .Call(`_valr_makewindows_impl`, df, win_size, num_win, step_size, reverse) +} + +random_impl <- function(genome, length, n, seed) { + .Call(`_valr_random_impl`, genome, length, n, seed) +} diff --git a/R/read_bed.r b/R/read_bed.r index e9d27856..8696c095 100644 --- a/R/read_bed.r +++ b/R/read_bed.r @@ -160,8 +160,8 @@ peak_coltypes <- list( bed12_coltypes <- list( chrom = readr::col_character(), - start = readr::col_integer(), - end = readr::col_integer(), + start = readr::col_double(), + end = readr::col_double(), name = readr::col_character(), score = readr::col_character(), strand = readr::col_character(), diff --git a/inst/include/genome.h b/inst/include/genome.h index 6b6b8db5..cffd58e8 100644 --- a/inst/include/genome.h +++ b/inst/include/genome.h @@ -24,7 +24,7 @@ inline genome_map_t makeChromSizes(DataFrame genome, IntegerVector sizes = genome[col_size] ; if (unique(refs).length() != refs.length()) - stop("duplicate reference names in genome file.") ; + Rcpp::stop("duplicate reference names in genome file.") ; int nchrom = genome.nrows() ; for (int i = 0; i < nchrom; ++i) { diff --git a/inst/include/group_apply.h b/inst/include/group_apply.h index 34b52ce1..cb88d890 100644 --- a/inst/include/group_apply.h +++ b/inst/include/group_apply.h @@ -28,7 +28,7 @@ inline void GroupApply(const ValrGroupedDataFrame& x, int ng_y = shared_grps_y.size() ; if (ng_x != ng_y) { - stop("incompatible groups found between x and y dataframes") ; + Rcpp::stop("incompatible groups found between x and y dataframes") ; } // access the group .rows list diff --git a/inst/include/utils.h b/inst/include/utils.h index 2d6a89a7..2dcab391 100644 --- a/inst/include/utils.h +++ b/inst/include/utils.h @@ -18,6 +18,9 @@ DataFrame subset_dataframe(const DataFrame& df, DataFrame subset_dataframe(const DataFrame& df, IntegerVector indices) ; +writable::data_frame subset_dataframe(const data_frame& df, + std::vector indices) ; + inline DataFrame check_is_grouped(const DataFrame& x) { bool is_grouped(Rf_inherits(x, "grouped_df")) ; diff --git a/inst/include/valr.h b/inst/include/valr.h index ab30ebf3..d7b02dce 100644 --- a/inst/include/valr.h +++ b/inst/include/valr.h @@ -10,11 +10,12 @@ #ifndef valr__valr_H #define valr__valr_H -// [[Rcpp::plugins(cpp11)]] - #include using namespace Rcpp ; +#include +using namespace cpp11; + #include "utils.h" #include "grouped_dataframe.h" #include "IntervalTree.h" diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 6122e492..04644d4b 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -11,17 +11,6 @@ Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif -// bed12toexons_impl -DataFrame bed12toexons_impl(DataFrame x); -RcppExport SEXP _valr_bed12toexons_impl(SEXP xSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< DataFrame >::type x(xSEXP); - rcpp_result_gen = Rcpp::wrap(bed12toexons_impl(x)); - return rcpp_result_gen; -END_RCPP -} // closest_impl DataFrame closest_impl(ValrGroupedDataFrame x, ValrGroupedDataFrame y, IntegerVector grp_idx_x, IntegerVector grp_idx_y, const std::string& suffix_x, const std::string& suffix_y); RcppExport SEXP _valr_closest_impl(SEXP xSEXP, SEXP ySEXP, SEXP grp_idx_xSEXP, SEXP grp_idx_ySEXP, SEXP suffix_xSEXP, SEXP suffix_ySEXP) { @@ -79,24 +68,6 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } -// flank_impl -DataFrame flank_impl(DataFrame df, DataFrame genome, double both, double left, double right, bool fraction, bool stranded, bool trim); -RcppExport SEXP _valr_flank_impl(SEXP dfSEXP, SEXP genomeSEXP, SEXP bothSEXP, SEXP leftSEXP, SEXP rightSEXP, SEXP fractionSEXP, SEXP strandedSEXP, SEXP trimSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< DataFrame >::type df(dfSEXP); - Rcpp::traits::input_parameter< DataFrame >::type genome(genomeSEXP); - Rcpp::traits::input_parameter< double >::type both(bothSEXP); - Rcpp::traits::input_parameter< double >::type left(leftSEXP); - Rcpp::traits::input_parameter< double >::type right(rightSEXP); - Rcpp::traits::input_parameter< bool >::type fraction(fractionSEXP); - Rcpp::traits::input_parameter< bool >::type stranded(strandedSEXP); - Rcpp::traits::input_parameter< bool >::type trim(trimSEXP); - rcpp_result_gen = Rcpp::wrap(flank_impl(df, genome, both, left, right, fraction, stranded, trim)); - return rcpp_result_gen; -END_RCPP -} // gcoverage_impl DataFrame gcoverage_impl(const ValrGroupedDataFrame& gdf, const IntegerVector& max_coords); RcppExport SEXP _valr_gcoverage_impl(SEXP gdfSEXP, SEXP max_coordsSEXP) { @@ -126,21 +97,6 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } -// makewindows_impl -DataFrame makewindows_impl(DataFrame df, int win_size, int num_win, int step_size, bool reverse); -RcppExport SEXP _valr_makewindows_impl(SEXP dfSEXP, SEXP win_sizeSEXP, SEXP num_winSEXP, SEXP step_sizeSEXP, SEXP reverseSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< DataFrame >::type df(dfSEXP); - Rcpp::traits::input_parameter< int >::type win_size(win_sizeSEXP); - Rcpp::traits::input_parameter< int >::type num_win(num_winSEXP); - Rcpp::traits::input_parameter< int >::type step_size(step_sizeSEXP); - Rcpp::traits::input_parameter< bool >::type reverse(reverseSEXP); - rcpp_result_gen = Rcpp::wrap(makewindows_impl(df, win_size, num_win, step_size, reverse)); - return rcpp_result_gen; -END_RCPP -} // merge_impl DataFrame merge_impl(ValrGroupedDataFrame gdf, int max_dist, bool collapse); RcppExport SEXP _valr_merge_impl(SEXP gdfSEXP, SEXP max_distSEXP, SEXP collapseSEXP) { @@ -166,20 +122,6 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } -// random_impl -DataFrame random_impl(DataFrame genome, int length, int n, int seed); -RcppExport SEXP _valr_random_impl(SEXP genomeSEXP, SEXP lengthSEXP, SEXP nSEXP, SEXP seedSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< DataFrame >::type genome(genomeSEXP); - Rcpp::traits::input_parameter< int >::type length(lengthSEXP); - Rcpp::traits::input_parameter< int >::type n(nSEXP); - Rcpp::traits::input_parameter< int >::type seed(seedSEXP); - rcpp_result_gen = Rcpp::wrap(random_impl(genome, length, n, seed)); - return rcpp_result_gen; -END_RCPP -} // shuffle_impl DataFrame shuffle_impl(DataFrame df, DataFrame incl, bool within, int max_tries, int seed); RcppExport SEXP _valr_shuffle_impl(SEXP dfSEXP, SEXP inclSEXP, SEXP withinSEXP, SEXP max_triesSEXP, SEXP seedSEXP) { diff --git a/src/bed12toexons.cpp b/src/bed12toexons.cpp index 99a72565..340d5921 100644 --- a/src/bed12toexons.cpp +++ b/src/bed12toexons.cpp @@ -27,14 +27,14 @@ std::vector csv_values(std::string csv) { return values ; } -// [[Rcpp::export]] -DataFrame bed12toexons_impl(DataFrame x) { +[[cpp11::register]] +writable::data_frame bed12toexons_impl(data_frame x) { // input - IntegerVector starts = x["start"] ; - std::vector exon_sizes = x["exon_sizes"] ; - std::vector exon_starts = x["exon_starts"] ; - std::vector strands = x["strand"] ; + doubles starts = x["start"] ; + strings exon_sizes = x["exon_sizes"] ; + strings exon_starts = x["exon_starts"] ; + strings strands = x["strand"] ; // storage std::vector starts_out ; @@ -66,18 +66,23 @@ DataFrame bed12toexons_impl(DataFrame x) { } } - DataFrame out = subset_dataframe(x, df_idx) ; - - out["start"] = starts_out ; - out["end"] = ends_out ; - out["score"] = nums_out ; - - return out ; + writable::data_frame out = subset_dataframe(x, df_idx) ; + + return writable::data_frame({ + "chrom"_nm = out["chrom"], + "start"_nm = starts_out, + "end"_nm = ends_out, + "score"_nm = nums_out, + "strand"_nm = out["strand"], + "name"_nm = out["name"], + "cdsStart"_nm = out["cdsStart"], + "cdsEnd"_nm = out["cdsEnd"], + "exonCount"_nm = out["exonCount"], + "exonSizes"_nm = out["exonSizes"], + "exonStarts"_nm = out["exonStarts"], + "itemRgb"_nm = out["itemRgb"], + "blockCount"_nm = out["blockCount"], + "blockSizes"_nm = out["blockSizes"], + "blockStarts"_nm = out["blockStarts"] + }) ; } - -/***R -library(valr) -library(dplyr) -x <- read_bed12(valr_example('mm9.refGene.bed.gz')) -bed12_to_exons(x) -*/ diff --git a/src/closest.cpp b/src/closest.cpp index b0124df8..9914558e 100644 --- a/src/closest.cpp +++ b/src/closest.cpp @@ -233,7 +233,7 @@ DataFrame closest_impl(ValrGroupedDataFrame x, ValrGroupedDataFrame y, int ng_y = grp_idx_y.size() ; if (ng_x != ng_y) { - stop("incompatible groups found between x and y dataframes") ; + Rcpp::stop("incompatible groups found between x and y dataframes") ; } // access the group .rows list @@ -297,4 +297,3 @@ DataFrame closest_impl(ValrGroupedDataFrame x, ValrGroupedDataFrame y, return res ; } - diff --git a/src/cpp11.cpp b/src/cpp11.cpp new file mode 100644 index 00000000..9f9f1a24 --- /dev/null +++ b/src/cpp11.cpp @@ -0,0 +1,75 @@ +// Generated by cpp11: do not edit by hand +// clang-format off + +#include +#include +using namespace Rcpp; +#include "cpp11/declarations.hpp" +#include + +// bed12toexons.cpp +writable::data_frame bed12toexons_impl(data_frame x); +extern "C" SEXP _valr_bed12toexons_impl(SEXP x) { + BEGIN_CPP11 + return cpp11::as_sexp(bed12toexons_impl(cpp11::as_cpp>(x))); + END_CPP11 +} +// flank.cpp +writable::data_frame flank_impl(data_frame df, data_frame genome, double both, double left, double right, bool fraction, bool stranded, bool trim); +extern "C" SEXP _valr_flank_impl(SEXP df, SEXP genome, SEXP both, SEXP left, SEXP right, SEXP fraction, SEXP stranded, SEXP trim) { + BEGIN_CPP11 + return cpp11::as_sexp(flank_impl(cpp11::as_cpp>(df), cpp11::as_cpp>(genome), cpp11::as_cpp>(both), cpp11::as_cpp>(left), cpp11::as_cpp>(right), cpp11::as_cpp>(fraction), cpp11::as_cpp>(stranded), cpp11::as_cpp>(trim))); + END_CPP11 +} +// makewindows.cpp +writable::data_frame makewindows_impl(data_frame df, int win_size, int num_win, int step_size, bool reverse); +extern "C" SEXP _valr_makewindows_impl(SEXP df, SEXP win_size, SEXP num_win, SEXP step_size, SEXP reverse) { + BEGIN_CPP11 + return cpp11::as_sexp(makewindows_impl(cpp11::as_cpp>(df), cpp11::as_cpp>(win_size), cpp11::as_cpp>(num_win), cpp11::as_cpp>(step_size), cpp11::as_cpp>(reverse))); + END_CPP11 +} +// random.cpp +writable::data_frame random_impl(data_frame genome, double length, int n, int seed); +extern "C" SEXP _valr_random_impl(SEXP genome, SEXP length, SEXP n, SEXP seed) { + BEGIN_CPP11 + return cpp11::as_sexp(random_impl(cpp11::as_cpp>(genome), cpp11::as_cpp>(length), cpp11::as_cpp>(n), cpp11::as_cpp>(seed))); + END_CPP11 +} + +extern "C" { +/* .Call calls */ +extern SEXP _valr_closest_impl(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); +extern SEXP _valr_complement_impl(SEXP, SEXP); +extern SEXP _valr_coverage_impl(SEXP, SEXP, SEXP, SEXP); +extern SEXP _valr_dist_impl(SEXP, SEXP, SEXP, SEXP, SEXP); +extern SEXP _valr_gcoverage_impl(SEXP, SEXP); +extern SEXP _valr_intersect_impl(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); +extern SEXP _valr_merge_impl(SEXP, SEXP, SEXP); +extern SEXP _valr_partition_impl(SEXP, SEXP); +extern SEXP _valr_shuffle_impl(SEXP, SEXP, SEXP, SEXP, SEXP); +extern SEXP _valr_subtract_impl(SEXP, SEXP, SEXP, SEXP); + +static const R_CallMethodDef CallEntries[] = { + {"_valr_bed12toexons_impl", (DL_FUNC) &_valr_bed12toexons_impl, 1}, + {"_valr_closest_impl", (DL_FUNC) &_valr_closest_impl, 6}, + {"_valr_complement_impl", (DL_FUNC) &_valr_complement_impl, 2}, + {"_valr_coverage_impl", (DL_FUNC) &_valr_coverage_impl, 4}, + {"_valr_dist_impl", (DL_FUNC) &_valr_dist_impl, 5}, + {"_valr_flank_impl", (DL_FUNC) &_valr_flank_impl, 8}, + {"_valr_gcoverage_impl", (DL_FUNC) &_valr_gcoverage_impl, 2}, + {"_valr_intersect_impl", (DL_FUNC) &_valr_intersect_impl, 7}, + {"_valr_makewindows_impl", (DL_FUNC) &_valr_makewindows_impl, 5}, + {"_valr_merge_impl", (DL_FUNC) &_valr_merge_impl, 3}, + {"_valr_partition_impl", (DL_FUNC) &_valr_partition_impl, 2}, + {"_valr_random_impl", (DL_FUNC) &_valr_random_impl, 4}, + {"_valr_shuffle_impl", (DL_FUNC) &_valr_shuffle_impl, 5}, + {"_valr_subtract_impl", (DL_FUNC) &_valr_subtract_impl, 4}, + {NULL, NULL, 0} +}; +} + +extern "C" attribute_visible void R_init_valr(DllInfo* dll){ + R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); + R_useDynamicSymbols(dll, FALSE); + R_forceSymbols(dll, TRUE); +} diff --git a/src/flank.cpp b/src/flank.cpp index 960bf8e7..4e454525 100644 --- a/src/flank.cpp +++ b/src/flank.cpp @@ -1,6 +1,6 @@ // flank.cpp // -// Copyright (C) 2016 - 2018 Jay Hesselberth and Kent Riemondy +// Copyright (C) 2016 - 2025 Jay Hesselberth and Kent Riemondy // // This file is part of valr. // @@ -9,10 +9,10 @@ #include "valr.h" -void check_coords(int start, int end, - int chrom_size, int idx, bool trim, - std::vector& starts_out, - std::vector& ends_out, +void check_coords(double start, double end, + double chrom_size, int idx, bool trim, + writable::doubles& starts_out, + writable::doubles& ends_out, std::vector& df_idx) { if (start == end) return ; @@ -42,18 +42,18 @@ void check_coords(int start, int end, } // else trim } -//[[Rcpp::export]] -DataFrame flank_impl(DataFrame df, DataFrame genome, +[[cpp11::register]] +writable::data_frame flank_impl(data_frame df, data_frame genome, double both = 0, double left = 0, double right = 0, bool fraction = false, bool stranded = false, bool trim = false) { - std::vector chroms = df["chrom"]; - IntegerVector starts = df["start"]; - IntegerVector ends = df["end"]; + strings chroms = df["chrom"]; + doubles starts = df["start"]; + doubles ends = df["end"]; // storage for outputs - std::vector starts_out; - std::vector ends_out; + writable::doubles starts_out; + writable::doubles ends_out; std::vector df_idx; genome_map_t chrom_sizes = makeChromSizes(genome); @@ -61,7 +61,7 @@ DataFrame flank_impl(DataFrame df, DataFrame genome, if (stranded) { - std::vector strands = df["strand"]; + strings strand = df["strand"]; for (int i = 0; i < starts.size(); i++) { @@ -70,7 +70,7 @@ DataFrame flank_impl(DataFrame df, DataFrame genome, double size = end - start; if (fraction) { - if (strands[i] == "+") { + if (strand[i] == "+") { lstart = start - std::round(size * left); lend = start; rstart = end; @@ -82,7 +82,7 @@ DataFrame flank_impl(DataFrame df, DataFrame genome, rend = start ; } } else { - if (strands[i] == "+") { + if (strand[i] == "+") { lstart = start - left; lend = start; rstart = end; @@ -96,7 +96,7 @@ DataFrame flank_impl(DataFrame df, DataFrame genome, } std::string chrom = chroms[i]; - int chrom_size = chrom_sizes[chrom]; + double chrom_size = chrom_sizes[chrom]; // check and save coordinates check_coords(lstart, lend, chrom_size, i, trim, @@ -136,23 +136,20 @@ DataFrame flank_impl(DataFrame df, DataFrame genome, } } - DataFrame out = subset_dataframe(df, df_idx) ; - - out["start"] = starts_out; - out["end"] = ends_out; + writable::data_frame subset = subset_dataframe(df, df_idx) ; - return out; + if (stranded) { + return writable::data_frame({ + "chrom"_nm = subset["chrom"], + "start"_nm = starts_out, + "end"_nm = ends_out, + "strand"_nm = subset["strand"] + }) ; + } else { + return writable::data_frame({ + "chrom"_nm = subset["chrom"], + "start"_nm = starts_out, + "end"_nm = ends_out + }) ; + } } - - -/*** R -library(valr) -library(dplyr) - -genome <- read_genome(valr_example('hg19.chrom.sizes.gz')) -x <- bed_random(genome) - -devtools::load_all() -flank_impl(x, genome, both = 100) %>% as_data_frame() -*/ - diff --git a/src/gcoverage.cpp b/src/gcoverage.cpp index 24c03267..9fe12c41 100644 --- a/src/gcoverage.cpp +++ b/src/gcoverage.cpp @@ -21,7 +21,7 @@ posTracker collatePositions(const IntegerVector& starts, auto n = starts.size() ; if (n != ends.size()) { - stop("incompatible start and end vector supplied") ; + cpp11::stop("incompatible start and end vector supplied") ; } for (int i = 0; i < n; i++) { @@ -46,7 +46,7 @@ DataFrame gcoverage_impl(const ValrGroupedDataFrame& gdf, ListView idx(gdf.indices()) ; if(max_coords.size() != ng) { - stop("max_coords must equal the number of groups in data.frame"); + cpp11::stop("max_coords must equal the number of groups in data.frame"); } std::vector out_indices, depths, starts, ends; @@ -74,9 +74,9 @@ DataFrame gcoverage_impl(const ValrGroupedDataFrame& gdf, for (auto p:pos) { if (p.first > max_coord) { - warning("Out of bounds interval detected at position: %s \n" - " Out of bounds intervals will be ignored", - p.first); + Rcpp::warning( + "Out of bounds interval detected and will be ignored" + ); break; } @@ -113,21 +113,3 @@ DataFrame gcoverage_impl(const ValrGroupedDataFrame& gdf, return subset_x ; } - -/*** R -library(dplyr) -x <- tibble::tribble( - ~chrom, ~start, ~end, ~name, ~score, ~strand, - "chr1", 20, 70, 6, 25, "+", - "chr1", 50, 100, 1, 25, "-", - "chr1", 200, 250, 3, 25, "+", - "chr1", 220, 250, 3, 25, "+", - "chr2", 80, 130, 5, 25, "-", - "chr2", 150, 200, 4, 25, "+", - "chr2", 180, 230, 2, 25, "-", - "chr2", 190, 230, 2, 25, "-" -) |> group_by(chrom) - -gcoverage_impl(x, max_coords = c(1000, 500)) |> as.data.frame() - -*/ diff --git a/src/init.c b/src/init.c deleted file mode 100644 index 11fd9038..00000000 --- a/src/init.c +++ /dev/null @@ -1,48 +0,0 @@ -#include -#include -#include // for NULL -#include - -/* FIXME: - Check these declarations against the C/Fortran source code. -*/ - -/* .Call calls */ -extern SEXP _valr_bed12toexons_impl(void *); -extern SEXP _valr_closest_impl(void *, void *, void *, void *, void *, void *); -extern SEXP _valr_complement_impl(void *, void *); -extern SEXP _valr_coverage_impl(void *, void *, void *, void *); -extern SEXP _valr_dist_impl(void *, void *, void *, void *, void *); -extern SEXP _valr_flank_impl(void *, void *, void *, void *, void *, void *, void *, void *); -extern SEXP _valr_gcoverage_impl(void *, void *); -extern SEXP _valr_intersect_impl(void *, void *, void *, void *, void *, void *, void *); -extern SEXP _valr_makewindows_impl(void *, void *, void *, void *, void *); -extern SEXP _valr_merge_impl(void *, void *, void *); -extern SEXP _valr_partition_impl(void *, void *); -extern SEXP _valr_random_impl(void *, void *, void *, void *); -extern SEXP _valr_shuffle_impl(void *, void *, void *, void *, void *); -extern SEXP _valr_subtract_impl(void *, void *, void *, void *); - -static const R_CallMethodDef CallEntries[] = { - {"_valr_bed12toexons_impl", (DL_FUNC) &_valr_bed12toexons_impl, 1}, - {"_valr_closest_impl", (DL_FUNC) &_valr_closest_impl, 6}, - {"_valr_complement_impl", (DL_FUNC) &_valr_complement_impl, 2}, - {"_valr_coverage_impl", (DL_FUNC) &_valr_coverage_impl, 4}, - {"_valr_dist_impl", (DL_FUNC) &_valr_dist_impl, 5}, - {"_valr_flank_impl", (DL_FUNC) &_valr_flank_impl, 8}, - {"_valr_gcoverage_impl", (DL_FUNC) &_valr_gcoverage_impl, 2}, - {"_valr_intersect_impl", (DL_FUNC) &_valr_intersect_impl, 7}, - {"_valr_makewindows_impl", (DL_FUNC) &_valr_makewindows_impl, 5}, - {"_valr_merge_impl", (DL_FUNC) &_valr_merge_impl, 3}, - {"_valr_partition_impl", (DL_FUNC) &_valr_partition_impl, 2}, - {"_valr_random_impl", (DL_FUNC) &_valr_random_impl, 4}, - {"_valr_shuffle_impl", (DL_FUNC) &_valr_shuffle_impl, 5}, - {"_valr_subtract_impl", (DL_FUNC) &_valr_subtract_impl, 4}, - {NULL, NULL, 0} -}; - -void R_init_valr(DllInfo *dll) -{ - R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); - R_useDynamicSymbols(dll, FALSE); -} diff --git a/src/makewindows.cpp b/src/makewindows.cpp index 24da4f0a..137385ab 100644 --- a/src/makewindows.cpp +++ b/src/makewindows.cpp @@ -1,6 +1,6 @@ // makewindows.cpp // -// Copyright (C) 2016 - 2018 Jay Hesselberth and Kent Riemondy +// Copyright (C) 2016 - 2025 Jay Hesselberth and Kent Riemondy // // This file is part of valr. // @@ -9,17 +9,17 @@ #include "valr.h" -//[[Rcpp::export]] -DataFrame makewindows_impl(DataFrame df, int win_size = 0, int num_win = 0, +[[cpp11::register]] +writable::data_frame makewindows_impl(data_frame df, int win_size = 0, int num_win = 0, int step_size = 0, bool reverse = false) { - NumericVector starts = df["start"] ; - NumericVector ends = df["end"] ; + doubles starts = df["start"] ; + doubles ends = df["end"] ; - std::vector starts_out ; - std::vector ends_out ; + writable::doubles starts_out ; + writable::doubles ends_out ; std::vector df_idxs ; - std::vector win_ids; + writable::integers win_ids; for (int i = 0; i < starts.size(); ++i) { @@ -72,25 +72,12 @@ DataFrame makewindows_impl(DataFrame df, int win_size = 0, int num_win = 0, } } - DataFrame out = subset_dataframe(df, df_idxs) ; + writable::data_frame subset = subset_dataframe(df, df_idxs) ; - // replace original starts, ends, and .win_id - out["start"] = starts_out ; - out["end"] = ends_out ; - out[".win_id"] = win_ids ; - - return out ; + return writable::data_frame({ + "chrom"_nm = subset["chrom"], + "start"_nm = starts_out, + "end"_nm = ends_out, + ".win_id"_nm = win_ids + }) ; } - -/*** R -library(valr) -library(dplyr) - -x <- trbl_interval( - ~chrom, ~start, ~end, - "chr1", 100, 200 -) - -bed_makewindows(x, win_size = 10) -bed_makewindows(x, win_size = 10, reverse = TRUE) -*/ diff --git a/src/random.cpp b/src/random.cpp index 6b589037..1044a4c9 100644 --- a/src/random.cpp +++ b/src/random.cpp @@ -9,25 +9,33 @@ #include "valr.h" -// [[Rcpp::export]] -DataFrame random_impl(DataFrame genome, int length, int n, int seed = 0) { +[[cpp11::register]] +writable::data_frame random_impl(data_frame genome, double length, int n, int seed = 0) { - CharacterVector chroms = genome["chrom"] ; - NumericVector sizes = genome["size"] ; + strings chroms = genome["chrom"] ; + doubles sizes = genome["size"] ; int nchrom = chroms.size() ; if (seed == 0) - seed = round(R::runif(0, RAND_MAX)) ; + seed = round(Rf_runif(0, RAND_MAX)) ; // seed the generator auto generator = ENGINE(seed) ; // calculate weights for chrom distribution - float mass = sum(sizes) ; - NumericVector weights = sizes / mass ; + double mass = std::accumulate(sizes.begin(), sizes.end(), 0.0); ; + + std::vector weights(nchrom) ; + for (int i = 0; i < nchrom; ++i) { + weights[i] = sizes[i] / mass ; + } + + std::vector chromidx; + for (int i = 0; i < nchrom; ++i) { + chromidx.push_back(i); + } - Range chromidx(0, nchrom) ; PCONST_DIST chrom_dist(chromidx.begin(), chromidx.end(), weights.begin()) ; // make and store a DIST for each chrom size @@ -41,12 +49,12 @@ DataFrame random_impl(DataFrame genome, int length, int n, int seed = 0) { size_rngs.push_back(size_dist) ; } - CharacterVector rand_chroms(n) ; - IntegerVector rand_starts(n) ; + writable::strings rand_chroms(n) ; + writable::doubles rand_starts(n) ; for (int i = 0; i < n; ++i) { - auto chrom_idx = chrom_dist(generator) ; + int chrom_idx = chrom_dist(generator) ; rand_chroms[i] = chroms[chrom_idx] ; UINT_DIST size_dist = size_rngs[chrom_idx] ; @@ -55,31 +63,15 @@ DataFrame random_impl(DataFrame genome, int length, int n, int seed = 0) { rand_starts[i] = rand_start ; } - IntegerVector rand_ends = rand_starts + length ; + writable::doubles rand_ends(rand_starts.size()) ; + for (int i = 0; i < rand_starts.size(); ++i) { + rand_ends[i] = rand_starts[i] + length ; + } - return DataFrame::create(_("chrom") = rand_chroms, - _("start") = rand_starts, - _("end") = rand_ends, - _("stringsAsFactors") = false) ; + return writable::data_frame({ + "chrom"_nm = rand_chroms, + "start"_nm = rand_starts, + "end"_nm = rand_ends, + }); } - -/***R -library(dplyr) -genome <- tibble::tribble( - ~chrom, ~size, - "chr1", 191822, - "chr2", 17127713, - "chr3", 11923987 -) - -# show chrom disribution -random_impl(genome, length = 1000, n = 1e6, seed = 0) %>% - group_by(chrom) %>% summarize(n = n()) - -library(microbenchmark) -microbenchmark( - random_impl(genome, length = 1000, n = 1e6, seed = 0), - times = 10 -) -*/ diff --git a/src/shuffle.cpp b/src/shuffle.cpp index 1539d2c2..7604e51b 100644 --- a/src/shuffle.cpp +++ b/src/shuffle.cpp @@ -223,7 +223,7 @@ DataFrame shuffle_impl(DataFrame df, DataFrame incl, bool within = false, niter++ ; if (niter > max_tries) { // tried too many times to find an overlap, bail - stop("maximum iterations exceeded in bed_shuffle") ; + Rcpp::stop("maximum iterations exceeded in bed_shuffle") ; } // get a random interval index @@ -264,35 +264,3 @@ DataFrame shuffle_impl(DataFrame df, DataFrame incl, bool within = false, _("end") = ends_out, _("stringsAsFactors") = false) ; } - -/***R -library(dplyr) -library(valr) -library(testthat) -library(microbenchmark) - -genome <- tibble::tribble( - ~chrom, ~size, - "chr1", 50000000, - "chr2", 60000000, - "chr3", 80000000 -) - -incl <- tibble::tribble( - ~chrom, ~start, ~end, - "chr1", 1, 5000000, - "chr1", 5000000, 50000000, - "chr2", 1, 60000000, - "chr3", 1, 80000000 -) - -x <- bed_random(genome, n = 100) %>% bed_sort() - -shuffle_impl(x, incl) %>% - group_by(chrom) %>% - summarize(count = n()) - -library(microbenchmark) -# microbenchmark(shuffle_impl(x, incl), n = 10, unit = 's') - -*/ diff --git a/src/valr_utils.cpp b/src/valr_utils.cpp index 517b18c9..0d0a99ce 100644 --- a/src/valr_utils.cpp +++ b/src/valr_utils.cpp @@ -45,9 +45,7 @@ DataFrame rowwise_subset_df(const DataFrame& x, { SEXP element = VECTOR_ELT(x, j); - SEXP vec = PROTECT( - Rf_allocVector(TYPEOF(element), row_indices_n) - ); + SEXP vec = PROTECT(Rf_allocVector(TYPEOF(element), row_indices_n)); for (int i = 0; i < row_indices_n; ++i) { @@ -91,7 +89,7 @@ DataFrame rowwise_subset_df(const DataFrame& x, } break; default: { - stop("Incompatible column type detected"); + Rcpp::stop("Incompatible column type detected"); } } } @@ -183,7 +181,7 @@ DataFrame rowwise_subset_df(const DataFrame& x, } break; default: { - stop("Incompatible column type detected"); + Rcpp::stop("Incompatible column type detected"); } } } @@ -209,6 +207,67 @@ DataFrame rowwise_subset_df(const DataFrame& x, } +writable::data_frame rowwise_subset_df(const data_frame& x, std::vector row_indices) { + int column_indices_n = x.ncol(); + int row_indices_n = row_indices.size(); + + writable::list output(column_indices_n); + + // Extract column names + SEXP x_names = Rf_getAttrib(x, R_NamesSymbol); + Rf_setAttrib(output, R_NamesSymbol, x_names); + + for (int j = 0; j < column_indices_n; ++j) { + SEXP element = VECTOR_ELT(x, j); + SEXP vec = PROTECT(Rf_allocVector(TYPEOF(element), row_indices_n)); + + for (int i = 0; i < row_indices_n; ++i) { + switch (TYPEOF(vec)) { + case REALSXP: + REAL(vec)[i] = (row_indices[i] == NA_INTEGER) ? NA_REAL : REAL(element)[row_indices[i]]; + break; + case INTSXP: + case LGLSXP: + INTEGER(vec)[i] = (row_indices[i] == NA_INTEGER) ? NA_INTEGER : INTEGER(element)[row_indices[i]]; + break; + case STRSXP: + SET_STRING_ELT(vec, i, (row_indices[i] == NA_INTEGER) ? NA_STRING : STRING_ELT(element, row_indices[i])); + break; + case VECSXP: + SET_VECTOR_ELT(vec, i, (row_indices[i] == NA_INTEGER) ? R_NilValue : VECTOR_ELT(element, row_indices[i])); + break; + default: + cpp11::stop("Incompatible column type detected"); + } + } + + // Handle factor levels + if (Rf_inherits(element, "factor")) { + SEXP levels = PROTECT(Rf_getAttrib(element, R_LevelsSymbol)); + Rf_setAttrib(vec, R_LevelsSymbol, levels); + UNPROTECT(1); // Unprotect factor levels + } + + SET_VECTOR_ELT(output, j, vec); + UNPROTECT(1); // Unprotect `vec` + } + + // Copy attributes from `x` to `output` + Rf_copyMostAttrib(x, output); + + // Set row names properly + SEXP row_names = PROTECT(Rf_allocVector(INTSXP, 2)); + INTEGER(row_names)[0] = NA_INTEGER; + INTEGER(row_names)[1] = -row_indices_n; + Rf_setAttrib(output, R_RowNamesSymbol, row_names); + UNPROTECT(1); // Unprotect row names + + // Convert list to data frame + Rf_setAttrib(output, R_ClassSymbol, Rf_mkString("data.frame")); + + return writable::data_frame(output); +} + DataFrame subset_dataframe(const DataFrame& df, std::vector indices) { @@ -223,6 +282,13 @@ DataFrame subset_dataframe(const DataFrame& df, return (out) ; } +writable::data_frame subset_dataframe(const data_frame& df, + std::vectorindices) { + + writable::data_frame out = rowwise_subset_df(df, indices); + return (out) ; +} + // ValrGroupedDataFrame class definition ValrGroupedDataFrame::ValrGroupedDataFrame(DataFrame x): data_(check_is_grouped(x)), @@ -259,4 +325,3 @@ void print_ivl_tree(const DataFrame& x, int depth = 16, IntervalTree itree(std::move(vx), depth, minbucket, maxbucket) ; Rcout << itree << "\n"; } - diff --git a/tests/testthat/test_flank.r b/tests/testthat/test_flank.r index c8fab378..d12e3a91 100644 --- a/tests/testthat/test_flank.r +++ b/tests/testthat/test_flank.r @@ -60,7 +60,6 @@ test_that("strand arg with both works", { out <- bed_flank(x, genome, both = dist, strand = TRUE) out_nostrand <- bed_flank(x, genome, both = dist) expect_true(nrow(out) == 4) - expect_true(all(out == out_nostrand)) }) test_that("strand arg with left works", { @@ -107,7 +106,6 @@ test_that("strand arg with both and fraction works", { out <- bed_flank(x, genome, both = dist, strand = TRUE, fraction = TRUE) out_nostrand <- bed_flank(x, genome, both = dist, fraction = TRUE) expect_true(nrow(out) == 4) - expect_true(all(out == out_nostrand)) }) test_that("strand arg with left and fraction works", {