Skip to content

Commit

Permalink
Merge pull request #268 from waveygang/high-sens
Browse files Browse the repository at this point in the history
High sensitivity defaults
  • Loading branch information
ekg authored Sep 2, 2024
2 parents 9c401b3 + 803ffbd commit 4521c10
Showing 5 changed files with 18 additions and 18 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -54,6 +54,7 @@ Specifying `-m, --approx-map` lets us stop before alignment and obtain the appro

Together, these settings allow us to precisely define an alignment space to consider.
During all-to-all mapping, `-X` can additionally help us by removing self mappings from the reported set, and `-Y` extends this capability to prevent mapping between sequences with the same name prefix.
When working with large sequence collections we frequently use [PanSN](https://github.com/pangenome/PanSN-spec) naming convention and `-Y'#'` to specify that we want to group mappings by prefix, which in this context means genome or haplotype groupings.


## input indexing
2 changes: 1 addition & 1 deletion src/common/wflign/src/wflign.cpp
Original file line number Diff line number Diff line change
@@ -13,7 +13,7 @@ namespace wavefront {
/*
* Configuration
*/
#define MAX_LEN_FOR_STANDARD_WFA 25000 // default --block-length. Only for low-divergence, otherwise disabled
#define MAX_LEN_FOR_STANDARD_WFA 1000
#define MIN_WF_LENGTH 256

/*
29 changes: 14 additions & 15 deletions src/interface/parse_args.hpp
Original file line number Diff line number Diff line change
@@ -68,12 +68,12 @@ void parse_args(int argc,
args::Positional<std::string> query_sequence_file(io_opts, "query", "query sequence file (optional)");

args::Group mapping_opts(parser, "[ Mapping Options ]");
args::ValueFlag<float> map_pct_identity(mapping_opts, "%", "percent identity in the mashmap step [default: 90]", {'p', "map-pct-id"});
args::ValueFlag<std::string> segment_length(mapping_opts, "N", "segment seed length for mapping [default: 5k]", {'s', "segment-length"});
args::ValueFlag<std::string> block_length(mapping_opts, "N", "keep merged mappings supported by homologies of this total length [default: 5*segment-length]", {'l', "block-length"});
args::ValueFlag<float> map_pct_identity(mapping_opts, "%", "percent identity in the mashmap step [default: 70]", {'p', "map-pct-id"});
args::ValueFlag<std::string> segment_length(mapping_opts, "N", "segment seed length for mapping [default: 1k]", {'s', "segment-length"});
args::ValueFlag<std::string> block_length(mapping_opts, "N", "keep merged mappings supported by homologies of this total length [default: 3*segment-length]", {'l', "block-length"});
args::ValueFlag<uint32_t> num_mappings_for_segments(mapping_opts, "N", "number of mappings to retain for each query/reference pair [default: 1]", {'n', "num-mappings-for-segment"});
args::ValueFlag<uint32_t> num_mappings_for_short_seq(mapping_opts, "N", "number of mappings to retain for each query/reference pair where the query sequence is shorter than segment length [default: 1]", {'S', "num-mappings-for-short-seq"});
args::ValueFlag<int> kmer_size(mapping_opts, "N", "kmer size [default: 19]", {'k', "kmer"});
args::ValueFlag<int> kmer_size(mapping_opts, "N", "kmer size [default: 15]", {'k', "kmer"});
args::ValueFlag<float> kmer_pct_threshold(mapping_opts, "%", "ignore the top % most-frequent kmers [default: 0.001]", {'H', "kmer-threshold"});
args::Flag lower_triangular(mapping_opts, "", "only map shorter sequences against longer", {'L', "lower-triangular"});
args::Flag skip_self(mapping_opts, "", "skip self mappings when the query and target name is the same (for all-vs-all mode)", {'X', "skip-self"});
@@ -85,8 +85,8 @@ void parse_args(int argc,
args::ValueFlag<std::string> query_list(mapping_opts, "FILE", "file containing list of query sequence names", {'A', "query-list"});
args::Flag approx_mapping(mapping_opts, "approx-map", "skip base-level alignment, producing an approximate mapping in PAF", {'m',"approx-map"});
args::Flag no_split(mapping_opts, "no-split", "disable splitting of input sequences during mapping [default: enabled]", {'N',"no-split"});
args::ValueFlag<std::string> chain_gap(mapping_opts, "N", "chain mappings closer than this distance in query and target, sets approximate maximum variant length detectable in alignment [default: 6*segment_length, up to 30k]", {'c', "chain-gap"});
args::ValueFlag<std::string> max_mapping_length(mapping_opts, "N", "maximum length of a single mapping before breaking [default: inf]", {'P', "max-mapping-length"});
args::ValueFlag<std::string> chain_gap(mapping_opts, "N", "chain mappings closer than this distance in query and target, sets approximate maximum variant length detectable in alignment [default: 30k]", {'c', "chain-gap"});
args::ValueFlag<std::string> max_mapping_length(mapping_opts, "N", "maximum length of a single mapping before breaking (inf to unset) [default: 50k]", {'P', "max-mapping-length"});
args::Flag drop_low_map_pct_identity(mapping_opts, "K", "drop mappings with estimated identity below --map-pct-id=%", {'K', "drop-low-map-id"});
args::ValueFlag<double> overlap_threshold(mapping_opts, "F", "drop mappings overlapping more than fraction F with a higher scoring mapping [default: 0.5]", {'O', "overlap-threshold"});
args::Flag no_filter(mapping_opts, "MODE", "disable mapping filtering", {'f', "no-filter"});
@@ -390,7 +390,7 @@ void parse_args(int argc,
}
map_parameters.segLength = s;
} else {
map_parameters.segLength = 5000;
map_parameters.segLength = 1000;
}

if (map_pct_identity) {
@@ -413,9 +413,7 @@ void parse_args(int argc,

map_parameters.block_length = l;
} else {
// n.b. we map-merge across gaps up to 3x segment length
// and then filter for things that are at least block_length long
map_parameters.block_length = 5 * map_parameters.segLength;
map_parameters.block_length = 3 * map_parameters.segLength;
}

if (chain_gap) {
@@ -427,19 +425,20 @@ void parse_args(int argc,
map_parameters.chain_gap = l;
align_parameters.chain_gap = l;
} else {
map_parameters.chain_gap = std::min((int64_t)30000, 6*map_parameters.segLength);
align_parameters.chain_gap = std::min((int64_t)30000, 6*map_parameters.segLength);
map_parameters.chain_gap = 30000;
align_parameters.chain_gap = 30000;
}

if (max_mapping_length) {
const int64_t l = wfmash::handy_parameter(args::get(max_mapping_length));
const int64_t l = args::get(max_mapping_length) == "inf" ? std::numeric_limits<int64_t>::max()
: wfmash::handy_parameter(args::get(max_mapping_length));
if (l <= 0) {
std::cerr << "[wfmash] ERROR: max mapping length must be greater than 0." << std::endl;
exit(1);
}
map_parameters.max_mapping_length = l;
} else {
map_parameters.max_mapping_length = std::numeric_limits<int64_t>::max();
map_parameters.max_mapping_length = 50000;
}

if (drop_low_map_pct_identity) {
@@ -464,7 +463,7 @@ void parse_args(int argc,
map_parameters.kmerSize = (map_parameters.percentageIdentity >= 0.97 ? 18 :
(map_parameters.percentageIdentity >= 0.9 ? 17 : 15));
*/
map_parameters.kmerSize = 19;
map_parameters.kmerSize = 15;
}

if (kmer_pct_threshold) {
2 changes: 1 addition & 1 deletion src/map/include/computeMap.hpp
Original file line number Diff line number Diff line change
@@ -1650,7 +1650,7 @@ namespace skch
dist = std::sqrt(std::pow(query_dist,2) + std::pow(ref_dist,2));
awed = axis_weighted_euclidean_distance(query_dist, ref_dist, 0.9);
}
if (awed < max_dist) {
if (dist < max_dist) {
distances.push_back(std::make_tuple(awed, dist, it2->splitMappingId));
}
}
2 changes: 1 addition & 1 deletion src/map/include/map_parameters.hpp
Original file line number Diff line number Diff line change
@@ -101,7 +101,7 @@ namespace fixed
double ss_table_max = 1000.0; // Maximum size of dp table for filtering
double pval_cutoff = 1e-3; // p-value cutoff for determining window size
float confidence_interval = 0.95; // Confidence interval to relax jaccard cutoff for mapping (0-1)
float percentage_identity = 0.90; // Percent identity in the mapping step
float percentage_identity = 0.70; // Percent identity in the mapping step
float ANIDiff = 0.0; // Stage 1 ANI diff threshold
float ANIDiffConf = 0.999; // ANI diff confidence
std::string VERSION = "3.1.1"; // Version of MashMap

0 comments on commit 4521c10

Please sign in to comment.