diff --git a/Cargo.toml b/Cargo.toml
index a84a884..8222d5c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "ska"
-version = "0.3.10"
+version = "0.3.11"
 authors = [
     "John Lees <jlees@ebi.ac.uk>",
     "Simon Harris <simon.harris@gatesfoundation.org>",
diff --git a/README.md b/README.md
index 77dd9de..978941d 100644
--- a/README.md
+++ b/README.md
@@ -26,11 +26,18 @@ Split k-mer analysis (version 2) uses exact matching of split k-mer sequences to
 
 SKA can only align SNPs further than the k-mer length apart, and does not use a gap penalty approach or give alignment scores. But the advantages are speed and flexibility, particularly the ability to run on a reference-free manner (i.e. including accessory genome variation) on both assemblies and reads.
 
+### Citation
+
+Romain Derelle, Johanna von Wachsmann, Tommi M&auml;klin, Joel Hellewell, Timothy Russell, Ajit Lalvani, Leonid Chindelevitch, Nicholas J. Croucher, Simon R. Harris, John A. Lees (2024).
+**Seamless, rapid and accurate analyses of outbreak genomic data using Split _k_-mer Analysis**
+*Genome Research*
+
 ## Documentation
 
 Can be found at https://docs.rs/ska. We also have some tutorials available:
 
 - [From genomes to trees](https://www.bacpop.org/guides/building_trees_with_ska/).
+- [Filtering options](https://www.bacpop.org/guides/snp_alignment_with_ska/).
 
 ## Installation
 
diff --git a/src/cli.rs b/src/cli.rs
index dcdf0c8..fea883a 100644
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -11,6 +11,8 @@ pub const DEFAULT_KMER: usize = 17;
 pub const DEFAULT_PROPORTION_READS: Option<f64> = None;
 /// Default single strand (which is equivalent to !rc)
 pub const DEFAULT_STRAND: bool = false;
+/// Default minimum frequency filter threshold
+pub const DEFAULT_MINFREQ: f64 = 0.9;
 /// Default behaviour when min-freq counting ambig sites
 pub const DEFAULT_AMBIGMISSING: bool = false;
 /// Default repeat masking behaviour
@@ -191,7 +193,7 @@ pub enum Commands {
         output: Option<String>,
 
         /// Minimum fraction of samples a k-mer has to appear in
-        #[arg(short, long, value_parser = zero_to_one, default_value_t = 0.9)]
+        #[arg(short, long, value_parser = zero_to_one, default_value_t = DEFAULT_MINFREQ)]
         min_freq: f64,
 
         /// With min_freq, only count non-ambiguous sites
@@ -255,9 +257,9 @@ pub enum Commands {
         #[arg(short, long, value_parser = zero_to_one, default_value_t = 0.0)]
         min_freq: f64,
 
-        /// Filter for ambiguous bases
+        /// Filter out ambiguous bases ('N' still a mismatch)
         #[arg(long, default_value_t = false)]
-        filter_ambiguous: bool,
+        allow_ambiguous: bool,
 
         /// Number of CPU threads
         #[arg(long, value_parser = valid_cpus, default_value_t = 1)]
@@ -312,7 +314,7 @@ pub enum Commands {
         reverse: bool,
 
         /// Minimum fraction of samples a k-mer has to appear in
-        #[arg(short, long, value_parser = zero_to_one, default_value_t = 0.0)]
+        #[arg(short, long, value_parser = zero_to_one, default_value_t = DEFAULT_MINFREQ)]
         min_freq: f64,
 
         /// With min_freq, only count non-ambiguous sites
diff --git a/src/generic_modes.rs b/src/generic_modes.rs
index 9b53be8..b6ddb44 100644
--- a/src/generic_modes.rs
+++ b/src/generic_modes.rs
@@ -152,6 +152,8 @@ pub fn distance<IntT: for<'a> UInt<'a>>(
     );
     if filt_ambig || (min_freq * ska_array.nsamples() as f64 >= 1.0) {
         if filt_ambig {
+            let filter_ambig_as_missing = true;
+            let mask_ambig = true;
             apply_filters(
                 ska_array,
                 min_freq,
diff --git a/src/lib.rs b/src/lib.rs
index 6601751..988725f 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -35,6 +35,7 @@
 //!
 //! Tutorials:
 //! - [From genomes to trees](https://www.bacpop.org/guides/building_trees_with_ska/).
+//! - [Filtering options](https://www.bacpop.org/guides/snp_alignment_with_ska/).
 //!
 //! Command line usage follows. For API documentation and usage, see the [end of this section](#api-usage).
 //!
@@ -208,9 +209,12 @@
 //! ska distance -o distances.txt seqs.skf
 //! ```
 //!
-//! Ignore ambiguous bases by adding `--filter-ambiguous` flag, and `--min-freq` to
-//! ignore k-mers only found in some samples. Multiple threads
-//! can be used, but this will only be effective with large numbers of samples.
+//! Consider ambiguous bases by adding `--allow-ambiguous` flag, and change `--min-freq` to
+//! ignore more/less k-mers only found in some samples (default = 0.9). Note that ambiguous bases may overestimate
+//! distances due to repeat k-mers. For finer control over filtering, first run `ska weed`
+//! on the input .skf.
+//!
+//! Multiple threads can be used, but this will only be effective with large numbers of samples.
 //!
 //! The companion script in `scripts/cluster_dists.py` (requires `networkx`) can
 //! be used to make single linkage clusters from these distances at given thresholds,
@@ -619,10 +623,11 @@ pub fn main() {
             skf_file,
             output,
             min_freq,
-            filter_ambiguous,
+            allow_ambiguous,
             threads,
         } => {
             check_threads(*threads);
+            let filter_ambiguous = !*allow_ambiguous;
             if let Ok(mut ska_array) = MergeSkaArray::<u64>::load(skf_file) {
                 // In debug mode (cannot be set from CLI, give details)
                 log::debug!("{ska_array}");
@@ -630,7 +635,7 @@ pub fn main() {
                     &mut ska_array,
                     output,
                     *min_freq,
-                    *filter_ambiguous,
+                    filter_ambiguous,
                     *threads,
                 );
             } else if let Ok(mut ska_array) = MergeSkaArray::<u128>::load(skf_file) {
@@ -640,7 +645,7 @@ pub fn main() {
                     &mut ska_array,
                     output,
                     *min_freq,
-                    *filter_ambiguous,
+                    filter_ambiguous,
                     *threads,
                 );
             } else {
diff --git a/src/merge_ska_array.rs b/src/merge_ska_array.rs
index ff472a8..ce413f8 100644
--- a/src/merge_ska_array.rs
+++ b/src/merge_ska_array.rs
@@ -342,6 +342,8 @@ where
                 } else {
                     removed += 1;
                 }
+            } else {
+                removed += 1;
             }
         }
         self.variants = filtered_variants;
diff --git a/src/ska_dict/bit_encoding.rs b/src/ska_dict/bit_encoding.rs
index 790ab3e..7cb9da6 100644
--- a/src/ska_dict/bit_encoding.rs
+++ b/src/ska_dict/bit_encoding.rs
@@ -76,7 +76,7 @@ pub fn base_to_prob(base: u8) -> [f64; 4] {
         b'D' => [1.0 / 3.0, 0.0, 1.0 / 3.0, 1.0 / 3.0],
         b'H' => [1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0, 0.0],
         b'V' => [1.0 / 3.0, 1.0 / 3.0, 0.0, 1.0 / 3.0],
-        b'N' => [0.25, 0.25, 0.25, 0.25],
+        b'N' => [0.0, 0.0, 0.0, 0.0], // This gives more expected behaviour than 0.25 in each entry
         _ => [0.0, 0.0, 0.0, 0.0],
     }
 }
@@ -494,7 +494,7 @@ mod tests {
 
         assert_eq!(overlap(&k, &b), 1.0 / 3.0);
         assert_eq!(overlap(&d, &h), 2.0 / 9.0);
-        assert_eq!(overlap(&v, &n), 0.25);
+        assert_eq!(overlap(&v, &n), 0.0);
 
         assert_eq!(overlap(&n, &empty), 0.0);
     }
diff --git a/tests/distance.rs b/tests/distance.rs
index 1fb6265..a4ed150 100644
--- a/tests/distance.rs
+++ b/tests/distance.rs
@@ -50,6 +50,7 @@ fn dist_filter() {
         .current_dir(sandbox.get_wd())
         .arg("distance")
         .arg(sandbox.file_string("merge_k9.skf", TestDir::Input))
+        .arg("--allow-ambiguous")
         .arg("-v")
         .args(&["--threads", "2"])
         .assert()
@@ -60,7 +61,6 @@ fn dist_filter() {
         .current_dir(sandbox.get_wd())
         .arg("distance")
         .arg(sandbox.file_string("merge_k9.skf", TestDir::Input))
-        .arg("--filter-ambiguous")
         .arg("-v")
         .assert()
         .stdout_eq_path(sandbox.file_string("merge_k9_no_ambig.dist.stdout", TestDir::Correct));
@@ -97,21 +97,24 @@ fn multisample_dists() {
         .assert()
         .success();
 
+    // Test with filters off
     Command::new(cargo_bin("ska"))
         .current_dir(sandbox.get_wd())
         .arg("distance")
         .arg("multidist.skf")
         .arg("-v")
+        .args(&["--min-freq", "0"])
+        .arg("--allow-ambiguous")
         .args(&["--threads", "2"])
         .assert()
         .stdout_eq_path(sandbox.file_string("multidist.stdout", TestDir::Correct));
 
+    // Test with default filters
     Command::new(cargo_bin("ska"))
         .current_dir(sandbox.get_wd())
         .arg("distance")
         .arg("multidist.skf")
         .arg("-v")
-        .args(&["--min-freq", "0.5"])
         .assert()
         .stdout_eq_path(sandbox.file_string("multidist.filter.stdout", TestDir::Correct));
 }
diff --git a/tests/test_results_correct/merge_k9_min_freq.dist.stdout b/tests/test_results_correct/merge_k9_min_freq.dist.stdout
index 6bef4ce..5556e85 100644
--- a/tests/test_results_correct/merge_k9_min_freq.dist.stdout
+++ b/tests/test_results_correct/merge_k9_min_freq.dist.stdout
@@ -1,2 +1,2 @@
 Sample1	Sample2	Distance	Mismatches
-test_1	test_2	2.50	0.00000
+test_1	test_2	2.00	0.00000
diff --git a/tests/test_results_correct/multidist.filter.stdout b/tests/test_results_correct/multidist.filter.stdout
index ff50dc8..2f24536 100644
--- a/tests/test_results_correct/multidist.filter.stdout
+++ b/tests/test_results_correct/multidist.filter.stdout
@@ -1,16 +1,16 @@
 Sample1	Sample2	Distance	Mismatches
-N_test_1	N_test_2	2.00	0.22222
+N_test_1	N_test_2	2.00	0.51724
 N_test_1	ambig_test_1	0.00	1.00000
 N_test_1	ambig_test_2	0.00	1.00000
-N_test_1	test_1	0.50	0.08333
-N_test_1	test_2	1.00	0.05556
+N_test_1	test_1	1.00	0.25455
+N_test_1	test_2	1.00	0.42623
 N_test_2	ambig_test_1	0.00	1.00000
 N_test_2	ambig_test_2	0.00	1.00000
-N_test_2	test_1	1.50	0.19444
-N_test_2	test_2	1.00	0.16667
-ambig_test_1	ambig_test_2	0.00	0.00000
+N_test_2	test_1	2.00	0.56716
+N_test_2	test_2	1.00	0.28571
+ambig_test_1	ambig_test_2	1.00	0.44444
 ambig_test_1	test_1	0.00	1.00000
 ambig_test_1	test_2	0.00	1.00000
 ambig_test_2	test_1	0.00	1.00000
 ambig_test_2	test_2	0.00	1.00000
-test_1	test_2	1.50	0.02778
+test_1	test_2	3.00	0.44118