diff --git a/test/test_misc.py b/test/test_misc.py
new file mode 100644
index 0000000..6159fc8
--- /dev/null
+++ b/test/test_misc.py
@@ -0,0 +1,338 @@
+"""
+This module contains some tests for Trycycler. To run them, execute `pytest` from the root
+Trycycler directory.
+
+Copyright 2020 Ryan Wick (rrwick@gmail.com)
+https://github.com/rrwick/Trycycler
+
+This file is part of Trycycler. Trycycler is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by the Free Software Foundation,
+either version 3 of the License, or (at your option) any later version. Trycycler is distributed
+in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+details. You should have received a copy of the GNU General Public License along with Trycycler.
+If not, see .
+"""
+
+import gzip
+import pathlib
+import pytest
+import sys
+import tempfile
+import unittest.mock
+
+import trycycler.misc
+
+
+def test_get_compression_type_1():
+ assert trycycler.misc.get_compression_type('test/test_misc/test.txt') == 'plain'
+
+
+def test_get_compression_type_2():
+ assert trycycler.misc.get_compression_type('test/test_misc/test.gz') == 'gz'
+
+
+def test_get_compression_type_3():
+ with pytest.raises(SystemExit) as e:
+ trycycler.misc.get_compression_type('test/test_misc/test.bz2')
+ assert 'cannot use bzip2' in str(e.value)
+
+
+def test_get_compression_type_4():
+ with pytest.raises(SystemExit) as e:
+ trycycler.misc.get_compression_type('test/test_misc/test.zip')
+ assert 'cannot use zip' in str(e.value)
+
+
+def test_get_open_func_1():
+ assert trycycler.misc.get_open_func('test/test_misc/test.txt') == open
+
+
+def test_get_open_func_2():
+ assert trycycler.misc.get_open_func('test/test_misc/test.gz') == gzip.open
+
+
+def test_get_sequence_file_type_1():
+ assert trycycler.misc.get_sequence_file_type('test/test_misc/test.fasta') == 'FASTA'
+
+
+def test_get_sequence_file_type_2():
+ assert trycycler.misc.get_sequence_file_type('test/test_misc/test.fastq') == 'FASTQ'
+
+
+def test_get_sequence_file_type_3():
+ assert trycycler.misc.get_sequence_file_type('test/test_misc/test.txt') == 'neither'
+
+
+def test_get_sequence_file_type_4():
+ assert trycycler.misc.get_sequence_file_type('test/test_misc/empty') == 'neither'
+
+
+def test_get_sequence_file_type_5():
+ assert trycycler.misc.get_sequence_file_type('test/test_misc/test.fasta.gz') == 'FASTA'
+
+
+def test_get_sequence_file_type_6():
+ assert trycycler.misc.get_sequence_file_type('test/test_misc/test.fastq.gz') == 'FASTQ'
+
+
+def test_get_sequence_file_type_7():
+ assert trycycler.misc.get_sequence_file_type('test/test_misc/test.gz') == 'neither'
+
+
+def test_get_sequence_file_type_8():
+ assert trycycler.misc.get_sequence_file_type('test/test_misc/not_unicode') == 'neither'
+
+
+def test_iterate_fastq_1():
+ seqs = list(trycycler.misc.iterate_fastq('test/test_misc/test.fastq'))
+ assert len(seqs) == 2
+ assert seqs[0][0] == 'A'
+ assert seqs[0][1] == '@A info'
+ assert seqs[0][2].startswith('TTGCCTGTAGTCGGGACC')
+ assert seqs[0][3].startswith('##$#%#%&++3*&&&-.7')
+ assert seqs[1][0] == 'B'
+ assert seqs[1][1] == '@B stuff'
+ assert seqs[1][2].startswith('ATTCTCAGAATGGCGTAG')
+ assert seqs[1][3].startswith(':;@@AHD98/.5C*-CEC')
+
+
+def test_iterate_fastq_2():
+ # Tests a FASTQ with extra line breaks.
+ seqs = list(trycycler.misc.iterate_fastq('test/test_misc/bad_1.fastq'))
+ assert len(seqs) == 2
+ assert seqs[0][0] == 'A'
+ assert seqs[0][1] == '@A info'
+ assert seqs[0][2].startswith('TTGCCTGTAGTCGGGACC')
+ assert seqs[0][3].startswith('##$#%#%&++3*&&&-.7')
+ assert seqs[1][0] == 'B'
+ assert seqs[1][1] == '@B stuff'
+ assert seqs[1][2].startswith('ATTCTCAGAATGGCGTAG')
+ assert seqs[1][3].startswith(':;@@AHD98/.5C*-CEC')
+
+
+def test_iterate_fastq_3():
+ # Tests a FASTQ with an extra line of text.
+ seqs = list(trycycler.misc.iterate_fastq('test/test_misc/bad_2.fastq'))
+ assert len(seqs) == 2
+ assert seqs[0][0] == 'A'
+ assert seqs[0][1] == '@A info'
+ assert seqs[0][2].startswith('TTGCCTGTAGTCGGGACC')
+ assert seqs[0][3].startswith('##$#%#%&++3*&&&-.7')
+ assert seqs[1][0] == 'B'
+ assert seqs[1][1] == '@B stuff'
+ assert seqs[1][2].startswith('ATTCTCAGAATGGCGTAG')
+ assert seqs[1][3].startswith(':;@@AHD98/.5C*-CEC')
+
+
+def test_iterate_fastq_4():
+ with pytest.raises(SystemExit) as e:
+ _ = list(trycycler.misc.iterate_fastq('test/test_misc/test.fasta'))
+ assert 'not FASTQ format' in str(e.value)
+
+
+def test_load_fastq_as_dict():
+ seqs = trycycler.misc.load_fastq_as_dict('test/test_misc/test.fastq')
+ assert len(seqs) == 2
+ assert seqs['A'][0] == '@A info'
+ assert seqs['A'][1].startswith('TTGCCTGTAGTCGGGACC')
+ assert seqs['A'][2].startswith('##$#%#%&++3*&&&-.7')
+ assert seqs['B'][0] == '@B stuff'
+ assert seqs['B'][1].startswith('ATTCTCAGAATGGCGTAG')
+ assert seqs['B'][2].startswith(':;@@AHD98/.5C*-CEC')
+
+
+def test_get_fastq_stats():
+ read_count, total_size, n50 = trycycler.misc.get_fastq_stats('test/test_misc/test.fastq')
+ assert read_count == 2
+ assert total_size == 200
+ assert n50 == 100
+
+
+def test_get_n50_1():
+ assert trycycler.misc.get_n50([1, 2, 3, 4, 1000]) == 1000
+
+
+def test_get_n50_2():
+ assert trycycler.misc.get_n50([12, 23455, 15, 12433, 15343, 9, 10]) == 15343
+
+
+def test_get_n50_3():
+ assert trycycler.misc.get_n50([]) == 0
+
+
+def test_load_fasta_1():
+ seqs = trycycler.misc.load_fasta('test/test_misc/test.fasta')
+ assert len(seqs) == 2
+ assert seqs[0][0] == 'A'
+ assert seqs[0][1].startswith('TTGCCTGTAGTCGGGACC')
+ assert seqs[1][0] == 'B'
+ assert seqs[1][1].startswith('ATTCTCAGAATGGCGTAG')
+
+
+def test_load_fasta_2():
+ seqs = trycycler.misc.load_fasta('test/test_misc/test.fasta', include_full_header=True)
+ assert len(seqs) == 2
+ assert seqs[0][0] == 'A'
+ assert seqs[0][1] == 'A info'
+ assert seqs[0][2].startswith('TTGCCTGTAGTCGGGACC')
+ assert seqs[1][0] == 'B'
+ assert seqs[1][1] == 'B stuff'
+ assert seqs[1][2].startswith('ATTCTCAGAATGGCGTAG')
+
+
+def test_load_fasta_3():
+ seqs = trycycler.misc.load_fasta('test/test_misc/test.fasta.gz')
+ assert len(seqs) == 2
+ assert seqs[0][0] == 'A'
+ assert seqs[0][1].startswith('TTGCCTGTAGTCGGGACC')
+ assert seqs[1][0] == 'B'
+ assert seqs[1][1].startswith('ATTCTCAGAATGGCGTAG')
+
+
+def test_load_fasta_4():
+ seqs = trycycler.misc.load_fasta('test/test_misc/bad_1.fasta')
+ assert len(seqs) == 2
+ assert seqs[0][0] == 'A'
+ assert seqs[0][1].startswith('TTGCCTGTAGTCGGGACC')
+ assert seqs[1][0] == 'B'
+ assert seqs[1][1].startswith('ATTCTCAGAATGGCGTAG')
+
+
+def test_get_default_thread_count():
+ assert 1 <= trycycler.misc.get_default_thread_count() <= 16
+
+
+def test_write_seq_to_fasta():
+ with tempfile.TemporaryDirectory() as temp_dir:
+ filename = pathlib.Path(temp_dir) / 'temp.fasta'
+ trycycler.misc.write_seq_to_fasta('CAGAATGGCGT', 'name', filename)
+ seqs = trycycler.misc.load_fasta(filename)
+ assert len(seqs) == 1
+ assert seqs[0][0] == 'name'
+ assert seqs[0][1] == 'CAGAATGGCGT'
+
+
+def test_reverse_complement_1():
+ assert trycycler.misc.reverse_complement('GGGGaaaaaaaatttatatat') == 'atatataaattttttttCCCC'
+
+
+def test_reverse_complement_2():
+ assert trycycler.misc.reverse_complement('atatataaattttttttCCCC') == 'GGGGaaaaaaaatttatatat'
+
+
+def test_reverse_complement_3():
+ assert trycycler.misc.reverse_complement('ACGT123') == 'NNNACGT'
+
+
+def test_remove_duplicates_1():
+ assert trycycler.misc.remove_duplicates([1, 4, 3, 4, 2]) == [1, 4, 3, 2]
+
+
+def test_remove_duplicates_2():
+ assert trycycler.misc.remove_duplicates(['a', 'a', 'a', 'b', 'a']) == ['a', 'b']
+
+
+def test_check_python_version_1():
+ with unittest.mock.patch.object(sys, 'version_info') as v_info:
+ v_info.major = 3
+ v_info.minor = 6
+ trycycler.misc.check_python_version()
+
+
+def test_check_python_version_2():
+ with unittest.mock.patch.object(sys, 'version_info') as v_info:
+ v_info.major = 3
+ v_info.minor = 8
+ trycycler.misc.check_python_version()
+
+
+def test_check_python_version_3():
+ with pytest.raises(SystemExit) as e:
+ with unittest.mock.patch.object(sys, 'version_info') as v_info:
+ v_info.major = 3
+ v_info.minor = 5
+ trycycler.misc.check_python_version()
+ assert 'requires Python 3.6 or later' in str(e.value)
+
+
+def test_check_python_version_4():
+ with pytest.raises(SystemExit) as e:
+ with unittest.mock.patch.object(sys, 'version_info') as v_info:
+ v_info.major = 2
+ v_info.minor = 7
+ trycycler.misc.check_python_version()
+ assert 'requires Python 3.6 or later' in str(e.value)
+
+
+def test_check_output_directory_1():
+ with pytest.raises(SystemExit) as e:
+ trycycler.misc.check_output_directory(pathlib.Path('test/test_misc/test.fasta'))
+ assert 'already exists as a file' in str(e.value)
+
+
+def test_check_output_directory_2():
+ with tempfile.TemporaryDirectory() as temp_dir:
+ out_dir = pathlib.Path(temp_dir) / 'output'
+ trycycler.misc.check_output_directory(out_dir)
+ assert out_dir.is_dir()
+ trycycler.misc.check_output_directory(out_dir)
+ assert out_dir.is_dir()
+ temp_file = out_dir / 'temp'
+ open(temp_file, 'a').close()
+ trycycler.misc.check_output_directory(out_dir)
+ assert out_dir.is_dir()
+
+
+def test_count_substrings_1():
+ assert trycycler.misc.count_substrings('000123000123', '123') == 2
+
+
+def test_count_substrings_2():
+ assert trycycler.misc.count_substrings('000123000123', 'abc') == 0
+
+
+def test_range_overlap_1():
+ assert trycycler.misc.range_overlap(0, 10, 5, 20)
+
+
+def test_range_overlap_2():
+ assert trycycler.misc.range_overlap(0, 10, 9, 20)
+
+
+def test_range_overlap_3():
+ assert not trycycler.misc.range_overlap(0, 10, 10, 20)
+
+
+def test_range_overlap_4():
+ assert not trycycler.misc.range_overlap(0, 10, 11, 20)
+
+
+def test_check_input_reads_1():
+ read_count, total_size = trycycler.misc.check_input_reads('test/test_misc/test.fastq.gz')
+ assert read_count == 2
+ assert total_size == 200
+
+
+def test_check_input_reads_2():
+ file_size = trycycler.misc.check_input_reads('test/test_misc/test.fastq.gz',
+ file_size_only=True)
+ assert file_size > 100
+
+
+def test_check_input_reads_3():
+ with pytest.raises(SystemExit) as e:
+ trycycler.misc.check_input_reads('test/test_misc/test.fasta')
+ assert 'not in FASTQ format' in str(e.value)
+
+
+def test_get_ascii_art():
+ assert "| || '__|| | | | / __|| | | |" in trycycler.misc.get_ascii_art()
+
+
+def test_count_lines_1():
+ assert trycycler.misc.count_lines('test/test_misc/test.fasta') == 4
+
+
+def test_count_lines_2():
+ assert trycycler.misc.count_lines('test/test_misc/test.fastq.gz') == 8
diff --git a/test/test_misc/bad_1.fasta b/test/test_misc/bad_1.fasta
new file mode 100644
index 0000000..075f1c1
--- /dev/null
+++ b/test/test_misc/bad_1.fasta
@@ -0,0 +1,7 @@
+>A
+TTGCCTGTAGTCGGGACCCCGTGACTAGGAAAGCAATCAGCGACTAACAGGCGGAGACCGTCTATAGCGCACGGGGTGTAGTTGGCTATTACTGATCTCT
+
+
+
+>B
+ATTCTCAGAATGGCGTAGTATTCATATTTGTTCGTAGCCCGCCTCCGTACATGTTATTGTGCTCATCGGTGGCCTGCGCCGTGGGGAGTGCAAAACGTGG
diff --git a/test/test_misc/bad_1.fastq b/test/test_misc/bad_1.fastq
new file mode 100644
index 0000000..e25086f
--- /dev/null
+++ b/test/test_misc/bad_1.fastq
@@ -0,0 +1,9 @@
+@A info
+TTGCCTGTAGTCGGGACCCCGTGACTAGGAAAGCAATCAGCGACTAACAGGCGGAGACCGTCTATAGCGCACGGGGTGTAGTTGGCTATTACTGATCTCT
++
+##$#%#%&++3*&&&-.72:789>;:<74362%&&(%()%$&$$(%*'&$%&$%*##$'/-&'&&'%%%'$%#"$#'#$$)##%((#%$('*'$'($'
+
+@B stuff
+ATTCTCAGAATGGCGTAGTATTCATATTTGTTCGTAGCCCGCCTCCGTACATGTTATTGTGCTCATCGGTGGCCTGCGCCGTGGGGAGTGCAAAACGTGG
++
+:;@@AHD98/.5C*-CEC68BHJD/>:@>9CA=@??DEIF835<1.*+)<8++1--5?;629;%3))2/@=BC6651:65.?@>EFBBFNJ@BJK
diff --git a/test/test_misc/bad_2.fastq b/test/test_misc/bad_2.fastq
new file mode 100644
index 0000000..da0aea8
--- /dev/null
+++ b/test/test_misc/bad_2.fastq
@@ -0,0 +1,9 @@
+@A info
+TTGCCTGTAGTCGGGACCCCGTGACTAGGAAAGCAATCAGCGACTAACAGGCGGAGACCGTCTATAGCGCACGGGGTGTAGTTGGCTATTACTGATCTCT
++
+##$#%#%&++3*&&&-.72:789>;:<74362%&&(%()%$&$$(%*'&$%&$%*##$'/-&'&&'%%%'$%#"$#'#$$)##%((#%$('*'$'($'
+EXTRA LINE
+@B stuff
+ATTCTCAGAATGGCGTAGTATTCATATTTGTTCGTAGCCCGCCTCCGTACATGTTATTGTGCTCATCGGTGGCCTGCGCCGTGGGGAGTGCAAAACGTGG
++
+:;@@AHD98/.5C*-CEC68BHJD/>:@>9CA=@??DEIF835<1.*+)<8++1--5?;629;%3))2/@=BC6651:65.?@>EFBBFNJ@BJK
diff --git a/test/test_misc/empty b/test/test_misc/empty
new file mode 100644
index 0000000..e69de29
diff --git a/test/test_misc/not_unicode b/test/test_misc/not_unicode
new file mode 100644
index 0000000..8dcf8b4
--- /dev/null
+++ b/test/test_misc/not_unicode
@@ -0,0 +1 @@
+ÊvWŠl
\ No newline at end of file
diff --git a/test/test_misc/test.bz2 b/test/test_misc/test.bz2
new file mode 100644
index 0000000..ec38fc2
Binary files /dev/null and b/test/test_misc/test.bz2 differ
diff --git a/test/test_misc/test.fasta b/test/test_misc/test.fasta
new file mode 100644
index 0000000..8804f54
--- /dev/null
+++ b/test/test_misc/test.fasta
@@ -0,0 +1,4 @@
+>A info
+TTGCCTGTAGTCGGGACCCCGTGACTAGGAAAGCAATCAGCGACTAACAGGCGGAGACCGTCTATAGCGCACGGGGTGTAGTTGGCTATTACTGATCTCT
+>B stuff
+ATTCTCAGAATGGCGTAGTATTCATATTTGTTCGTAGCCCGCCTCCGTACATGTTATTGTGCTCATCGGTGGCCTGCGCCGTGGGGAGTGCAAAACGTGG
diff --git a/test/test_misc/test.fasta.gz b/test/test_misc/test.fasta.gz
new file mode 100644
index 0000000..8a1ce99
Binary files /dev/null and b/test/test_misc/test.fasta.gz differ
diff --git a/test/test_misc/test.fastq b/test/test_misc/test.fastq
new file mode 100644
index 0000000..98547c5
--- /dev/null
+++ b/test/test_misc/test.fastq
@@ -0,0 +1,8 @@
+@A info
+TTGCCTGTAGTCGGGACCCCGTGACTAGGAAAGCAATCAGCGACTAACAGGCGGAGACCGTCTATAGCGCACGGGGTGTAGTTGGCTATTACTGATCTCT
++
+##$#%#%&++3*&&&-.72:789>;:<74362%&&(%()%$&$$(%*'&$%&$%*##$'/-&'&&'%%%'$%#"$#'#$$)##%((#%$('*'$'($'
+@B stuff
+ATTCTCAGAATGGCGTAGTATTCATATTTGTTCGTAGCCCGCCTCCGTACATGTTATTGTGCTCATCGGTGGCCTGCGCCGTGGGGAGTGCAAAACGTGG
++
+:;@@AHD98/.5C*-CEC68BHJD/>:@>9CA=@??DEIF835<1.*+)<8++1--5?;629;%3))2/@=BC6651:65.?@>EFBBFNJ@BJK
diff --git a/test/test_misc/test.fastq.gz b/test/test_misc/test.fastq.gz
new file mode 100644
index 0000000..82f70c6
Binary files /dev/null and b/test/test_misc/test.fastq.gz differ
diff --git a/test/test_misc/test.gz b/test/test_misc/test.gz
new file mode 100644
index 0000000..cfc8d92
Binary files /dev/null and b/test/test_misc/test.gz differ
diff --git a/test/test_misc/test.txt b/test/test_misc/test.txt
new file mode 100644
index 0000000..b5da95d
--- /dev/null
+++ b/test/test_misc/test.txt
@@ -0,0 +1 @@
+This is a plain text file.
diff --git a/test/test_misc/test.zip b/test/test_misc/test.zip
new file mode 100644
index 0000000..12ef577
Binary files /dev/null and b/test/test_misc/test.zip differ
diff --git a/trycycler/consensus.py b/trycycler/consensus.py
index 2cdcfbc..d2c0fcc 100644
--- a/trycycler/consensus.py
+++ b/trycycler/consensus.py
@@ -224,7 +224,7 @@ def choose_best_chunk_options(chunks, cluster_dir, threads, verbose, circular):
log('No chunks need read-based assessment. Skipping this step.\n')
return
- reads = load_fastq_as_dict(cluster_dir)
+ reads = load_fastq_as_dict(cluster_dir / '4_reads.fastq')
new_best_seqs = {}
completed, kept, changed = 0, 0, 0
diff --git a/trycycler/misc.py b/trycycler/misc.py
index 015b5e6..50a5700 100755
--- a/trycycler/misc.py
+++ b/trycycler/misc.py
@@ -91,8 +91,7 @@ def iterate_fastq(filename):
yield name, header, sequence, qualities
-def load_fastq_as_dict(cluster_dir):
- read_filename = cluster_dir / '4_reads.fastq'
+def load_fastq_as_dict(read_filename):
reads = {name: (header, seq, qual) for name, header, seq, qual in iterate_fastq(read_filename)}
return reads
@@ -178,7 +177,7 @@ def reverse_complement(seq):
def remove_duplicates(lst):
"""
- https://stackoverflow.com/questions/480214/how-do-you-remove-duplicates-from-a-list-whilst-preserving-order
+ https://stackoverflow.com/questions/480214
"""
seen = set()
seen_add = seen.add