From 12824c9d4ac5ce661cc07dc295205d8c992b2792 Mon Sep 17 00:00:00 2001 From: Marcus Fedarko Date: Tue, 21 Mar 2023 23:37:32 -0700 Subject: [PATCH] ENH: first-pass LJA+Flye DOT parser; custom errors needs testing, and this will crash downstream b/c the layout code/etc expects nodes to have lengths. But overall, this is a good start. from there -- need to allow other parsers to create multidigraphs (not just this one) -- #202. and, for DOT graphs, draw nodes as circles w/ uniform sizes (#211). once these (and the requisite other hurdles i haven't thought of r/n) are passed, we've got something going. --- metagenomescope/assembly_graph_parser.py | 286 +++++++++++++++--- metagenomescope/errors.py | 2 + .../test_auxiliary_functions.py | 5 + .../assembly_graph_parser/test_parse_gfa.py | 5 +- .../test_parse_lastgraph.py | 19 +- .../test_parse_metacarvel_gml.py | 57 ++-- .../test_validate_lastgraph.py | 7 +- 7 files changed, 302 insertions(+), 79 deletions(-) create mode 100644 metagenomescope/errors.py diff --git a/metagenomescope/assembly_graph_parser.py b/metagenomescope/assembly_graph_parser.py index 405e9410..04ecd246 100644 --- a/metagenomescope/assembly_graph_parser.py +++ b/metagenomescope/assembly_graph_parser.py @@ -38,10 +38,15 @@ # # 3. Add tests for your parser in metagenomescope/tests/assembly_graph_parser/ +import re import networkx as nx import gfapy import pyfastg from .input_node_utils import gc_content, negate_node_id +from .errors import GraphParsingError + + +LJA_LFL_PATT = re.compile(r"([ACGT]) ?([0-9]+)\(([0-9\.]+)\)") def is_not_pos_int(number_string): @@ -97,7 +102,7 @@ def validate_lastgraph_file(graph_file): Raises ------ - ValueError + GraphParsingError If any of the following conditions are observed: General: @@ -140,7 +145,7 @@ def validate_lastgraph_file(graph_file): if line_num == 1: header_num_nodes_str = line.split()[0] if is_not_pos_int(header_num_nodes_str): - raise ValueError( + raise GraphParsingError( "Line 1: $NUMBER_OF_NODES must be a positive integer." "Currently, it's {}.".format(header_num_nodes_str) ) @@ -148,27 +153,27 @@ def validate_lastgraph_file(graph_file): if line.startswith("NODE\t"): if in_node_block: - raise ValueError( + raise GraphParsingError( "Line {}: Node block ends too early.".format(line_num) ) split_line = line.split() if len(split_line) < 4: - raise ValueError( + raise GraphParsingError( "Line {}: Node declaration doesn't include enough " "fields.".format(line_num) ) if is_not_pos_int(split_line[2]) or is_not_pos_int(split_line[3]): - raise ValueError( + raise GraphParsingError( "Line {}: The $COV_SHORT1 and $O_COV_SHORT1 values " "must be positive integers.".format(line_num) ) if split_line[1][0] == "-": - raise ValueError( + raise GraphParsingError( "Line {}: Node IDs can't start with " "'-'.".format(line_num) ) if split_line[1] in seen_nodes: - raise ValueError( + raise GraphParsingError( "Line {}: Node ID {} declared multiple times.".format( line_num, split_line[1] ) @@ -181,17 +186,17 @@ def validate_lastgraph_file(graph_file): elif line.startswith("ARC\t"): if in_node_block: - raise ValueError( + raise GraphParsingError( "Line {}: Node block ends too early.".format(line_num) ) split_line = line.split() if len(split_line) < 4: - raise ValueError( + raise GraphParsingError( "Line {}: Arc declaration doesn't include enough " "fields.".format(line_num) ) if is_not_pos_int(split_line[3]): - raise ValueError( + raise GraphParsingError( "Line {}: The $MULTIPLICITY value of an arc must be " "a positive integer.".format(line_num) ) @@ -201,7 +206,7 @@ def validate_lastgraph_file(graph_file): # split_line, referring to the source and target node of this # edge) if node_id not in seen_nodes: - raise ValueError( + raise GraphParsingError( "Line {}: Unseen node {} referred to in an " "arc.".format(line_num, node_id) ) @@ -213,7 +218,7 @@ def validate_lastgraph_file(graph_file): # If rev_ids is in seen_edges, then so is fwd_ids. No need to check # both here. if fwd_ids in seen_edges: - raise ValueError( + raise GraphParsingError( "Line {}: Edge from {} to {} somehow declared multiple " "times.".format(line_num, split_line[1], split_line[2]) ) @@ -223,14 +228,14 @@ def validate_lastgraph_file(graph_file): if curr_node_fwdseq is None: curr_node_fwdseq = line.strip() if curr_node_length != len(curr_node_fwdseq): - raise ValueError( + raise GraphParsingError( "Line {}: Node sequence length doesn't match " "$COV_SHORT1.".format(line_num) ) else: # The current line is the reverse sequence of this node. if len(curr_node_fwdseq) != len(line.strip()): - raise ValueError( + raise GraphParsingError( "Line {}: Node sequences have unequal " "lengths.".format(line_num) ) @@ -250,12 +255,12 @@ def validate_lastgraph_file(graph_file): # block, then that means that the file ended before a given node's # declaration did. That's a problem! if in_node_block: - raise ValueError("Node block ended too early at end-of-file.") + raise GraphParsingError("Node block ended too early at end-of-file.") if len(seen_nodes) != (header_num_nodes * 2): # seen_nodes should always be divisible by 2 (since every time we # record a node we add it and its complement), so dividing # len(seen_nodes) by 2 is ok - raise ValueError( + raise GraphParsingError( "The file's header indicated that there were {} node(s), but " "we identified {} node(s).".format( header_num_nodes, int(len(seen_nodes) / 2) @@ -266,16 +271,16 @@ def validate_lastgraph_file(graph_file): def validate_nx_digraph(g, required_node_fields, required_edge_fields): # Verify that the graph is directed and doesn't have duplicate edges if not g.is_directed(): - raise ValueError("The input graph should be directed.") + raise GraphParsingError("The input graph should be directed.") if g.is_multigraph(): - raise ValueError("Multigraphs are unsupported in MetagenomeScope.") + raise GraphParsingError("Multigraphs are unsupported in MetagenomeScope.") # Verify that all nodes have the properties we expect nodes to have num_nodes = len(g.nodes) for required_field in required_node_fields: num_nodes_with_field = len(nx.get_node_attributes(g, required_field)) if num_nodes_with_field < num_nodes: - raise ValueError( + raise GraphParsingError( 'Only {} / {} nodes have "{}" given.'.format( num_nodes_with_field, num_nodes, required_field ) @@ -286,7 +291,7 @@ def validate_nx_digraph(g, required_node_fields, required_edge_fields): for required_field in required_edge_fields: num_edges_with_field = len(nx.get_edge_attributes(g, required_field)) if num_edges_with_field < num_edges: - raise ValueError( + raise GraphParsingError( 'Only {} / {} edges have "{}" given.'.format( num_edges_with_field, num_edges, required_field ) @@ -322,12 +327,12 @@ def parse_metacarvel_gml(filename): for n in g.nodes: orientation = g.nodes[n]["orientation"] if type(orientation) != str or orientation not in ("FOW", "REV"): - raise ValueError( + raise GraphParsingError( 'Node {} has unsupported orientation "{}". Should be either ' '"FOW" or "REV".'.format(n, orientation) ) if is_not_pos_int(g.nodes[n]["length"]): - raise ValueError( + raise GraphParsingError( 'Node {} has non-positive-integer length "{}".'.format( n, g.nodes[n]["length"] ) @@ -338,14 +343,14 @@ def parse_metacarvel_gml(filename): # Verify that edge attributes are good for e in g.edges: if g.edges[e]["orientation"] not in ("EE", "EB", "BE", "BB"): - raise ValueError( + raise GraphParsingError( 'Edge {} has unsupported orientation "{}". Should be one of ' '"EE", "EB", "BE", or "BB".'.format( e, g.edges[e]["orientation"] ) ) if is_not_pos_int(g.edges[e]["bsize"]): - raise ValueError( + raise GraphParsingError( 'Edge {} has non-positive-integer bsize "{}".'.format( e, g.edges[e]["bsize"] ) @@ -367,7 +372,7 @@ def parse_metacarvel_gml(filename): # Rationale for using except Exception is analogous to what's # discussed in this comment thread: # https://github.com/biocore/LabControl/pull/585#discussion_r323413268 - raise ValueError( + raise GraphParsingError( 'Edge {} has non-numeric {} "{}".'.format( e, field, g.edges[e][field] ) @@ -390,12 +395,12 @@ def parse_gfa(filename): # Add nodes ("segments") to the DiGraph for node in gfa_graph.segments: if node.length is None: - raise ValueError( + raise GraphParsingError( "Found a node without a specified length: " "{}".format(node.name) ) if node.name[0] == "-": - raise ValueError( + raise GraphParsingError( "Node IDs in the input assembly graph cannot " 'start with the "-" character.' ) @@ -456,7 +461,7 @@ def parse_fastg(filename): elif suffix == "-": g.nodes[n]["orientation"] = "-" else: - raise ValueError( + raise GraphParsingError( ( "Node {} in parsed FASTG file doesn't have an " "orientation?" @@ -572,11 +577,212 @@ def parse_lastgraph(filename): return digraph +def parse_dot(filename): + """Returns a nx.MultiDiGraph representation of a DOT (Graphviz) file. + + Like GML files, DOT is a file format for representing arbitrary graphs (not + just assembly graphs). Furthermore, multiple assemblers output DOT files + (and may encode biological information like sequence lengths in different + ways in this file). + + Here, I limit our scope to visualizing DOT files produced by LJA / jumboDBG + or Flye. I'm sure there are other assemblers that can create DOT files, + but -- to keep the scope of this parser reasonable for the time being -- I + will just focus on these specific assemblers' output DOT files. + """ + # This function is recommended by NetworkX over the nx.nx_pydot version; + # see https://networkx.org/documentation/stable/_modules/networkx/drawing/nx_pydot.html#read_dot + # (Also, the nx_agraph version only requires pygraphviz, which we should + # already have installed.) + g = nx.nx_agraph.read_dot(filename) + + # NOTE: both LJA and Flye DOT files, as of writing, have some cosmetic + # node-specific attributes (e.g. "style" = "filled", "fillcolor" = "grey" + # for every node [?] in Flye's DOT files), which we don't really care + # about. These attributes could take up a nontrivial amount of storage for + # large graphs; I'm not sure it's worth it to go through here and manually + # delete these attributes from these nodes, so I'm just documenting this + # here in case this becomes a bottleneck at some point. + + # For both LJA and Flye DOT files, the main attributes we care about are + # stored in the edge labels. + # + # LJA's edge labels look like "A1235(5)", which indicates that this edge + # starts with an A nucleotide, has a length of 1,235 nt, and has a + # multiplicity (average coverage of k-mers in this edge by reads) of 5x. + # + # And Flye's edge labels look like "id 5678\l50k 5x", which indicates that + # this edge has an ID of "5678", a length of ~50kbp, and a coverage of 5x. + # (The lengths are approximate -- see + # https://github.com/fenderglass/Flye/blob/3f0154aed48d006b1387cf39918c7c05236cdcdd/src/repeat_graph/output_generator.cpp#L303-L312 + # for the exact logic, as of writing. I don't think this is a huge deal, + # but maybe we should add a warning somewhere.) + + # One -- and ONLY one of these -- should be True, after considering all + # edges in the graph. + lja_vibes = False + flye_vibes = False + seen_one_edge = False + + for e in g.edges(data=True, keys=True): + seen_one_edge = True + err_prefix = f"Edge {e[0]} -> {e[1]}" + + if "label" not in e[3]: + raise GraphParsingError( + f"{err_prefix} has no label. Note that we currently only " + "accept Graphviz files from Flye or LJA / jumboDBG." + ) + + # Both Flye and LJA DOT files have edge labels. Use this label to + # figure out which type of graph this is. + label = e[3]["label"] + if label.startswith("id"): + if lja_vibes: + raise GraphParsingError( + f"{err_prefix} looks like it came from Flye, but other " + "edges in the same file look like they came from LJA?" + ) + if "\\l" not in label: + raise GraphParsingError( + f"{err_prefix} has a label that looks like it came from " + "Flye, but doesn't include a '\\l' separator." + ) + parts = label.split("\\l") + if len(parts) != 2: + # This is kind of untrue -- there are other newline separators + # that Graphviz accepts + # (https://graphviz.org/docs/attr-types/escString/) -- but we + # can assume that only \l will be used since that's what Flye + # uses (at the moment). + raise GraphParsingError( + f"{err_prefix} has a label that seems like it spans != 2 " + "lines?" + ) + + # Get the edge ID + id_part = parts[0] + id_parts = id_part.split() + if len(id_parts) != 2: + raise GraphParsingError( + f"{err_prefix} has a label where the first line is " + f"{id_part}. There should be a single space character " + "between the word 'id' and the actual ID, and the ID " + "shouldn't contain any whitespace." + ) + eid = id_parts[1] + + # Get the length and coverage + lencov_part = parts[1] + lencov_parts = lencov_part.split() + if len(lencov_parts) != 2: + raise GraphParsingError( + f"{err_prefix} has a label where the second line is " + f"{lencov_part}. It should contain just the edge length " + "and coverage, separated by a single space character." + ) + elen_str, ecov_str = lencov_parts + # Get the length + if elen_str.isdigit(): + # I don't think Flye outputs lengths in this style, but we + # might as well be nice and accept lengths that are formatted + # like this if this ever changes + elen = int(elen_str) + else: + if elen_str[-1] == "k": + # This will fail if the first part of the length string + # (before the "k") isn't a valid float. I think letting + # this error go up to the end user is fine -- this should + # never happen anyway, since if we've made it this far this + # is almost certainly a real Flye assembly graph. + elen_thousands = float(elen_str[:-1]) + # Of course, this is just an approximation of the *true* + # edge length. Something we may want to warn about, as + # discussed above. + elen = elen_thousands * 1000 + else: + raise GraphParsingError( + f"{err_prefix} has a confusing length of {elen_str}?" + ) + + # Get the coverage -- this should just be a rounded integer ending + # in "x", so it's easier for us to parse than the length. + if ecov_str.isdigit(): + # Again, I don't think Flye should output coverages that lack + # the "x", but it's easy to account for this case and parse 'em + # anyway + ecov = int(ecov_str) + else: + if ecov_str[-1] == "x": + # This can fail if the coverage isn't a valid int, but + # that's fine -- just let the error go up to the user + ecov = int(ecov_str[:-1]) + else: + raise GraphParsingError( + f"{err_prefix} has a confusing coverage of {ecov_str}?" + ) + + # If we've made it here, then we've successfully read the ID, + # length, and coverage of this edge. Save it, and (if we haven't + # already) record that this is probably a Flye assembly graph. + g.edges[e[:3]]["id"] = eid + g.edges[e[:3]]["approx-length"] = elen + g.edges[e[:3]]["cov"] = ecov + flye_vibes = True + else: + if flye_vibes: + raise GraphParsingError( + f"{err_prefix} looks like it came from LJA, but other " + "edges in the same file look like they came from Flye?" + ) + label_first_line = label.splitlines()[0] + lfl_match = re.match(LJA_LFL_PATT, label_first_line) + if lfl_match is None: + raise GraphParsingError( + f"{err_prefix} has a label with a confusing first line, " + "at least for LJA edge labels." + ) + + groups = lfl_match.groups() + + # The length and kmer-cov lines could cause errors if these aren't + # valid numbers. That's fine -- like in the Flye parser, let the + # errors propagate up to the user. + g.edges[e[:3]]["first-nt"] = groups[0] + g.edges[e[:3]]["length"] = int(groups[1]) + g.edges[e[:3]]["kmer-cov"] = float(groups[2]) + # The full label could be useful for the visualization. LJA can + # generate labels spanning multiple lines (although jumboDBG's + # labels seem mostly to span single lines). + g.edges[e[:3]]["label"] = label + lja_vibes = True + + if not seen_one_edge: + # For "node-major" graphs I guess this is ok. For de Bruijn graphs, + # though, we are going to have problems. Probably. + raise GraphParsingError("DOT-format graph contains 0 edges.") + + # Only one of these should be True. ("^" is the XOR operator in Python.) + # I thiiiink this case will only get triggered if, uh, there aren't any + # edges in the graph. But I'm keeping this here just in case I messed + # something up. + if not (lja_vibes ^ flye_vibes): + raise GraphParsingError("Call a priest. Something went wrong.") + + return g + + +# Multiple suffixes can point to the same parser -- for example, both .gv and +# .dot are often used for Graphviz files. (See +# https://en.wikipedia.org/wiki/DOT_(graph_description_language) and its +# references for some details.) SUPPORTED_FILETYPE_TO_PARSER = { "lastgraph": parse_lastgraph, "gml": parse_metacarvel_gml, "gfa": parse_gfa, "fastg": parse_fastg, + "gv": parse_dot, + "dot": parse_dot, } @@ -584,23 +790,29 @@ def sniff_filetype(filename): """Attempts to determine the filetype of the file specified by a filename. Currently, this just returns the extension of the filename (after - converting the filename to lowercase). If the extension isn't one of - "lastgraph", "gfa", "fastg", or "gml", this throws a + converting the filename to lowercase). If the lowercase'd extension isn't + listed as a key in SUPPORTED_FILETYPE_TO_PARSER, then this will throw a NotImplementedError. - It might be worth extending this in the future to try sniffing via a - more sophisticated method, but this seems fine for the time being. + It might be worth extending this in the future to try sniffing via more + sophisticated methods (i.e. actually checking the first few characters in + these files), but this is a fine (albeit inelegant) solution for now. """ lowercase_fn = filename.lower() for suffix in SUPPORTED_FILETYPE_TO_PARSER: if lowercase_fn.endswith(suffix): return suffix + allowed_suffixes = ( + "{" + + ", ".join(f'"{s}"' for s in SUPPORTED_FILETYPE_TO_PARSER.keys()) + + "}" + ) raise NotImplementedError( - "The input filename ({}) doesn't end with one of the following " - "supported filetypes: {}. Please provide an assembly graph that " - "follows one of these filetypes and is named accordingly.".format( - filename, tuple(SUPPORTED_FILETYPE_TO_PARSER.keys()) - ) + f"The input filename ({filename}) doesn't end with one of the " + f"following supported filetype suffixes: {allowed_suffixes}. Please " + "provide an assembly graph that (i) follows one of these file formats " + "and (ii) has a filename that ends with the corresponding filetype " + "suffix." ) diff --git a/metagenomescope/errors.py b/metagenomescope/errors.py new file mode 100644 index 00000000..f8600f5b --- /dev/null +++ b/metagenomescope/errors.py @@ -0,0 +1,2 @@ +class GraphParsingError(Exception): + pass diff --git a/metagenomescope/tests/assembly_graph_parser/test_auxiliary_functions.py b/metagenomescope/tests/assembly_graph_parser/test_auxiliary_functions.py index 3796ecf9..e92f3cba 100644 --- a/metagenomescope/tests/assembly_graph_parser/test_auxiliary_functions.py +++ b/metagenomescope/tests/assembly_graph_parser/test_auxiliary_functions.py @@ -76,6 +76,11 @@ def test_sniff_filetype(): assert sniff_filetype("aSdF.FaStG") == "fastg" assert sniff_filetype("LastGraphfastg") == "fastg" + assert sniff_filetype("asdf.dot") == "dot" + assert sniff_filetype("ASDF.DOT") == "dot" + assert sniff_filetype("asdf.gv") == "gv" + assert sniff_filetype("ASDF.GV") == "gv" + with pytest.raises(NotImplementedError): sniff_filetype("asdf.asdf") with pytest.raises(NotImplementedError): diff --git a/metagenomescope/tests/assembly_graph_parser/test_parse_gfa.py b/metagenomescope/tests/assembly_graph_parser/test_parse_gfa.py index 85dffe5b..feb10d75 100644 --- a/metagenomescope/tests/assembly_graph_parser/test_parse_gfa.py +++ b/metagenomescope/tests/assembly_graph_parser/test_parse_gfa.py @@ -1,6 +1,7 @@ # from .utils import run_tempfile_test from metagenomescope.input_node_utils import negate_node_id from metagenomescope.assembly_graph_parser import parse_gfa +from metagenomescope.errors import GraphParsingError from .utils import run_tempfile_test from gfapy.error import InconsistencyError @@ -134,7 +135,7 @@ def test_parse_no_length_node(): s1.pop(1) s1.insert(1, "S\t1\t*") run_tempfile_test( - "gfa", s1, ValueError, "Found a node without a specified length: 1" + "gfa", s1, GraphParsingError, "Found a node without a specified length: 1" ) # Manually assigning node 1 a sequence should fix the problem @@ -177,7 +178,7 @@ def test_parse_invalid_id_node(): run_tempfile_test( "gfa", s1, - ValueError, + GraphParsingError, "Node IDs in the input assembly graph cannot " 'start with the "-" character.', ) diff --git a/metagenomescope/tests/assembly_graph_parser/test_parse_lastgraph.py b/metagenomescope/tests/assembly_graph_parser/test_parse_lastgraph.py index fabb743e..b067748f 100644 --- a/metagenomescope/tests/assembly_graph_parser/test_parse_lastgraph.py +++ b/metagenomescope/tests/assembly_graph_parser/test_parse_lastgraph.py @@ -1,4 +1,5 @@ from .utils import run_tempfile_test +from metagenomescope.errors import GraphParsingError from metagenomescope.assembly_graph_parser import parse_lastgraph from metagenomescope.tests.assembly_graph_parser.test_validate_lastgraph import ( reset_glines, @@ -64,13 +65,13 @@ def test_parse_lastgraph_node_interrupted(): glines = reset_glines() glines.pop(3) run_tempfile_test( - "LastGraph", glines, ValueError, "Line 4: Node block ends too early." + "LastGraph", glines, GraphParsingError, "Line 4: Node block ends too early." ) glines = reset_glines() glines[2] = "ARC\t1\t1\t5" run_tempfile_test( - "LastGraph", glines, ValueError, "Line 3: Node block ends too early." + "LastGraph", glines, GraphParsingError, "Line 3: Node block ends too early." ) @@ -79,22 +80,22 @@ def test_parse_lastgraph_invalid_node_count(): exp_msg = "Line 1: $NUMBER_OF_NODES must be a positive integer" glines[0] = "3.5\t10\t1\t1" - run_tempfile_test("LastGraph", glines, ValueError, exp_msg) + run_tempfile_test("LastGraph", glines, GraphParsingError, exp_msg) glines[0] = "-3.5\t10\t1\t1" - run_tempfile_test("LastGraph", glines, ValueError, exp_msg) + run_tempfile_test("LastGraph", glines, GraphParsingError, exp_msg) glines[0] = "-2\t10\t1\t1" - run_tempfile_test("LastGraph", glines, ValueError, exp_msg) + run_tempfile_test("LastGraph", glines, GraphParsingError, exp_msg) glines[0] = "2.0\t10\t1\t1" - run_tempfile_test("LastGraph", glines, ValueError, exp_msg) + run_tempfile_test("LastGraph", glines, GraphParsingError, exp_msg) glines[0] = "ABC\t10\t1\t1" - run_tempfile_test("LastGraph", glines, ValueError, exp_msg) + run_tempfile_test("LastGraph", glines, GraphParsingError, exp_msg) glines[0] = "0x123\t10\t1\t1" - run_tempfile_test("LastGraph", glines, ValueError, exp_msg) + run_tempfile_test("LastGraph", glines, GraphParsingError, exp_msg) glines[0] = "0\t10\t1\t1" - run_tempfile_test("LastGraph", glines, ValueError, exp_msg) + run_tempfile_test("LastGraph", glines, GraphParsingError, exp_msg) diff --git a/metagenomescope/tests/assembly_graph_parser/test_parse_metacarvel_gml.py b/metagenomescope/tests/assembly_graph_parser/test_parse_metacarvel_gml.py index 6ac5a175..d1068cf2 100644 --- a/metagenomescope/tests/assembly_graph_parser/test_parse_metacarvel_gml.py +++ b/metagenomescope/tests/assembly_graph_parser/test_parse_metacarvel_gml.py @@ -1,5 +1,6 @@ from networkx import NetworkXError from .utils import run_tempfile_test +from metagenomescope.errors import GraphParsingError from metagenomescope.assembly_graph_parser import parse_metacarvel_gml @@ -92,14 +93,14 @@ def test_parse_metacarvel_gml_insufficient_node_metadata(): # Remove orientation from node 10 mg.pop(5) exp_msg = 'Only 11 / 12 nodes have "orientation" given.' - run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="") + run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="") # Remove length from node 10 (it's the line after the previous line we # removed) mg.pop(5) # due to "precedence" (just the order of iteration in the for loops), we # expect orientation to take priority over length in these error messages # -- but this doesn't really matter - run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="") + run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="") # Restore mg mg = get_marygold_gml() @@ -107,19 +108,19 @@ def test_parse_metacarvel_gml_insufficient_node_metadata(): # length, not orientation, now. mg.pop(6) exp_msg = 'Only 11 / 12 nodes have "length" given.' - run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="") + run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="") # For fun, let's remove all of the lines with length and make sure this # updates the error msg accordingly mg = [line for line in mg if "length" not in line] exp_msg = 'Only 0 / 12 nodes have "length" given.' - run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="") + run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="") # ... And let's try that same thing with orientation, which as we've # established takes priority in error messages (again, doesn't actually # matter, but we might as well test that this behavior remains consistent) mg = [line for line in mg if "orientation" not in line] exp_msg = 'Only 0 / 12 nodes have "orientation" given.' - run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="") + run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="") def test_parse_metacarvel_gml_insufficient_edge_metadata(): @@ -128,7 +129,7 @@ def test_parse_metacarvel_gml_insufficient_edge_metadata(): # Remove orientation from edge 8 -> 9 (line 190 in the file) mg.pop(189) exp_msg = 'Only 15 / 16 edges have "orientation" given.' - run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="") + run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="") # Remove orientation from all edges in the file mg = [ line @@ -136,27 +137,27 @@ def test_parse_metacarvel_gml_insufficient_edge_metadata(): if 'orientation "E' not in line and 'orientation "B' not in line ] exp_msg = 'Only 0 / 16 edges have "orientation" given.' - run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="") + run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="") # Restore mg mg = get_marygold_gml() # Remove mean from edge 12 -> 8 mg.pop(182) exp_msg = 'Only 15 / 16 edges have "mean" given.' - run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="") + run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="") # Also remove mean from edge 8 -> 9 # This is actually line 191 of the file, but remember 0-indexing + already # popped one line above in the file mg.pop(189) exp_msg = 'Only 14 / 16 edges have "mean" given.' - run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="") + run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="") # Restore mg mg = get_marygold_gml() # Remove bsize from edge 7 -> 12 mg.pop(168) exp_msg = 'Only 15 / 16 edges have "bsize" given.' - run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="") + run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="") # Remove stdev from edge 7 -> 9 # Note that we haven't restored mg from above yet! This tests the whole @@ -167,11 +168,11 @@ def test_parse_metacarvel_gml_insufficient_edge_metadata(): # breaking this function. mg.pop(174) exp_msg = 'Only 15 / 16 edges have "stdev" given.' - run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="") + run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="") # Remove bsize from *all* lines -- stdev error should still show up mg = [line for line in mg if "bsize" not in line] - run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="") + run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="") def test_parse_metacarvel_gml_undirected_graph(): @@ -182,10 +183,10 @@ def test_parse_metacarvel_gml_undirected_graph(): # Try two things: 1) the choice of directed/undirected isn't specified # (defaults to undirected), 2) explicitly specified as not directed mg.pop(1) - run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="") + run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="") mg.insert(1, " directed 0\n") - run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="") + run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="") def test_parse_metacarvel_gml_duplicate_edges(): @@ -221,7 +222,7 @@ def test_parse_metacarvel_gml_duplicate_edges(): # detect that the input graph is a multigraph and be all like "nuh uh # you didn't get that from MetaCarvel, now did you" (something like that) run_tempfile_test( - "gml", mg, ValueError, "Multigraphs are unsupported", join_char="" + "gml", mg, GraphParsingError, "Multigraphs are unsupported", join_char="" ) @@ -294,7 +295,7 @@ def test_parse_metacarvel_gml_invalid_node_metadata(): # angry if you put a string like FOW outside of quotes in a GML file.) p = "".join([c for c in o if c != '"']) exp_msg = 'Node NODE_10 has unsupported orientation "{}".'.format(p) - run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="") + run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="") # Test invalid lengths lengths = [ @@ -315,7 +316,7 @@ def test_parse_metacarvel_gml_invalid_node_metadata(): exp_msg = 'Node NODE_10 has non-positive-integer length "{}".'.format( length_to_test ) - run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="") + run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="") def test_parse_metacarvel_gml_invalid_edge_metadata(): @@ -344,12 +345,12 @@ def test_parse_metacarvel_gml_invalid_edge_metadata(): # (See test_parse_metacarvel_gml_invalid_node_metadata() above) p = "".join([c for c in val if c != '"']) exp_msg = 'has unsupported orientation "{}".'.format(p) - run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="") + run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="") mg.pop(189) mg.insert(189, ' orientation "REV"\n') exp_msg = 'has unsupported orientation "REV".' - run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="") + run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="") # Restoring the orientation to something normal (say, BB) should work mg.pop(189) @@ -376,21 +377,21 @@ def test_parse_metacarvel_gml_invalid_edge_metadata(): mg.insert(192, " bsize {}\n".format(val)) p = "".join([c for c in val if c != '"']) exp_msg = 'has non-positive-integer bsize "{}".'.format(p) - run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="") + run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="") # 3. Test mean mg = get_marygold_gml() mg.pop(198) mg.insert(198, ' mean "ABC"\n') exp_msg = 'has non-numeric mean "ABC".' - run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="") + run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="") # 4. Test stdev mg = get_marygold_gml() mg.pop(199) mg.insert(199, ' stdev "ABC"\n') exp_msg = 'has non-numeric stdev "ABC".' - run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="") + run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="") # TODO test bsize, mean, stdev more thoroughly # Test that NaN/infinity/zero/negative values work ok (but only with @@ -461,14 +462,14 @@ def test_parse_metacarvel_gml_repeated_node_attrs(): mg = get_marygold_gml() mg.insert(5, ' orientation "REV"\n') exp_msg = "Node NODE_10 has unsupported orientation \"['REV', 'FOW']\"." - run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="") + run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="") mg = get_marygold_gml() mg.insert(5, ' length "200"\n') exp_msg = ( "Node NODE_10 has non-positive-integer length \"['200', '100']\"." ) - run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="") + run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="") def test_parse_metacarvel_gml_repeated_edge_attrs(): @@ -478,7 +479,7 @@ def test_parse_metacarvel_gml_repeated_edge_attrs(): "Edge ('NODE_7', 'NODE_12') has unsupported orientation " "\"['EB', 'EB']\"." ) - run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="") + run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="") mg = get_marygold_gml() mg.insert(166, ' mean "123.45"\n') @@ -486,7 +487,7 @@ def test_parse_metacarvel_gml_repeated_edge_attrs(): "Edge ('NODE_7', 'NODE_12') has non-numeric mean " "\"['123.45', '-200.00']\"." ) - run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="") + run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="") mg = get_marygold_gml() mg.insert(166, ' stdev "123.45"\n') @@ -494,7 +495,7 @@ def test_parse_metacarvel_gml_repeated_edge_attrs(): "Edge ('NODE_7', 'NODE_12') has non-numeric stdev " "\"['123.45', 25.1234]\"." ) - run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="") + run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="") mg = get_marygold_gml() mg.insert(167, " bsize 15\n") @@ -502,7 +503,7 @@ def test_parse_metacarvel_gml_repeated_edge_attrs(): "Edge ('NODE_7', 'NODE_12') has non-positive-integer bsize " '"[15, 30]".' ) - run_tempfile_test("gml", mg, ValueError, exp_msg, join_char="") + run_tempfile_test("gml", mg, GraphParsingError, exp_msg, join_char="") def test_parse_metacarvel_gml_repeated_edge_source_or_target(): diff --git a/metagenomescope/tests/assembly_graph_parser/test_validate_lastgraph.py b/metagenomescope/tests/assembly_graph_parser/test_validate_lastgraph.py index 7b559947..93e38c9f 100644 --- a/metagenomescope/tests/assembly_graph_parser/test_validate_lastgraph.py +++ b/metagenomescope/tests/assembly_graph_parser/test_validate_lastgraph.py @@ -1,5 +1,6 @@ import pytest from io import StringIO +from metagenomescope.errors import GraphParsingError from metagenomescope.assembly_graph_parser import validate_lastgraph_file @@ -9,9 +10,9 @@ def get_validate_err(glines): bad_lg = StringIO("\n".join(glines)) # Assume that the LastGraph file represented by bad_lg will fail # validation, and return the accompanying error message. - # (If a ValueError *isn't* raised, this'll throw an error saying DID NOT - # RAISE or something.) - with pytest.raises(ValueError) as ei: + # (If a GraphParsingError *isn't* raised, this'll throw an error saying + # DID NOT RAISE or something.) + with pytest.raises(GraphParsingError) as ei: validate_lastgraph_file(bad_lg) return str(ei.value)