From 9884ea35d42effd609332b3ed170ffdb61c4bd3c Mon Sep 17 00:00:00 2001 From: Josiah Seaman Date: Tue, 20 Aug 2019 17:24:24 +0100 Subject: [PATCH 01/15] #23 WIP: First round of conversions moving halponetwork to Graph.Node. Getters for .specimens, .upstream, .downstream. Now comes the harder work of thinking through the algorithm in database terms. --- .travis.yml | 1 - Graph/migrations/0001_initial.py | 3 +- Graph/models.py | 91 ++++++++++++++++++++- Graph/test.py | 39 ++------- Graph/views.py | 31 ++++++++ HaploBlocker/haplonetwork.py | 132 +++++++------------------------ HaploBlocker/models.py | 15 ---- HaploBlocker/tests.py | 70 +++++++--------- HaploBlocker/views.py | 3 +- 9 files changed, 183 insertions(+), 202 deletions(-) diff --git a/.travis.yml b/.travis.yml index 05d839f..ce36042 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,7 +11,6 @@ install: - python manage.py migrate env: - - DJANGO_VERSION=2.2.1 - DJANGO_SETTINGS_MODULE=vgbrowser.settings # Command to run tests, e.g. python setup.py test diff --git a/Graph/migrations/0001_initial.py b/Graph/migrations/0001_initial.py index 0df4016..f90df1c 100644 --- a/Graph/migrations/0001_initial.py +++ b/Graph/migrations/0001_initial.py @@ -22,8 +22,9 @@ class Migration(migrations.Migration): migrations.CreateModel( name='Node', fields=[ + ('id', models.AutoField(auto_created=True, default=0, primary_key=True, serialize=False, verbose_name='ID')), ('seq', models.CharField(blank=True, max_length=255)), - ('name', models.CharField(max_length=15, primary_key=True, serialize=False)), + ('name', models.CharField(max_length=15,serialize=False)), ('graph', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='Graph.GraphGenome')), ], options={ diff --git a/Graph/models.py b/Graph/models.py index cbf06ca..95257a9 100644 --- a/Graph/models.py +++ b/Graph/models.py @@ -1,4 +1,4 @@ -from typing import List, Iterable +from typing import List, Iterable, Set from django.db import models from Utils.models import CustomSaveManager @@ -64,10 +64,13 @@ def append_node_to_path(self, node_id, strand, path_name) -> None: raise ValueError("Provide the id of the node, not", node_id) Path.objects.get(name=path_name).append_node(Node.objects.get(name=node_id), strand) + def node(self, node_name): + return Node.objects.get(name=node_name, graph=self) + class Node(models.Model): seq = models.CharField(max_length=255, blank=True) - name = models.CharField(primary_key=True, max_length=15) + name = models.CharField(max_length=15) graph = models.ForeignKey(GraphGenome, on_delete=models.CASCADE) class Meta: @@ -93,6 +96,59 @@ def __hash__(self): def to_gfa(self, segment_id: int): return '\t'.join(['S', str(segment_id), self.seq]) + @property + def specimens(self): + return self.nodetraversal_set + + @property + def upstream(self) -> Set[int]: + traverses = self.nodetraversal_set.all() # values_list('node__id', flat=True) + # Node.objects.filter(id__in=traverses).values_list('id', flat=True) + return set(t.upstream_id() for t in traverses) + + @property + def downstream(self) -> Set[int]: + traverses = self.nodetraversal_set.all() + return set(t.downstream_id() for t in traverses) + + def __repr__(self): + return "N%s(%s)" % (str(self.name), self.seq) + + def details(self): + return f"""Node{self.name}: {self.seq} + upstream: {dict((key, value) for key, value in self.upstream.items())} + downstream: {dict((key, value) for key, value in self.downstream.items())} + {len(self.specimens)} specimens: {self.specimens}""" + + def is_nothing(self): + """Useful in Node class definition to check for Node.NOTHING""" + return self.name == '-1' + + def validate(self): + """Returns true if the Node has specimens and does not have any negative + transition values, raises an AssertionError otherwise.""" + if not self.specimens: + assert self.specimens, "Specimens are empty" + self.details() + for node, weight in self.upstream.items(): + if not node.is_nothing() and weight < 0: + print(self.details()) + assert weight > -1, node.details() + + for node, weight in self.downstream.items(): + if not node.is_nothing() and weight < 0: + print(self.details()) + assert weight > -1, node.details() + return True + + +# Node.NOTHING is essential "Not Applicable" when used to track transition rates between nodes. +# Node.NOTHING is an important concept to Haploblocker, used to track upstream and downstream +# that transitions to an unknown or untracked state. As neglect_nodes removes minority +# allele nodes, there will be specimens downstream that "come from" Node.NOTHING, meaning their +# full history is no longer tracked. Node.NOTHING is a regular exception case for missing data, +# the ends of chromosomes, and the gaps between haplotype blocks. +Node.NOTHING = Node(-1) + class Path(models.Model): """Paths represent the linear order of on particular individual (accession) as its genome @@ -146,8 +202,6 @@ def to_gfa(self): return '\t'.join(['P', self.accession, "+,".join([x.node.name + x.strand for x in self.nodes]) + "+", ",".join(['*' for x in self.nodes])]) - - class NodeTraversal(models.Model): """Link from a Path to a Node it is currently traversing. Includes strand""" node = models.ForeignKey(Node, db_index=True, on_delete=models.CASCADE) @@ -176,3 +230,32 @@ def save(self, **kwargs): self.order = 0 if not last_traversal else last_traversal.order + 1 super(NodeTraversal, self).save(**kwargs) + def fetch_neighbor(self, target_index): + query = NodeTraversal.objects.filter \ + (path=self.path, order=target_index).values_list('node__id', flat=True) + if query: + return query[0] + return -1 + + def upstream_id(self): + target_index = self.order - 1 + return self.fetch_neighbor(target_index) + + def downstream_id(self): + target_index = self.order + 1 + return self.fetch_neighbor(target_index) + + def neighbor(self, target_index): + try: + return NodeTraversal.objects.get(path=self.path, order=target_index) + except NodeTraversal.DoesNotExist: + return None + + def upstream(self): + target_index = self.order - 1 + return self.neighbor(target_index) + + def downstream(self): + target_index = self.order + 1 + return self.neighbor(target_index) + diff --git a/Graph/test.py b/Graph/test.py index 9a44150..8dec1d3 100644 --- a/Graph/test.py +++ b/Graph/test.py @@ -1,15 +1,14 @@ import unittest -from datetime import datetime from django.test import TestCase -from typing import List import os from os.path import join from Graph.gfa import GFA -from Graph.models import Node, GraphGenome, Path +from Graph.models import Node, Path from Graph.sort import DAGify # Define the working directory +from Graph.views import build_from_test_slices from vgbrowser.settings import BASE_DIR PATH_TO_TEST_DATA = join(BASE_DIR, "test_data") location_of_xg = join(BASE_DIR, "test_data","xg") @@ -19,34 +18,6 @@ x, y, z = 'x', 'y', 'z' -def build_from_test_slices(cmd: List): - """This factory uses test data shorthand for linear graph slices to build - a database GraphGenome with all the necessary Paths and Nodes. Path order populated in the order - that they are mentioned in the slices. Currently, this is + only and does not support non-linear - orderings. Use Path.append_node() to build non-linear graphs.""" - if isinstance(cmd, str): - cmd = eval(cmd) - # preemptively grab all the path names from every odd list entry - graph = GraphGenome.objects.get_or_create(name='test_data')[0] # + str(datetime.now()) - node_count = 0 - paths = {key for sl in cmd for i in range(0, len(sl), 2) for key in sl[i + 1]} - path_objs = {} - for path_name in paths: - path_objs[path_name] = Path.objects.get_or_create(graph=graph, accession=path_name)[0] - for sl in cmd: - try: - for i in range(0, len(sl), 2): - paths_mentioned = [path_objs[key] for key in sl[i + 1]] - node, is_new = Node.objects.get_or_create(seq=sl[i], name=graph.name + str(node_count), graph=graph) - node_count += 1 - for path in paths_mentioned: - path.append_node(node, '+') - except IndexError: - raise IndexError("Expecting two terms: ", sl[0]) # sl[i:i+2]) - - return graph - - class GraphTest(TestCase): """Constructing a node with an existing Path object will modify that Path object (doubly linked) which means care must be taken when constructing Graphs. From factory_input we have an example of @@ -214,9 +185,9 @@ def test_load_gfa_via_xg(self): graph = graph2.to_graph() x,y,z = 'x','y','z' self.assertEqual(graph, build_from_test_slices([['CAAATAAG', {x, y, z}], ['A', {y, z}, 'G', {x}], - ['C', {x, y, z}], ['TTG', {x, y, z}], - ['A', {z}, 'G', {x, y}], ['AAATTTTCTGGAGTTCTAT', {x, y, z}], ['T', {x, y, z}], - ['ATAT', {x, y, z}], ['T', {x, y, z}], ['CCAACTCTCTG', {x, y, z}]])) + ['C', {x, y, z}], ['TTG', {x, y, z}], + ['A', {z}, 'G', {x, y}], ['AAATTTTCTGGAGTTCTAT', {x, y, z}], ['T', {x, y, z}], + ['ATAT', {x, y, z}], ['T', {x, y, z}], ['CCAACTCTCTG', {x, y, z}]])) @staticmethod def is_different(gfa1, gfa2): diff --git a/Graph/views.py b/Graph/views.py index 71c2249..d2ece86 100644 --- a/Graph/views.py +++ b/Graph/views.py @@ -1,3 +1,34 @@ +from typing import List + from django.shortcuts import render # View contains the endpoints on the server for the browser to fetch data +from Graph.models import GraphGenome, Path, Node + + +def build_from_test_slices(cmd: List, graph_name='test_data') -> GraphGenome: + """This factory uses test data shorthand for linear graph slices to build + a database GraphGenome with all the necessary Paths and Nodes. Path order populated in the order + that they are mentioned in the slices. Currently, this is + only and does not support non-linear + orderings. Use Path.append_node() to build non-linear graphs.""" + if isinstance(cmd, str): + cmd = eval(cmd) + # preemptively grab all the path names from every odd list entry + graph = GraphGenome.objects.get_or_create(name=graph_name)[0] # + str(datetime.now()) + node_count = 0 + paths = {key for sl in cmd for i in range(0, len(sl), 2) for key in sl[i + 1]} + path_objs = {} + for path_name in paths: + path_objs[path_name] = Path.objects.get_or_create(graph=graph, accession=path_name)[0] + for sl in cmd: + try: + for i in range(0, len(sl), 2): + paths_mentioned = [path_objs[key] for key in sl[i + 1]] + node, is_new = Node.objects.get_or_create(seq=sl[i], name=graph.name + str(node_count), graph=graph) + node_count += 1 + for path in paths_mentioned: + path.append_node(node, '+') + except IndexError: + raise IndexError("Expecting two terms: ", sl[0]) # sl[i:i+2]) + + return graph \ No newline at end of file diff --git a/HaploBlocker/haplonetwork.py b/HaploBlocker/haplonetwork.py index 7f062f6..15355aa 100644 --- a/HaploBlocker/haplonetwork.py +++ b/HaploBlocker/haplonetwork.py @@ -7,6 +7,7 @@ import numpy as np from collections import defaultdict from copy import copy +from Graph.models import Node BLOCK_SIZE = 20 FILTER_THRESHOLD = 4 @@ -15,80 +16,6 @@ def first(iterable): return next(iter(iterable)) - -class Node: - """This definition of Node is designed to be equivalent to the R code HaploBlocker Nodes. - This will be combined with the VG definition of Graph.models.Node and extended to support the - concept of summarization layers. - - Currently, It has no concept of strand, CNV. It uses an absolute Start and End position, but those - are not referenced very often. - Critically, it needs Node.NOTHING which is used frequently to mark specimens whose - upstream or downstream nodes have been pruned. The usage of Node.NOTHING is equivalent to - our sequence mismatch penalties. In both cases information is being discarded for the - purpose of summarization.""" - def __init__(self, ident, start, end, specimens=None, upstream=None, downstream=None): - self.ident = ident - self.start = start # bp, arbitrary coordinates, used for debugging - self.end = end # bp, arbitrary coordinates, used for debugging - self.specimens = set() if specimens is None else specimens - self.upstream = defaultdict(lambda: 0) if not upstream else upstream - # E.g. {Node.NOTHING:501, Node: 38, Node: 201, Node: 3} - self.downstream = defaultdict(lambda: 0) if not downstream else downstream - # E.g. {Node: 38, Node: 201, Node: 3} - assert not self.is_nothing() or (self.end is None and self.start is None), self.details() - - def __len__(self): - return len(self.specimens) - - def __repr__(self): - return "N%s(%s, %s)" % (str(self.ident), str(self.start), str(self.end)) - - def __hash__(self): - return hash(self.ident + 1) * hash(self.start) * hash(self.end) - - def details(self): - return f"""Node{self.ident}: {self.start} - {self.end} - upstream: {dict((key, value) for key, value in self.upstream.items())} - downstream: {dict((key, value) for key, value in self.downstream.items())} - {len(self.specimens)} specimens: {self.specimens}""" - - def is_nothing(self): - """Useful in Node class definition to check for Node.NOTHING""" - return self.ident == -1 and self.start is None and self.end is None - - def validate(self): - """Returns true if the Node has specimens and does not have any negative - transition values, raises an AssertionError otherwise.""" - if not self.specimens: - assert self.specimens, "Specimens are empty" + self.details() - for node, weight in self.upstream.items(): - if not node.is_nothing() and weight < 0: - print(self.details()) - assert weight > -1, node.details() - - for node, weight in self.downstream.items(): - if not node.is_nothing() and weight < 0: - print(self.details()) - assert weight > -1, node.details() - return True - - def is_beginning(self) -> bool: - return self.start == 0 - - def is_end(self) -> bool: - return len(self.downstream) == 1 and first(self.downstream).is_nothing() - - -# Node.NOTHING is essential "Not Applicable" when used to track transition rates between nodes. -# Node.NOTHING is an important concept to Haploblocker, used to track upstream and downstream -# that transitions to an unknown or untracked state. As neglect_nodes removes minority -# allele nodes, there will be specimens downstream that "come from" Node.NOTHING, meaning their -# full history is no longer tracked. Node.NOTHING is a regular exception case for missing data, -# the ends of chromosomes, and the gaps between haplotype blocks. -Node.NOTHING = Node(-1, None, None) - - def read_data(file_path): """Reads one of Torsten's SNP files. In the file, Individuals are columns, not rows. This method returns both loci (all 501 individual alleles at one loci) and the Transposed @@ -106,7 +33,7 @@ def signature(individual, start_locus): return tuple(individual[start_locus: start_locus + BLOCK_SIZE]) -def get_unique_signatures(individuals, start_locus): +def get_unique_signatures(individuals, start_locus, current_graph): """A signature is a series of BLOCK_SIZE SNPs inside of a locus. We want to know how many unique signatures are present inside of one locus. A Node is created for each unique signature found. @@ -117,16 +44,16 @@ def get_unique_signatures(individuals, start_locus): for individual in individuals: sig = signature(individual, start_locus) if sig not in unique_blocks: - unique_blocks[sig] = Node(len(unique_blocks), - start_locus // BLOCK_SIZE, - start_locus // BLOCK_SIZE) # Inclusive end + unique_blocks[sig] = Node(name=f'{len(unique_blocks)}:{start_locus // BLOCK_SIZE}-{start_locus // BLOCK_SIZE}', + seq=''.join(str(x) for x in sig), + graph=current_graph) return unique_blocks -def get_all_signatures(alleles, individuals): +def get_all_signatures(alleles, individuals, current_graph): unique_signatures = [] for locus_start in range(0, len(alleles) - BLOCK_SIZE, BLOCK_SIZE): # discards remainder - sig = get_unique_signatures(individuals, locus_start) + sig = get_unique_signatures(individuals, locus_start, current_graph) unique_signatures.append(sig) return unique_signatures @@ -160,17 +87,17 @@ def populate_transitions(simplified_individuals): node.specimens.add(i) if x + 1 < len(indiv): node.downstream[indiv[x + 1]] += 1 - else: - node.downstream[Node.NOTHING] += 1 + # else: + # node.downstream[Node.NOTHING] += 1 if x - 1 >= 0: node.upstream[indiv[x - 1]] += 1 - else: - node.upstream[Node.NOTHING] += 1 + # else: + # node.upstream[Node.NOTHING] += 1 def update_transition(node): """Only transition values for nodes already listed in upstream and downstream will be calculated.""" - if node is not Node.NOTHING: + if not node.is_nothing(): update_stream_transitions(node, 'upstream') update_stream_transitions(node, 'downstream') @@ -185,7 +112,7 @@ def update_stream_transitions(node, stream): running = g(node, stream).keys() setattr(node, stream, defaultdict(lambda: 0)) for n in running: - if n is not Node.NOTHING: + if not n.is_nothing(): g(node, stream)[n] = len(node.specimens.intersection(n.specimens)) accounted_upstream = sum(g(node, stream).values()) - g(node, stream)[Node.NOTHING] g(node, stream)[Node.NOTHING] = len(node.specimens) - accounted_upstream @@ -213,7 +140,7 @@ def simple_merge(full_graph): next_node.start = node.start # prepare to delete node by removing references for parent in node.upstream.keys(): - if parent is not Node.NOTHING: + if not parent.is_nothing(): count = parent.downstream[node] del parent.downstream[node] # updating pointer parent.downstream[next_node] = count @@ -254,31 +181,28 @@ def split_one_group(prev_node, anchor, next_node): Comment: That is actually the case we want to split up to obtain longer blocks later Extension of full windows will take care of potential loss of information later""" my_specimens = copy(anchor.specimens) # important to copy or side effects occur - if prev_node is not Node.NOTHING: # normal case + if not prev_node.is_nothing(): # normal case my_specimens = my_specimens.intersection(prev_node.specimens) - if next_node is not Node.NOTHING: # normal case + if not next_node.is_nothing(): # normal case my_specimens = my_specimens.intersection(next_node.specimens) - if prev_node is Node.NOTHING and next_node is Node.NOTHING: # exceptional: both are nothing node + if prev_node.is_nothing() and next_node.is_nothing(): # exceptional: both are nothing node my_specimens = copy(anchor.specimens) # removing all specimens that transition to nothing for n in anchor.downstream.keys(): - if n is Node.NOTHING: # remove dead leads + if n.is_nothing(): # remove dead leads my_specimens -= n.specimens for n in anchor.upstream.keys(): - if n is Node.NOTHING: # remove dead leads + if n.is_nothing(): # remove dead leads my_specimens -= n.specimens - my_start, my_end = prev_node.start, next_node.end my_upstream, my_downstream = copy(prev_node.upstream), copy(next_node.downstream) - if Node.NOTHING is prev_node: # Rare case - my_start = anchor.start + if prev_node.is_nothing(): # Rare case my_upstream = copy(anchor.upstream) - if Node.NOTHING is next_node: # Rare case - my_end = anchor.end + if next_node.is_nothing(): # Rare case my_downstream = copy(anchor.downstream) # TODO: what about case where more content is joining downstream? - new = Node(777, my_start, my_end, my_specimens, my_upstream, my_downstream) + new = Node(777, my_specimens, my_upstream, my_downstream) # Update Specimens in prev_node, anchor, next_node anchor.specimens -= new.specimens @@ -298,10 +222,10 @@ def update_neighbor_pointers(new_node): """Ensure that my new upstream pointers have matching downstream pointers in neighbors, and vice versa. This does not set correct transition rates, it only makes the nodes connected.""" for n in new_node.upstream.keys(): - if n is not Node.NOTHING: + if not n.is_nothing(): n.downstream[new_node] = 1 for n in new_node.downstream.keys(): - if n is not Node.NOTHING: + if not n.is_nothing(): n.upstream[new_node] = 1 @@ -324,15 +248,15 @@ def split_groups(all_nodes: List[Node]): for down in tuple(node.downstream.keys()): set1 = copy(up.specimens) set2 = copy(down.specimens) - if up is Node.NOTHING: + if up.is_nothing(): set1 = copy(node.specimens) for index in tuple(node.upstream.keys()): - if index is not Node.NOTHING: + if not index.is_nothing(): set1.difference_update(index.specimens) - if down is Node.NOTHING: + if down.is_nothing(): set2 = copy(node.specimens) for index in tuple(node.downstream.keys()): - if index is not Node.NOTHING: + if not index.is_nothing(): set2.difference_update(index.specimens) if set1 == set2 and len(set1) > 0: diff --git a/HaploBlocker/models.py b/HaploBlocker/models.py index 3796ddd..c8945cd 100644 --- a/HaploBlocker/models.py +++ b/HaploBlocker/models.py @@ -3,18 +3,3 @@ # Create your models here. from django.db import models - -class Node(models.Model): - seq = models.CharField(max_length=1000) - - -class Path(models.Model): - accession = models.CharField(unique=True, max_length=1000) - - -class Edge(models.Model): - node = models.ForeignKey(Node, on_delete=models.CASCADE) - path = models.ForeignKey(Path, on_delete=models.CASCADE) - - class Meta: - ordering = ['path','id'] \ No newline at end of file diff --git a/HaploBlocker/tests.py b/HaploBlocker/tests.py index 500785c..a8eb46e 100644 --- a/HaploBlocker/tests.py +++ b/HaploBlocker/tests.py @@ -1,4 +1,7 @@ from django.test import TestCase + +import Graph.views +from Graph.models import GraphGenome from vgbrowser.settings import BASE_DIR import unittest import os @@ -27,12 +30,14 @@ def setUpClass(self) -> None: print(os.getcwd()) self.alleles, self.individuals = read_data(os.path.join(BASE_DIR, "test_data/KE_chromo10.txt")) - def create_nodes(self): + def create_nodes(self, graph_name): """Tests that need a fresh graph must call create_nodes() FIRST! Graph summarization works by side effecting Node objects. Tests can not run independently with order dependent side effects. This method is slow, so don't use it unless you - need it.""" - self.unique_signatures = get_all_signatures(self.alleles, self.individuals) + need it. + :param graph_name: """ + graph = GraphGenome.objects.create(name=graph_name) + self.unique_signatures = get_all_signatures(self.alleles, self.individuals, graph) self.simplified_individuals = build_individuals(self.individuals, self.unique_signatures) # G = build_graph(simplified_individuals) populate_transitions(self.simplified_individuals) @@ -49,12 +54,13 @@ def test_build_individuals(self): def internal_build_individuals(self, alleles, individuals): - unique_signatures = get_all_signatures(alleles, individuals) - assert repr(unique_signatures[ - 21]) == '{(0, 0, 0, 0, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2): N0(21, 21), (0, 0, 2, 0, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2): N1(21, 21), (0, 0, 0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0): N2(21, 21), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0): N3(21, 21), (0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0): N4(21, 21), (0, 0, 0, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 0, 0, 2, 0, 0, 0, 2): N5(21, 21), (0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0): N6(21, 21), (0, 0, 0, 0, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 0, 2, 0, 0, 2, 2): N7(21, 21)}' + graph = GraphGenome.objects.create(name='internal_build_individuals') + unique_signatures = get_all_signatures(alleles, individuals, graph) + peek = repr(list(unique_signatures[21].values())) + assert peek == '[N0:21-21(00002202022222220202), N1:21-21(00202202022222220202), N2:21-21(00022200000000000000), N3:21-21(00000000000000000000), N4:21-21(00002200000000000000), N5:21-21(00022202022220020002), N6:21-21(02000000000000000000), N7:21-21(00002202222220020022)]', peek simplified_individuals = build_individuals(individuals, unique_signatures) - assert repr(simplified_individuals[500][ - :100]) == '[N2(0, 0), N2(1, 1), N2(2, 2), N2(3, 3), N2(4, 4), N2(5, 5), N3(6, 6), N3(7, 7), N3(8, 8), N2(9, 9), N0(10, 10), N1(11, 11), N2(12, 12), N2(13, 13), N2(14, 14), N2(15, 15), N3(16, 16), N3(17, 17), N4(18, 18), N3(19, 19), N5(20, 20), N3(21, 21), N3(22, 22), N10(23, 23), N4(24, 24), N3(25, 25), N4(26, 26), N3(27, 27), N1(28, 28), N1(29, 29), N4(30, 30), N3(31, 31), N21(32, 32), N1(33, 33), N1(34, 34), N1(35, 35), N1(36, 36), N1(37, 37), N1(38, 38), N1(39, 39), N1(40, 40), N1(41, 41), N1(42, 42), N1(43, 43), N1(44, 44), N1(45, 45), N1(46, 46), N1(47, 47), N1(48, 48), N1(49, 49), N1(50, 50), N1(51, 51), N1(52, 52), N1(53, 53), N1(54, 54), N1(55, 55), N1(56, 56), N1(57, 57), N1(58, 58), N1(59, 59), N1(60, 60), N1(61, 61), N1(62, 62), N1(63, 63), N1(64, 64), N1(65, 65), N1(66, 66), N1(67, 67), N1(68, 68), N1(69, 69), N1(70, 70), N1(71, 71), N1(72, 72), N1(73, 73), N1(74, 74), N1(75, 75), N1(76, 76), N1(77, 77), N0(78, 78), N0(79, 79), N1(80, 80), N1(81, 81), N1(82, 82), N1(83, 83), N1(84, 84), N1(85, 85), N1(86, 86), N1(87, 87), N1(88, 88), N1(89, 89), N1(90, 90), N1(91, 91), N1(92, 92), N1(93, 93), N1(94, 94), N1(95, 95), N1(96, 96), N1(97, 97), N0(98, 98), N0(99, 99)]' + peek = repr(simplified_individuals[500][:100]) + assert peek == '[N2:0-0(00000000000000000000), N2:1-1(00000000000000000000), N2:2-2(00000000000000000000), N2:3-3(00000000000000000000), N2:4-4(00000000000000000000), N2:5-5(00000000000000000000), N3:6-6(00000000000000000000), N3:7-7(00000000000000000000), N3:8-8(00000000000000000000), N2:9-9(00000000000000000000), N0:10-10(00000000000000000000), N1:11-11(00000000000000000000), N2:12-12(00000000000000000000), N2:13-13(00000000000000000000), N2:14-14(00000000000000000000), N2:15-15(00000000000000000000), N3:16-16(00000000000000000000), N3:17-17(00000000000000000000), N4:18-18(00000000000000000000), N3:19-19(00000000000000000000), N5:20-20(00000000000000000000), N3:21-21(00000000000000000000), N3:22-22(00000000000000000000), N10:23-23(00200000000000000000), N4:24-24(00002200222220002000), N3:25-25(02000222220002020222), N4:26-26(20022000002002220002), N3:27-27(22222202020222220000), N1:28-28(00000000000000000000), N1:29-29(00000000000000000022), N4:30-30(00002222222000002200), N3:31-31(00022222202000000000), N21:32-32(00000020202200022020), N1:33-33(02202220022020222000), N1:34-34(00020002000202222002), N1:35-35(22220002220022200022), N1:36-36(22222200000000000000), N1:37-37(00202002222220000200), N1:38-38(00000200000202022200), N1:39-39(02202000202202220000), N1:40-40(00020222200020000020), N1:41-41(20220020022200022200), N1:42-42(00000000000000000000), N1:43-43(00000000000000000000), N1:44-44(00000000000000000000), N1:45-45(00000000000000000000), N1:46-46(00000002220220020020), N1:47-47(00202220222220222202), N1:48-48(00000000000000000002), N1:49-49(20002200000002220022), N1:50-50(22020002002020202022), N1:51-51(02202222220222202000), N1:52-52(20000020000000000000), N1:53-53(00000000000000000000), N1:54-54(00000000000000000000), N1:55-55(00220220000200000220), N1:56-56(20000000202022022020), N1:57-57(20222022022202222220), N1:58-58(22022202222222020200), N1:59-59(22202200202220202220), N1:60-60(22020022220200022022), N1:61-61(20202220000220000220), N1:62-62(00022002000000000000), N1:63-63(00000220000000000000), N1:64-64(00000000000220200000), N1:65-65(00022020200000020022), N1:66-66(20020222222020200020), N1:67-67(00000000000000202222), N1:68-68(22222222000202222202), N1:69-69(22022222020020000022), N1:70-70(00002002220022222200), N1:71-71(22002020020202000000), N1:72-72(00022202000202220020), N1:73-73(22000000000000200020), N1:74-74(22220222220200202202), N1:75-75(00022202222200000000), N1:76-76(00000220220200200022), N1:77-77(02200202020020200000), N0:78-78(00002000000000000000), N0:79-79(00000000000000000000), N1:80-80(00000000000022220000), N1:81-81(00000000000000000000), N1:82-82(00022220200202202202), N1:83-83(20202222200202202202), N1:84-84(00000020000000000000), N1:85-85(00222022020000000002), N1:86-86(22020222020222222000), N1:87-87(00022222002020222022), N1:88-88(00002222000000000200), N1:89-89(00000000000000220022), N1:90-90(22020202200020222220), N1:91-91(00002000002220002222), N1:92-92(22200000000000000000), N1:93-93(00000000000000000000), N1:94-94(00202022200202222222), N1:95-95(22222202202020222222), N1:96-96(00222220200202222020), N1:97-97(22002202220222222022), N0:98-98(20222222222222020220), N0:99-99(20222222220222222002)]', peek assert len(simplified_individuals) == 501 and len(simplified_individuals[60]) == 1638 @@ -68,7 +74,7 @@ def test_get_unique_signatures(self): @unittest.skip def test_no_duplicate_nodes(self): - self.create_nodes() + self.create_nodes('test') unique_nodes = set() duplicates_found = 0 for locus in self.simplified_individuals: @@ -103,39 +109,21 @@ def test_split_one_group(self): ['C', {a, b, d}, '', {c}], # [3] repeated from [1] SNP """ nodes = [ - Node(91, 1, 1, {1, 2, 4}), - Node(92, 1, 1, {3}), - Node(93, 2, 2, {1, 2, 3, 4}), # [2] anchor - Node(94, 3, 3, {1, 2, 4}), - Node(95, 3, 3, {3}), - # additional bracketing to anchor - Node(90, 0, 0, {1, 2, 3, 4}), - Node(96, 4, 4, {1, 2, 3, 4}) + ['90', {1, 2, 3, 4}], #[5] + ['91', {1, 2, 4}, '92', {3}], + ['93', {1, 2, 3, 4}], # [2] anchor + ['94', {1, 2, 4}, '95', {3}], #[3] [4] + ['96', {1, 2, 3, 4}] #[6] ] - # connections - nodes[5].downstream[nodes[0]] = 3 - nodes[5].downstream[nodes[1]] = 1 - nodes[0].downstream[nodes[2]] = 3 - nodes[1].downstream[nodes[2]] = 1 - nodes[2].downstream[nodes[3]] = 3 - nodes[2].downstream[nodes[4]] = 1 - nodes[3].downstream[nodes[6]] = 3 - nodes[4].downstream[nodes[6]] = 1 - nodes[0].upstream[nodes[5]] = 3 - nodes[1].upstream[nodes[5]] = 1 - nodes[2].upstream[nodes[0]] = 3 - nodes[2].upstream[nodes[1]] = 1 - nodes[3].upstream[nodes[2]] = 3 - nodes[4].upstream[nodes[2]] = 1 - nodes[6].upstream[nodes[3]] = 3 - nodes[6].upstream[nodes[4]] = 1 - new_node = split_one_group(nodes[0], nodes[2], nodes[3]) # no mentions of minorities [1] or [4] + g = Graph.views.build_from_test_slices(nodes, '9') + first, anchor, third = g.node('91'), g.node('93'), g.node('94') + new_node = split_one_group(first, anchor, third) # no mentions of minorities [1] or [4] print(new_node.details()) - assert new_node in nodes[5].downstream and nodes[1] in nodes[5].downstream - assert nodes[0] not in nodes[5].downstream - assert nodes[5] in new_node.upstream and nodes[6] in new_node.downstream - assert new_node in nodes[6].upstream and nodes[4] in nodes[6].upstream - assert nodes[3] not in nodes[6].upstream + assert new_node in g.node('90').downstream and g.node('92') in g.node('90').downstream + assert g.node('91') not in g.node('90').downstream + assert g.node('90') in new_node.upstream and g.node('96') in new_node.downstream + assert new_node in g.node('96').upstream and g.node('95') in g.node('96').upstream + assert g.node('94') not in g.node('96').upstream def _test_split_groups(self, all_nodes): summary3 = split_groups(all_nodes) @@ -144,7 +132,7 @@ def _test_split_groups(self, all_nodes): def test_workflow(self): - self.create_nodes() + self.create_nodes('test') all_nodes = [node for window in self.unique_signatures for node in window.values()] # think about referencing and deletion summary1 = self._test_simple_merge(all_nodes) summary2 = self._test_neglect_nodes(summary1) diff --git a/HaploBlocker/views.py b/HaploBlocker/views.py index a283b04..a47de89 100644 --- a/HaploBlocker/views.py +++ b/HaploBlocker/views.py @@ -2,10 +2,9 @@ from django.http import HttpResponse -from HaploBlocker.models import Edge def index(request): - return HttpResponse("Index of Edges.\n" + str(Edge.objects.all())) + return HttpResponse() #"Index of Edges.\n" + str(Edge.objects.all())) # Create your views here. From a2bbcefdefcee1197d59ad846663e229556d4145 Mon Sep 17 00:00:00 2001 From: Josiah Seaman Date: Thu, 22 Aug 2019 15:23:42 +0100 Subject: [PATCH 02/15] #23 #3 #4 MAJOR: Added Zoom layer concept by giving Path.zoom so Paths can visit Nodes from any zoom layer. There is one path per accession per zoom layer. Paths and nodes contain links to their parent/child summaries. --- .travis.yml | 2 +- Graph/gfa.py | 2 +- Graph/migrations/0003_add_zoom_levels.py | 49 ++++++++++++++++++++++++ Graph/models.py | 19 ++++++++- Graph/test.py | 36 +++++++++++------ Graph/utils.py | 33 +++++++++++++++- Graph/views.py | 31 --------------- HaploBlocker/migrations/0001_initial.py | 40 ------------------- HaploBlocker/tests.py | 3 +- 9 files changed, 127 insertions(+), 88 deletions(-) create mode 100644 Graph/migrations/0003_add_zoom_levels.py delete mode 100644 HaploBlocker/migrations/0001_initial.py diff --git a/.travis.yml b/.travis.yml index ce36042..42ba70b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,7 +8,7 @@ python: # Command to install dependencies, e.g. pip install -r requirements_dev.txt --use-mirrors install: - pip install -r requirements_dev.txt - - python manage.py migrate + - python manage.py migrate Graph env: - DJANGO_SETTINGS_MODULE=vgbrowser.settings diff --git a/Graph/gfa.py b/Graph/gfa.py index 671d1a9..f03c926 100644 --- a/Graph/gfa.py +++ b/Graph/gfa.py @@ -6,7 +6,7 @@ import io import os import tempfile -from Graph.models import * +from Graph.models import Node, Path, GraphGenome def pairwise(iterable): diff --git a/Graph/migrations/0003_add_zoom_levels.py b/Graph/migrations/0003_add_zoom_levels.py new file mode 100644 index 0000000..46a8a15 --- /dev/null +++ b/Graph/migrations/0003_add_zoom_levels.py @@ -0,0 +1,49 @@ +# Generated by Django 2.2.1 on 2019-08-22 13:40 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('Graph', '0002_Path_unique_together'), + ] + + operations = [ + migrations.AddField( + model_name='node', + name='summarized_by', + field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='children', to='Graph.Node'), + ), + migrations.AddField( + model_name='path', + name='zoom', + field=models.IntegerField(default=0), + preserve_default=False, + ), + migrations.AlterField( + model_name='node', + name='id', + field=models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID'), + ), + migrations.AlterField( + model_name='node', + name='name', + field=models.CharField(max_length=15), + ), + migrations.AlterField( + model_name='nodetraversal', + name='order', + field=models.IntegerField(help_text='Defines the order a path lists traversals. The scale of order is not preserved between zoom levels.'), + ), + migrations.AlterUniqueTogether( + name='path', + unique_together={('graph', 'accession', 'zoom')}, + ), + migrations.AddField( + model_name='path', + name='summarized_by', + field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='summary_child', to='Graph.Path'), + ), + ] diff --git a/Graph/models.py b/Graph/models.py index 95257a9..70ae0b6 100644 --- a/Graph/models.py +++ b/Graph/models.py @@ -16,6 +16,8 @@ class NodeMissingError(ValueError): class GraphGenome(models.Model): name = models.CharField(max_length=1000) + #zoom = models.IntegerField() Zoom level is defined in Paths only. + # Nodes can be shared between zoom levels. @property def paths(self): @@ -69,9 +71,16 @@ def node(self, node_name): class Node(models.Model): + """Nodes are the fundamental content carriers for sequence. A Path + can traverse nodes in any order, on both + and - strands. + Nodes can be utilized by Paths in multiple zoom levels. + Only first order Nodes (zoom=0) have sequences, but all have names which can be used + to fetch the node from GraphGenome.node().""" seq = models.CharField(max_length=255, blank=True) name = models.CharField(max_length=15) graph = models.ForeignKey(GraphGenome, on_delete=models.CASCADE) + summarized_by = models.ForeignKey('Node', null=True, blank=True, on_delete=models.SET_NULL, + related_name='children') class Meta: unique_together = ['graph', 'name'] @@ -157,9 +166,13 @@ class Path(models.Model): them to Nodes to link together.""" accession = models.CharField(max_length=1000) # one path per accession graph = models.ForeignKey(GraphGenome, on_delete=models.CASCADE) + zoom = models.IntegerField(default=0) # Zoom level starts at 0 for nucleotide level and moves up + # Nodes can be shared between zoom levels. + summarized_by = models.ForeignKey('Path', related_name='summary_child', + blank=True, null=True, on_delete=models.SET_NULL) class Meta: - unique_together = ['graph', 'accession'] + unique_together = ['graph', 'accession', 'zoom'] def __getitem__(self, path_index): return self.nodes[path_index] @@ -207,7 +220,9 @@ class NodeTraversal(models.Model): node = models.ForeignKey(Node, db_index=True, on_delete=models.CASCADE) path = models.ForeignKey(Path, db_index=True, on_delete=models.CASCADE, help_text='') strand = models.CharField(choices=[('+', '+'),('-', '-')], default='+', max_length=1) - order = models.IntegerField(help_text='Defines the order a path lists traversals') # set automatically + # order is set automatically in the CustomSaveManager + order = models.IntegerField(help_text='Defines the order a path lists traversals. ' + 'The scale of order is not preserved between zoom levels.') objects = CustomSaveManager() diff --git a/Graph/test.py b/Graph/test.py index 8dec1d3..e0e794e 100644 --- a/Graph/test.py +++ b/Graph/test.py @@ -4,11 +4,11 @@ import os from os.path import join from Graph.gfa import GFA -from Graph.models import Node, Path +from Graph.models import Node, Path, NodeTraversal from Graph.sort import DAGify # Define the working directory -from Graph.views import build_from_test_slices +from Graph.utils import build_graph_from_slices from vgbrowser.settings import BASE_DIR PATH_TO_TEST_DATA = join(BASE_DIR, "test_data") location_of_xg = join(BASE_DIR, "test_data","xg") @@ -36,23 +36,37 @@ def test_equalities(self): def test_graph_factory(self): original_test = [['ACGT', {a, b, c, d}], - ['C', {a, b, d}, 'T', {c}], # SNP + ['C', {a, b, d}, 'T', {c}], # SNP ['GGA', {a, b, c, d}], # anchor - ['C', {a, b, d}], # [3] repeated from [1] SNP + ['C', {a, b, d}], # [3] repeated from [1] SNP ['AGTACG', {a, b, c}, 'CGTACT', {d}], # [4] different membership from [3] - ['TTG', {a, b, c, d}], # [5] anchor + ['TTG', {a, b, c, d}], # [5] anchor ['A', {a, b}, 'C', {d, e}, 'T', {c}], # [6] third allele ['GG', {a, b}, 'TT', {c, d}], # [7] equal size nodes ['C', {a, b, c, e}, 'T', {d}], # [8] path slip ['C', {a, b, e}, 'T', {c, d}], # [9] path slip ['C', {a, b, c}, 'T', {d}], # [10]path slip ['TATA', {a, b, c, d}]] # [11] anchor - g1, g2 = build_from_test_slices(original_test), build_from_test_slices(original_test) + g1, g2 = build_graph_from_slices(original_test), build_graph_from_slices(original_test) assert g1 == g2, \ ('\n' + repr(g1) + '\n' + repr(g2)) g_from_GFA = self.test_example_graph() # comes from matching assert g1 == g_from_GFA, repr(g1) + '\n' + repr(g_from_GFA) - + # TODO: stricter equality checks: currently just counting nodes + + def test_summary_storage(self): + graph = self.test_example_graph() + parent = Path.objects.get(graph=graph, accession='a', zoom=0) + path = Path.objects.create(graph=graph, accession='a', zoom=1) + new_node = Node.objects.create(seq='ACGTCGGA', name='2*2', graph=graph) + Node.objects.filter(name__in=['1','2','4']).update(summarized_by=new_node) + assert new_node.children.count() == 3 + for node in Node.objects.filter(name__in=['1','2','4']): + assert node.summarized_by_id == new_node.id + for i, node_name in enumerate(['2*2', '5', '6']): + current = graph.node(node_name) + NodeTraversal(node=current, path=path, strand='+', order=i).save() + assert NodeTraversal.objects.get(order=0, path=path).downstream().downstream().node.name == '6' @unittest.skip # DAGify has not been converted to databases yet. class DAGifyTest(TestCase): @@ -184,10 +198,10 @@ def test_load_gfa_via_xg(self): graph2 = GFA.load_from_xg(join(PATH_TO_TEST_DATA, "test.xg"), location_of_xg) graph = graph2.to_graph() x,y,z = 'x','y','z' - self.assertEqual(graph, build_from_test_slices([['CAAATAAG', {x, y, z}], ['A', {y, z}, 'G', {x}], - ['C', {x, y, z}], ['TTG', {x, y, z}], - ['A', {z}, 'G', {x, y}], ['AAATTTTCTGGAGTTCTAT', {x, y, z}], ['T', {x, y, z}], - ['ATAT', {x, y, z}], ['T', {x, y, z}], ['CCAACTCTCTG', {x, y, z}]])) + self.assertEqual(graph, build_graph_from_slices([['CAAATAAG', {x, y, z}], ['A', {y, z}, 'G', {x}], + ['C', {x, y, z}], ['TTG', {x, y, z}], + ['A', {z}, 'G', {x, y}], ['AAATTTTCTGGAGTTCTAT', {x, y, z}], ['T', {x, y, z}], + ['ATAT', {x, y, z}], ['T', {x, y, z}], ['CCAACTCTCTG', {x, y, z}]])) @staticmethod def is_different(gfa1, gfa2): diff --git a/Graph/utils.py b/Graph/utils.py index df3e172..d750c60 100644 --- a/Graph/utils.py +++ b/Graph/utils.py @@ -1,4 +1,7 @@ from collections import defaultdict +from typing import List + +from Graph.models import GraphGenome, Path, Node class keydefaultdict(defaultdict): @@ -7,4 +10,32 @@ def __missing__(self, key): raise KeyError( key ) else: ret = self[key] = self.default_factory(key) - return ret \ No newline at end of file + return ret + + +def build_graph_from_slices(cmd: List, graph_name='test_data') -> GraphGenome: + """This factory uses test data shorthand for linear graph slices to build + a database GraphGenome with all the necessary Paths and Nodes. Path order populated in the order + that they are mentioned in the slices. Currently, this is + only and does not support non-linear + orderings. Use Path.append_node() to build non-linear graphs.""" + if isinstance(cmd, str): + cmd = eval(cmd) + # preemptively grab all the path names from every odd list entry + graph = GraphGenome.objects.get_or_create(name=graph_name)[0] # + str(datetime.now()) + node_count = 0 + paths = {key for sl in cmd for i in range(0, len(sl), 2) for key in sl[i + 1]} + path_objs = {} + for path_name in paths: + path_objs[path_name] = Path.objects.get_or_create(graph=graph, accession=path_name, zoom=0)[0] + for sl in cmd: + try: + for i in range(0, len(sl), 2): + paths_mentioned = [path_objs[key] for key in sl[i + 1]] + node, is_new = Node.objects.get_or_create(seq=sl[i], name=graph.name + str(node_count), graph=graph) + node_count += 1 + for path in paths_mentioned: + path.append_node(node, '+') + except IndexError: + raise IndexError("Expecting two terms: ", sl[0]) # sl[i:i+2]) + + return graph \ No newline at end of file diff --git a/Graph/views.py b/Graph/views.py index d2ece86..adee7d1 100644 --- a/Graph/views.py +++ b/Graph/views.py @@ -1,34 +1,3 @@ -from typing import List - -from django.shortcuts import render - # View contains the endpoints on the server for the browser to fetch data -from Graph.models import GraphGenome, Path, Node - -def build_from_test_slices(cmd: List, graph_name='test_data') -> GraphGenome: - """This factory uses test data shorthand for linear graph slices to build - a database GraphGenome with all the necessary Paths and Nodes. Path order populated in the order - that they are mentioned in the slices. Currently, this is + only and does not support non-linear - orderings. Use Path.append_node() to build non-linear graphs.""" - if isinstance(cmd, str): - cmd = eval(cmd) - # preemptively grab all the path names from every odd list entry - graph = GraphGenome.objects.get_or_create(name=graph_name)[0] # + str(datetime.now()) - node_count = 0 - paths = {key for sl in cmd for i in range(0, len(sl), 2) for key in sl[i + 1]} - path_objs = {} - for path_name in paths: - path_objs[path_name] = Path.objects.get_or_create(graph=graph, accession=path_name)[0] - for sl in cmd: - try: - for i in range(0, len(sl), 2): - paths_mentioned = [path_objs[key] for key in sl[i + 1]] - node, is_new = Node.objects.get_or_create(seq=sl[i], name=graph.name + str(node_count), graph=graph) - node_count += 1 - for path in paths_mentioned: - path.append_node(node, '+') - except IndexError: - raise IndexError("Expecting two terms: ", sl[0]) # sl[i:i+2]) - return graph \ No newline at end of file diff --git a/HaploBlocker/migrations/0001_initial.py b/HaploBlocker/migrations/0001_initial.py deleted file mode 100644 index a7b3890..0000000 --- a/HaploBlocker/migrations/0001_initial.py +++ /dev/null @@ -1,40 +0,0 @@ -# Generated by Django 2.2.1 on 2019-07-23 21:25 - -from django.db import migrations, models -import django.db.models.deletion - - -class Migration(migrations.Migration): - - initial = True - - dependencies = [ - ] - - operations = [ - migrations.CreateModel( - name='Node', - fields=[ - ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('seq', models.CharField(max_length=1000)), - ], - ), - migrations.CreateModel( - name='Path', - fields=[ - ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('accession', models.CharField(max_length=1000, unique=True)), - ], - ), - migrations.CreateModel( - name='Edge', - fields=[ - ('node', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='HaploBlocker.Node')), - ('path', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='HaploBlocker.Path')), - ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ], - options={ - 'ordering': ['path', 'id'], - }, - ), - ] diff --git a/HaploBlocker/tests.py b/HaploBlocker/tests.py index a8eb46e..665d84c 100644 --- a/HaploBlocker/tests.py +++ b/HaploBlocker/tests.py @@ -1,5 +1,6 @@ from django.test import TestCase +import Graph.utils import Graph.views from Graph.models import GraphGenome from vgbrowser.settings import BASE_DIR @@ -115,7 +116,7 @@ def test_split_one_group(self): ['94', {1, 2, 4}, '95', {3}], #[3] [4] ['96', {1, 2, 3, 4}] #[6] ] - g = Graph.views.build_from_test_slices(nodes, '9') + g = Graph.utils.build_graph_from_slices(nodes, '9') first, anchor, third = g.node('91'), g.node('93'), g.node('94') new_node = split_one_group(first, anchor, third) # no mentions of minorities [1] or [4] print(new_node.details()) From c02b7f1ce0a2d4c0e124bb5126e5604363b90296 Mon Sep 17 00:00:00 2001 From: Josiah Seaman Date: Thu, 22 Aug 2019 16:37:57 +0100 Subject: [PATCH 03/15] #23 WIP: Converting Haploblocker, automating transitions simplifies the code. But zoom filtering is becoming cumbersome. --- Graph/models.py | 15 +++----- HaploBlocker/haplonetwork.py | 41 ++++++++++++--------- HaploBlocker/tests.py | 71 +++++++++++++++++++----------------- 3 files changed, 66 insertions(+), 61 deletions(-) diff --git a/Graph/models.py b/Graph/models.py index 70ae0b6..6e18101 100644 --- a/Graph/models.py +++ b/Graph/models.py @@ -105,19 +105,16 @@ def __hash__(self): def to_gfa(self, segment_id: int): return '\t'.join(['S', str(segment_id), self.seq]) - @property - def specimens(self): - return self.nodetraversal_set + def specimens(self, zoom_level) -> Set[int]: + return self.nodetraversal_set.filter(path_zoom=zoom_level).value_list('path_id', flat=True) - @property - def upstream(self) -> Set[int]: - traverses = self.nodetraversal_set.all() # values_list('node__id', flat=True) + def upstream(self, zoom_level) -> Set[int]: + traverses = self.nodetraversal_set.filter(path_zoom=zoom_level) #.value_list('node_id', flat=True) # Node.objects.filter(id__in=traverses).values_list('id', flat=True) return set(t.upstream_id() for t in traverses) - @property - def downstream(self) -> Set[int]: - traverses = self.nodetraversal_set.all() + def downstream(self, zoom_level) -> Set[int]: + traverses = self.nodetraversal_set.filter(path_zoom=zoom_level).all() return set(t.downstream_id() for t in traverses) def __repr__(self): diff --git a/HaploBlocker/haplonetwork.py b/HaploBlocker/haplonetwork.py index 15355aa..a536648 100644 --- a/HaploBlocker/haplonetwork.py +++ b/HaploBlocker/haplonetwork.py @@ -7,7 +7,7 @@ import numpy as np from collections import defaultdict from copy import copy -from Graph.models import Node +from Graph.models import Node, Path BLOCK_SIZE = 20 FILTER_THRESHOLD = 4 @@ -33,7 +33,7 @@ def signature(individual, start_locus): return tuple(individual[start_locus: start_locus + BLOCK_SIZE]) -def get_unique_signatures(individuals, start_locus, current_graph): +def nodes_from_unique_signatures(individuals, start_locus, current_graph): """A signature is a series of BLOCK_SIZE SNPs inside of a locus. We want to know how many unique signatures are present inside of one locus. A Node is created for each unique signature found. @@ -44,41 +44,46 @@ def get_unique_signatures(individuals, start_locus, current_graph): for individual in individuals: sig = signature(individual, start_locus) if sig not in unique_blocks: - unique_blocks[sig] = Node(name=f'{len(unique_blocks)}:{start_locus // BLOCK_SIZE}-{start_locus // BLOCK_SIZE}', + unique_blocks[sig] = Node.objects.create( # saves to Database + name=f'{len(unique_blocks)}:{start_locus // BLOCK_SIZE}-{start_locus // BLOCK_SIZE}', seq=''.join(str(x) for x in sig), graph=current_graph) return unique_blocks -def get_all_signatures(alleles, individuals, current_graph): - unique_signatures = [] - for locus_start in range(0, len(alleles) - BLOCK_SIZE, BLOCK_SIZE): # discards remainder - sig = get_unique_signatures(individuals, locus_start, current_graph) - unique_signatures.append(sig) - return unique_signatures +def build_all_slices(alleles, individuals, current_graph): + """Each item in this list is a slice, representing all the possible states for one locus. + Inside a slice is a set of Nodes, one for each unique 'signature' or sequence state. + Paths that all have the same state in this slice all reference the same Node object.""" + slices = [] + for slice_start in range(0, len(alleles) - BLOCK_SIZE, BLOCK_SIZE): # discards remainder + nodes = nodes_from_unique_signatures(individuals, slice_start, current_graph) + slices.append(nodes) + return slices -def build_individuals(individuals, unique_signatures): - """Describes an individual as a list of Nodes that individual visits. - simplified_individuals is a list of loci which contain a list of Nodes which each contain specimen +def build_paths(individuals, unique_signatures): + """Describes an individual as a Path (list of Nodes) that the individual visits (NodeTraversals). + accessions is a list of loci which contain a list of Nodes which each contain specimen build nodes: [0] first 4 are the 4 starting signatures in window 0. Nodes represent a collection of individuals with the same signature at that locus For each node list which individuals are present at that node""" - simplified_individuals = [] + # TODO: It may be more performant to merge build_all_slices and build_paths so that global lists are never stored + accessions = [] for i_specimen, specimen in enumerate(individuals): - my_simplification = [] + my_path = Path.objects.create(accession=str(i_specimen)) for w, window in enumerate(unique_signatures): # the length of the genome sig = signature(specimen, w * BLOCK_SIZE) - my_simplification.append(unique_signatures[w][sig]) - simplified_individuals.append(my_simplification) - return simplified_individuals + my_path.append_node(unique_signatures[w][sig], '+') + accessions.append(my_path) + return accessions def populate_transitions(simplified_individuals): """ List transition rates from one node to all other upstream and downstream. This method populates Node.specimens and begins the process of side-effecting Nodes. - To rebuild a fresh Graph copy, you must start at get_all_signatures() + To rebuild a fresh Graph copy, you must start at build_all_slices() :param simplified_individuals: """ for i, indiv in enumerate(simplified_individuals): diff --git a/HaploBlocker/tests.py b/HaploBlocker/tests.py index 665d84c..f023385 100644 --- a/HaploBlocker/tests.py +++ b/HaploBlocker/tests.py @@ -2,14 +2,14 @@ import Graph.utils import Graph.views -from Graph.models import GraphGenome +from Graph.models import GraphGenome, Path from vgbrowser.settings import BASE_DIR import unittest import os # Create your tests here. # from HaploBlocker.models import Node, Path, Edge from HaploBlocker.haplonetwork import Node, split_one_group -from HaploBlocker.haplonetwork import read_data, get_all_signatures, build_individuals, get_unique_signatures, \ +from HaploBlocker.haplonetwork import read_data, build_all_slices, build_paths, nodes_from_unique_signatures, \ populate_transitions, simple_merge, neglect_nodes, split_groups # @@ -27,21 +27,20 @@ class HaploTest(unittest.TestCase): @classmethod def setUpClass(self) -> None: """Reads the input data file once. Tests that need a fresh graph must - call create_nodes()""" + call create_graph()""" print(os.getcwd()) self.alleles, self.individuals = read_data(os.path.join(BASE_DIR, "test_data/KE_chromo10.txt")) - def create_nodes(self, graph_name): - """Tests that need a fresh graph must call create_nodes() FIRST! + def create_graph(self, graph_name): + """Tests that need a fresh graph must call create_graph() FIRST! Graph summarization works by side effecting Node objects. Tests can not run independently with order dependent side effects. This method is slow, so don't use it unless you need it. :param graph_name: """ graph = GraphGenome.objects.create(name=graph_name) - self.unique_signatures = get_all_signatures(self.alleles, self.individuals, graph) - self.simplified_individuals = build_individuals(self.individuals, self.unique_signatures) - # G = build_graph(simplified_individuals) - populate_transitions(self.simplified_individuals) + slices = build_all_slices(self.alleles, self.individuals, graph) + self.paths = build_paths(self.individuals, slices) + return graph def test_read(self): @@ -56,29 +55,30 @@ def test_build_individuals(self): def internal_build_individuals(self, alleles, individuals): graph = GraphGenome.objects.create(name='internal_build_individuals') - unique_signatures = get_all_signatures(alleles, individuals, graph) + unique_signatures = build_all_slices(alleles, individuals, graph) peek = repr(list(unique_signatures[21].values())) assert peek == '[N0:21-21(00002202022222220202), N1:21-21(00202202022222220202), N2:21-21(00022200000000000000), N3:21-21(00000000000000000000), N4:21-21(00002200000000000000), N5:21-21(00022202022220020002), N6:21-21(02000000000000000000), N7:21-21(00002202222220020022)]', peek - simplified_individuals = build_individuals(individuals, unique_signatures) + simplified_individuals = build_paths(individuals, unique_signatures) peek = repr(simplified_individuals[500][:100]) assert peek == '[N2:0-0(00000000000000000000), N2:1-1(00000000000000000000), N2:2-2(00000000000000000000), N2:3-3(00000000000000000000), N2:4-4(00000000000000000000), N2:5-5(00000000000000000000), N3:6-6(00000000000000000000), N3:7-7(00000000000000000000), N3:8-8(00000000000000000000), N2:9-9(00000000000000000000), N0:10-10(00000000000000000000), N1:11-11(00000000000000000000), N2:12-12(00000000000000000000), N2:13-13(00000000000000000000), N2:14-14(00000000000000000000), N2:15-15(00000000000000000000), N3:16-16(00000000000000000000), N3:17-17(00000000000000000000), N4:18-18(00000000000000000000), N3:19-19(00000000000000000000), N5:20-20(00000000000000000000), N3:21-21(00000000000000000000), N3:22-22(00000000000000000000), N10:23-23(00200000000000000000), N4:24-24(00002200222220002000), N3:25-25(02000222220002020222), N4:26-26(20022000002002220002), N3:27-27(22222202020222220000), N1:28-28(00000000000000000000), N1:29-29(00000000000000000022), N4:30-30(00002222222000002200), N3:31-31(00022222202000000000), N21:32-32(00000020202200022020), N1:33-33(02202220022020222000), N1:34-34(00020002000202222002), N1:35-35(22220002220022200022), N1:36-36(22222200000000000000), N1:37-37(00202002222220000200), N1:38-38(00000200000202022200), N1:39-39(02202000202202220000), N1:40-40(00020222200020000020), N1:41-41(20220020022200022200), N1:42-42(00000000000000000000), N1:43-43(00000000000000000000), N1:44-44(00000000000000000000), N1:45-45(00000000000000000000), N1:46-46(00000002220220020020), N1:47-47(00202220222220222202), N1:48-48(00000000000000000002), N1:49-49(20002200000002220022), N1:50-50(22020002002020202022), N1:51-51(02202222220222202000), N1:52-52(20000020000000000000), N1:53-53(00000000000000000000), N1:54-54(00000000000000000000), N1:55-55(00220220000200000220), N1:56-56(20000000202022022020), N1:57-57(20222022022202222220), N1:58-58(22022202222222020200), N1:59-59(22202200202220202220), N1:60-60(22020022220200022022), N1:61-61(20202220000220000220), N1:62-62(00022002000000000000), N1:63-63(00000220000000000000), N1:64-64(00000000000220200000), N1:65-65(00022020200000020022), N1:66-66(20020222222020200020), N1:67-67(00000000000000202222), N1:68-68(22222222000202222202), N1:69-69(22022222020020000022), N1:70-70(00002002220022222200), N1:71-71(22002020020202000000), N1:72-72(00022202000202220020), N1:73-73(22000000000000200020), N1:74-74(22220222220200202202), N1:75-75(00022202222200000000), N1:76-76(00000220220200200022), N1:77-77(02200202020020200000), N0:78-78(00002000000000000000), N0:79-79(00000000000000000000), N1:80-80(00000000000022220000), N1:81-81(00000000000000000000), N1:82-82(00022220200202202202), N1:83-83(20202222200202202202), N1:84-84(00000020000000000000), N1:85-85(00222022020000000002), N1:86-86(22020222020222222000), N1:87-87(00022222002020222022), N1:88-88(00002222000000000200), N1:89-89(00000000000000220022), N1:90-90(22020202200020222220), N1:91-91(00002000002220002222), N1:92-92(22200000000000000000), N1:93-93(00000000000000000000), N1:94-94(00202022200202222222), N1:95-95(22222202202020222222), N1:96-96(00222220200202222020), N1:97-97(22002202220222222022), N0:98-98(20222222222222020220), N0:99-99(20222222220222222002)]', peek assert len(simplified_individuals) == 501 and len(simplified_individuals[60]) == 1638 def test_get_unique_signatures(self): - unique_blocks = get_unique_signatures(self.individuals, 0) + graph = GraphGenome.objects.create(name='test_get_unique_signatures') + unique_blocks = nodes_from_unique_signatures(self.individuals, 0, graph) assert len(unique_blocks) == 4 - assert unique_blocks.__repr__() == '{(0, 2, 0, 0, 2, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 2, 0, 0, 0, 0): N0(0, 0), ' \ - '(0, 0, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 0, 2, 2, 2, 2): N1(0, 0), ' \ - '(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0): N2(0, 0), ' \ - '(2, 0, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 0, 2, 2, 2, 2): N3(0, 0)}' + peek = repr(list(unique_blocks.values())) + assert peek == \ + '[N0:0-0(02002020002000220000), N1:0-0(00220202220222002222), ' \ + 'N2:0-0(00000000000000000000), N3:0-0(20220202220222002222)]', peek @unittest.skip def test_no_duplicate_nodes(self): - self.create_nodes('test') + graph = self.create_graph('test') unique_nodes = set() duplicates_found = 0 - for locus in self.simplified_individuals: + for locus in self.paths: for node in locus: # assert isinstance(node, Node) if node in unique_nodes: # If two nodes have the same __hash__ they'll be "in" @@ -89,13 +89,17 @@ def test_no_duplicate_nodes(self): assert duplicates_found == 0, f"Found {duplicates_found} duplicated nodes in the graph" - def _test_simple_merge(self, all_nodes): - assert len(all_nodes) == 7180 - summary1 = simple_merge(all_nodes) - assert len(summary1) == 3690 - return summary1 + def _test_simple_merge(self, graph, zoom_level): + # these tests could be made independent of test_workflow, but it would be slower + assert graph.highest_zoom_level() == zoom_level + assert len(graph) == 7180 + status = simple_merge(graph) + assert Path.objects.filter(graph=graph, zoom=zoom_level + 1).count() == \ + Path.objects.filter(graph=graph, zoom=zoom_level + 0).count() + assert Path.objects.filter(graph=graph, zoom=zoom_level+1) == 3690 + return status - def _test_neglect_nodes(self, all_nodes): + def _test_neglect_nodes(self, all_nodes, zoom_level): summary2 = neglect_nodes(all_nodes) assert len(summary2) == 2854 unchanged = neglect_nodes(summary2, 0) @@ -126,25 +130,24 @@ def test_split_one_group(self): assert new_node in g.node('96').upstream and g.node('95') in g.node('96').upstream assert g.node('94') not in g.node('96').upstream - def _test_split_groups(self, all_nodes): - summary3 = split_groups(all_nodes) + def _test_split_groups(self, graph, zoom_level): + summary3 = split_groups(graph) assert len(summary3) > 10 return summary3 def test_workflow(self): - self.create_nodes('test') - all_nodes = [node for window in self.unique_signatures for node in window.values()] # think about referencing and deletion - summary1 = self._test_simple_merge(all_nodes) - summary2 = self._test_neglect_nodes(summary1) - summary3 = self._test_split_groups(summary2) + graph = self.create_graph('test') + summary1 = self._test_simple_merge(graph, 0) + summary2 = self._test_neglect_nodes(graph, 1) + summary3 = self._test_split_groups(graph, 2) assert len(summary1) > len(summary2) > len(summary3), "Each summarization should result in less nodes" - summary4 = simple_merge(summary3) + summary4 = simple_merge(summary3, 3) bad = summary3[2] print(bad.details()) - # test_signatures = get_all_signatures(alleles, individuals) - # test_individuals = build_individuals(individuals, test_signatures) + # test_signatures = build_all_slices(alleles, individuals) + # test_individuals = build_paths(individuals, test_signatures) # populate_transitions(test_individuals) # no return val # # test1 = test_simple_merge(test_signatures) From 52c5c0f8494ba28db58464fe21f347199ea6dd2d Mon Sep 17 00:00:00 2001 From: Josiah Seaman Date: Fri, 23 Aug 2019 18:10:42 +0100 Subject: [PATCH 04/15] #23 ZoomLevel objects, custom create() methods for consistency and convenience. WIP: Converting all tests and code to database queries. GraphTest now passing. --- Graph/gfa.py | 17 +++-- Graph/migrations/0004_ZoomLevel_objects.py | 40 +++++++++++ Graph/models.py | 84 ++++++++++++++++++---- Graph/test.py | 11 ++- Graph/utils.py | 10 +-- 5 files changed, 135 insertions(+), 27 deletions(-) create mode 100644 Graph/migrations/0004_ZoomLevel_objects.py diff --git a/Graph/gfa.py b/Graph/gfa.py index f03c926..b7debbd 100644 --- a/Graph/gfa.py +++ b/Graph/gfa.py @@ -6,7 +6,7 @@ import io import os import tempfile -from Graph.models import Node, Path, GraphGenome +from Graph.models import Node, Path, GraphGenome, ZoomLevel def pairwise(iterable): @@ -107,8 +107,11 @@ def from_graph(cls, graph: GraphGenome): """Constructs the lines of a GFA file listing paths, then sequence nodes in arbitrary order.""" gfa = gfapy.Gfa() for path in graph.paths: - node_series = ",".join([traverse.node.name + traverse.strand for traverse in path.nodes]) - gfa.add_line('\t'.join(['P', path.accession, node_series, ",".join(['*' for _ in path.nodes])])) + visits = [traverse.node_name + traverse.strand for traverse in + path.nodes.values_list('node_name', 'strand', named=True)] + node_series = ",".join(visits) + connections = ",".join(['*'] * path.nodes.count()) # count -1? + gfa.add_line('\t'.join(['P', path.accession, node_series, connections])) for node in graph.nodes: # in no particular order gfa.add_line('\t'.join(['S', str(node.name), node.seq])) return cls(gfa, "from Graph") @@ -120,13 +123,13 @@ def to_paths(self) -> GraphGenome: def to_graph(self) -> GraphGenome: """Create parent object for this genome and save it in the database. This can create duplicates appended in Paths if it is called twice.""" - gdb = GraphGenome.objects.get_or_create(name=self.source_path)[0] + gdb = GraphGenome.objects.create(name=self.source_path) + # sequence_level = ZoomLevel.objects.create(graph=gdb, zoom=0) for segment in self.gfa.segments: - Node.objects.get_or_create(seq=segment.sequence, name=(segment.name), graph=gdb) + Node.objects.get_or_create(seq=segment.sequence, name=segment.name, graph=gdb) for path in self.gfa.paths: - p = Path(accession=path.name, graph=gdb) - p.save() + p = Path.objects.create(accession=path.name, graph=gdb, zoom=0) p.append_gfa_nodes(path.segment_names) return gdb diff --git a/Graph/migrations/0004_ZoomLevel_objects.py b/Graph/migrations/0004_ZoomLevel_objects.py new file mode 100644 index 0000000..c6256f6 --- /dev/null +++ b/Graph/migrations/0004_ZoomLevel_objects.py @@ -0,0 +1,40 @@ +# Generated by Django 2.2.1 on 2019-08-23 16:23 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + replaces = [('Graph', '0004_ZoomLevel_objects'), ('Graph', '0005_auto_20190823_1603')] + + dependencies = [ + ('Graph', '0003_add_zoom_levels'), + ] + + operations = [ + migrations.AlterUniqueTogether( + name='path', + unique_together=set(), + ), + migrations.RemoveField( + model_name='path', + name='graph', + ), + migrations.RemoveField( + model_name='path', + name='zoom', + ), + migrations.CreateModel( + name='ZoomLevel', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('zoom', models.IntegerField(default=0)), + ('graph', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='Graph.GraphGenome')), + ('paths', models.ManyToManyField(blank=True, help_text='One path for each accession in a zoom level. Paths can be reused unmodified in multiple levels.', related_name='zoom_levels', to='Graph.Path')), + ], + options={ + 'unique_together': {('graph', 'zoom')}, + }, + ), + ] diff --git a/Graph/models.py b/Graph/models.py index 6e18101..1045c81 100644 --- a/Graph/models.py +++ b/Graph/models.py @@ -1,5 +1,7 @@ from typing import List, Iterable, Set from django.db import models +from django.db.models import QuerySet + from Utils.models import CustomSaveManager @@ -14,15 +16,54 @@ class NodeMissingError(ValueError): pass +class ZoomLevel(models.Model): + """Each Graph Genome has multiple Zoom levels. A zoom level is a collection of Paths + at that zoom level. When a Path is unmodified from a summarization step, it can reuse + a previous Path without duplicating it. + Zoom was originally a single integer in Path, but it turned out a helper class for queries + would result in more robust code and allow Path reuse.""" + graph = models.ForeignKey('GraphGenome', on_delete=models.CASCADE) + zoom = models.IntegerField(default=0) # Zoom level starts at 0 for nucleotide level and moves up + # Nodes can be shared between zoom levels. + paths = models.ManyToManyField('Path', related_name='zoom_levels', blank=True, + help_text='One path for each accession in a zoom level. ' + 'Paths can be reused unmodified in multiple levels.') + + class Meta: + unique_together = ['graph', 'zoom'] # one ZoomLevel at each integer + + +class GraphManager(models.Manager): + """custom create() methods for consistency and convenience""" + def create(self, **kwargs): + """When a graph is created. It should also automatically create the first ZoomLevel.""" + graph = super(GraphManager, self).create(**kwargs) + ZoomLevel.objects.create(graph=graph, zoom=0) # sequence_level created + return graph + + class GraphGenome(models.Model): + """Graph Genomes in general are defined as a collection of unordered Nodes which contain sequence, + and one Path per individual. A Path visits Nodes in any order, on either strand. This directed + graph then contains the relationships of all individuals to each other through shared sequence. + GraphGenomes in this database contain an extra concept not found in GFA or VG: Summarization. + GraphGenome has multiple ZoomLevels which each contain a full set of Paths. Paths at higher + zoom levels are shorter and visit summary nodes that explain or discard trivial variation to + allow user to focus on larger graph structure.""" name = models.CharField(max_length=1000) - #zoom = models.IntegerField() Zoom level is defined in Paths only. + # Zoom level is defined in Paths only. # Nodes can be shared between zoom levels. + objects = GraphManager() + + @property + def sequence_level(self) -> 'ZoomLevel': + return self.zoomlevel_set.filter(zoom=0).first() + @property - def paths(self): + def paths(self) -> QuerySet: """Getter only. Shortcut for DB.""" - return self.path_set.all() + return self.sequence_level.paths @property def nodes(self): @@ -31,12 +72,12 @@ def nodes(self): def __repr__(self): """Warning: the representation strings are very sensitive to whitespace""" - return f"Graph: {self.name}\n{self.path_set.count()} paths {self.node_set.count()} nodes." + return f"Graph: {self.name}\n{self.paths.count()} paths {self.node_set.count()} nodes." def __eq__(self, other): if isinstance(other, GraphGenome): return other.node_set.count() == self.node_set.count() and \ - other.path_set.count() == self.path_set.count() # other.name == self.name and \ + other.paths.count() == self.paths.count() # other.name == self.name and \ return False @classmethod @@ -70,6 +111,7 @@ def node(self, node_name): return Node.objects.get(name=node_name, graph=self) + class Node(models.Model): """Nodes are the fundamental content carriers for sequence. A Path can traverse nodes in any order, on both + and - strands. @@ -81,6 +123,8 @@ class Node(models.Model): graph = models.ForeignKey(GraphGenome, on_delete=models.CASCADE) summarized_by = models.ForeignKey('Node', null=True, blank=True, on_delete=models.SET_NULL, related_name='children') + # Break points for haploblocks - Erik Garrison - service for coordinates + # Start and stop positions for a node class Meta: unique_together = ['graph', 'name'] @@ -156,23 +200,32 @@ def validate(self): Node.NOTHING = Node(-1) +class PathManager(models.Manager): + """custom create() methods for consistency and convenience""" + def create(self, accession, graph, zoom=0): + """Fetches the appropriate ZoomLevel object and creates a link when the Path is first + created. """ + path = super(PathManager, self).create(accession=accession) + ZoomLevel.objects.get_or_create(graph=graph, zoom=zoom)[0].paths.add(path) + return path + + class Path(models.Model): """Paths represent the linear order of on particular individual (accession) as its genome was sequenced. A path visits a series of nodes and the ordered concatenation of the node sequences is the accession's genome. Create Paths first from accession names, then append them to Nodes to link together.""" accession = models.CharField(max_length=1000) # one path per accession - graph = models.ForeignKey(GraphGenome, on_delete=models.CASCADE) - zoom = models.IntegerField(default=0) # Zoom level starts at 0 for nucleotide level and moves up - # Nodes can be shared between zoom levels. summarized_by = models.ForeignKey('Path', related_name='summary_child', blank=True, null=True, on_delete=models.SET_NULL) - class Meta: - unique_together = ['graph', 'accession', 'zoom'] + objects = PathManager() + + # class Meta: we need a database check where each accession name only occurs once per zoom level + # unique_together = ['accession', 'zoom_levels'] def __getitem__(self, path_index): - return self.nodes[path_index] + return self.nodes.filter(order=path_index) def __repr__(self): """Warning: the representation strings are very sensitive to whitespace""" @@ -185,12 +238,17 @@ def __hash__(self): return hash(self.accession) @property - def nodes(self) -> Iterable['NodeTraversal']: - return NodeTraversal.objects.filter(path=self).order_by('order').all() + def nodes(self) -> QuerySet: + return NodeTraversal.objects.filter(path=self).order_by('order') + + @property + def graph(self) -> GraphGenome: + return self.zoom_levels.first().graph def append_gfa_nodes(self, nodes): assert hasattr(nodes[0], 'orient') and hasattr(nodes[0], 'name'), 'Expecting gfapy.Gfa.path' for node in nodes: + # TODO: could be sped up with bulk_create after checking and setting order NodeTraversal(node=Node.objects.get(name=node.name), path=self, strand=node.orient).save() diff --git a/Graph/test.py b/Graph/test.py index e0e794e..a2afb2d 100644 --- a/Graph/test.py +++ b/Graph/test.py @@ -56,7 +56,6 @@ def test_graph_factory(self): def test_summary_storage(self): graph = self.test_example_graph() - parent = Path.objects.get(graph=graph, accession='a', zoom=0) path = Path.objects.create(graph=graph, accession='a', zoom=1) new_node = Node.objects.create(seq='ACGTCGGA', name='2*2', graph=graph) Node.objects.filter(name__in=['1','2','4']).update(summarized_by=new_node) @@ -67,8 +66,14 @@ def test_summary_storage(self): current = graph.node(node_name) NodeTraversal(node=current, path=path, strand='+', order=i).save() assert NodeTraversal.objects.get(order=0, path=path).downstream().downstream().node.name == '6' - -@unittest.skip # DAGify has not been converted to databases yet. + child = graph.paths.get(accession='a') + child.update(summarized_by=path) + assert graph.zoomlevel_set.count() == 2 + path_pointers = graph.zoomlevel_set.filter(zoom=1).first().paths + print('path_pointers', type(path_pointers)) + assert path_pointers.count() == 1 + +# @unittest.skip # DAGify has not been converted to databases yet. class DAGifyTest(TestCase): """ test class of sort.py """ diff --git a/Graph/utils.py b/Graph/utils.py index d750c60..b08a089 100644 --- a/Graph/utils.py +++ b/Graph/utils.py @@ -20,18 +20,20 @@ def build_graph_from_slices(cmd: List, graph_name='test_data') -> GraphGenome: orderings. Use Path.append_node() to build non-linear graphs.""" if isinstance(cmd, str): cmd = eval(cmd) - # preemptively grab all the path names from every odd list entry - graph = GraphGenome.objects.get_or_create(name=graph_name)[0] # + str(datetime.now()) + # preemptive grab all the path names from every odd list entry + graph = GraphGenome.objects.create(name=graph_name) node_count = 0 paths = {key for sl in cmd for i in range(0, len(sl), 2) for key in sl[i + 1]} path_objs = {} for path_name in paths: - path_objs[path_name] = Path.objects.get_or_create(graph=graph, accession=path_name, zoom=0)[0] + if graph.paths.filter(accession=path_name).count() == 0: + path_objs[path_name] = Path.objects.create(graph=graph, accession=path_name, zoom=0) for sl in cmd: try: for i in range(0, len(sl), 2): paths_mentioned = [path_objs[key] for key in sl[i + 1]] - node, is_new = Node.objects.get_or_create(seq=sl[i], name=graph.name + str(node_count), graph=graph) + node, is_new = Node.objects.get_or_create( + seq=sl[i], name=graph.name + str(node_count), graph=graph) node_count += 1 for path in paths_mentioned: path.append_node(node, '+') From 005d1d874052a0f3dd29591055520cb85d97f3e9 Mon Sep 17 00:00:00 2001 From: Josiah Seaman Date: Fri, 23 Aug 2019 19:46:03 +0100 Subject: [PATCH 05/15] #23 test_export_as_gfa now working with example fast queries for fetching the whole graph. --- Graph/gfa.py | 12 ++++++++---- Graph/migrations/0004_ZoomLevel_objects.py | 2 -- Graph/test.py | 7 ++++--- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/Graph/gfa.py b/Graph/gfa.py index b7debbd..1c81d98 100644 --- a/Graph/gfa.py +++ b/Graph/gfa.py @@ -103,12 +103,16 @@ def save_as_gfa(self, file: str): self.gfa.to_file(file) @classmethod - def from_graph(cls, graph: GraphGenome): + def from_graph(cls, graph: GraphGenome): # TODO: should be given ZoomLevel instead """Constructs the lines of a GFA file listing paths, then sequence nodes in arbitrary order.""" gfa = gfapy.Gfa() - for path in graph.paths: - visits = [traverse.node_name + traverse.strand for traverse in - path.nodes.values_list('node_name', 'strand', named=True)] + for path in graph.paths.all(): + visits = [] + # example of using lazy queries and values_list for fast lookup + node_infos = path.nodes.values_list('node_id', 'strand', named=True) + for traverse in node_infos: + name = Node.objects.values_list('name', flat=True).get(id=traverse.node_id) # fast lookup + visits.append(name + traverse.strand) node_series = ",".join(visits) connections = ",".join(['*'] * path.nodes.count()) # count -1? gfa.add_line('\t'.join(['P', path.accession, node_series, connections])) diff --git a/Graph/migrations/0004_ZoomLevel_objects.py b/Graph/migrations/0004_ZoomLevel_objects.py index c6256f6..8c05de2 100644 --- a/Graph/migrations/0004_ZoomLevel_objects.py +++ b/Graph/migrations/0004_ZoomLevel_objects.py @@ -6,8 +6,6 @@ class Migration(migrations.Migration): - replaces = [('Graph', '0004_ZoomLevel_objects'), ('Graph', '0005_auto_20190823_1603')] - dependencies = [ ('Graph', '0003_add_zoom_levels'), ] diff --git a/Graph/test.py b/Graph/test.py index a2afb2d..6f8d4b1 100644 --- a/Graph/test.py +++ b/Graph/test.py @@ -66,14 +66,15 @@ def test_summary_storage(self): current = graph.node(node_name) NodeTraversal(node=current, path=path, strand='+', order=i).save() assert NodeTraversal.objects.get(order=0, path=path).downstream().downstream().node.name == '6' - child = graph.paths.get(accession='a') - child.update(summarized_by=path) assert graph.zoomlevel_set.count() == 2 + graph.paths.filter(accession='a').update(summarized_by=path) + assert bool(path.summary_child), "Path should be linked to its child." path_pointers = graph.zoomlevel_set.filter(zoom=1).first().paths print('path_pointers', type(path_pointers)) assert path_pointers.count() == 1 -# @unittest.skip # DAGify has not been converted to databases yet. + +#@unittest.skip # DAGify has not been converted to databases yet. class DAGifyTest(TestCase): """ test class of sort.py """ From 44a3bd154e73bb37fa91cca052fe06c9fd5a017e Mon Sep 17 00:00:00 2001 From: Josiah Seaman Date: Fri, 23 Aug 2019 20:04:17 +0100 Subject: [PATCH 06/15] Updated some of DAGify to use GraphGenomes. Unfortunately, I don't understand Profiles well enough to update generate_profiles(). This may be the same concept as Haploblocker identifying the set of unique signatures at a locus. --- Graph/gfa.py | 4 ---- Graph/sort.py | 20 +++++++++++--------- Graph/test.py | 36 ++++++++++++++++++------------------ 3 files changed, 29 insertions(+), 31 deletions(-) diff --git a/Graph/gfa.py b/Graph/gfa.py index 1c81d98..ee65be7 100644 --- a/Graph/gfa.py +++ b/Graph/gfa.py @@ -120,10 +120,6 @@ def from_graph(cls, graph: GraphGenome): # TODO: should be given ZoomLevel inst gfa.add_line('\t'.join(['S', str(node.name), node.seq])) return cls(gfa, "from Graph") - def to_paths(self) -> GraphGenome: - graph = self.to_graph() - return graph.paths - def to_graph(self) -> GraphGenome: """Create parent object for this genome and save it in the database. This can create duplicates appended in Paths if it is called twice.""" diff --git a/Graph/sort.py b/Graph/sort.py index dbea358..83687a9 100644 --- a/Graph/sort.py +++ b/Graph/sort.py @@ -2,7 +2,7 @@ import dataclasses from typing import List, Set -from Graph.models import NodeTraversal, Path, Node +from Graph.models import NodeTraversal, Path, Node, GraphGenome @dataclasses.dataclass @@ -32,22 +32,21 @@ class DAGify: """ DAGify accepts a set of paths, and """ - def __init__(self, paths: List[Path], nodes={}): + def __init__(self, graph: GraphGenome): """ :type paths: List[Path], nodes: Set[Node] """ - self.paths = paths - self.nodes = nodes + self.graph_genome = graph def generate_profiles_with_minimizing_replications(self) -> (List[Profile], int): """ - Generate profiles with minimizing the number of node repication by trying to assign each path as a primary path one by one. + Generate profiles with minimizing the number of node replication by trying to assign each path as a primary path one by one. It returns profiles and whose number of replication. :return: Profiles and the number of replicated nodes. """ min_rep = sys.maxsize profile = [] - for i, _ in enumerate(self.paths): + for i in range(self.graph_genome.paths.count()): profile_candidate = self.generate_profiles(i) if min_rep > len([x.duplicate for x in profile_candidate if x.duplicate]): min_rep = len([x.duplicate for x in profile_candidate if x.duplicate]) @@ -61,12 +60,15 @@ def generate_profiles(self, primary_path_index: int = 0) -> List[Profile]: :return: a list of profiles """ profile = [] - for node_index in self.paths[primary_path_index].nodes: + # TODO: replace this with database queries. Possibly pass in path, rather than an primary_path_index. + raise NotImplementedError() + primary_path = self.paths[primary_path_index] + for node_index in primary_path.nodes: if node_index.strand == "+": - profile.append(Profile(node_index, [self.paths[primary_path_index]], [], {self.paths[primary_path_index]}, False)) + profile.append(Profile(node_index, [primary_path], [], {primary_path}, False)) else: profile.append( - Profile(node_index, [], [self.paths[primary_path_index]], {self.paths[primary_path_index]}, False)) + Profile(node_index, [], [primary_path], {primary_path}, False)) for i, path in enumerate(self.paths): if i == primary_path_index: continue diff --git a/Graph/test.py b/Graph/test.py index 6f8d4b1..790c1ac 100644 --- a/Graph/test.py +++ b/Graph/test.py @@ -74,7 +74,7 @@ def test_summary_storage(self): assert path_pointers.count() == 1 -#@unittest.skip # DAGify has not been converted to databases yet. +@unittest.skip # DAGify has not been converted to databases yet. class DAGifyTest(TestCase): """ test class of sort.py """ @@ -84,16 +84,16 @@ class DAGifyTest(TestCase): def test_dagify(self): gfa = GFA.load_from_gfa(join(PATH_TO_TEST_DATA, "test.gfa")) - paths = gfa.to_paths() - dagify = DAGify(paths) + graph = gfa.to_graph() + dagify = DAGify(graph) profile = dagify.generate_profiles(0) # self.assertEqual([['CAAATAAG', {x,y,z}], ['A', {y,z}, 'G', {x}], ['C', {x,y,z}], ['TTG', {x,y,z}], ['A', {z}, 'G', {x,y}], ['AAATTTTCTGGAGTTCTAT', {x,y,z}], ['T', {x,y,z}], ['ATAT', {x,y,z}], ['T', {x,y,z}], ['CCAACTCTCTG', {x,y,z}]], graph) def test_dagify2(self): gfa = GFA.load_from_gfa(join(PATH_TO_TEST_DATA, "test2.gfa")) - paths = gfa.to_paths() - dagify = DAGify(paths) + graph = gfa.to_graph() + dagify = DAGify(graph) profile = dagify.generate_profiles(0) # graph = SlicedGraph.load_from_slices(dagify.to_slices(profile), paths) # x,y,z,a = 'x', 'y', 'z', 'a' @@ -101,8 +101,8 @@ def test_dagify2(self): def test_dagify3(self): gfa = GFA.load_from_gfa(join(PATH_TO_TEST_DATA, "test3.gfa")) - paths = gfa.to_paths() - dagify = DAGify(paths) + graph = gfa.to_graph() + dagify = DAGify(graph) profile, rep_count = dagify.generate_profiles_with_minimizing_replications() self.assertEqual(rep_count, 1) # graph = SlicedGraph.load_from_slices(dagify.to_slices(profile), paths) @@ -110,8 +110,8 @@ def test_dagify3(self): def test_dagify_altpath(self): gfa = GFA.load_from_gfa(join(PATH_TO_TEST_DATA, "alternate_paths.gfa")) - paths = gfa.to_paths() - dagify = DAGify(paths) + graph = gfa.to_graph() + dagify = DAGify(graph) profile, rep_count = dagify.generate_profiles_with_minimizing_replications() self.assertEqual(rep_count, 1) # graph = SlicedGraph.load_from_slices(dagify.to_slices(profile), paths) @@ -119,8 +119,8 @@ def test_dagify_altpath(self): def test_dagify_dup(self): gfa = GFA.load_from_gfa(join(PATH_TO_TEST_DATA, "duplicate.gfa")) - paths = gfa.to_paths() - dagify = DAGify(paths) + graph = gfa.to_graph() + dagify = DAGify(graph) profile, rep_count = dagify.generate_profiles_with_minimizing_replications() self.assertEqual(rep_count, 2) # graph = SlicedGraph.load_from_slices(dagify.to_slices(profile), paths) @@ -129,8 +129,8 @@ def test_dagify_dup(self): def test_unresolved_repreat(self): gfa = GFA.load_from_gfa(join(PATH_TO_TEST_DATA, "unresolved_repeat.gfa")) - paths = gfa.to_paths() - dagify = DAGify(paths) + graph = gfa.to_graph() + dagify = DAGify(graph) profile, rep_count = dagify.generate_profiles_with_minimizing_replications() # graph = SlicedGraph.load_from_slices(dagify.to_slices(profile), paths) # self.assertEqual([['CAAATAAG', {'x'}, 'T', {'y'}], ['A', {'y', 'x'}], ['G', {'x'}, 'C', {'y'}]], graph) @@ -138,8 +138,8 @@ def test_unresolved_repreat(self): @unittest.skip("Inversion is unsupported") def test_inversion(self): gfa = GFA.load_from_gfa(join(PATH_TO_TEST_DATA, "inversion.gfa")) - paths = gfa.to_paths() - dagify = DAGify(paths) + graph = gfa.to_graph() + dagify = DAGify(graph) profile, rep_count = dagify.generate_profiles_with_minimizing_replications() # graph = SlicedGraph.load_from_slices(dagify.to_slices(profile), paths) # self.assertEqual(graph, []) @@ -147,8 +147,8 @@ def test_inversion(self): @unittest.skip("Inversion is unsupported") def test_nested_inversion(self): gfa = GFA.load_from_gfa(join(PATH_TO_TEST_DATA, "nested_inv.gfa")) - paths = gfa.to_paths() - dagify = DAGify(paths) + graph = gfa.to_graph() + dagify = DAGify(graph) profile, rep_count = dagify.generate_profiles_with_minimizing_replications() # graph = SlicedGraph.load_from_slices(dagify.to_slices(profile), paths) # self.assertEqual(graph, []) @@ -157,7 +157,7 @@ def test_nested_inversion(self): def test_simple_inversion(self): gfa = GFA.load_from_gfa(join(PATH_TO_TEST_DATA, "simple_inv.gfa")) graph = gfa.to_graph() - dagify = DAGify(graph.paths) + dagify = DAGify(graph) profile, rep_count = dagify.generate_profiles_with_minimizing_replications() # graph = SlicedGraph.load_from_slices(dagify.to_slices(profile), paths) # self.assertEqual(graph, [['CAAATAAG', {x,y}], ['AC', {x}, 'AC', {y}], ['G', {x, y}]]) From bc492539b027dbf2fd064d27d85c26452a9303ac Mon Sep 17 00:00:00 2001 From: Josiah Seaman Date: Mon, 26 Aug 2019 16:36:11 +0100 Subject: [PATCH 07/15] #26 Simple_merge converted by copying the entire GraphGenome, Paths, and NodeTraversals to a new ZoomLevel. I'm committing here because I want to save a working test state. Using pycharm, make sure you use Django Tests (manage.py) with no Target. This will run all tests in the project. I added bulk_create for NodeTraversals because the KE_chromo test was ridiculously slow. --- Graph/gfa.py | 2 +- Graph/models.py | 151 +++++++++++++++++++++++------------ Graph/test.py | 18 ++++- HaploBlocker/haplonetwork.py | 70 +++++++++------- HaploBlocker/tests.py | 58 +++++++++----- vgbrowser/settings.py | 4 +- 6 files changed, 196 insertions(+), 107 deletions(-) diff --git a/Graph/gfa.py b/Graph/gfa.py index ee65be7..c8d2d82 100644 --- a/Graph/gfa.py +++ b/Graph/gfa.py @@ -6,7 +6,7 @@ import io import os import tempfile -from Graph.models import Node, Path, GraphGenome, ZoomLevel +from Graph.models import Node, Path, GraphGenome def pairwise(iterable): diff --git a/Graph/models.py b/Graph/models.py index 1045c81..d7a26eb 100644 --- a/Graph/models.py +++ b/Graph/models.py @@ -16,6 +16,31 @@ class NodeMissingError(ValueError): pass +class ZoomLevelManager(models.Manager): + def create(self, graph, zoom) -> 'ZoomLevel': + """Creates a full collection of Paths for the new ZoomLevel that matches the + previous level's path names, but are now blank.""" + me = super(ZoomLevelManager, self).create(graph=graph, zoom=zoom) # now saved to DB + if zoom == 0: + return me # We've done all the necessary work + # Copy previous level in entirety + previous_level = graph.zoomlevel_set.get(zoom=zoom - 1) + path_names = previous_level.paths.values('accession') + # TODO: This loop can be sped up by bulk_create and bulk_update + for name in path_names: + p = Path(accession=name) # new Path for a new level + p.save() + me.paths.add(p) # linking to current level + nucleotide_path = previous_level.paths.get(accession=name) + nucleotide_path.update(summarized_by=p) # links between levels + #TODO: this is REALLY SLOW AND WASTEFUL! + # Makes a full copy of every traversal in the Path so new copies can be edited + copies = [NodeTraversal(path=p, node=traverse.node, strand=traverse.strand, order=traverse.order) + for traverse in nucleotide_path.nodetraversal_set] + NodeTraversal.objects.create(copies, 500) + return me + + class ZoomLevel(models.Model): """Each Graph Genome has multiple Zoom levels. A zoom level is a collection of Paths at that zoom level. When a Path is unmodified from a summarization step, it can reuse @@ -28,10 +53,25 @@ class ZoomLevel(models.Model): paths = models.ManyToManyField('Path', related_name='zoom_levels', blank=True, help_text='One path for each accession in a zoom level. ' 'Paths can be reused unmodified in multiple levels.') + objects = ZoomLevelManager() class Meta: unique_together = ['graph', 'zoom'] # one ZoomLevel at each integer + def __len__(self): + """The number of unique NodeTraversals in this level""" + nodes = self.node_ids() + return len(nodes) + + def node_ids(self) -> Set[int]: + path_ids = self.paths.values_list('id', flat=True) + nodes = set(NodeTraversal.objects.filter(path_id__in=path_ids).values_list('node_id', flat=True)) + return nodes + + def nodes(self) -> Iterable['Node']: + return Node.objects.filter(id__in=self.node_ids()) + + class GraphManager(models.Manager): """custom create() methods for consistency and convenience""" @@ -95,8 +135,9 @@ def save_as_xg(self, file: str, xg_bin: str): gfa = GFA.from_graph(self) gfa.save_as_xg(file, xg_bin) - def append_node_to_path(self, node_id, strand, path_name) -> None: - """This is the preferred way to build a graph in a truly non-linear way. + def append_node_to_path(self, node_id, strand, path_name, zoom) -> None: + """Path.append_node() is preferred over this method, as it is faster and simpler. + This will build a graph in a truly non-linear way. Nodes will be created if necessary. NodeTraversal is appended to Path (order dependent) and PathIndex is added to Node (order independent).""" @@ -105,7 +146,8 @@ def append_node_to_path(self, node_id, strand, path_name) -> None: self.nodes[node_id] = Node('', [], node_id) else: raise ValueError("Provide the id of the node, not", node_id) - Path.objects.get(name=path_name).append_node(Node.objects.get(name=node_id), strand) + level = self.zoomlevel_set.get(zoom=zoom) + level.paths.get(name=path_name).append_node(Node.objects.get(name=node_id), strand) def node(self, node_name): return Node.objects.get(name=node_name, graph=self) @@ -149,26 +191,28 @@ def __hash__(self): def to_gfa(self, segment_id: int): return '\t'.join(['S', str(segment_id), self.seq]) - def specimens(self, zoom_level) -> Set[int]: + def specimens(self, zoom_level) -> List[int]: return self.nodetraversal_set.filter(path_zoom=zoom_level).value_list('path_id', flat=True) - def upstream(self, zoom_level) -> Set[int]: + def upstream_ids(self, zoom_level) -> Set[int]: + """Returns the node ids that are upstream of this node.""" traverses = self.nodetraversal_set.filter(path_zoom=zoom_level) #.value_list('node_id', flat=True) # Node.objects.filter(id__in=traverses).values_list('id', flat=True) return set(t.upstream_id() for t in traverses) - def downstream(self, zoom_level) -> Set[int]: - traverses = self.nodetraversal_set.filter(path_zoom=zoom_level).all() + def downstream_ids(self, zoom_level) -> Set[int]: + """Returns the node ids that are downstream of this node.""" + traverses = self.nodetraversal_set.filter(path_zoom=zoom_level).all() # TODO: optimize more return set(t.downstream_id() for t in traverses) def __repr__(self): return "N%s(%s)" % (str(self.name), self.seq) - def details(self): + def details(self, zoom=0): return f"""Node{self.name}: {self.seq} - upstream: {dict((key, value) for key, value in self.upstream.items())} - downstream: {dict((key, value) for key, value in self.downstream.items())} - {len(self.specimens)} specimens: {self.specimens}""" + upstream: {self.upstream_ids(zoom)} + downstream: {self.downstream_ids(zoom)} + {len(self.specimens(zoom))} specimens: {self.specimens}""" def is_nothing(self): """Useful in Node class definition to check for Node.NOTHING""" @@ -177,17 +221,8 @@ def is_nothing(self): def validate(self): """Returns true if the Node has specimens and does not have any negative transition values, raises an AssertionError otherwise.""" - if not self.specimens: - assert self.specimens, "Specimens are empty" + self.details() - for node, weight in self.upstream.items(): - if not node.is_nothing() and weight < 0: - print(self.details()) - assert weight > -1, node.details() - - for node, weight in self.downstream.items(): - if not node.is_nothing() and weight < 0: - print(self.details()) - assert weight > -1, node.details() + if not self.nodetraversal_set.count(): + assert False, "Orphaned: No NodeTraversals are referencing this Node." + self.details() return True @@ -249,19 +284,14 @@ def append_gfa_nodes(self, nodes): assert hasattr(nodes[0], 'orient') and hasattr(nodes[0], 'name'), 'Expecting gfapy.Gfa.path' for node in nodes: # TODO: could be sped up with bulk_create after checking and setting order - NodeTraversal(node=Node.objects.get(name=node.name), - path=self, strand=node.orient).save() + NodeTraversal.objects.create(node=Node.objects.get(name=node.name), + path=self, strand=node.orient) def append_node(self, node: Node, strand: str): """This is the preferred way to build a graph in a truly non-linear way. + NodeTraversal.order is guaranteed to be contiguous, making upstream() and downstream() work properly. NodeTraversal is appended to Path (order dependent) and PathIndex is added to Node (order independent).""" - NodeTraversal(node=node, path=self, strand=strand).save() - - # @classmethod - # def build(cls, name: str, seq_of_nodes: List[str]): - # node = Node.objects.create(seq) - # for p in paths: - # NodeTraversal.objects.create(node, path) + NodeTraversal.objects.create(node=node, path=self, strand=strand) # calculates order def name(self): return self.accession @@ -270,6 +300,17 @@ def to_gfa(self): return '\t'.join(['P', self.accession, "+,".join([x.node.name + x.strand for x in self.nodes]) + "+", ",".join(['*' for x in self.nodes])]) +class NodeTraversalManager(models.Manager): + def create(self, node, path, strand, order=None): + """Checks the largest 'order' value in the current path and increments by 1. + IMPORTANT NOTE: save() does not get called if you do NodeTraverseal.objects.create + or get_or_create""" + if order is None: + last_traversal = path.nodetraversal_set.all().order_by('-order').first() + order = 0 if not last_traversal else last_traversal.order + 1 + return super(NodeTraversalManager, self).create(node=node, path=path, strand=strand, order=order) + + class NodeTraversal(models.Model): """Link from a Path to a Node it is currently traversing. Includes strand""" node = models.ForeignKey(Node, db_index=True, on_delete=models.CASCADE) @@ -279,7 +320,7 @@ class NodeTraversal(models.Model): order = models.IntegerField(help_text='Defines the order a path lists traversals. ' 'The scale of order is not preserved between zoom levels.') - objects = CustomSaveManager() + objects = NodeTraversalManager() def __repr__(self): if self.strand == '+': @@ -291,15 +332,6 @@ def __repr__(self): def __eq__(self, other): return self.node.id == other.node.id and self.strand == other.strand - def save(self, **kwargs): - """Checks the largest 'order' value in the current path and increments by 1. - IMPORTANT NOTE: save() does not get called if you do NodeTraverseal.objects.create - or get_or_create""" - if self.order is None: - last_traversal = self.path.nodetraversal_set.all().order_by('-order').first() - self.order = 0 if not last_traversal else last_traversal.order + 1 - super(NodeTraversal, self).save(**kwargs) - def fetch_neighbor(self, target_index): query = NodeTraversal.objects.filter \ (path=self.path, order=target_index).values_list('node__id', flat=True) @@ -309,11 +341,31 @@ def fetch_neighbor(self, target_index): def upstream_id(self): target_index = self.order - 1 - return self.fetch_neighbor(target_index) + return self.neighbor_id(target_index) + # try: This query version can tolerate non-contiguous Path sequences + # return NodeTraversal.objects.\ + # filter(path=self.path, order__lte=target_index).\ + # order_by('-order').first().values_list('node_id', flat=True)[0] + # except (NodeTraversal.DoesNotExist, IndexError): + # return None def downstream_id(self): target_index = self.order + 1 - return self.fetch_neighbor(target_index) + return self.neighbor_id(target_index) + # try: This query version can tolerate non-contiguous Path sequences + # return NodeTraversal.objects.\ + # filter(path=self.path, order__gte=target_index).\ + # order_by('order').first().values_list('node_id', flat=True)[0] + # except (NodeTraversal.DoesNotExist, IndexError): + # return None + + def neighbor_id(self, target_index): + """Faster query that just retruns the node_id, not the NodeTraversal.""" + try: + return NodeTraversal.objects.\ + get(path=self.path, order=target_index).values_list('node_id', flat=True)[0] + except (NodeTraversal.DoesNotExist, IndexError): + return None def neighbor(self, target_index): try: @@ -321,11 +373,10 @@ def neighbor(self, target_index): except NodeTraversal.DoesNotExist: return None - def upstream(self): - target_index = self.order - 1 - return self.neighbor(target_index) - - def downstream(self): - target_index = self.order + 1 - return self.neighbor(target_index) + def upstream(self) -> 'NodeTraversal': + """Slower queries that return the neighboring NodeTraversal. This can be chained.""" + return self.neighbor(self.order - 1) + def downstream(self) -> 'NodeTraversal': + """Slower queries that return the neighboring NodeTraversal. This can be chained.""" + return self.neighbor(self.order + 1) diff --git a/Graph/test.py b/Graph/test.py index 790c1ac..599ac32 100644 --- a/Graph/test.py +++ b/Graph/test.py @@ -69,10 +69,22 @@ def test_summary_storage(self): assert graph.zoomlevel_set.count() == 2 graph.paths.filter(accession='a').update(summarized_by=path) assert bool(path.summary_child), "Path should be linked to its child." - path_pointers = graph.zoomlevel_set.filter(zoom=1).first().paths - print('path_pointers', type(path_pointers)) + zoom1 = graph.zoomlevel_set.get(zoom=1) + path_pointers = zoom1.paths assert path_pointers.count() == 1 - + # ZoomLevel + self.assertEqual(len(zoom1), 3, ) + self.assertEqual(zoom1.node_ids(),{5, 21, 6}, zoom1.node_ids()) + self.assertEqual(graph.zoomlevel_set.get(zoom=0).node_ids(), set(range(1,21))) + names = [x.name for x in zoom1.nodes()] + self.assertEqual(names, ['5', '6', '2*2']) + sequences = [x.seq for x in zoom1.nodes()] + self.assertEqual(sequences, ['C', 'AGTACG', 'ACGTCGGA']) + + + # def test_zoom_level(self): + # graph = self.test_example_graph() + # node_ids @unittest.skip # DAGify has not been converted to databases yet. class DAGifyTest(TestCase): diff --git a/HaploBlocker/haplonetwork.py b/HaploBlocker/haplonetwork.py index a536648..0881b15 100644 --- a/HaploBlocker/haplonetwork.py +++ b/HaploBlocker/haplonetwork.py @@ -3,11 +3,11 @@ Graph Summarization: https://github.com/graph-genome/vgbrowser/issues/3 HaploBlocker: https://github.com/graph-genome/vgbrowser/issues/19 """ -from typing import List +from typing import List, Iterable import numpy as np from collections import defaultdict from copy import copy -from Graph.models import Node, Path +from Graph.models import Node, Path, ZoomLevel, NodeTraversal BLOCK_SIZE = 20 FILTER_THRESHOLD = 4 @@ -62,20 +62,26 @@ def build_all_slices(alleles, individuals, current_graph): return slices -def build_paths(individuals, unique_signatures): +def build_paths(individuals, unique_signatures, graph): """Describes an individual as a Path (list of Nodes) that the individual visits (NodeTraversals). accessions is a list of loci which contain a list of Nodes which each contain specimen build nodes: [0] first 4 are the 4 starting signatures in window 0. Nodes represent a collection of individuals with the same signature at that locus - For each node list which individuals are present at that node""" + For each node list which individuals are present at that node + :param graph: """ # TODO: It may be more performant to merge build_all_slices and build_paths so that global lists are never stored + print(f"Building paths from {len(individuals)} individuals and {len(unique_signatures)} loci") accessions = [] for i_specimen, specimen in enumerate(individuals): - my_path = Path.objects.create(accession=str(i_specimen)) - for w, window in enumerate(unique_signatures): # the length of the genome - sig = signature(specimen, w * BLOCK_SIZE) - my_path.append_node(unique_signatures[w][sig], '+') + my_path = Path.objects.create(accession=str(i_specimen), graph=graph) + my_sigs = [unique_signatures[w][signature(specimen, w * BLOCK_SIZE)] for w in range(len(unique_signatures))] + traverses = [NodeTraversal(node=sig, path=my_path, strand='+') for sig in my_sigs] + NodeTraversal.objects.bulk_create(traverses, 1000) + # for w, window in enumerate(unique_signatures): # the length of the genome + # sig = signature(specimen, w * BLOCK_SIZE) + # my_path.append_node(unique_signatures[w][sig], '+') accessions.append(my_path) + print(f"Done building {len(accessions)}Paths") return accessions @@ -128,31 +134,37 @@ def update_stream_transitions(node, stream): g(node, stream).pop(key, None) -def simple_merge(full_graph): +def simple_merge(current_level: ZoomLevel) -> ZoomLevel: """ Side effects full_graph by merging any consecutive nodes that have - identical specimens and removing the redundant node from full_graph. + identical specimens and removing the redundant my_node from full_graph. :param full_graph: :return: full_graph modified """ - n = 0 - while n < len(full_graph): # size of global_nodes changes, necessitating this weird loop - node = full_graph[n] - if len(node.downstream) == 1: - next_node = first(node.downstream.keys()) - if len(node.specimens) == len(next_node.specimens): - # Torsten deletes nodeA and modifies next_node - next_node.upstream = node.upstream - next_node.start = node.start - # prepare to delete node by removing references - for parent in node.upstream.keys(): - if not parent.is_nothing(): - count = parent.downstream[node] - del parent.downstream[node] # updating pointer - parent.downstream[next_node] = count - full_graph.remove(node) # delete node - n -= 1 - n += 1 - return full_graph + #TODO: Paths start fully populated with redundant NodeTraversals. Editing NodeTraversals, + # moves to newly created Nodes. Global bool for whether or not a particular path was modified. + zoom = current_level.zoom + next_level = ZoomLevel.objects.create(graph=current_level.graph, zoom=zoom + 1) + for my_node in current_level.nodes(): + # only one Node Downstream, no matter the number of specimens + if len(my_node.downstream_ids(zoom)) == 1: + next_node = my_node.nodetraversal_set.fist().downstream().node # fetched from DB + if my_node.nodetraversal_set.count() == next_node.nodetraversal_set.count(): # Not a complete guarantee... + # Torsten deletes my_node and modifies next_node + merged_node = Node.objects.create(name=f'{my_node.name}*{next_level.zoom}', + graph=current_level.graph) + for x in [my_node, next_node]: + x.summarized_by = merged_node + x.save() + + # edit existing traversals + NodeTraversal.objects.filter(node=next_node, path__in=next_level.paths).bulk_update(node_id=merged_node.id) + # next_node.nodetraversal_set.filter(zoom=zoom).bulk_update(node_id=merged_node.id) + + # delete my_node and all associates + query = NodeTraversal.objects.filter(node=my_node, path__in=next_level.paths) + query._raw_delete(query.db) # https://www.nickang.com/fastest-delete-django/ + # TODO: merged_node.start = my_node.start + return next_level def delete_node(node, cutoff): diff --git a/HaploBlocker/tests.py b/HaploBlocker/tests.py index f023385..80f751b 100644 --- a/HaploBlocker/tests.py +++ b/HaploBlocker/tests.py @@ -1,10 +1,9 @@ +from unittest import skip, skipIf from django.test import TestCase - import Graph.utils import Graph.views -from Graph.models import GraphGenome, Path +from Graph.models import GraphGenome, Path, ZoomLevel, NodeTraversal from vgbrowser.settings import BASE_DIR -import unittest import os # Create your tests here. # from HaploBlocker.models import Node, Path, Edge @@ -12,24 +11,29 @@ from HaploBlocker.haplonetwork import read_data, build_all_slices, build_paths, nodes_from_unique_signatures, \ populate_transitions, simple_merge, neglect_nodes, split_groups -# -# class ModelTest(TestCase): -# def test_creation(self): -# p = Path.objects.create(accession='watermelon') -# n = Node.objects.create(seq='ACGT') -# Edge.objects.create(node=n, path=p) -# Edge.objects.create(node=Node.objects.create(seq='AAAA'), path=p) -# Edge.objects.create(node=Node.objects.create(seq='TTT'), path=p) -# print(Edge.objects.all()) + +class ModelTest(TestCase): + def test_creation(self): + g = GraphGenome.objects.create(name='test_creation') + p = Path.objects.create(accession='watermelon', graph=g) + n = Node.objects.create(seq='ACGT', graph=g) + NodeTraversal.objects.create(node=n, path=p, strand='+') + NodeTraversal.objects.create(node=Node.objects.create(seq='AAAA', name='-9', graph=g), + path=p, strand='+') + NodeTraversal.objects.create(node=Node.objects.create(seq='TTT', name='-8', graph=g), + path=p, strand='+') + print(NodeTraversal.objects.all()) -class HaploTest(unittest.TestCase): +class HaploTest(TestCase): @classmethod def setUpClass(self) -> None: """Reads the input data file once. Tests that need a fresh graph must call create_graph()""" - print(os.getcwd()) + print("Setting up HaploTest:", os.getcwd()) self.alleles, self.individuals = read_data(os.path.join(BASE_DIR, "test_data/KE_chromo10.txt")) + print("Finished reading SNP file") + super(HaploTest, self).setUpClass() def create_graph(self, graph_name): """Tests that need a fresh graph must call create_graph() FIRST! @@ -37,9 +41,11 @@ def create_graph(self, graph_name): with order dependent side effects. This method is slow, so don't use it unless you need it. :param graph_name: """ + print("Creating test Graph") graph = GraphGenome.objects.create(name=graph_name) slices = build_all_slices(self.alleles, self.individuals, graph) - self.paths = build_paths(self.individuals, slices) + self.paths = build_paths(self.individuals, slices, graph) + print("Finished test graph.") return graph @@ -58,7 +64,7 @@ def internal_build_individuals(self, alleles, individuals): unique_signatures = build_all_slices(alleles, individuals, graph) peek = repr(list(unique_signatures[21].values())) assert peek == '[N0:21-21(00002202022222220202), N1:21-21(00202202022222220202), N2:21-21(00022200000000000000), N3:21-21(00000000000000000000), N4:21-21(00002200000000000000), N5:21-21(00022202022220020002), N6:21-21(02000000000000000000), N7:21-21(00002202222220020022)]', peek - simplified_individuals = build_paths(individuals, unique_signatures) + simplified_individuals = build_paths(individuals, unique_signatures, graph) peek = repr(simplified_individuals[500][:100]) assert peek == '[N2:0-0(00000000000000000000), N2:1-1(00000000000000000000), N2:2-2(00000000000000000000), N2:3-3(00000000000000000000), N2:4-4(00000000000000000000), N2:5-5(00000000000000000000), N3:6-6(00000000000000000000), N3:7-7(00000000000000000000), N3:8-8(00000000000000000000), N2:9-9(00000000000000000000), N0:10-10(00000000000000000000), N1:11-11(00000000000000000000), N2:12-12(00000000000000000000), N2:13-13(00000000000000000000), N2:14-14(00000000000000000000), N2:15-15(00000000000000000000), N3:16-16(00000000000000000000), N3:17-17(00000000000000000000), N4:18-18(00000000000000000000), N3:19-19(00000000000000000000), N5:20-20(00000000000000000000), N3:21-21(00000000000000000000), N3:22-22(00000000000000000000), N10:23-23(00200000000000000000), N4:24-24(00002200222220002000), N3:25-25(02000222220002020222), N4:26-26(20022000002002220002), N3:27-27(22222202020222220000), N1:28-28(00000000000000000000), N1:29-29(00000000000000000022), N4:30-30(00002222222000002200), N3:31-31(00022222202000000000), N21:32-32(00000020202200022020), N1:33-33(02202220022020222000), N1:34-34(00020002000202222002), N1:35-35(22220002220022200022), N1:36-36(22222200000000000000), N1:37-37(00202002222220000200), N1:38-38(00000200000202022200), N1:39-39(02202000202202220000), N1:40-40(00020222200020000020), N1:41-41(20220020022200022200), N1:42-42(00000000000000000000), N1:43-43(00000000000000000000), N1:44-44(00000000000000000000), N1:45-45(00000000000000000000), N1:46-46(00000002220220020020), N1:47-47(00202220222220222202), N1:48-48(00000000000000000002), N1:49-49(20002200000002220022), N1:50-50(22020002002020202022), N1:51-51(02202222220222202000), N1:52-52(20000020000000000000), N1:53-53(00000000000000000000), N1:54-54(00000000000000000000), N1:55-55(00220220000200000220), N1:56-56(20000000202022022020), N1:57-57(20222022022202222220), N1:58-58(22022202222222020200), N1:59-59(22202200202220202220), N1:60-60(22020022220200022022), N1:61-61(20202220000220000220), N1:62-62(00022002000000000000), N1:63-63(00000220000000000000), N1:64-64(00000000000220200000), N1:65-65(00022020200000020022), N1:66-66(20020222222020200020), N1:67-67(00000000000000202222), N1:68-68(22222222000202222202), N1:69-69(22022222020020000022), N1:70-70(00002002220022222200), N1:71-71(22002020020202000000), N1:72-72(00022202000202220020), N1:73-73(22000000000000200020), N1:74-74(22220222220200202202), N1:75-75(00022202222200000000), N1:76-76(00000220220200200022), N1:77-77(02200202020020200000), N0:78-78(00002000000000000000), N0:79-79(00000000000000000000), N1:80-80(00000000000022220000), N1:81-81(00000000000000000000), N1:82-82(00022220200202202202), N1:83-83(20202222200202202202), N1:84-84(00000020000000000000), N1:85-85(00222022020000000002), N1:86-86(22020222020222222000), N1:87-87(00022222002020222022), N1:88-88(00002222000000000200), N1:89-89(00000000000000220022), N1:90-90(22020202200020222220), N1:91-91(00002000002220002222), N1:92-92(22200000000000000000), N1:93-93(00000000000000000000), N1:94-94(00202022200202222222), N1:95-95(22222202202020222222), N1:96-96(00222220200202222020), N1:97-97(22002202220222222022), N0:98-98(20222222222222020220), N0:99-99(20222222220222222002)]', peek assert len(simplified_individuals) == 501 and len(simplified_individuals[60]) == 1638 @@ -73,7 +79,7 @@ def test_get_unique_signatures(self): '[N0:0-0(02002020002000220000), N1:0-0(00220202220222002222), ' \ 'N2:0-0(00000000000000000000), N3:0-0(20220202220222002222)]', peek - @unittest.skip + @skip def test_no_duplicate_nodes(self): graph = self.create_graph('test') unique_nodes = set() @@ -89,15 +95,23 @@ def test_no_duplicate_nodes(self): assert duplicates_found == 0, f"Found {duplicates_found} duplicated nodes in the graph" - def _test_simple_merge(self, graph, zoom_level): + def _test_simple_merge(self, graph, zoom_level: int) -> ZoomLevel: # these tests could be made independent of test_workflow, but it would be slower assert graph.highest_zoom_level() == zoom_level - assert len(graph) == 7180 - status = simple_merge(graph) + starting_level = ZoomLevel.objects.get(graph=graph, zoom=zoom_level) + assert len(starting_level) == 7180 + next_level = simple_merge(starting_level) + #Test every Path has a representative in this ZoomLevel assert Path.objects.filter(graph=graph, zoom=zoom_level + 1).count() == \ Path.objects.filter(graph=graph, zoom=zoom_level + 0).count() - assert Path.objects.filter(graph=graph, zoom=zoom_level+1) == 3690 - return status + assert NodeTraversal.objects.filter(graph=graph, zoom=zoom_level+1) == 3690 + return next_level + + @skip + def test_simple_merge(self): + graph = self.create_graph('test') + summary1 = self._test_simple_merge(graph, 0) + def _test_neglect_nodes(self, all_nodes, zoom_level): summary2 = neglect_nodes(all_nodes) diff --git a/vgbrowser/settings.py b/vgbrowser/settings.py index ef7bf9b..af17561 100644 --- a/vgbrowser/settings.py +++ b/vgbrowser/settings.py @@ -31,14 +31,14 @@ # Application definition INSTALLED_APPS = [ - 'HaploBlocker.apps.HaploblockerConfig', - 'Graph.apps.GraphConfig', 'django.contrib.admin', 'django.contrib.auth', 'django.contrib.contenttypes', 'django.contrib.sessions', 'django.contrib.messages', 'django.contrib.staticfiles', + 'Graph.apps.GraphConfig', + 'HaploBlocker.apps.HaploblockerConfig', ] MIDDLEWARE = [ From 59c18fd26e83e321eef0bb5592ebe1380b45d478 Mon Sep 17 00:00:00 2001 From: Josiah Seaman Date: Tue, 27 Aug 2019 19:32:57 +0100 Subject: [PATCH 08/15] #26 Simple_merge test cases improved. internal_build_individuals() is working. --- Graph/models.py | 4 +++- HaploBlocker/haplonetwork.py | 36 +++++++++++++++++++++++------------- HaploBlocker/tests.py | 20 ++++++++++++-------- 3 files changed, 38 insertions(+), 22 deletions(-) diff --git a/Graph/models.py b/Graph/models.py index d7a26eb..b5a7191 100644 --- a/Graph/models.py +++ b/Graph/models.py @@ -152,6 +152,8 @@ def append_node_to_path(self, node_id, strand, path_name, zoom) -> None: def node(self, node_name): return Node.objects.get(name=node_name, graph=self) + def highest_zoom_level(self): + return self.zoomlevel_set.all().order_by('-zoom').first().zoom class Node(models.Model): @@ -192,7 +194,7 @@ def to_gfa(self, segment_id: int): return '\t'.join(['S', str(segment_id), self.seq]) def specimens(self, zoom_level) -> List[int]: - return self.nodetraversal_set.filter(path_zoom=zoom_level).value_list('path_id', flat=True) + return list(self.nodetraversal_set.filter(path_zoom=zoom_level).value_list('path_id', flat=True)) def upstream_ids(self, zoom_level) -> Set[int]: """Returns the node ids that are upstream of this node.""" diff --git a/HaploBlocker/haplonetwork.py b/HaploBlocker/haplonetwork.py index 0881b15..066d356 100644 --- a/HaploBlocker/haplonetwork.py +++ b/HaploBlocker/haplonetwork.py @@ -75,11 +75,8 @@ def build_paths(individuals, unique_signatures, graph): for i_specimen, specimen in enumerate(individuals): my_path = Path.objects.create(accession=str(i_specimen), graph=graph) my_sigs = [unique_signatures[w][signature(specimen, w * BLOCK_SIZE)] for w in range(len(unique_signatures))] - traverses = [NodeTraversal(node=sig, path=my_path, strand='+') for sig in my_sigs] - NodeTraversal.objects.bulk_create(traverses, 1000) - # for w, window in enumerate(unique_signatures): # the length of the genome - # sig = signature(specimen, w * BLOCK_SIZE) - # my_path.append_node(unique_signatures[w][sig], '+') + traverses = [NodeTraversal(node=sig, path=my_path, strand='+', order=i) for i, sig in enumerate(my_sigs)] + NodeTraversal.objects.bulk_create(traverses, 100) accessions.append(my_path) print(f"Done building {len(accessions)}Paths") return accessions @@ -142,8 +139,7 @@ def simple_merge(current_level: ZoomLevel) -> ZoomLevel: """ #TODO: Paths start fully populated with redundant NodeTraversals. Editing NodeTraversals, # moves to newly created Nodes. Global bool for whether or not a particular path was modified. - zoom = current_level.zoom - next_level = ZoomLevel.objects.create(graph=current_level.graph, zoom=zoom + 1) + next_level, zoom = prep_next_summary_layer(current_level) for my_node in current_level.nodes(): # only one Node Downstream, no matter the number of specimens if len(my_node.downstream_ids(zoom)) == 1: @@ -167,6 +163,14 @@ def simple_merge(current_level: ZoomLevel) -> ZoomLevel: return next_level +def prep_next_summary_layer(current_level): + zoom = current_level.zoom + assert current_level.graph.highest_zoom_level() == zoom, \ + "You should only be summarizing the topmost layer" + next_level = ZoomLevel.objects.create(graph=current_level.graph, zoom=zoom + 1) + return next_level, zoom + + def delete_node(node, cutoff): """Changes references to this node to add to references to Node.NOTHING""" if cutoff < 1: @@ -183,6 +187,9 @@ def neglect_nodes(all_nodes, deletion_cutoff=FILTER_THRESHOLD): """Deletes nodes if they have too few specimens supporting them defined by :param deletion_cutoff :returns a new list of nodes lacking the pruned nodes in all_nodes""" + + # next_level, zoom = prep_next_summary_layer(current_level) + nodes_to_delete = set() for node in all_nodes: if len(node.specimens) <= deletion_cutoff: @@ -197,20 +204,20 @@ def split_one_group(prev_node, anchor, next_node): """ Called when up.specimens == down.specimens Comment: That is actually the case we want to split up to obtain longer blocks later Extension of full windows will take care of potential loss of information later""" - my_specimens = copy(anchor.specimens) # important to copy or side effects occur + my_specimens = anchor.specimens() # important to copy or side effects occur if not prev_node.is_nothing(): # normal case - my_specimens = my_specimens.intersection(prev_node.specimens) + my_specimens = my_specimens.intersection(prev_node.specimens()) if not next_node.is_nothing(): # normal case - my_specimens = my_specimens.intersection(next_node.specimens) + my_specimens = my_specimens.intersection(next_node.specimens()) if prev_node.is_nothing() and next_node.is_nothing(): # exceptional: both are nothing node - my_specimens = copy(anchor.specimens) + my_specimens = copy(anchor.specimens()) # removing all specimens that transition to nothing for n in anchor.downstream.keys(): if n.is_nothing(): # remove dead leads - my_specimens -= n.specimens + my_specimens -= n.specimens() for n in anchor.upstream.keys(): if n.is_nothing(): # remove dead leads - my_specimens -= n.specimens + my_specimens -= n.specimens() my_upstream, my_downstream = copy(prev_node.upstream), copy(next_node.downstream) if prev_node.is_nothing(): # Rare case @@ -257,6 +264,9 @@ def split_groups(all_nodes: List[Node]): TODO: Ideally, the database would retain some record of how many nucleotides are shared between the two new haplotype nodes.""" new_graph = list(all_nodes) + # next_level, zoom = prep_next_summary_layer(current_level) + + for node in all_nodes: # check if all transition upstream match with one of my downstream nodes if len(node.specimens) > 0: diff --git a/HaploBlocker/tests.py b/HaploBlocker/tests.py index 80f751b..b442f38 100644 --- a/HaploBlocker/tests.py +++ b/HaploBlocker/tests.py @@ -65,9 +65,13 @@ def internal_build_individuals(self, alleles, individuals): peek = repr(list(unique_signatures[21].values())) assert peek == '[N0:21-21(00002202022222220202), N1:21-21(00202202022222220202), N2:21-21(00022200000000000000), N3:21-21(00000000000000000000), N4:21-21(00002200000000000000), N5:21-21(00022202022220020002), N6:21-21(02000000000000000000), N7:21-21(00002202222220020022)]', peek simplified_individuals = build_paths(individuals, unique_signatures, graph) - peek = repr(simplified_individuals[500][:100]) - assert peek == '[N2:0-0(00000000000000000000), N2:1-1(00000000000000000000), N2:2-2(00000000000000000000), N2:3-3(00000000000000000000), N2:4-4(00000000000000000000), N2:5-5(00000000000000000000), N3:6-6(00000000000000000000), N3:7-7(00000000000000000000), N3:8-8(00000000000000000000), N2:9-9(00000000000000000000), N0:10-10(00000000000000000000), N1:11-11(00000000000000000000), N2:12-12(00000000000000000000), N2:13-13(00000000000000000000), N2:14-14(00000000000000000000), N2:15-15(00000000000000000000), N3:16-16(00000000000000000000), N3:17-17(00000000000000000000), N4:18-18(00000000000000000000), N3:19-19(00000000000000000000), N5:20-20(00000000000000000000), N3:21-21(00000000000000000000), N3:22-22(00000000000000000000), N10:23-23(00200000000000000000), N4:24-24(00002200222220002000), N3:25-25(02000222220002020222), N4:26-26(20022000002002220002), N3:27-27(22222202020222220000), N1:28-28(00000000000000000000), N1:29-29(00000000000000000022), N4:30-30(00002222222000002200), N3:31-31(00022222202000000000), N21:32-32(00000020202200022020), N1:33-33(02202220022020222000), N1:34-34(00020002000202222002), N1:35-35(22220002220022200022), N1:36-36(22222200000000000000), N1:37-37(00202002222220000200), N1:38-38(00000200000202022200), N1:39-39(02202000202202220000), N1:40-40(00020222200020000020), N1:41-41(20220020022200022200), N1:42-42(00000000000000000000), N1:43-43(00000000000000000000), N1:44-44(00000000000000000000), N1:45-45(00000000000000000000), N1:46-46(00000002220220020020), N1:47-47(00202220222220222202), N1:48-48(00000000000000000002), N1:49-49(20002200000002220022), N1:50-50(22020002002020202022), N1:51-51(02202222220222202000), N1:52-52(20000020000000000000), N1:53-53(00000000000000000000), N1:54-54(00000000000000000000), N1:55-55(00220220000200000220), N1:56-56(20000000202022022020), N1:57-57(20222022022202222220), N1:58-58(22022202222222020200), N1:59-59(22202200202220202220), N1:60-60(22020022220200022022), N1:61-61(20202220000220000220), N1:62-62(00022002000000000000), N1:63-63(00000220000000000000), N1:64-64(00000000000220200000), N1:65-65(00022020200000020022), N1:66-66(20020222222020200020), N1:67-67(00000000000000202222), N1:68-68(22222222000202222202), N1:69-69(22022222020020000022), N1:70-70(00002002220022222200), N1:71-71(22002020020202000000), N1:72-72(00022202000202220020), N1:73-73(22000000000000200020), N1:74-74(22220222220200202202), N1:75-75(00022202222200000000), N1:76-76(00000220220200200022), N1:77-77(02200202020020200000), N0:78-78(00002000000000000000), N0:79-79(00000000000000000000), N1:80-80(00000000000022220000), N1:81-81(00000000000000000000), N1:82-82(00022220200202202202), N1:83-83(20202222200202202202), N1:84-84(00000020000000000000), N1:85-85(00222022020000000002), N1:86-86(22020222020222222000), N1:87-87(00022222002020222022), N1:88-88(00002222000000000200), N1:89-89(00000000000000220022), N1:90-90(22020202200020222220), N1:91-91(00002000002220002222), N1:92-92(22200000000000000000), N1:93-93(00000000000000000000), N1:94-94(00202022200202222222), N1:95-95(22222202202020222222), N1:96-96(00222220200202222020), N1:97-97(22002202220222222022), N0:98-98(20222222222222020220), N0:99-99(20222222220222222002)]', peek - assert len(simplified_individuals) == 501 and len(simplified_individuals[60]) == 1638 + traverses = simplified_individuals[500].nodes.filter(order__lt=100) # [:100] + nodes = [t.node for t in traverses.prefetch_related('node').all()] + peek = repr(nodes) + self.maxDiff = None # tells the debugger to show the whole thing + self.assertEqual(peek, '[N2:0-0(00000000000000000000), N2:1-1(00000000000000000000), N2:2-2(00000000000000000000), N2:3-3(00000000000000000000), N2:4-4(00000000000000000000), N2:5-5(00000000000000000000), N3:6-6(00000000000000000000), N3:7-7(00000000000000000000), N3:8-8(00000000000000000000), N2:9-9(00000000000000000000), N0:10-10(00000000000000000000), N1:11-11(00000000000000000000), N2:12-12(00000000000000000000), N2:13-13(00000000000000000000), N2:14-14(00000000000000000000), N2:15-15(00000000000000000000), N3:16-16(00000000000000000000), N3:17-17(00000000000000000000), N4:18-18(00000000000000000000), N3:19-19(00000000000000000000), N5:20-20(00000000000000000000), N3:21-21(00000000000000000000), N3:22-22(00000000000000000000), N10:23-23(00200000000000000000), N4:24-24(00002200222220002000), N3:25-25(02000222220002020222), N4:26-26(20022000002002220002), N3:27-27(22222202020222220000), N1:28-28(00000000000000000000), N1:29-29(00000000000000000022), N4:30-30(00002222222000002200), N3:31-31(00022222202000000000), N21:32-32(00000020202200022020), N1:33-33(02202220022020222000), N1:34-34(00020002000202222002), N1:35-35(22220002220022200022), N1:36-36(22222200000000000000), N1:37-37(00202002222220000200), N1:38-38(00000200000202022200), N1:39-39(02202000202202220000), N1:40-40(00020222200020000020), N1:41-41(20220020022200022200), N1:42-42(00000000000000000000), N1:43-43(00000000000000000000), N1:44-44(00000000000000000000), N1:45-45(00000000000000000000), N1:46-46(00000002220220020020), N1:47-47(00202220222220222202), N1:48-48(00000000000000000002), N1:49-49(20002200000002220022), N1:50-50(22020002002020202022), N1:51-51(02202222220222202000), N1:52-52(20000020000000000000), N1:53-53(00000000000000000000), N1:54-54(00000000000000000000), N1:55-55(00220220000200000220), N1:56-56(20000000202022022020), N1:57-57(20222022022202222220), N1:58-58(22022202222222020200), N1:59-59(22202200202220202220), N1:60-60(22020022220200022022), N1:61-61(20202220000220000220), N1:62-62(00022002000000000000), N1:63-63(00000220000000000000), N1:64-64(00000000000220200000), N1:65-65(00022020200000020022), N1:66-66(20020222222020200020), N1:67-67(00000000000000202222), N1:68-68(22222222000202222202), N1:69-69(22022222020020000022), N1:70-70(00002002220022222200), N1:71-71(22002020020202000000), N1:72-72(00022202000202220020), N1:73-73(22000000000000200020), N1:74-74(22220222220200202202), N1:75-75(00022202222200000000), N1:76-76(00000220220200200022), N1:77-77(02200202020020200000), N0:78-78(00002000000000000000), N0:79-79(00000000000000000000), N1:80-80(00000000000022220000), N1:81-81(00000000000000000000), N1:82-82(00022220200202202202), N1:83-83(20202222200202202202), N1:84-84(00000020000000000000), N1:85-85(00222022020000000002), N1:86-86(22020222020222222000), N1:87-87(00022222002020222022), N1:88-88(00002222000000000200), N1:89-89(00000000000000220022), N1:90-90(22020202200020222220), N1:91-91(00002000002220002222), N1:92-92(22200000000000000000), N1:93-93(00000000000000000000), N1:94-94(00202022200202222222), N1:95-95(22222202202020222222), N1:96-96(00222220200202222020), N1:97-97(22002202220222222022), N0:98-98(20222222222222020220), N0:99-99(20222222220222222002)]') + self.assertEqual(len(simplified_individuals),501) + self.assertEqual(simplified_individuals[60].nodes.count(), 1638) def test_get_unique_signatures(self): @@ -95,16 +99,16 @@ def test_no_duplicate_nodes(self): assert duplicates_found == 0, f"Found {duplicates_found} duplicated nodes in the graph" - def _test_simple_merge(self, graph, zoom_level: int) -> ZoomLevel: + def _test_simple_merge(self, graph: GraphGenome, zoom_level: int) -> ZoomLevel: # these tests could be made independent of test_workflow, but it would be slower assert graph.highest_zoom_level() == zoom_level starting_level = ZoomLevel.objects.get(graph=graph, zoom=zoom_level) - assert len(starting_level) == 7180 + self.assertEqual(len(starting_level), 7180) next_level = simple_merge(starting_level) #Test every Path has a representative in this ZoomLevel - assert Path.objects.filter(graph=graph, zoom=zoom_level + 1).count() == \ - Path.objects.filter(graph=graph, zoom=zoom_level + 0).count() - assert NodeTraversal.objects.filter(graph=graph, zoom=zoom_level+1) == 3690 + self.assertEqual(Path.objects.filter(graph=graph, zoom=zoom_level + 1).count(), + Path.objects.filter(graph=graph, zoom=zoom_level + 0).count()) + self.assertEqual(NodeTraversal.objects.filter(graph=graph, zoom=zoom_level+1).count(), 3690) #*501? return next_level @skip From 2fba2d897ab42ec76eb795feec2639ec2be9304b Mon Sep 17 00:00:00 2001 From: Josiah Seaman Date: Tue, 3 Sep 2019 12:00:34 +0900 Subject: [PATCH 09/15] #26 Split_one_group() updated for Database but Path - NodeTraversal - ZoomLevel relationship is over complicated. --- Graph/models.py | 4 +-- HaploBlocker/haplonetwork.py | 62 +++++++++++++++--------------------- HaploBlocker/tests.py | 3 +- 3 files changed, 29 insertions(+), 40 deletions(-) diff --git a/Graph/models.py b/Graph/models.py index b5a7191..787bf13 100644 --- a/Graph/models.py +++ b/Graph/models.py @@ -193,8 +193,8 @@ def __hash__(self): def to_gfa(self, segment_id: int): return '\t'.join(['S', str(segment_id), self.seq]) - def specimens(self, zoom_level) -> List[int]: - return list(self.nodetraversal_set.filter(path_zoom=zoom_level).value_list('path_id', flat=True)) + def specimens(self, zoom_level) -> Set[int]: + return set(self.nodetraversal_set.filter(path__zoomlevel_set__zoom=zoom_level).value_list('path_id', flat=True)) def upstream_ids(self, zoom_level) -> Set[int]: """Returns the node ids that are upstream of this node.""" diff --git a/HaploBlocker/haplonetwork.py b/HaploBlocker/haplonetwork.py index 066d356..4bd9219 100644 --- a/HaploBlocker/haplonetwork.py +++ b/HaploBlocker/haplonetwork.py @@ -140,6 +140,15 @@ def simple_merge(current_level: ZoomLevel) -> ZoomLevel: #TODO: Paths start fully populated with redundant NodeTraversals. Editing NodeTraversals, # moves to newly created Nodes. Global bool for whether or not a particular path was modified. next_level, zoom = prep_next_summary_layer(current_level) + # TODO: Iterate an optimized query or Remove this nonsense comment + # Node.objects.filter() + # NodeTraversal.objects.filter(node_id) + # traverses = Node.nodetraversal_set.filter(path_zoom=zoom)\ + # .distinct()\ + # .filter(count=1) + # downstream_ids = set(t.downstream_id() for t in traverses) + # a.path_set == b.path_set + # Node.objects.filter(path_set == ) for my_node in current_level.nodes(): # only one Node Downstream, no matter the number of specimens if len(my_node.downstream_ids(zoom)) == 1: @@ -200,46 +209,25 @@ def neglect_nodes(all_nodes, deletion_cutoff=FILTER_THRESHOLD): return filtered_nodes -def split_one_group(prev_node, anchor, next_node): +def split_one_group(prev_node, anchor, next_node, level: ZoomLevel): """ Called when up.specimens == down.specimens Comment: That is actually the case we want to split up to obtain longer blocks later Extension of full windows will take care of potential loss of information later""" - my_specimens = anchor.specimens() # important to copy or side effects occur - if not prev_node.is_nothing(): # normal case - my_specimens = my_specimens.intersection(prev_node.specimens()) - if not next_node.is_nothing(): # normal case - my_specimens = my_specimens.intersection(next_node.specimens()) - if prev_node.is_nothing() and next_node.is_nothing(): # exceptional: both are nothing node - my_specimens = copy(anchor.specimens()) - # removing all specimens that transition to nothing - for n in anchor.downstream.keys(): - if n.is_nothing(): # remove dead leads - my_specimens -= n.specimens() - for n in anchor.upstream.keys(): - if n.is_nothing(): # remove dead leads - my_specimens -= n.specimens() - - my_upstream, my_downstream = copy(prev_node.upstream), copy(next_node.downstream) - if prev_node.is_nothing(): # Rare case - my_upstream = copy(anchor.upstream) - if next_node.is_nothing(): # Rare case - my_downstream = copy(anchor.downstream) - - # TODO: what about case where more content is joining downstream? - new = Node(777, my_specimens, my_upstream, my_downstream) - - # Update Specimens in prev_node, anchor, next_node - anchor.specimens -= new.specimens - prev_node.specimens -= new.specimens - next_node.specimens -= new.specimens - - # Update upstream/downstream - update_neighbor_pointers(new) - suspects = {new, prev_node, anchor, next_node}.union(set(new.upstream.keys()), set(new.downstream.keys())) - for n in suspects: - update_transition(n) - new.validate() - return new + + my_specimens = anchor.specimens(level.zoom) # list of path_ids + my_specimens = my_specimens.intersection(prev_node.specimens(level.zoom)) + my_specimens = my_specimens.intersection(next_node.specimens(level.zoom)) + new_node = Node.objects.create(graph=level.graph, name=f'{anchor.name}:{level.zoom}') + for a in (prev_node, anchor, next_node): + a.summarized_by = new_node + a.save() + NodeTraversal.objects.filter(path_id__in=my_specimens, node_id=anchor).update(node_id=new_node.id) + NodeTraversal.objects.filter(path_id__in=my_specimens, node_id=prev_node.id).delete() + NodeTraversal.objects.filter(path_id__in=my_specimens, node_id=next_node.id).delete() + # TODO: if this is slow use query._raw_delete + + new_node.validate() + return new_node def update_neighbor_pointers(new_node): diff --git a/HaploBlocker/tests.py b/HaploBlocker/tests.py index b442f38..b8ca381 100644 --- a/HaploBlocker/tests.py +++ b/HaploBlocker/tests.py @@ -140,7 +140,8 @@ def test_split_one_group(self): ] g = Graph.utils.build_graph_from_slices(nodes, '9') first, anchor, third = g.node('91'), g.node('93'), g.node('94') - new_node = split_one_group(first, anchor, third) # no mentions of minorities [1] or [4] + zoom = ZoomLevel.objects.get(graph=g, zoom=0) + new_node = split_one_group(first, anchor, third, zoom) # no mentions of minorities [1] or [4] print(new_node.details()) assert new_node in g.node('90').downstream and g.node('92') in g.node('90').downstream assert g.node('91') not in g.node('90').downstream From 96df52a4171ea58b3518d2040e42747d4fde17aa Mon Sep 17 00:00:00 2001 From: Josiah Seaman Date: Tue, 3 Sep 2019 16:35:43 +0900 Subject: [PATCH 10/15] #26 Split_groups Working fully. ZoomLevels are full copies of NodeTraversal, which need not be contiguous. --- Graph/models.py | 96 ++++++++++++++++++----------- HaploBlocker/haplonetwork.py | 113 ++++++++++++++--------------------- HaploBlocker/tests.py | 91 ++++++++++++++++------------ 3 files changed, 160 insertions(+), 140 deletions(-) diff --git a/Graph/models.py b/Graph/models.py index 787bf13..5b6f988 100644 --- a/Graph/models.py +++ b/Graph/models.py @@ -1,4 +1,4 @@ -from typing import List, Iterable, Set +from typing import List, Iterable, Set, Optional from django.db import models from django.db.models import QuerySet @@ -25,19 +25,21 @@ def create(self, graph, zoom) -> 'ZoomLevel': return me # We've done all the necessary work # Copy previous level in entirety previous_level = graph.zoomlevel_set.get(zoom=zoom - 1) - path_names = previous_level.paths.values('accession') + paths = previous_level.paths.all() # TODO: This loop can be sped up by bulk_create and bulk_update - for name in path_names: + for path in paths: + name = path.name p = Path(accession=name) # new Path for a new level p.save() me.paths.add(p) # linking to current level - nucleotide_path = previous_level.paths.get(accession=name) - nucleotide_path.update(summarized_by=p) # links between levels + # nucleotide_path = previous_level.paths.get(accession=name) + path.summarized_by = p # links between levels + path.save() #TODO: this is REALLY SLOW AND WASTEFUL! # Makes a full copy of every traversal in the Path so new copies can be edited copies = [NodeTraversal(path=p, node=traverse.node, strand=traverse.strand, order=traverse.order) - for traverse in nucleotide_path.nodetraversal_set] - NodeTraversal.objects.create(copies, 500) + for traverse in path.nodetraversal_set.all()] + NodeTraversal.objects.bulk_create(copies, 500) return me @@ -110,6 +112,9 @@ def nodes(self): """Getter only. Shortcut for DB.""" return self.node_set.all() + def nucleotide_level(self): + return ZoomLevel.objects.get(graph=self, zoom=0) + def __repr__(self): """Warning: the representation strings are very sensitive to whitespace""" return f"Graph: {self.name}\n{self.paths.count()} paths {self.node_set.count()} nodes." @@ -194,19 +199,41 @@ def to_gfa(self, segment_id: int): return '\t'.join(['S', str(segment_id), self.seq]) def specimens(self, zoom_level) -> Set[int]: - return set(self.nodetraversal_set.filter(path__zoomlevel_set__zoom=zoom_level).value_list('path_id', flat=True)) + if isinstance(zoom_level, ZoomLevel): + zoom_level = zoom_level.zoom + return set(self.traverses(zoom_level).values_list('path_id', flat=True)) + + def traverses(self, zoom_level): + if isinstance(zoom_level, ZoomLevel): + zoom_level = zoom_level.zoom + return self.nodetraversal_set.filter(path__zoom_levels__zoom=zoom_level) def upstream_ids(self, zoom_level) -> Set[int]: """Returns the node ids that are upstream of this node.""" - traverses = self.nodetraversal_set.filter(path_zoom=zoom_level) #.value_list('node_id', flat=True) + traverses = self.traverses(zoom_level) #.value_list('node_id', flat=True) # Node.objects.filter(id__in=traverses).values_list('id', flat=True) return set(t.upstream_id() for t in traverses) + def upstream(self, zoom_level) -> Set['Node']: + """Returns the node ids that are upstream of this node.""" + traverses = self.traverses(zoom_level) #.value_list('node_id', flat=True) + nodes = set() + for t in traverses: + n = t.upstream() # upstream may be None + if n: + nodes.add(n.node) + return nodes + def downstream_ids(self, zoom_level) -> Set[int]: """Returns the node ids that are downstream of this node.""" - traverses = self.nodetraversal_set.filter(path_zoom=zoom_level).all() # TODO: optimize more + traverses = self.traverses(zoom_level) return set(t.downstream_id() for t in traverses) + def downstream(self, zoom_level) -> Set['Node']: + """Returns the node ids that are upstream of this node.""" + traverses = self.traverses(zoom_level) #.value_list('node_id', flat=True) + return set(t.downstream().node for t in traverses if t.downstream()) + def __repr__(self): return "N%s(%s)" % (str(self.name), self.seq) @@ -214,7 +241,7 @@ def details(self, zoom=0): return f"""Node{self.name}: {self.seq} upstream: {self.upstream_ids(zoom)} downstream: {self.downstream_ids(zoom)} - {len(self.specimens(zoom))} specimens: {self.specimens}""" + {len(self.specimens(zoom))} specimens: {self.specimens(zoom)}""" def is_nothing(self): """Useful in Node class definition to check for Node.NOTHING""" @@ -343,42 +370,41 @@ def fetch_neighbor(self, target_index): def upstream_id(self): target_index = self.order - 1 - return self.neighbor_id(target_index) - # try: This query version can tolerate non-contiguous Path sequences - # return NodeTraversal.objects.\ - # filter(path=self.path, order__lte=target_index).\ - # order_by('-order').first().values_list('node_id', flat=True)[0] - # except (NodeTraversal.DoesNotExist, IndexError): - # return None + return self.neighbor_id(target_index, True) def downstream_id(self): target_index = self.order + 1 - return self.neighbor_id(target_index) - # try: This query version can tolerate non-contiguous Path sequences - # return NodeTraversal.objects.\ - # filter(path=self.path, order__gte=target_index).\ - # order_by('order').first().values_list('node_id', flat=True)[0] - # except (NodeTraversal.DoesNotExist, IndexError): - # return None - - def neighbor_id(self, target_index): + return self.neighbor_id(target_index, False) + + def neighbor_id(self, target_index: int, less_than: bool): """Faster query that just retruns the node_id, not the NodeTraversal.""" - try: + try: # This query version can tolerate non-contiguous Path sequences + if less_than: + return NodeTraversal.objects.\ + filter(path=self.path, order__lte=target_index).\ + order_by('-order').values_list('node_id', flat=True).first() return NodeTraversal.objects.\ - get(path=self.path, order=target_index).values_list('node_id', flat=True)[0] + filter(path=self.path, order__gte=target_index).\ + order_by('order').values_list('node_id', flat=True).first() except (NodeTraversal.DoesNotExist, IndexError): return None - def neighbor(self, target_index): - try: - return NodeTraversal.objects.get(path=self.path, order=target_index) - except NodeTraversal.DoesNotExist: + def neighbor(self, target_index: int, less_than: bool) -> Optional['NodeTraversal']: + try: # This query version can tolerate non-contiguous Path sequences + if less_than: + return NodeTraversal.objects. \ + filter(path=self.path, order__lte=target_index). \ + order_by('-order').first() + return NodeTraversal.objects. \ + filter(path=self.path, order__gte=target_index). \ + order_by('order').first() + except (NodeTraversal.DoesNotExist, IndexError): return None def upstream(self) -> 'NodeTraversal': """Slower queries that return the neighboring NodeTraversal. This can be chained.""" - return self.neighbor(self.order - 1) + return self.neighbor(self.order - 1, True) def downstream(self) -> 'NodeTraversal': """Slower queries that return the neighboring NodeTraversal. This can be chained.""" - return self.neighbor(self.order + 1) + return self.neighbor(self.order + 1, False) diff --git a/HaploBlocker/haplonetwork.py b/HaploBlocker/haplonetwork.py index 4bd9219..5b8f2fb 100644 --- a/HaploBlocker/haplonetwork.py +++ b/HaploBlocker/haplonetwork.py @@ -134,7 +134,7 @@ def update_stream_transitions(node, stream): def simple_merge(current_level: ZoomLevel) -> ZoomLevel: """ Side effects full_graph by merging any consecutive nodes that have identical specimens and removing the redundant my_node from full_graph. - :param full_graph: + :param current_level: Graph that will be read and edited :return: full_graph modified """ #TODO: Paths start fully populated with redundant NodeTraversals. Editing NodeTraversals, @@ -152,23 +152,27 @@ def simple_merge(current_level: ZoomLevel) -> ZoomLevel: for my_node in current_level.nodes(): # only one Node Downstream, no matter the number of specimens if len(my_node.downstream_ids(zoom)) == 1: - next_node = my_node.nodetraversal_set.fist().downstream().node # fetched from DB - if my_node.nodetraversal_set.count() == next_node.nodetraversal_set.count(): # Not a complete guarantee... - # Torsten deletes my_node and modifies next_node - merged_node = Node.objects.create(name=f'{my_node.name}*{next_level.zoom}', - graph=current_level.graph) - for x in [my_node, next_node]: - x.summarized_by = merged_node - x.save() - - # edit existing traversals - NodeTraversal.objects.filter(node=next_node, path__in=next_level.paths).bulk_update(node_id=merged_node.id) - # next_node.nodetraversal_set.filter(zoom=zoom).bulk_update(node_id=merged_node.id) - - # delete my_node and all associates - query = NodeTraversal.objects.filter(node=my_node, path__in=next_level.paths) - query._raw_delete(query.db) # https://www.nickang.com/fastest-delete-django/ - # TODO: merged_node.start = my_node.start + d = my_node.nodetraversal_set.first().downstream() + if d: + next_node = d.node # fetched from DB + if my_node.nodetraversal_set.count() == next_node.nodetraversal_set.count(): # Not a complete guarantee... + # Torsten deletes my_node and modifies next_node + merged_node = Node.objects.create(name=f'{my_node.name}*{next_level.zoom}', + graph=current_level.graph) + for x in [my_node, next_node]: + x.summarized_by = merged_node + x.save() + + # edit existing traversals + NodeTraversal.objects.\ + filter(node=next_node, path__in=next_level.paths).\ + update(node_id=merged_node.id) + # next_node.nodetraversal_set.filter(zoom=zoom).bulk_update(node_id=merged_node.id) + + # delete my_node and all associates + query = NodeTraversal.objects.filter(node=my_node, path__in=next_level.paths) + query._raw_delete(query.db) # https://www.nickang.com/fastest-delete-django/ + # TODO: merged_node.start = my_node.start return next_level @@ -180,48 +184,40 @@ def prep_next_summary_layer(current_level): return next_level, zoom -def delete_node(node, cutoff): +def delete_node(node: Node, cutoff: int, layer: ZoomLevel): """Changes references to this node to add to references to Node.NOTHING""" if cutoff < 1: return # if cutoff is 0, then don't touch upstream and downstream - for parent, count in node.upstream.items(): - parent.downstream[Node.NOTHING] += parent.downstream[node] - del parent.downstream[node] - for descendant, count in node.downstream.items(): - descendant.upstream[Node.NOTHING] += descendant.upstream[node] - del descendant.upstream[node] + node.traverses(layer).delete() + node.validate() -def neglect_nodes(all_nodes, deletion_cutoff=FILTER_THRESHOLD): +def neglect_nodes(zoom_level : ZoomLevel, deletion_cutoff=FILTER_THRESHOLD): """Deletes nodes if they have too few specimens supporting them defined by :param deletion_cutoff :returns a new list of nodes lacking the pruned nodes in all_nodes""" # next_level, zoom = prep_next_summary_layer(current_level) - nodes_to_delete = set() - for node in all_nodes: - if len(node.specimens) <= deletion_cutoff: - delete_node(node, deletion_cutoff) # TODO: check if this will orphan - nodes_to_delete.add(node) - filtered_nodes = [x for x in all_nodes if x not in nodes_to_delete] - # TODO: remove orphaned haplotypes in a node that transition to and from zero within a 10 window length - return filtered_nodes + for node in zoom_level.nodes(): # TODO optimize distinct count + if len(node.specimens(zoom_level)) <= deletion_cutoff: + delete_node(node, deletion_cutoff, zoom_level) -def split_one_group(prev_node, anchor, next_node, level: ZoomLevel): +def split_one_group(prev_node, anchor, next_node, zoom_level: ZoomLevel): """ Called when up.specimens == down.specimens Comment: That is actually the case we want to split up to obtain longer blocks later Extension of full windows will take care of potential loss of information later""" - my_specimens = anchor.specimens(level.zoom) # list of path_ids - my_specimens = my_specimens.intersection(prev_node.specimens(level.zoom)) - my_specimens = my_specimens.intersection(next_node.specimens(level.zoom)) - new_node = Node.objects.create(graph=level.graph, name=f'{anchor.name}:{level.zoom}') + my_specimens = anchor.specimens(zoom_level.zoom) # list of path_ids + my_specimens = my_specimens.intersection(prev_node.specimens(zoom_level.zoom)) + my_specimens = my_specimens.intersection(next_node.specimens(zoom_level.zoom)) + new_node = Node.objects.create(graph=zoom_level.graph, name=f'{anchor.name}:{zoom_level.zoom}') for a in (prev_node, anchor, next_node): a.summarized_by = new_node a.save() - NodeTraversal.objects.filter(path_id__in=my_specimens, node_id=anchor).update(node_id=new_node.id) + + NodeTraversal.objects.filter(path_id__in=my_specimens, node_id=anchor.id).update(node_id=new_node.id) NodeTraversal.objects.filter(path_id__in=my_specimens, node_id=prev_node.id).delete() NodeTraversal.objects.filter(path_id__in=my_specimens, node_id=next_node.id).delete() # TODO: if this is slow use query._raw_delete @@ -241,7 +237,7 @@ def update_neighbor_pointers(new_node): n.upstream[new_node] = 1 -def split_groups(all_nodes: List[Node]): +def split_groups(zoom_level: ZoomLevel): """When two haplotypes have a locus in common with no variation, then the graph represents this with a single anchor node flanked by 2 haplotypes on either side. This means 5 Nodes are present where 2 would suffice. Split groups splits the anchor node and gives pieces @@ -251,32 +247,15 @@ def split_groups(all_nodes: List[Node]): Note: This is called crossmerge in the R code. TODO: Ideally, the database would retain some record of how many nucleotides are shared between the two new haplotype nodes.""" - new_graph = list(all_nodes) - # next_level, zoom = prep_next_summary_layer(current_level) - - for node in all_nodes: + for node in zoom_level.nodes(): # check if all transition upstream match with one of my downstream nodes - if len(node.specimens) > 0: + if len(node.specimens(zoom_level)) > 0: # Matchup upstream and downstream with specimen identities - for up in tuple(node.upstream.keys()): - for down in tuple(node.downstream.keys()): - set1 = copy(up.specimens) - set2 = copy(down.specimens) - if up.is_nothing(): - set1 = copy(node.specimens) - for index in tuple(node.upstream.keys()): - if not index.is_nothing(): - set1.difference_update(index.specimens) - if down.is_nothing(): - set2 = copy(node.specimens) - for index in tuple(node.downstream.keys()): - if not index.is_nothing(): - set2.difference_update(index.specimens) - - if set1 == set2 and len(set1) > 0: - new_node = split_one_group(up, node, down) - new_graph.append(new_node) - - filtered = neglect_nodes(new_graph, 0) # Delete nodes with zero specimens from the Graph? - return filtered + for up in node.upstream(zoom_level): + set1 = up.specimens(zoom_level) + if len(set1): + for down in node.downstream(zoom_level): + set2 = down.specimens(zoom_level) + if set1 == set2 and len(set2) > 0: + new_node = split_one_group(up, node, down, zoom_level) diff --git a/HaploBlocker/tests.py b/HaploBlocker/tests.py index b8ca381..6be0da9 100644 --- a/HaploBlocker/tests.py +++ b/HaploBlocker/tests.py @@ -7,7 +7,7 @@ import os # Create your tests here. # from HaploBlocker.models import Node, Path, Edge -from HaploBlocker.haplonetwork import Node, split_one_group +from HaploBlocker.haplonetwork import Node, split_one_group, prep_next_summary_layer from HaploBlocker.haplonetwork import read_data, build_all_slices, build_paths, nodes_from_unique_signatures, \ populate_transitions, simple_merge, neglect_nodes, split_groups @@ -26,14 +26,14 @@ def test_creation(self): class HaploTest(TestCase): - @classmethod - def setUpClass(self) -> None: - """Reads the input data file once. Tests that need a fresh graph must - call create_graph()""" - print("Setting up HaploTest:", os.getcwd()) - self.alleles, self.individuals = read_data(os.path.join(BASE_DIR, "test_data/KE_chromo10.txt")) - print("Finished reading SNP file") - super(HaploTest, self).setUpClass() + # @classmethod + # def setUpClass(self) -> None: + # """Reads the input data file once. Tests that need a fresh graph must + # call create_graph()""" + # print("Setting up HaploTest:", os.getcwd()) + # self.alleles, self.individuals = read_data(os.path.join(BASE_DIR, "test_data/KE_chromo10.txt")) + # print("Finished reading SNP file") + # super(HaploTest, self).setUpClass() def create_graph(self, graph_name): """Tests that need a fresh graph must call create_graph() FIRST! @@ -99,30 +99,31 @@ def test_no_duplicate_nodes(self): assert duplicates_found == 0, f"Found {duplicates_found} duplicated nodes in the graph" - def _test_simple_merge(self, graph: GraphGenome, zoom_level: int) -> ZoomLevel: + def _test_simple_merge(self, layer : ZoomLevel) -> ZoomLevel: # these tests could be made independent of test_workflow, but it would be slower + graph = layer.graph + zoom_level = layer.zoom assert graph.highest_zoom_level() == zoom_level - starting_level = ZoomLevel.objects.get(graph=graph, zoom=zoom_level) - self.assertEqual(len(starting_level), 7180) - next_level = simple_merge(starting_level) + self.assertEqual(len(layer), 7180) + next_level = simple_merge(layer) #Test every Path has a representative in this ZoomLevel self.assertEqual(Path.objects.filter(graph=graph, zoom=zoom_level + 1).count(), Path.objects.filter(graph=graph, zoom=zoom_level + 0).count()) self.assertEqual(NodeTraversal.objects.filter(graph=graph, zoom=zoom_level+1).count(), 3690) #*501? - return next_level @skip def test_simple_merge(self): graph = self.create_graph('test') - summary1 = self._test_simple_merge(graph, 0) + self._test_simple_merge(graph.nucleotide_level()) - def _test_neglect_nodes(self, all_nodes, zoom_level): - summary2 = neglect_nodes(all_nodes) - assert len(summary2) == 2854 - unchanged = neglect_nodes(summary2, 0) - assert len([n for n in unchanged if len(n.specimens) == 0]) == 0 - return summary2 + def _test_neglect_nodes(self, zoom_level : ZoomLevel): + neglect_nodes(zoom_level) + assert len(zoom_level) == 2854 + unchanged = neglect_nodes(zoom_level, 0) + assert len(zoom_level) == 2854 + # TODO: assert no orphans + # assert len([n for n in unchanged if len(n.specimens(zoom_level)) == 0]) == 0 def test_split_one_group(self): @@ -143,27 +144,41 @@ def test_split_one_group(self): zoom = ZoomLevel.objects.get(graph=g, zoom=0) new_node = split_one_group(first, anchor, third, zoom) # no mentions of minorities [1] or [4] print(new_node.details()) - assert new_node in g.node('90').downstream and g.node('92') in g.node('90').downstream - assert g.node('91') not in g.node('90').downstream - assert g.node('90') in new_node.upstream and g.node('96') in new_node.downstream - assert new_node in g.node('96').upstream and g.node('95') in g.node('96').upstream - assert g.node('94') not in g.node('96').upstream - - def _test_split_groups(self, graph, zoom_level): - summary3 = split_groups(graph) - assert len(summary3) > 10 - return summary3 - + self.assertIn(new_node.id, g.node('90').downstream_ids(0)) + self.assertIn(g.node('92').id, g.node('90').downstream_ids(0)) + self.assertNotIn(g.node('91').id, g.node('90').downstream_ids(0)) + self.assertIn(g.node('90').id, new_node.upstream_ids(0)) + self.assertIn(g.node('96').id, new_node.downstream_ids(0)) + self.assertIn(new_node.id, g.node('96').upstream_ids(0)) + self.assertIn(g.node('95').id, g.node('96').upstream_ids(0)) + self.assertNotIn(g.node('94').id, g.node('96').upstream_ids(0)) + + def test_split_groups(self): + nodes = [ + ['90', {1, 2, 3, 4}], # [5] + ['91', {1, 2, 4}, '92', {3}], + ['93', {1, 2, 3, 4}], # [2] anchor + ['94', {1, 2, 4}, '95', {3}], # [3] [4] + ['96', {1, 2, 3, 4}] # [6] + ] + g = Graph.utils.build_graph_from_slices(nodes, 'test_split_groups') + # zoom_level, zoom = prep_next_summary_layer(g.nucleotide_level()) + split_groups(g.nucleotide_level()) + self.assertEqual(len(g.nucleotide_level()), 5) + simple_merge(g.nucleotide_level()) + self.assertEqual(len(g.nucleotide_level()), 3) def test_workflow(self): graph = self.create_graph('test') - summary1 = self._test_simple_merge(graph, 0) - summary2 = self._test_neglect_nodes(graph, 1) - summary3 = self._test_split_groups(graph, 2) + summary1, zoom = prep_next_summary_layer(graph.nucleotide_level()) + self._test_simple_merge(summary1) + summary2, zoom = prep_next_summary_layer(summary1) + self._test_neglect_nodes(summary2) + summary3, zoom = prep_next_summary_layer(summary2) + split_groups(summary3) assert len(summary1) > len(summary2) > len(summary3), "Each summarization should result in less nodes" - summary4 = simple_merge(summary3, 3) - bad = summary3[2] - print(bad.details()) + summary4, zoom = prep_next_summary_layer(summary2) + simple_merge(summary4) # test_signatures = build_all_slices(alleles, individuals) # test_individuals = build_paths(individuals, test_signatures) From 5159eae93f97fe660a43d35e4501c82244276d96 Mon Sep 17 00:00:00 2001 From: Josiah Seaman Date: Tue, 3 Sep 2019 17:06:20 +0900 Subject: [PATCH 11/15] #26 Fix for Graph copying operation. Still hitting "too many SQL variables". --- Graph/models.py | 2 +- HaploBlocker/haplonetwork.py | 5 +++-- HaploBlocker/tests.py | 27 +++++++++++++-------------- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/Graph/models.py b/Graph/models.py index 5b6f988..3304829 100644 --- a/Graph/models.py +++ b/Graph/models.py @@ -39,7 +39,7 @@ def create(self, graph, zoom) -> 'ZoomLevel': # Makes a full copy of every traversal in the Path so new copies can be edited copies = [NodeTraversal(path=p, node=traverse.node, strand=traverse.strand, order=traverse.order) for traverse in path.nodetraversal_set.all()] - NodeTraversal.objects.bulk_create(copies, 500) + NodeTraversal.objects.bulk_create(copies, 100) return me diff --git a/HaploBlocker/haplonetwork.py b/HaploBlocker/haplonetwork.py index 5b8f2fb..dd52635 100644 --- a/HaploBlocker/haplonetwork.py +++ b/HaploBlocker/haplonetwork.py @@ -164,13 +164,14 @@ def simple_merge(current_level: ZoomLevel) -> ZoomLevel: x.save() # edit existing traversals + path_ids = next_level.paths.values_list('id', flat=True) NodeTraversal.objects.\ - filter(node=next_node, path__in=next_level.paths).\ + filter(node=next_node, path_id__in=path_ids).\ update(node_id=merged_node.id) # next_node.nodetraversal_set.filter(zoom=zoom).bulk_update(node_id=merged_node.id) # delete my_node and all associates - query = NodeTraversal.objects.filter(node=my_node, path__in=next_level.paths) + query = NodeTraversal.objects.filter(node=my_node, path_id__in=path_ids) query._raw_delete(query.db) # https://www.nickang.com/fastest-delete-django/ # TODO: merged_node.start = my_node.start return next_level diff --git a/HaploBlocker/tests.py b/HaploBlocker/tests.py index 6be0da9..5b5c79f 100644 --- a/HaploBlocker/tests.py +++ b/HaploBlocker/tests.py @@ -26,14 +26,14 @@ def test_creation(self): class HaploTest(TestCase): - # @classmethod - # def setUpClass(self) -> None: - # """Reads the input data file once. Tests that need a fresh graph must - # call create_graph()""" - # print("Setting up HaploTest:", os.getcwd()) - # self.alleles, self.individuals = read_data(os.path.join(BASE_DIR, "test_data/KE_chromo10.txt")) - # print("Finished reading SNP file") - # super(HaploTest, self).setUpClass() + @classmethod + def setUpClass(self) -> None: + """Reads the input data file once. Tests that need a fresh graph must + call create_graph()""" + print("Setting up HaploTest:", os.getcwd()) + self.alleles, self.individuals = read_data(os.path.join(BASE_DIR, "test_data/KE_chromo10.txt")) + print("Finished reading SNP file") + super(HaploTest, self).setUpClass() def create_graph(self, graph_name): """Tests that need a fresh graph must call create_graph() FIRST! @@ -98,20 +98,19 @@ def test_no_duplicate_nodes(self): unique_nodes.add(node) assert duplicates_found == 0, f"Found {duplicates_found} duplicated nodes in the graph" - - def _test_simple_merge(self, layer : ZoomLevel) -> ZoomLevel: + def _test_simple_merge(self, layer: ZoomLevel): # these tests could be made independent of test_workflow, but it would be slower graph = layer.graph zoom_level = layer.zoom assert graph.highest_zoom_level() == zoom_level self.assertEqual(len(layer), 7180) next_level = simple_merge(layer) - #Test every Path has a representative in this ZoomLevel + # Test every Path has a representative in this ZoomLevel self.assertEqual(Path.objects.filter(graph=graph, zoom=zoom_level + 1).count(), Path.objects.filter(graph=graph, zoom=zoom_level + 0).count()) self.assertEqual(NodeTraversal.objects.filter(graph=graph, zoom=zoom_level+1).count(), 3690) #*501? - @skip + def test_simple_merge(self): graph = self.create_graph('test') self._test_simple_merge(graph.nucleotide_level()) @@ -165,8 +164,8 @@ def test_split_groups(self): # zoom_level, zoom = prep_next_summary_layer(g.nucleotide_level()) split_groups(g.nucleotide_level()) self.assertEqual(len(g.nucleotide_level()), 5) - simple_merge(g.nucleotide_level()) - self.assertEqual(len(g.nucleotide_level()), 3) + # simple_merge(g.nucleotide_level()) + # self.assertEqual(len(g.nucleotide_level()), 3) def test_workflow(self): graph = self.create_graph('test') From 90570c4edee6f3d104335e7a3f0a15673fd2f56d Mon Sep 17 00:00:00 2001 From: Josiah Seaman Date: Wed, 4 Sep 2019 02:09:38 +0900 Subject: [PATCH 12/15] #26 changes to the DB schema for ZoomLevels being directly linked with full copies of all Nodes and Paths. Data sharing removed. --- Graph/gfa.py | 8 +- Graph/migrations/0005_Node_zoomlevel.py | 28 +++ Graph/migrations/0006_Path_zoomlevel.py | 28 +++ ...007_NodeTraversal_unique_together_order.py | 17 ++ Graph/models.py | 197 +++++++++--------- Graph/test.py | 42 ++-- Graph/utils.py | 6 +- HaploBlocker/haplonetwork.py | 103 ++++----- HaploBlocker/tests.py | 39 ++-- 9 files changed, 272 insertions(+), 196 deletions(-) create mode 100644 Graph/migrations/0005_Node_zoomlevel.py create mode 100644 Graph/migrations/0006_Path_zoomlevel.py create mode 100644 Graph/migrations/0007_NodeTraversal_unique_together_order.py diff --git a/Graph/gfa.py b/Graph/gfa.py index c8d2d82..0520012 100644 --- a/Graph/gfa.py +++ b/Graph/gfa.py @@ -116,7 +116,7 @@ def from_graph(cls, graph: GraphGenome): # TODO: should be given ZoomLevel inst node_series = ",".join(visits) connections = ",".join(['*'] * path.nodes.count()) # count -1? gfa.add_line('\t'.join(['P', path.accession, node_series, connections])) - for node in graph.nodes: # in no particular order + for node in graph.nucleotide_level.nodes: # in no particular order gfa.add_line('\t'.join(['S', str(node.name), node.seq])) return cls(gfa, "from Graph") @@ -124,12 +124,12 @@ def to_graph(self) -> GraphGenome: """Create parent object for this genome and save it in the database. This can create duplicates appended in Paths if it is called twice.""" gdb = GraphGenome.objects.create(name=self.source_path) - # sequence_level = ZoomLevel.objects.create(graph=gdb, zoom=0) + z = gdb.nucleotide_level for segment in self.gfa.segments: - Node.objects.get_or_create(seq=segment.sequence, name=segment.name, graph=gdb) + Node.objects.get_or_create(seq=segment.sequence, name=segment.name, zoom=z) for path in self.gfa.paths: - p = Path.objects.create(accession=path.name, graph=gdb, zoom=0) + p = Path.objects.create(accession=path.name, zoom=z) p.append_gfa_nodes(path.segment_names) return gdb diff --git a/Graph/migrations/0005_Node_zoomlevel.py b/Graph/migrations/0005_Node_zoomlevel.py new file mode 100644 index 0000000..04d7bd4 --- /dev/null +++ b/Graph/migrations/0005_Node_zoomlevel.py @@ -0,0 +1,28 @@ +# Generated by Django 2.2.1 on 2019-09-03 14:33 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('Graph', '0004_ZoomLevel_objects'), + ] + + operations = [ + migrations.AddField( + model_name='node', + name='zoom', + field=models.ForeignKey(default=1, on_delete=django.db.models.deletion.CASCADE, to='Graph.ZoomLevel'), + preserve_default=False, + ), + migrations.AlterUniqueTogether( + name='node', + unique_together={('zoom', 'name')}, + ), + migrations.RemoveField( + model_name='node', + name='graph', + ), + ] diff --git a/Graph/migrations/0006_Path_zoomlevel.py b/Graph/migrations/0006_Path_zoomlevel.py new file mode 100644 index 0000000..7e75858 --- /dev/null +++ b/Graph/migrations/0006_Path_zoomlevel.py @@ -0,0 +1,28 @@ +# Generated by Django 2.2.1 on 2019-09-03 14:53 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('Graph', '0005_Node_zoomlevel'), + ] + + operations = [ + migrations.RemoveField( + model_name='zoomlevel', + name='paths', + ), + migrations.AddField( + model_name='path', + name='zoom', + field=models.ForeignKey(default=1, on_delete=django.db.models.deletion.CASCADE, to='Graph.ZoomLevel'), + preserve_default=False, + ), + migrations.AlterUniqueTogether( + name='path', + unique_together={('accession', 'zoom')}, + ), + ] diff --git a/Graph/migrations/0007_NodeTraversal_unique_together_order.py b/Graph/migrations/0007_NodeTraversal_unique_together_order.py new file mode 100644 index 0000000..b2bfc8b --- /dev/null +++ b/Graph/migrations/0007_NodeTraversal_unique_together_order.py @@ -0,0 +1,17 @@ +# Generated by Django 2.2.1 on 2019-09-03 16:07 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('Graph', '0006_Path_zoomlevel'), + ] + + operations = [ + migrations.AlterUniqueTogether( + name='nodetraversal', + unique_together={('path', 'order')}, + ), + ] diff --git a/Graph/models.py b/Graph/models.py index 3304829..7e99afb 100644 --- a/Graph/models.py +++ b/Graph/models.py @@ -1,6 +1,6 @@ from typing import List, Iterable, Set, Optional from django.db import models -from django.db.models import QuerySet +from django.db.models import QuerySet, Max, Min from Utils.models import CustomSaveManager @@ -17,29 +17,29 @@ class NodeMissingError(ValueError): class ZoomLevelManager(models.Manager): - def create(self, graph, zoom) -> 'ZoomLevel': + def create(self, graph, zoom, blank_layer=False) -> 'ZoomLevel': """Creates a full collection of Paths for the new ZoomLevel that matches the previous level's path names, but are now blank.""" - me = super(ZoomLevelManager, self).create(graph=graph, zoom=zoom) # now saved to DB - if zoom == 0: + me = super(ZoomLevelManager, self).create(graph=graph, zoom=zoom) # now saved to DB + if zoom == 0 or blank_layer: return me # We've done all the necessary work # Copy previous level in entirety previous_level = graph.zoomlevel_set.get(zoom=zoom - 1) - paths = previous_level.paths.all() + Node.objects.bulk_create([Node(name=n.name, zoom=me) for n in previous_level.nodes], 300) # TODO: This loop can be sped up by bulk_create and bulk_update - for path in paths: - name = path.name - p = Path(accession=name) # new Path for a new level - p.save() - me.paths.add(p) # linking to current level - # nucleotide_path = previous_level.paths.get(accession=name) - path.summarized_by = p # links between levels - path.save() - #TODO: this is REALLY SLOW AND WASTEFUL! - # Makes a full copy of every traversal in the Path so new copies can be edited - copies = [NodeTraversal(path=p, node=traverse.node, strand=traverse.strand, order=traverse.order) - for traverse in path.nodetraversal_set.all()] - NodeTraversal.objects.bulk_create(copies, 100) + start = previous_level.paths.aggregate(Max('id'))['rating__max'] # TODO make lazy_paths() generator method + stop = previous_level.paths.aggregate(Min('id'))['rating__min'] + for path_id in range(start, stop): + if Path.objects.exists(id=path_id): + path = Path.objects.get(id=path_id) + name = path.name + p = Path(accession=name, zoom=me) # new Path for a new level + p.save() + # TODO: this is REALLY SLOW AND WASTEFUL! + # Makes a full copy of every traversal in the Path so new copies can be edited + copies = [NodeTraversal(path=p, node=traverse.node, strand=traverse.strand, order=traverse.order) + for traverse in path.nodetraversal_set.all()] + NodeTraversal.objects.bulk_create(copies, 100) return me @@ -52,26 +52,41 @@ class ZoomLevel(models.Model): graph = models.ForeignKey('GraphGenome', on_delete=models.CASCADE) zoom = models.IntegerField(default=0) # Zoom level starts at 0 for nucleotide level and moves up # Nodes can be shared between zoom levels. - paths = models.ManyToManyField('Path', related_name='zoom_levels', blank=True, - help_text='One path for each accession in a zoom level. ' - 'Paths can be reused unmodified in multiple levels.') objects = ZoomLevelManager() class Meta: unique_together = ['graph', 'zoom'] # one ZoomLevel at each integer + @property + def paths(self) -> QuerySet: + return self.path_set + def __len__(self): """The number of unique NodeTraversals in this level""" - nodes = self.node_ids() - return len(nodes) + return self.node_set.count() - def node_ids(self) -> Set[int]: - path_ids = self.paths.values_list('id', flat=True) - nodes = set(NodeTraversal.objects.filter(path_id__in=path_ids).values_list('node_id', flat=True)) - return nodes + def __repr__(self): + """Warning: the representation strings are very sensitive to whitespace""" + return f"Graph: {self.graph.name}\n{self.paths.count()} paths {self.node_set.count()} layers." - def nodes(self) -> Iterable['Node']: - return Node.objects.filter(id__in=self.node_ids()) + def __eq__(self, other: 'ZoomLevel'): + if isinstance(other, ZoomLevel): + return other.node_set.count() == self.node_set.count() and \ + other.paths.count() == self.paths.count() # other.name == self.name and \ + return False + + def node_ids(self) -> Set[int]: # TODO: optimize + # path_ids = self.paths.values_list('id', flat=True) + # nodes = set(NodeTraversal.objects.filter(path_id__in=path_ids).values_list('node_id', flat=True)) + return set(self.node_set.values_list('id', flat=True)) + + @property + def nodes(self) -> QuerySet: + """Getter only. Shortcut for DB.""" + return self.node_set.all() + + def node(self, node_name): + return Node.objects.get(name=node_name, zoom=self) @@ -80,7 +95,7 @@ class GraphManager(models.Manager): def create(self, **kwargs): """When a graph is created. It should also automatically create the first ZoomLevel.""" graph = super(GraphManager, self).create(**kwargs) - ZoomLevel.objects.create(graph=graph, zoom=0) # sequence_level created + ZoomLevel.objects.create(graph=graph, zoom=0) # nucleotide_level created return graph @@ -99,31 +114,32 @@ class GraphGenome(models.Model): objects = GraphManager() @property - def sequence_level(self) -> 'ZoomLevel': + def nucleotide_level(self) -> 'ZoomLevel': return self.zoomlevel_set.filter(zoom=0).first() @property def paths(self) -> QuerySet: """Getter only. Shortcut for DB.""" - return self.sequence_level.paths + return self.nucleotide_level.paths - @property - def nodes(self): - """Getter only. Shortcut for DB.""" - return self.node_set.all() + # @property + # def nodes(self): + # """Getter only. Shortcut for DB.""" + # return self.nucleotide_level.nodes - def nucleotide_level(self): - return ZoomLevel.objects.get(graph=self, zoom=0) + def __eq__(self, other: 'GraphGenome'): + if isinstance(other, GraphGenome): + if other.zoomlevel_set.count() == self.zoomlevel_set.count(): + for a,b in zip(other.zoomlevel_set.all(),self.zoomlevel_set.all()): + if a != b: + return False + return True + return False + return False def __repr__(self): """Warning: the representation strings are very sensitive to whitespace""" - return f"Graph: {self.name}\n{self.paths.count()} paths {self.node_set.count()} nodes." - - def __eq__(self, other): - if isinstance(other, GraphGenome): - return other.node_set.count() == self.node_set.count() and \ - other.paths.count() == self.paths.count() # other.name == self.name and \ - return False + return f"Graph: {self.name}\n{self.paths.count()} paths {self.nucleotide_level.node_set.count()} nodes at base in {self.zoomlevel_set.count()} layers." @classmethod def load_from_xg(cls, file: str, xg_bin: str) -> 'GraphGenome': @@ -140,22 +156,6 @@ def save_as_xg(self, file: str, xg_bin: str): gfa = GFA.from_graph(self) gfa.save_as_xg(file, xg_bin) - def append_node_to_path(self, node_id, strand, path_name, zoom) -> None: - """Path.append_node() is preferred over this method, as it is faster and simpler. - This will build a graph in a truly non-linear way. - Nodes will be created if necessary. - NodeTraversal is appended to Path (order dependent) and PathIndex is added to Node - (order independent).""" - if node_id not in self.nodes: # hasn't been created yet, need to retrieve from dictionary of guid - if isinstance(node_id, str): - self.nodes[node_id] = Node('', [], node_id) - else: - raise ValueError("Provide the id of the node, not", node_id) - level = self.zoomlevel_set.get(zoom=zoom) - level.paths.get(name=path_name).append_node(Node.objects.get(name=node_id), strand) - - def node(self, node_name): - return Node.objects.get(name=node_name, graph=self) def highest_zoom_level(self): return self.zoomlevel_set.all().order_by('-zoom').first().zoom @@ -169,14 +169,14 @@ class Node(models.Model): to fetch the node from GraphGenome.node().""" seq = models.CharField(max_length=255, blank=True) name = models.CharField(max_length=15) - graph = models.ForeignKey(GraphGenome, on_delete=models.CASCADE) + zoom = models.ForeignKey(ZoomLevel, on_delete=models.CASCADE) summarized_by = models.ForeignKey('Node', null=True, blank=True, on_delete=models.SET_NULL, related_name='children') # Break points for haploblocks - Erik Garrison - service for coordinates # Start and stop positions for a node class Meta: - unique_together = ['graph', 'name'] + unique_together = ['zoom', 'name'] def __len__(self): return self.nodetraversal_set.count() @@ -198,25 +198,21 @@ def __hash__(self): def to_gfa(self, segment_id: int): return '\t'.join(['S', str(segment_id), self.seq]) - def specimens(self, zoom_level) -> Set[int]: - if isinstance(zoom_level, ZoomLevel): - zoom_level = zoom_level.zoom - return set(self.traverses(zoom_level).values_list('path_id', flat=True)) + def specimens(self) -> Set[int]: + return set(self.traverses().values_list('path_id', flat=True)) - def traverses(self, zoom_level): - if isinstance(zoom_level, ZoomLevel): - zoom_level = zoom_level.zoom - return self.nodetraversal_set.filter(path__zoom_levels__zoom=zoom_level) + def traverses(self) -> QuerySet: + return self.nodetraversal_set.all() - def upstream_ids(self, zoom_level) -> Set[int]: + def upstream_ids(self) -> Set[int]: """Returns the node ids that are upstream of this node.""" - traverses = self.traverses(zoom_level) #.value_list('node_id', flat=True) + traverses = self.traverses() #.value_list('node_id', flat=True) # Node.objects.filter(id__in=traverses).values_list('id', flat=True) return set(t.upstream_id() for t in traverses) - def upstream(self, zoom_level) -> Set['Node']: + def upstream(self) -> Set['Node']: """Returns the node ids that are upstream of this node.""" - traverses = self.traverses(zoom_level) #.value_list('node_id', flat=True) + traverses = self.traverses() #.value_list('node_id', flat=True) nodes = set() for t in traverses: n = t.upstream() # upstream may be None @@ -224,14 +220,14 @@ def upstream(self, zoom_level) -> Set['Node']: nodes.add(n.node) return nodes - def downstream_ids(self, zoom_level) -> Set[int]: + def downstream_ids(self) -> Set[int]: """Returns the node ids that are downstream of this node.""" - traverses = self.traverses(zoom_level) + traverses = self.traverses() return set(t.downstream_id() for t in traverses) - def downstream(self, zoom_level) -> Set['Node']: + def downstream(self) -> Set['Node']: """Returns the node ids that are upstream of this node.""" - traverses = self.traverses(zoom_level) #.value_list('node_id', flat=True) + traverses = self.traverses() #.value_list('node_id', flat=True) return set(t.downstream().node for t in traverses if t.downstream()) def __repr__(self): @@ -239,9 +235,9 @@ def __repr__(self): def details(self, zoom=0): return f"""Node{self.name}: {self.seq} - upstream: {self.upstream_ids(zoom)} - downstream: {self.downstream_ids(zoom)} - {len(self.specimens(zoom))} specimens: {self.specimens(zoom)}""" + upstream: {self.upstream_ids()} + downstream: {self.downstream_ids()} + {len(self.specimens())} specimens: {self.specimens()}""" def is_nothing(self): """Useful in Node class definition to check for Node.NOTHING""" @@ -264,29 +260,16 @@ def validate(self): Node.NOTHING = Node(-1) -class PathManager(models.Manager): - """custom create() methods for consistency and convenience""" - def create(self, accession, graph, zoom=0): - """Fetches the appropriate ZoomLevel object and creates a link when the Path is first - created. """ - path = super(PathManager, self).create(accession=accession) - ZoomLevel.objects.get_or_create(graph=graph, zoom=zoom)[0].paths.add(path) - return path - - class Path(models.Model): """Paths represent the linear order of on particular individual (accession) as its genome was sequenced. A path visits a series of nodes and the ordered concatenation of the node sequences is the accession's genome. Create Paths first from accession names, then append them to Nodes to link together.""" accession = models.CharField(max_length=1000) # one path per accession - summarized_by = models.ForeignKey('Path', related_name='summary_child', - blank=True, null=True, on_delete=models.SET_NULL) - - objects = PathManager() + zoom = models.ForeignKey(ZoomLevel, on_delete=models.CASCADE) - # class Meta: we need a database check where each accession name only occurs once per zoom level - # unique_together = ['accession', 'zoom_levels'] + class Meta: # we need a database check where each accession name only occurs once per zoom level + unique_together = ['accession', 'zoom'] def __getitem__(self, path_index): return self.nodes.filter(order=path_index) @@ -307,7 +290,18 @@ def nodes(self) -> QuerySet: @property def graph(self) -> GraphGenome: - return self.zoom_levels.first().graph + return self.zoom.graph + + @property + def name(self): + return self.accession + + @property + def summary_child(self): + if self.zoom.zoom == 0: + return None + previous_zoom = self.zoom.graph.zoomlevel_set.get(zoom=self.zoom.zoom - 1) + return previous_zoom.node_set.get(accession=self.accession) def append_gfa_nodes(self, nodes): assert hasattr(nodes[0], 'orient') and hasattr(nodes[0], 'name'), 'Expecting gfapy.Gfa.path' @@ -322,8 +316,6 @@ def append_node(self, node: Node, strand: str): NodeTraversal is appended to Path (order dependent) and PathIndex is added to Node (order independent).""" NodeTraversal.objects.create(node=node, path=self, strand=strand) # calculates order - def name(self): - return self.accession def to_gfa(self): return '\t'.join(['P', self.accession, "+,".join([x.node.name + x.strand for x in self.nodes]) + "+", ",".join(['*' for x in self.nodes])]) @@ -351,6 +343,9 @@ class NodeTraversal(models.Model): objects = NodeTraversalManager() + class Meta: + unique_together = ['path', 'order'] + def __repr__(self): if self.strand == '+': return self.node.seq diff --git a/Graph/test.py b/Graph/test.py index 599ac32..4801adf 100644 --- a/Graph/test.py +++ b/Graph/test.py @@ -4,7 +4,7 @@ import os from os.path import join from Graph.gfa import GFA -from Graph.models import Node, Path, NodeTraversal +from Graph.models import Node, Path, NodeTraversal, ZoomLevel from Graph.sort import DAGify # Define the working directory @@ -56,29 +56,29 @@ def test_graph_factory(self): def test_summary_storage(self): graph = self.test_example_graph() - path = Path.objects.create(graph=graph, accession='a', zoom=1) - new_node = Node.objects.create(seq='ACGTCGGA', name='2*2', graph=graph) - Node.objects.filter(name__in=['1','2','4']).update(summarized_by=new_node) + + zoom1 = ZoomLevel.objects.create(graph=graph, zoom=1, blank_layer=False) + path1 = zoom1.paths.filter(accession='a').first() + new_node = Node.objects.create(seq='ACGTCGGA', name='2*2', zoom=zoom1) + base_nodes = graph.nucleotide_level.nodes + base_nodes.filter(name__in=['1', '2', '4']).update(summarized_by=new_node) assert new_node.children.count() == 3 - for node in Node.objects.filter(name__in=['1','2','4']): + zoom1.nodes.filter(name__in=['1', '2', '4']).delete() # delete copies made redundant in next layer + for node in base_nodes.filter(name__in=['1','2','4']): assert node.summarized_by_id == new_node.id - for i, node_name in enumerate(['2*2', '5', '6']): - current = graph.node(node_name) - NodeTraversal(node=current, path=path, strand='+', order=i).save() - assert NodeTraversal.objects.get(order=0, path=path).downstream().downstream().node.name == '6' - assert graph.zoomlevel_set.count() == 2 - graph.paths.filter(accession='a').update(summarized_by=path) - assert bool(path.summary_child), "Path should be linked to its child." - zoom1 = graph.zoomlevel_set.get(zoom=1) - path_pointers = zoom1.paths - assert path_pointers.count() == 1 + print(list(zoom1.nodes.all())) + self.assertEqual(NodeTraversal.objects.get(order=0, path=path1).downstream().downstream().node.name, '4') + self.assertEqual(graph.zoomlevel_set.count(), 2) + self.assertTrue(path1.summary_child), # "Path should be linked to its child." + self.assertEqual(zoom1.paths.count(), 5) # ZoomLevel - self.assertEqual(len(zoom1), 3, ) - self.assertEqual(zoom1.node_ids(),{5, 21, 6}, zoom1.node_ids()) - self.assertEqual(graph.zoomlevel_set.get(zoom=0).node_ids(), set(range(1,21))) - names = [x.name for x in zoom1.nodes()] + zoom0 = graph.nucleotide_level + self.assertEqual(len(zoom1), len(zoom0) - 2) + self.assertEqual(zoom1.node_ids(),set(range(23, 42)),)#{23,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41} + self.assertEqual(zoom0.node_ids(), set(range(1,21))) + names = [x.name for x in zoom1.nodes] self.assertEqual(names, ['5', '6', '2*2']) - sequences = [x.seq for x in zoom1.nodes()] + sequences = [x.seq for x in zoom1.nodes] self.assertEqual(sequences, ['C', 'AGTACG', 'ACGTCGGA']) @@ -192,7 +192,7 @@ def test_gfa(self): def test_load_gfa_to_graph(self): graph, gfa = self.make_graph_from_gfa() self.assertEqual(graph.paths.count(), 3) - self.assertEqual(graph.nodes.count(), 15) + self.assertEqual(graph.nucleotide_level.nodes.count(), 15) def make_graph_from_gfa(self): gfa = GFA.load_from_gfa(join(PATH_TO_TEST_DATA, "test.gfa")) diff --git a/Graph/utils.py b/Graph/utils.py index b08a089..83f12e0 100644 --- a/Graph/utils.py +++ b/Graph/utils.py @@ -27,13 +27,15 @@ def build_graph_from_slices(cmd: List, graph_name='test_data') -> GraphGenome: path_objs = {} for path_name in paths: if graph.paths.filter(accession=path_name).count() == 0: - path_objs[path_name] = Path.objects.create(graph=graph, accession=path_name, zoom=0) + path_objs[path_name] = Path.objects.create(accession=path_name, zoom=graph.nucleotide_level) for sl in cmd: try: for i in range(0, len(sl), 2): paths_mentioned = [path_objs[key] for key in sl[i + 1]] node, is_new = Node.objects.get_or_create( - seq=sl[i], name=graph.name + str(node_count), graph=graph) + seq=sl[i], + name=graph.name + str(node_count), + zoom=graph.nucleotide_level) node_count += 1 for path in paths_mentioned: path.append_node(node, '+') diff --git a/HaploBlocker/haplonetwork.py b/HaploBlocker/haplonetwork.py index dd52635..92744cc 100644 --- a/HaploBlocker/haplonetwork.py +++ b/HaploBlocker/haplonetwork.py @@ -7,7 +7,7 @@ import numpy as np from collections import defaultdict from copy import copy -from Graph.models import Node, Path, ZoomLevel, NodeTraversal +from Graph.models import Node, Path, ZoomLevel, NodeTraversal, GraphGenome BLOCK_SIZE = 20 FILTER_THRESHOLD = 4 @@ -33,7 +33,7 @@ def signature(individual, start_locus): return tuple(individual[start_locus: start_locus + BLOCK_SIZE]) -def nodes_from_unique_signatures(individuals, start_locus, current_graph): +def nodes_from_unique_signatures(individuals, start_locus, current_graph: GraphGenome): """A signature is a series of BLOCK_SIZE SNPs inside of a locus. We want to know how many unique signatures are present inside of one locus. A Node is created for each unique signature found. @@ -47,7 +47,7 @@ def nodes_from_unique_signatures(individuals, start_locus, current_graph): unique_blocks[sig] = Node.objects.create( # saves to Database name=f'{len(unique_blocks)}:{start_locus // BLOCK_SIZE}-{start_locus // BLOCK_SIZE}', seq=''.join(str(x) for x in sig), - graph=current_graph) + zoom=current_graph.nucleotide_level) return unique_blocks @@ -62,7 +62,7 @@ def build_all_slices(alleles, individuals, current_graph): return slices -def build_paths(individuals, unique_signatures, graph): +def build_paths(individuals, unique_signatures, graph: GraphGenome): """Describes an individual as a Path (list of Nodes) that the individual visits (NodeTraversals). accessions is a list of loci which contain a list of Nodes which each contain specimen build nodes: [0] first 4 are the 4 starting signatures in window 0. @@ -73,7 +73,7 @@ def build_paths(individuals, unique_signatures, graph): print(f"Building paths from {len(individuals)} individuals and {len(unique_signatures)} loci") accessions = [] for i_specimen, specimen in enumerate(individuals): - my_path = Path.objects.create(accession=str(i_specimen), graph=graph) + my_path = Path.objects.create(accession=str(i_specimen), zoom=graph.nucleotide_level) my_sigs = [unique_signatures[w][signature(specimen, w * BLOCK_SIZE)] for w in range(len(unique_signatures))] traverses = [NodeTraversal(node=sig, path=my_path, strand='+', order=i) for i, sig in enumerate(my_sigs)] NodeTraversal.objects.bulk_create(traverses, 100) @@ -131,15 +131,14 @@ def update_stream_transitions(node, stream): g(node, stream).pop(key, None) -def simple_merge(current_level: ZoomLevel) -> ZoomLevel: +def simple_merge(current_level: ZoomLevel) -> bool: """ Side effects full_graph by merging any consecutive nodes that have identical specimens and removing the redundant my_node from full_graph. :param current_level: Graph that will be read and edited - :return: full_graph modified + :return: true if modification occurred """ - #TODO: Paths start fully populated with redundant NodeTraversals. Editing NodeTraversals, - # moves to newly created Nodes. Global bool for whether or not a particular path was modified. - next_level, zoom = prep_next_summary_layer(current_level) + #TODO: Global bool for whether or not a particular path was modified. + # TODO: Iterate an optimized query or Remove this nonsense comment # Node.objects.filter() # NodeTraversal.objects.filter(node_id) @@ -149,32 +148,38 @@ def simple_merge(current_level: ZoomLevel) -> ZoomLevel: # downstream_ids = set(t.downstream_id() for t in traverses) # a.path_set == b.path_set # Node.objects.filter(path_set == ) - for my_node in current_level.nodes(): - # only one Node Downstream, no matter the number of specimens - if len(my_node.downstream_ids(zoom)) == 1: - d = my_node.nodetraversal_set.first().downstream() - if d: - next_node = d.node # fetched from DB - if my_node.nodetraversal_set.count() == next_node.nodetraversal_set.count(): # Not a complete guarantee... - # Torsten deletes my_node and modifies next_node - merged_node = Node.objects.create(name=f'{my_node.name}*{next_level.zoom}', - graph=current_level.graph) - for x in [my_node, next_node]: - x.summarized_by = merged_node - x.save() - - # edit existing traversals - path_ids = next_level.paths.values_list('id', flat=True) - NodeTraversal.objects.\ - filter(node=next_node, path_id__in=path_ids).\ - update(node_id=merged_node.id) - # next_node.nodetraversal_set.filter(zoom=zoom).bulk_update(node_id=merged_node.id) - - # delete my_node and all associates - query = NodeTraversal.objects.filter(node=my_node, path_id__in=path_ids) - query._raw_delete(query.db) # https://www.nickang.com/fastest-delete-django/ - # TODO: merged_node.start = my_node.start - return next_level + modification_happened = False + path_ids = current_level.paths.values_list('id', flat=True) + for node_id in range(1, current_level.node_set.order_by('-id').first().id): + try: + my_node = Node.objects.get(id=node_id, zoom=current_level) + + # only one Node Downstream, no matter the number of specimens + if len(my_node.downstream_ids()) == 1: + d = my_node.nodetraversal_set.first().downstream() + if d: + modification_happened = True + next_node = d.node # fetched from DB + if my_node.nodetraversal_set.count() == next_node.nodetraversal_set.count(): # Not a complete guarantee... + # Torsten deletes my_node and modifies next_node + merged_node = Node.objects.create(name=f'{my_node.name}*{current_level.zoom}', + zoom=current_level) + for x in [my_node, next_node]: + x.summarized_by = merged_node + x.save() # TODO: doesn't work because reading and writing same layer. next_node gets deleted soon + + # edit existing traversals + next_node.nodetraversal_set.\ + filter(path_id__in=path_ids).\ + update(node_id=merged_node.id) + + # delete my_node and all associates + query = my_node.nodetraversal_set.filter(path_id__in=path_ids) + query._raw_delete(query.db) # https://www.nickang.com/fastest-delete-django/ + # TODO: merged_node.start = my_node.start, length = my_node.length + next_node.length + except Node.DoesNotExist: + pass # node ids are not entirely dense + return modification_happened def prep_next_summary_layer(current_level): @@ -189,7 +194,7 @@ def delete_node(node: Node, cutoff: int, layer: ZoomLevel): """Changes references to this node to add to references to Node.NOTHING""" if cutoff < 1: return # if cutoff is 0, then don't touch upstream and downstream - node.traverses(layer).delete() + node.traverses().delete() node.validate() @@ -200,8 +205,8 @@ def neglect_nodes(zoom_level : ZoomLevel, deletion_cutoff=FILTER_THRESHOLD): # next_level, zoom = prep_next_summary_layer(current_level) - for node in zoom_level.nodes(): # TODO optimize distinct count - if len(node.specimens(zoom_level)) <= deletion_cutoff: + for node in zoom_level.nodes: # TODO optimize distinct count + if len(node.specimens()) <= deletion_cutoff: delete_node(node, deletion_cutoff, zoom_level) @@ -210,10 +215,10 @@ def split_one_group(prev_node, anchor, next_node, zoom_level: ZoomLevel): Comment: That is actually the case we want to split up to obtain longer blocks later Extension of full windows will take care of potential loss of information later""" - my_specimens = anchor.specimens(zoom_level.zoom) # list of path_ids - my_specimens = my_specimens.intersection(prev_node.specimens(zoom_level.zoom)) - my_specimens = my_specimens.intersection(next_node.specimens(zoom_level.zoom)) - new_node = Node.objects.create(graph=zoom_level.graph, name=f'{anchor.name}:{zoom_level.zoom}') + my_specimens = anchor.specimens() # list of path_ids + my_specimens = my_specimens.intersection(prev_node.specimens()) + my_specimens = my_specimens.intersection(next_node.specimens()) + new_node = Node.objects.create(zoom=zoom_level, name=f'{anchor.name}:{zoom_level.zoom}') for a in (prev_node, anchor, next_node): a.summarized_by = new_node a.save() @@ -249,14 +254,14 @@ def split_groups(zoom_level: ZoomLevel): TODO: Ideally, the database would retain some record of how many nucleotides are shared between the two new haplotype nodes.""" - for node in zoom_level.nodes(): + for node in zoom_level.nodes: # check if all transition upstream match with one of my downstream nodes - if len(node.specimens(zoom_level)) > 0: + if len(node.specimens()) > 0: # Matchup upstream and downstream with specimen identities - for up in node.upstream(zoom_level): - set1 = up.specimens(zoom_level) + for up in node.upstream(): + set1 = up.specimens() if len(set1): - for down in node.downstream(zoom_level): - set2 = down.specimens(zoom_level) + for down in node.downstream(): + set2 = down.specimens() if set1 == set2 and len(set2) > 0: new_node = split_one_group(up, node, down, zoom_level) diff --git a/HaploBlocker/tests.py b/HaploBlocker/tests.py index 5b5c79f..3ae93d9 100644 --- a/HaploBlocker/tests.py +++ b/HaploBlocker/tests.py @@ -15,12 +15,13 @@ class ModelTest(TestCase): def test_creation(self): g = GraphGenome.objects.create(name='test_creation') - p = Path.objects.create(accession='watermelon', graph=g) - n = Node.objects.create(seq='ACGT', graph=g) + z = g.nucleotide_level + p = Path.objects.create(accession='watermelon', zoom=z) + n = Node.objects.create(seq='ACGT', zoom=z) NodeTraversal.objects.create(node=n, path=p, strand='+') - NodeTraversal.objects.create(node=Node.objects.create(seq='AAAA', name='-9', graph=g), + NodeTraversal.objects.create(node=Node.objects.create(seq='AAAA', name='-9', zoom=z), path=p, strand='+') - NodeTraversal.objects.create(node=Node.objects.create(seq='TTT', name='-8', graph=g), + NodeTraversal.objects.create(node=Node.objects.create(seq='TTT', name='-8', zoom=z), path=p, strand='+') print(NodeTraversal.objects.all()) @@ -113,7 +114,7 @@ def _test_simple_merge(self, layer: ZoomLevel): def test_simple_merge(self): graph = self.create_graph('test') - self._test_simple_merge(graph.nucleotide_level()) + self._test_simple_merge(graph.nucleotide_level) def _test_neglect_nodes(self, zoom_level : ZoomLevel): @@ -139,18 +140,18 @@ def test_split_one_group(self): ['96', {1, 2, 3, 4}] #[6] ] g = Graph.utils.build_graph_from_slices(nodes, '9') - first, anchor, third = g.node('91'), g.node('93'), g.node('94') - zoom = ZoomLevel.objects.get(graph=g, zoom=0) - new_node = split_one_group(first, anchor, third, zoom) # no mentions of minorities [1] or [4] + z = g.nucleotide_level + first, anchor, third = z.node('91'), z.node('93'), z.node('94') + new_node = split_one_group(first, anchor, third, z) # no mentions of minorities [1] or [4] print(new_node.details()) - self.assertIn(new_node.id, g.node('90').downstream_ids(0)) - self.assertIn(g.node('92').id, g.node('90').downstream_ids(0)) - self.assertNotIn(g.node('91').id, g.node('90').downstream_ids(0)) - self.assertIn(g.node('90').id, new_node.upstream_ids(0)) - self.assertIn(g.node('96').id, new_node.downstream_ids(0)) - self.assertIn(new_node.id, g.node('96').upstream_ids(0)) - self.assertIn(g.node('95').id, g.node('96').upstream_ids(0)) - self.assertNotIn(g.node('94').id, g.node('96').upstream_ids(0)) + self.assertIn(new_node.id, z.node('90').downstream_ids()) + self.assertIn(z.node('92').id, z.node('90').downstream_ids()) + self.assertNotIn(z.node('91').id, z.node('90').downstream_ids()) + self.assertIn(z.node('90').id, new_node.upstream_ids()) + self.assertIn(z.node('96').id, new_node.downstream_ids()) + self.assertIn(new_node.id, z.node('96').upstream_ids()) + self.assertIn(z.node('95').id, z.node('96').upstream_ids()) + self.assertNotIn(z.node('94').id, z.node('96').upstream_ids()) def test_split_groups(self): nodes = [ @@ -162,14 +163,14 @@ def test_split_groups(self): ] g = Graph.utils.build_graph_from_slices(nodes, 'test_split_groups') # zoom_level, zoom = prep_next_summary_layer(g.nucleotide_level()) - split_groups(g.nucleotide_level()) - self.assertEqual(len(g.nucleotide_level()), 5) + split_groups(g.nucleotide_level) + self.assertEqual(len(g.nucleotide_level), 5) # simple_merge(g.nucleotide_level()) # self.assertEqual(len(g.nucleotide_level()), 3) def test_workflow(self): graph = self.create_graph('test') - summary1, zoom = prep_next_summary_layer(graph.nucleotide_level()) + summary1, zoom = prep_next_summary_layer(graph.nucleotide_level) self._test_simple_merge(summary1) summary2, zoom = prep_next_summary_layer(summary1) self._test_neglect_nodes(summary2) From eba726d1b24b957b140f8af232f54765c89eebcb Mon Sep 17 00:00:00 2001 From: Josiah Seaman Date: Wed, 4 Sep 2019 12:02:08 +0900 Subject: [PATCH 13/15] #26 Added xrange iterators --- Graph/gfa.py | 2 +- .../0008_remove_path_summarized_by.py | 17 ++++ Graph/models.py | 45 ++++++---- Graph/test.py | 12 +-- HaploBlocker/haplonetwork.py | 86 ++++++------------- 5 files changed, 81 insertions(+), 81 deletions(-) create mode 100644 Graph/migrations/0008_remove_path_summarized_by.py diff --git a/Graph/gfa.py b/Graph/gfa.py index 0520012..106367a 100644 --- a/Graph/gfa.py +++ b/Graph/gfa.py @@ -116,7 +116,7 @@ def from_graph(cls, graph: GraphGenome): # TODO: should be given ZoomLevel inst node_series = ",".join(visits) connections = ",".join(['*'] * path.nodes.count()) # count -1? gfa.add_line('\t'.join(['P', path.accession, node_series, connections])) - for node in graph.nucleotide_level.nodes: # in no particular order + for node in graph.nucleotide_level.nodes_xrange(): # in no particular order gfa.add_line('\t'.join(['S', str(node.name), node.seq])) return cls(gfa, "from Graph") diff --git a/Graph/migrations/0008_remove_path_summarized_by.py b/Graph/migrations/0008_remove_path_summarized_by.py new file mode 100644 index 0000000..c0eeb2f --- /dev/null +++ b/Graph/migrations/0008_remove_path_summarized_by.py @@ -0,0 +1,17 @@ +# Generated by Django 2.2.1 on 2019-09-03 16:59 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('Graph', '0007_NodeTraversal_unique_together_order'), + ] + + operations = [ + migrations.RemoveField( + model_name='path', + name='summarized_by', + ), + ] diff --git a/Graph/models.py b/Graph/models.py index 7e99afb..cbc695c 100644 --- a/Graph/models.py +++ b/Graph/models.py @@ -25,21 +25,17 @@ def create(self, graph, zoom, blank_layer=False) -> 'ZoomLevel': return me # We've done all the necessary work # Copy previous level in entirety previous_level = graph.zoomlevel_set.get(zoom=zoom - 1) - Node.objects.bulk_create([Node(name=n.name, zoom=me) for n in previous_level.nodes], 300) + Node.objects.bulk_create([Node(name=n.name, zoom=me) for n in previous_level.nodes_xrange()], 100) # TODO: This loop can be sped up by bulk_create and bulk_update - start = previous_level.paths.aggregate(Max('id'))['rating__max'] # TODO make lazy_paths() generator method - stop = previous_level.paths.aggregate(Min('id'))['rating__min'] - for path_id in range(start, stop): - if Path.objects.exists(id=path_id): - path = Path.objects.get(id=path_id) - name = path.name - p = Path(accession=name, zoom=me) # new Path for a new level - p.save() - # TODO: this is REALLY SLOW AND WASTEFUL! - # Makes a full copy of every traversal in the Path so new copies can be edited - copies = [NodeTraversal(path=p, node=traverse.node, strand=traverse.strand, order=traverse.order) - for traverse in path.nodetraversal_set.all()] - NodeTraversal.objects.bulk_create(copies, 100) + for path in previous_level.path_xrange(): + name = path.name + p = Path(accession=name, zoom=me) # new Path for a new level + p.save() + # TODO: this is REALLY SLOW AND WASTEFUL! + # Makes a full copy of every traversal in the Path so new copies can be edited + copies = [NodeTraversal(path=p, node=traverse.node, strand=traverse.strand, order=traverse.order) + for traverse in path.nodetraversal_set.all()] + NodeTraversal.objects.bulk_create(copies, 100) return me @@ -88,6 +84,25 @@ def nodes(self) -> QuerySet: def node(self, node_name): return Node.objects.get(name=node_name, zoom=self) + def path_xrange(self): + """Lazy evaluation of Paths to ensure that we don't overrun SQL""" + start = self.paths.aggregate(Min('id'))['id__min'] + stop = self.paths.aggregate(Max('id'))['id__max'] + 1 + for path_id in range(start, stop): + try: + yield Path.objects.get(id=path_id) + except Path.DoesNotExist: + pass # path ids sometimes will have gaps + + def nodes_xrange(self): + """Lazy evaluation of Nodes to ensure that we don't overrun SQL""" + start = self.nodes.aggregate(Min('id'))['id__min'] + stop = self.nodes.aggregate(Max('id'))['id__max'] + 1 + for path_id in range(start, stop): + try: + yield Node.objects.get(id=path_id) + except Node.DoesNotExist: + pass # node ids sometimes will have gaps class GraphManager(models.Manager): @@ -301,7 +316,7 @@ def summary_child(self): if self.zoom.zoom == 0: return None previous_zoom = self.zoom.graph.zoomlevel_set.get(zoom=self.zoom.zoom - 1) - return previous_zoom.node_set.get(accession=self.accession) + return previous_zoom.path_set.get(accession=self.accession) def append_gfa_nodes(self, nodes): assert hasattr(nodes[0], 'orient') and hasattr(nodes[0], 'name'), 'Expecting gfapy.Gfa.path' diff --git a/Graph/test.py b/Graph/test.py index 4801adf..8af2213 100644 --- a/Graph/test.py +++ b/Graph/test.py @@ -56,9 +56,10 @@ def test_graph_factory(self): def test_summary_storage(self): graph = self.test_example_graph() - + zoom0 = graph.nucleotide_level zoom1 = ZoomLevel.objects.create(graph=graph, zoom=1, blank_layer=False) - path1 = zoom1.paths.filter(accession='a').first() + path1 = zoom1.paths.get(accession='a') + self.assertEqual(zoom1.paths.count(), 5) new_node = Node.objects.create(seq='ACGTCGGA', name='2*2', zoom=zoom1) base_nodes = graph.nucleotide_level.nodes base_nodes.filter(name__in=['1', '2', '4']).update(summarized_by=new_node) @@ -72,11 +73,10 @@ def test_summary_storage(self): self.assertTrue(path1.summary_child), # "Path should be linked to its child." self.assertEqual(zoom1.paths.count(), 5) # ZoomLevel - zoom0 = graph.nucleotide_level - self.assertEqual(len(zoom1), len(zoom0) - 2) - self.assertEqual(zoom1.node_ids(),set(range(23, 42)),)#{23,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41} + self.assertEqual(len(zoom1), len(zoom0) - 3 + 1) + self.assertEqual(zoom1.node_ids(),set(range(23, 42)) - {24},) # 24 deleted self.assertEqual(zoom0.node_ids(), set(range(1,21))) - names = [x.name for x in zoom1.nodes] + names = zoom1.nodes.values_list('name', flat=True) self.assertEqual(names, ['5', '6', '2*2']) sequences = [x.seq for x in zoom1.nodes] self.assertEqual(sequences, ['C', 'AGTACG', 'ACGTCGGA']) diff --git a/HaploBlocker/haplonetwork.py b/HaploBlocker/haplonetwork.py index 92744cc..853785c 100644 --- a/HaploBlocker/haplonetwork.py +++ b/HaploBlocker/haplonetwork.py @@ -103,34 +103,6 @@ def populate_transitions(simplified_individuals): # node.upstream[Node.NOTHING] += 1 -def update_transition(node): - """Only transition values for nodes already listed in upstream and downstream will be calculated.""" - if not node.is_nothing(): - update_stream_transitions(node, 'upstream') - update_stream_transitions(node, 'downstream') - - return node - - -def update_stream_transitions(node, stream): - """This will updated either upstream or downstream transition counts based on the - the value of 'stream'. This is a meta-programming function that requires the exact - name of the class field 'upstream' or 'downstream' to work.""" - g = getattr # - running = g(node, stream).keys() - setattr(node, stream, defaultdict(lambda: 0)) - for n in running: - if not n.is_nothing(): - g(node, stream)[n] = len(node.specimens.intersection(n.specimens)) - accounted_upstream = sum(g(node, stream).values()) - g(node, stream)[Node.NOTHING] - g(node, stream)[Node.NOTHING] = len(node.specimens) - accounted_upstream - assert all([count > -1 for count in g(node, stream).values()]), node.details() - # Cleans up old keys including Node.NOTHING - to_be_deleted = {key for key, count in g(node, stream).items() if count == 0} - for key in to_be_deleted: - g(node, stream).pop(key, None) - - def simple_merge(current_level: ZoomLevel) -> bool: """ Side effects full_graph by merging any consecutive nodes that have identical specimens and removing the redundant my_node from full_graph. @@ -150,35 +122,31 @@ def simple_merge(current_level: ZoomLevel) -> bool: # Node.objects.filter(path_set == ) modification_happened = False path_ids = current_level.paths.values_list('id', flat=True) - for node_id in range(1, current_level.node_set.order_by('-id').first().id): - try: - my_node = Node.objects.get(id=node_id, zoom=current_level) - - # only one Node Downstream, no matter the number of specimens - if len(my_node.downstream_ids()) == 1: - d = my_node.nodetraversal_set.first().downstream() - if d: - modification_happened = True - next_node = d.node # fetched from DB - if my_node.nodetraversal_set.count() == next_node.nodetraversal_set.count(): # Not a complete guarantee... - # Torsten deletes my_node and modifies next_node - merged_node = Node.objects.create(name=f'{my_node.name}*{current_level.zoom}', - zoom=current_level) - for x in [my_node, next_node]: - x.summarized_by = merged_node - x.save() # TODO: doesn't work because reading and writing same layer. next_node gets deleted soon - - # edit existing traversals - next_node.nodetraversal_set.\ - filter(path_id__in=path_ids).\ - update(node_id=merged_node.id) - - # delete my_node and all associates - query = my_node.nodetraversal_set.filter(path_id__in=path_ids) - query._raw_delete(query.db) # https://www.nickang.com/fastest-delete-django/ - # TODO: merged_node.start = my_node.start, length = my_node.length + next_node.length - except Node.DoesNotExist: - pass # node ids are not entirely dense + for my_node in current_level.nodes_xrange(): + + # only one Node Downstream, no matter the number of specimens + if len(my_node.downstream_ids()) == 1: + d = my_node.nodetraversal_set.first().downstream() + if d: + modification_happened = True + next_node = d.node # fetched from DB + if my_node.nodetraversal_set.count() == next_node.nodetraversal_set.count(): # Not a complete guarantee... + # Torsten deletes my_node and modifies next_node + merged_node = Node.objects.create(name=f'{my_node.name}*{current_level.zoom}', + zoom=current_level) + for x in [my_node, next_node]: + x.summarized_by = merged_node + x.save() # TODO: doesn't work because reading and writing same layer. next_node gets deleted soon + + # edit existing traversals + next_node.nodetraversal_set.\ + filter(path_id__in=path_ids).\ + update(node_id=merged_node.id) + + # delete my_node and all associates + query = my_node.nodetraversal_set.filter(path_id__in=path_ids) + query._raw_delete(query.db) # https://www.nickang.com/fastest-delete-django/ + # TODO: merged_node.start = my_node.start, length = my_node.length + next_node.length return modification_happened @@ -205,7 +173,7 @@ def neglect_nodes(zoom_level : ZoomLevel, deletion_cutoff=FILTER_THRESHOLD): # next_level, zoom = prep_next_summary_layer(current_level) - for node in zoom_level.nodes: # TODO optimize distinct count + for node in zoom_level.nodes_xrange(): # TODO optimize distinct count if len(node.specimens()) <= deletion_cutoff: delete_node(node, deletion_cutoff, zoom_level) @@ -254,7 +222,7 @@ def split_groups(zoom_level: ZoomLevel): TODO: Ideally, the database would retain some record of how many nucleotides are shared between the two new haplotype nodes.""" - for node in zoom_level.nodes: + for node in zoom_level.nodes_xrange(): # check if all transition upstream match with one of my downstream nodes if len(node.specimens()) > 0: # Matchup upstream and downstream with specimen identities From 6c3c8793ddb13210a1ab8cd87bc93fe78993d4b8 Mon Sep 17 00:00:00 2001 From: Josiah Seaman Date: Wed, 6 Nov 2019 10:39:19 +0000 Subject: [PATCH 14/15] First working version of random gfa generator. All positive strand --- GFA_Generator.ipynb | 183 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 183 insertions(+) create mode 100644 GFA_Generator.ipynb diff --git a/GFA_Generator.ipynb b/GFA_Generator.ipynb new file mode 100644 index 0000000..f183072 --- /dev/null +++ b/GFA_Generator.ipynb @@ -0,0 +1,183 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "H\tVN:Z:1.0\n", + "S\t1\tTTTT\n", + "S\t2\tAGTCGCAT\n", + "S\t3\tGTTG\n", + "S\t4\tCCTA\n", + "P\t7\t2+,2+,4+,4+\t*,*,*,*\n", + "P\t8\t4+,1+,2+,2+\t*,*,*,*\n", + "P\t9\t1+,4+,4+,3+\t*,*,*,*\n", + "P\t10\t3+,1+,4+,4+\t*,*,*,*\n", + "P\t11\t2+,3+,4+,2+\t*,*,*,*\n", + "L\t1\t+\t2\t+\t0M\n", + "L\t2\t+\t3\t+\t0M\n", + "L\t2\t+\t4\t+\t0M\n", + "L\t2\t+\t2\t+\t0M\n", + "L\t4\t+\t3\t+\t0M\n", + "L\t4\t+\t4\t+\t0M\n", + "L\t4\t+\t1\t+\t0M\n", + "L\t3\t+\t4\t+\t0M\n", + "L\t1\t+\t4\t+\t0M\n", + "L\t3\t+\t1\t+\t0M\n", + "L\t4\t+\t2\t+\t0M\n" + ] + } + ], + "source": [ + "from random import choice\n", + "\n", + "\n", + "def generate_random_gfa(size = 5, printer=print):\n", + " printer('H\tVN:Z:1.0')\n", + " printer(\"\"\"S\t1\tTTTT\n", + "S\t2\tAGTCGCAT\n", + "S\t3\tGTTG\n", + "S\t4\tCCTA\"\"\")\n", + "\n", + " nodes = ['1', '2', '3', '4']\n", + "\n", + " paths = []\n", + " for path in range(size):\n", + " my_path = [choice(nodes) for i in range(4)]\n", + " # for step in range(4):\n", + " paths.append(my_path)\n", + "\n", + " links = set()\n", + " for y, p in enumerate(paths):\n", + " for i in range(len(p)-1):\n", + " links.add('L\\t'+p[i]+'\\t+\\t'+p[i+1]+'\\t+\\t0M')\n", + "\n", + " printer('\\t'.join(['P', str(7+y), ','.join([n+'+' for n in p]), ','.join('*'*len(p))]))\n", + "\n", + " for l in links:\n", + " printer(l)\n", + "generate_random_gfa()" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def write_gfa(size):\n", + " name = r'D:\\josiah\\Documents\\1001G\\Graph_Genome_Browser\\scalability_test\\4_node_' + str(size) + '_paths.gfa'\n", + " with open(name, 'w') as f:\n", + " lines = []\n", + " generate_random_gfa(size, lines.append)\n", + " f.write('\\n'.join(lines))" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "write_gfa(20)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "write_gfa(100)" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "write_gfa(1000)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "ename": "ImportError", + "evalue": "No module named 'gfapy'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[1;32mimport\u001b[0m \u001b[0mgfapy\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mg1\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mgfapy\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mGfa\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfrom_file\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"4_node_1000_paths.gfa\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mstr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mg1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[0;31mImportError\u001b[0m: No module named 'gfapy'" + ] + } + ], + "source": [ + "import gfapy\n", + "g1 = gfapy.Gfa.from_file(\"4_node_1000_paths.gfa\")\n", + "str(g1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.4.3" + }, + "pycharm": { + "stem_cell": { + "cell_type": "raw", + "metadata": { + "collapsed": false + }, + "source": [] + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From c7b8af03431ac12333513ad8e9127d66f67ab86d Mon Sep 17 00:00:00 2001 From: Josiah Seaman Date: Thu, 21 Nov 2019 17:28:31 +0100 Subject: [PATCH 15/15] Unknown: Dangling changes for halpo_conversion --- GFA_Generator.ipynb | 235 +++++++++++++++++++++++++++++++++----------- Graph/models.py | 16 +-- 2 files changed, 180 insertions(+), 71 deletions(-) diff --git a/GFA_Generator.ipynb b/GFA_Generator.ipynb index f183072..fc7add7 100644 --- a/GFA_Generator.ipynb +++ b/GFA_Generator.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 42, + "execution_count": 74, "metadata": { "collapsed": false }, @@ -16,30 +16,31 @@ "S\t2\tAGTCGCAT\n", "S\t3\tGTTG\n", "S\t4\tCCTA\n", - "P\t7\t2+,2+,4+,4+\t*,*,*,*\n", - "P\t8\t4+,1+,2+,2+\t*,*,*,*\n", - "P\t9\t1+,4+,4+,3+\t*,*,*,*\n", - "P\t10\t3+,1+,4+,4+\t*,*,*,*\n", - "P\t11\t2+,3+,4+,2+\t*,*,*,*\n", - "L\t1\t+\t2\t+\t0M\n", - "L\t2\t+\t3\t+\t0M\n", - "L\t2\t+\t4\t+\t0M\n", - "L\t2\t+\t2\t+\t0M\n", + "P\t7\t3-,4+,3+,3+\t*,*,*,*\n", + "P\t8\t3+,2+,4-,3+\t*,*,*,*\n", + "P\t9\t3+,3+,1+,2+\t*,*,*,*\n", + "P\t10\t2+,1-,1+,2+\t*,*,*,*\n", + "P\t11\t1-,1+,3+,2+\t*,*,*,*\n", + "L\t1\t+\t3\t+\t0M\n", "L\t4\t+\t3\t+\t0M\n", - "L\t4\t+\t4\t+\t0M\n", - "L\t4\t+\t1\t+\t0M\n", - "L\t3\t+\t4\t+\t0M\n", - "L\t1\t+\t4\t+\t0M\n", + "L\t1\t-\t1\t+\t0M\n", + "L\t3\t-\t4\t+\t0M\n", + "L\t3\t+\t2\t+\t0M\n", + "L\t3\t+\t3\t+\t0M\n", + "L\t1\t+\t2\t+\t0M\n", "L\t3\t+\t1\t+\t0M\n", - "L\t4\t+\t2\t+\t0M\n" + "L\t4\t-\t3\t+\t0M\n", + "L\t2\t+\t1\t-\t0M\n", + "L\t2\t+\t4\t-\t0M\n", + "11 Edges\n" ] } ], "source": [ - "from random import choice\n", + "from random import choice, randint\n", "\n", "\n", - "def generate_random_gfa(size = 5, printer=print):\n", + "def generate_random_gfa(rows = 5, printer=print, stranded=True):\n", " printer('H\tVN:Z:1.0')\n", " printer(\"\"\"S\t1\tTTTT\n", "S\t2\tAGTCGCAT\n", @@ -47,97 +48,229 @@ "S\t4\tCCTA\"\"\")\n", "\n", " nodes = ['1', '2', '3', '4']\n", - "\n", + " chance = 10 if stranded else 0\n", " paths = []\n", - " for path in range(size):\n", - " my_path = [choice(nodes) for i in range(4)]\n", + " for path in range(rows):\n", + " length = 4\n", + " my_path = [choice(nodes) + ('+' if randint(0,chance) != 9 else '-') for i in range(length)]\n", " # for step in range(4):\n", " paths.append(my_path)\n", "\n", " links = set()\n", " for y, p in enumerate(paths):\n", " for i in range(len(p)-1):\n", - " links.add('L\\t'+p[i]+'\\t+\\t'+p[i+1]+'\\t+\\t0M')\n", + " links.add('L\\t'+p[i][:-1]+'\\t'+p[i][-1:]+'\\t'+p[i+1][:-1]+'\\t'+p[i+1][-1:]+'\\t0M')\n", "\n", - " printer('\\t'.join(['P', str(7+y), ','.join([n+'+' for n in p]), ','.join('*'*len(p))]))\n", + " printer('\\t'.join(['P', str(7+y), ','.join([n for n in p]), ','.join('*'*len(p))]))\n", "\n", " for l in links:\n", " printer(l)\n", + " print(len(links), \"Edges\")\n", "generate_random_gfa()" ] }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 75, "metadata": { "collapsed": true }, "outputs": [], "source": [ - "def write_gfa(size):\n", - " name = r'D:\\josiah\\Documents\\1001G\\Graph_Genome_Browser\\scalability_test\\4_node_' + str(size) + '_paths.gfa'\n", + "def write_gfa(size, stranded=True):\n", + " mark = '_stranded' if stranded else '_no_inversions'\n", + " name = r'D:\\josiah\\Documents\\1001G\\Graph_Genome_Browser\\scalability_test\\second_batch\\\\' + str(size) + '_paths'+mark+'.gfa'\n", " with open(name, 'w') as f:\n", " lines = []\n", - " generate_random_gfa(size, lines.append)\n", + " generate_random_gfa(size, lines.append, stranded)\n", " f.write('\\n'.join(lines))" ] }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 77, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "16 Edges\n" + ] + } + ], + "source": [ + "write_gfa(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "28 Edges\n" + ] + } + ], "source": [ "write_gfa(20)" ] }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 79, "metadata": { - "collapsed": true + "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "45 Edges\n" + ] + } + ], "source": [ "write_gfa(100)" ] }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 80, "metadata": { - "collapsed": true + "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "61 Edges\n" + ] + } + ], "source": [ "write_gfa(1000)" ] }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 81, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "14 Edges\n" + ] + } + ], + "source": [ + "write_gfa(10, False)" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "15 Edges\n" + ] + } + ], + "source": [ + "write_gfa(20, False)" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "16 Edges\n" + ] + } + ], + "source": [ + "write_gfa(100, False)" + ] + }, + { + "cell_type": "code", + "execution_count": 84, "metadata": { "collapsed": false }, "outputs": [ { - "ename": "ImportError", - "evalue": "No module named 'gfapy'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[1;32mimport\u001b[0m \u001b[0mgfapy\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mg1\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mgfapy\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mGfa\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfrom_file\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"4_node_1000_paths.gfa\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mstr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mg1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[0;31mImportError\u001b[0m: No module named 'gfapy'" + "name": "stdout", + "output_type": "stream", + "text": [ + "16 Edges\n" ] } ], + "source": [ + "write_gfa(1000, False)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "collapsed": false + }, + "outputs": [], "source": [ "import gfapy\n", - "g1 = gfapy.Gfa.from_file(\"4_node_1000_paths.gfa\")\n", - "str(g1)" + "g1 = gfapy.Gfa.from_file(r'D:\\josiah\\Documents\\1001G\\Graph_Genome_Browser\\scalability_test\\5_node_1000_paths.gfa')" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from random import choice, randint\n", + "randint(0,0)" ] }, { @@ -156,18 +289,6 @@ "language": "python", "name": "python3" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.4.3" - }, "pycharm": { "stem_cell": { "cell_type": "raw", diff --git a/Graph/models.py b/Graph/models.py index cbc695c..1d4688a 100644 --- a/Graph/models.py +++ b/Graph/models.py @@ -86,23 +86,11 @@ def node(self, node_name): def path_xrange(self): """Lazy evaluation of Paths to ensure that we don't overrun SQL""" - start = self.paths.aggregate(Min('id'))['id__min'] - stop = self.paths.aggregate(Max('id'))['id__max'] + 1 - for path_id in range(start, stop): - try: - yield Path.objects.get(id=path_id) - except Path.DoesNotExist: - pass # path ids sometimes will have gaps + return self.paths.all().iterator(chunk_size=100) def nodes_xrange(self): """Lazy evaluation of Nodes to ensure that we don't overrun SQL""" - start = self.nodes.aggregate(Min('id'))['id__min'] - stop = self.nodes.aggregate(Max('id'))['id__max'] + 1 - for path_id in range(start, stop): - try: - yield Node.objects.get(id=path_id) - except Node.DoesNotExist: - pass # node ids sometimes will have gaps + return self.paths.all().iterator(chunk_size=100) class GraphManager(models.Manager):