diff --git a/.travis.yml b/.travis.yml index 05d839f..42ba70b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,10 +8,9 @@ python: # Command to install dependencies, e.g. pip install -r requirements_dev.txt --use-mirrors install: - pip install -r requirements_dev.txt - - python manage.py migrate + - python manage.py migrate Graph env: - - DJANGO_VERSION=2.2.1 - DJANGO_SETTINGS_MODULE=vgbrowser.settings # Command to run tests, e.g. python setup.py test diff --git a/GFA_Generator.ipynb b/GFA_Generator.ipynb new file mode 100644 index 0000000..fc7add7 --- /dev/null +++ b/GFA_Generator.ipynb @@ -0,0 +1,304 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 74, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "H\tVN:Z:1.0\n", + "S\t1\tTTTT\n", + "S\t2\tAGTCGCAT\n", + "S\t3\tGTTG\n", + "S\t4\tCCTA\n", + "P\t7\t3-,4+,3+,3+\t*,*,*,*\n", + "P\t8\t3+,2+,4-,3+\t*,*,*,*\n", + "P\t9\t3+,3+,1+,2+\t*,*,*,*\n", + "P\t10\t2+,1-,1+,2+\t*,*,*,*\n", + "P\t11\t1-,1+,3+,2+\t*,*,*,*\n", + "L\t1\t+\t3\t+\t0M\n", + "L\t4\t+\t3\t+\t0M\n", + "L\t1\t-\t1\t+\t0M\n", + "L\t3\t-\t4\t+\t0M\n", + "L\t3\t+\t2\t+\t0M\n", + "L\t3\t+\t3\t+\t0M\n", + "L\t1\t+\t2\t+\t0M\n", + "L\t3\t+\t1\t+\t0M\n", + "L\t4\t-\t3\t+\t0M\n", + "L\t2\t+\t1\t-\t0M\n", + "L\t2\t+\t4\t-\t0M\n", + "11 Edges\n" + ] + } + ], + "source": [ + "from random import choice, randint\n", + "\n", + "\n", + "def generate_random_gfa(rows = 5, printer=print, stranded=True):\n", + " printer('H\tVN:Z:1.0')\n", + " printer(\"\"\"S\t1\tTTTT\n", + "S\t2\tAGTCGCAT\n", + "S\t3\tGTTG\n", + "S\t4\tCCTA\"\"\")\n", + "\n", + " nodes = ['1', '2', '3', '4']\n", + " chance = 10 if stranded else 0\n", + " paths = []\n", + " for path in range(rows):\n", + " length = 4\n", + " my_path = [choice(nodes) + ('+' if randint(0,chance) != 9 else '-') for i in range(length)]\n", + " # for step in range(4):\n", + " paths.append(my_path)\n", + "\n", + " links = set()\n", + " for y, p in enumerate(paths):\n", + " for i in range(len(p)-1):\n", + " links.add('L\\t'+p[i][:-1]+'\\t'+p[i][-1:]+'\\t'+p[i+1][:-1]+'\\t'+p[i+1][-1:]+'\\t0M')\n", + "\n", + " printer('\\t'.join(['P', str(7+y), ','.join([n for n in p]), ','.join('*'*len(p))]))\n", + "\n", + " for l in links:\n", + " printer(l)\n", + " print(len(links), \"Edges\")\n", + "generate_random_gfa()" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def write_gfa(size, stranded=True):\n", + " mark = '_stranded' if stranded else '_no_inversions'\n", + " name = r'D:\\josiah\\Documents\\1001G\\Graph_Genome_Browser\\scalability_test\\second_batch\\\\' + str(size) + '_paths'+mark+'.gfa'\n", + " with open(name, 'w') as f:\n", + " lines = []\n", + " generate_random_gfa(size, lines.append, stranded)\n", + " f.write('\\n'.join(lines))" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "16 Edges\n" + ] + } + ], + "source": [ + "write_gfa(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "28 Edges\n" + ] + } + ], + "source": [ + "write_gfa(20)" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "45 Edges\n" + ] + } + ], + "source": [ + "write_gfa(100)" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "61 Edges\n" + ] + } + ], + "source": [ + "write_gfa(1000)" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "14 Edges\n" + ] + } + ], + "source": [ + "write_gfa(10, False)" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "15 Edges\n" + ] + } + ], + "source": [ + "write_gfa(20, False)" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "16 Edges\n" + ] + } + ], + "source": [ + "write_gfa(100, False)" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "16 Edges\n" + ] + } + ], + "source": [ + "write_gfa(1000, False)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import gfapy\n", + "g1 = gfapy.Gfa.from_file(r'D:\\josiah\\Documents\\1001G\\Graph_Genome_Browser\\scalability_test\\5_node_1000_paths.gfa')" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from random import choice, randint\n", + "randint(0,0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "pycharm": { + "stem_cell": { + "cell_type": "raw", + "metadata": { + "collapsed": false + }, + "source": [] + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/Graph/gfa.py b/Graph/gfa.py index 671d1a9..106367a 100644 --- a/Graph/gfa.py +++ b/Graph/gfa.py @@ -6,7 +6,7 @@ import io import os import tempfile -from Graph.models import * +from Graph.models import Node, Path, GraphGenome def pairwise(iterable): @@ -103,30 +103,33 @@ def save_as_gfa(self, file: str): self.gfa.to_file(file) @classmethod - def from_graph(cls, graph: GraphGenome): + def from_graph(cls, graph: GraphGenome): # TODO: should be given ZoomLevel instead """Constructs the lines of a GFA file listing paths, then sequence nodes in arbitrary order.""" gfa = gfapy.Gfa() - for path in graph.paths: - node_series = ",".join([traverse.node.name + traverse.strand for traverse in path.nodes]) - gfa.add_line('\t'.join(['P', path.accession, node_series, ",".join(['*' for _ in path.nodes])])) - for node in graph.nodes: # in no particular order + for path in graph.paths.all(): + visits = [] + # example of using lazy queries and values_list for fast lookup + node_infos = path.nodes.values_list('node_id', 'strand', named=True) + for traverse in node_infos: + name = Node.objects.values_list('name', flat=True).get(id=traverse.node_id) # fast lookup + visits.append(name + traverse.strand) + node_series = ",".join(visits) + connections = ",".join(['*'] * path.nodes.count()) # count -1? + gfa.add_line('\t'.join(['P', path.accession, node_series, connections])) + for node in graph.nucleotide_level.nodes_xrange(): # in no particular order gfa.add_line('\t'.join(['S', str(node.name), node.seq])) return cls(gfa, "from Graph") - def to_paths(self) -> GraphGenome: - graph = self.to_graph() - return graph.paths - def to_graph(self) -> GraphGenome: """Create parent object for this genome and save it in the database. This can create duplicates appended in Paths if it is called twice.""" - gdb = GraphGenome.objects.get_or_create(name=self.source_path)[0] + gdb = GraphGenome.objects.create(name=self.source_path) + z = gdb.nucleotide_level for segment in self.gfa.segments: - Node.objects.get_or_create(seq=segment.sequence, name=(segment.name), graph=gdb) + Node.objects.get_or_create(seq=segment.sequence, name=segment.name, zoom=z) for path in self.gfa.paths: - p = Path(accession=path.name, graph=gdb) - p.save() + p = Path.objects.create(accession=path.name, zoom=z) p.append_gfa_nodes(path.segment_names) return gdb diff --git a/Graph/migrations/0001_initial.py b/Graph/migrations/0001_initial.py index 0df4016..f90df1c 100644 --- a/Graph/migrations/0001_initial.py +++ b/Graph/migrations/0001_initial.py @@ -22,8 +22,9 @@ class Migration(migrations.Migration): migrations.CreateModel( name='Node', fields=[ + ('id', models.AutoField(auto_created=True, default=0, primary_key=True, serialize=False, verbose_name='ID')), ('seq', models.CharField(blank=True, max_length=255)), - ('name', models.CharField(max_length=15, primary_key=True, serialize=False)), + ('name', models.CharField(max_length=15,serialize=False)), ('graph', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='Graph.GraphGenome')), ], options={ diff --git a/Graph/migrations/0003_add_zoom_levels.py b/Graph/migrations/0003_add_zoom_levels.py new file mode 100644 index 0000000..46a8a15 --- /dev/null +++ b/Graph/migrations/0003_add_zoom_levels.py @@ -0,0 +1,49 @@ +# Generated by Django 2.2.1 on 2019-08-22 13:40 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('Graph', '0002_Path_unique_together'), + ] + + operations = [ + migrations.AddField( + model_name='node', + name='summarized_by', + field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='children', to='Graph.Node'), + ), + migrations.AddField( + model_name='path', + name='zoom', + field=models.IntegerField(default=0), + preserve_default=False, + ), + migrations.AlterField( + model_name='node', + name='id', + field=models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID'), + ), + migrations.AlterField( + model_name='node', + name='name', + field=models.CharField(max_length=15), + ), + migrations.AlterField( + model_name='nodetraversal', + name='order', + field=models.IntegerField(help_text='Defines the order a path lists traversals. The scale of order is not preserved between zoom levels.'), + ), + migrations.AlterUniqueTogether( + name='path', + unique_together={('graph', 'accession', 'zoom')}, + ), + migrations.AddField( + model_name='path', + name='summarized_by', + field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='summary_child', to='Graph.Path'), + ), + ] diff --git a/Graph/migrations/0004_ZoomLevel_objects.py b/Graph/migrations/0004_ZoomLevel_objects.py new file mode 100644 index 0000000..8c05de2 --- /dev/null +++ b/Graph/migrations/0004_ZoomLevel_objects.py @@ -0,0 +1,38 @@ +# Generated by Django 2.2.1 on 2019-08-23 16:23 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('Graph', '0003_add_zoom_levels'), + ] + + operations = [ + migrations.AlterUniqueTogether( + name='path', + unique_together=set(), + ), + migrations.RemoveField( + model_name='path', + name='graph', + ), + migrations.RemoveField( + model_name='path', + name='zoom', + ), + migrations.CreateModel( + name='ZoomLevel', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('zoom', models.IntegerField(default=0)), + ('graph', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='Graph.GraphGenome')), + ('paths', models.ManyToManyField(blank=True, help_text='One path for each accession in a zoom level. Paths can be reused unmodified in multiple levels.', related_name='zoom_levels', to='Graph.Path')), + ], + options={ + 'unique_together': {('graph', 'zoom')}, + }, + ), + ] diff --git a/Graph/migrations/0005_Node_zoomlevel.py b/Graph/migrations/0005_Node_zoomlevel.py new file mode 100644 index 0000000..04d7bd4 --- /dev/null +++ b/Graph/migrations/0005_Node_zoomlevel.py @@ -0,0 +1,28 @@ +# Generated by Django 2.2.1 on 2019-09-03 14:33 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('Graph', '0004_ZoomLevel_objects'), + ] + + operations = [ + migrations.AddField( + model_name='node', + name='zoom', + field=models.ForeignKey(default=1, on_delete=django.db.models.deletion.CASCADE, to='Graph.ZoomLevel'), + preserve_default=False, + ), + migrations.AlterUniqueTogether( + name='node', + unique_together={('zoom', 'name')}, + ), + migrations.RemoveField( + model_name='node', + name='graph', + ), + ] diff --git a/Graph/migrations/0006_Path_zoomlevel.py b/Graph/migrations/0006_Path_zoomlevel.py new file mode 100644 index 0000000..7e75858 --- /dev/null +++ b/Graph/migrations/0006_Path_zoomlevel.py @@ -0,0 +1,28 @@ +# Generated by Django 2.2.1 on 2019-09-03 14:53 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('Graph', '0005_Node_zoomlevel'), + ] + + operations = [ + migrations.RemoveField( + model_name='zoomlevel', + name='paths', + ), + migrations.AddField( + model_name='path', + name='zoom', + field=models.ForeignKey(default=1, on_delete=django.db.models.deletion.CASCADE, to='Graph.ZoomLevel'), + preserve_default=False, + ), + migrations.AlterUniqueTogether( + name='path', + unique_together={('accession', 'zoom')}, + ), + ] diff --git a/Graph/migrations/0007_NodeTraversal_unique_together_order.py b/Graph/migrations/0007_NodeTraversal_unique_together_order.py new file mode 100644 index 0000000..b2bfc8b --- /dev/null +++ b/Graph/migrations/0007_NodeTraversal_unique_together_order.py @@ -0,0 +1,17 @@ +# Generated by Django 2.2.1 on 2019-09-03 16:07 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('Graph', '0006_Path_zoomlevel'), + ] + + operations = [ + migrations.AlterUniqueTogether( + name='nodetraversal', + unique_together={('path', 'order')}, + ), + ] diff --git a/Graph/migrations/0008_remove_path_summarized_by.py b/Graph/migrations/0008_remove_path_summarized_by.py new file mode 100644 index 0000000..c0eeb2f --- /dev/null +++ b/Graph/migrations/0008_remove_path_summarized_by.py @@ -0,0 +1,17 @@ +# Generated by Django 2.2.1 on 2019-09-03 16:59 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('Graph', '0007_NodeTraversal_unique_together_order'), + ] + + operations = [ + migrations.RemoveField( + model_name='path', + name='summarized_by', + ), + ] diff --git a/Graph/models.py b/Graph/models.py index cbf06ca..1d4688a 100644 --- a/Graph/models.py +++ b/Graph/models.py @@ -1,5 +1,7 @@ -from typing import List, Iterable +from typing import List, Iterable, Set, Optional from django.db import models +from django.db.models import QuerySet, Max, Min + from Utils.models import CustomSaveManager @@ -14,29 +16,134 @@ class NodeMissingError(ValueError): pass +class ZoomLevelManager(models.Manager): + def create(self, graph, zoom, blank_layer=False) -> 'ZoomLevel': + """Creates a full collection of Paths for the new ZoomLevel that matches the + previous level's path names, but are now blank.""" + me = super(ZoomLevelManager, self).create(graph=graph, zoom=zoom) # now saved to DB + if zoom == 0 or blank_layer: + return me # We've done all the necessary work + # Copy previous level in entirety + previous_level = graph.zoomlevel_set.get(zoom=zoom - 1) + Node.objects.bulk_create([Node(name=n.name, zoom=me) for n in previous_level.nodes_xrange()], 100) + # TODO: This loop can be sped up by bulk_create and bulk_update + for path in previous_level.path_xrange(): + name = path.name + p = Path(accession=name, zoom=me) # new Path for a new level + p.save() + # TODO: this is REALLY SLOW AND WASTEFUL! + # Makes a full copy of every traversal in the Path so new copies can be edited + copies = [NodeTraversal(path=p, node=traverse.node, strand=traverse.strand, order=traverse.order) + for traverse in path.nodetraversal_set.all()] + NodeTraversal.objects.bulk_create(copies, 100) + return me + + +class ZoomLevel(models.Model): + """Each Graph Genome has multiple Zoom levels. A zoom level is a collection of Paths + at that zoom level. When a Path is unmodified from a summarization step, it can reuse + a previous Path without duplicating it. + Zoom was originally a single integer in Path, but it turned out a helper class for queries + would result in more robust code and allow Path reuse.""" + graph = models.ForeignKey('GraphGenome', on_delete=models.CASCADE) + zoom = models.IntegerField(default=0) # Zoom level starts at 0 for nucleotide level and moves up + # Nodes can be shared between zoom levels. + objects = ZoomLevelManager() + + class Meta: + unique_together = ['graph', 'zoom'] # one ZoomLevel at each integer + + @property + def paths(self) -> QuerySet: + return self.path_set + + def __len__(self): + """The number of unique NodeTraversals in this level""" + return self.node_set.count() + + def __repr__(self): + """Warning: the representation strings are very sensitive to whitespace""" + return f"Graph: {self.graph.name}\n{self.paths.count()} paths {self.node_set.count()} layers." + + def __eq__(self, other: 'ZoomLevel'): + if isinstance(other, ZoomLevel): + return other.node_set.count() == self.node_set.count() and \ + other.paths.count() == self.paths.count() # other.name == self.name and \ + return False + + def node_ids(self) -> Set[int]: # TODO: optimize + # path_ids = self.paths.values_list('id', flat=True) + # nodes = set(NodeTraversal.objects.filter(path_id__in=path_ids).values_list('node_id', flat=True)) + return set(self.node_set.values_list('id', flat=True)) + + @property + def nodes(self) -> QuerySet: + """Getter only. Shortcut for DB.""" + return self.node_set.all() + + def node(self, node_name): + return Node.objects.get(name=node_name, zoom=self) + + def path_xrange(self): + """Lazy evaluation of Paths to ensure that we don't overrun SQL""" + return self.paths.all().iterator(chunk_size=100) + + def nodes_xrange(self): + """Lazy evaluation of Nodes to ensure that we don't overrun SQL""" + return self.paths.all().iterator(chunk_size=100) + + +class GraphManager(models.Manager): + """custom create() methods for consistency and convenience""" + def create(self, **kwargs): + """When a graph is created. It should also automatically create the first ZoomLevel.""" + graph = super(GraphManager, self).create(**kwargs) + ZoomLevel.objects.create(graph=graph, zoom=0) # nucleotide_level created + return graph + + class GraphGenome(models.Model): + """Graph Genomes in general are defined as a collection of unordered Nodes which contain sequence, + and one Path per individual. A Path visits Nodes in any order, on either strand. This directed + graph then contains the relationships of all individuals to each other through shared sequence. + GraphGenomes in this database contain an extra concept not found in GFA or VG: Summarization. + GraphGenome has multiple ZoomLevels which each contain a full set of Paths. Paths at higher + zoom levels are shorter and visit summary nodes that explain or discard trivial variation to + allow user to focus on larger graph structure.""" name = models.CharField(max_length=1000) + # Zoom level is defined in Paths only. + # Nodes can be shared between zoom levels. + + objects = GraphManager() @property - def paths(self): - """Getter only. Shortcut for DB.""" - return self.path_set.all() + def nucleotide_level(self) -> 'ZoomLevel': + return self.zoomlevel_set.filter(zoom=0).first() @property - def nodes(self): + def paths(self) -> QuerySet: """Getter only. Shortcut for DB.""" - return self.node_set.all() + return self.nucleotide_level.paths - def __repr__(self): - """Warning: the representation strings are very sensitive to whitespace""" - return f"Graph: {self.name}\n{self.path_set.count()} paths {self.node_set.count()} nodes." + # @property + # def nodes(self): + # """Getter only. Shortcut for DB.""" + # return self.nucleotide_level.nodes - def __eq__(self, other): + def __eq__(self, other: 'GraphGenome'): if isinstance(other, GraphGenome): - return other.node_set.count() == self.node_set.count() and \ - other.path_set.count() == self.path_set.count() # other.name == self.name and \ + if other.zoomlevel_set.count() == self.zoomlevel_set.count(): + for a,b in zip(other.zoomlevel_set.all(),self.zoomlevel_set.all()): + if a != b: + return False + return True + return False return False + def __repr__(self): + """Warning: the representation strings are very sensitive to whitespace""" + return f"Graph: {self.name}\n{self.paths.count()} paths {self.nucleotide_level.node_set.count()} nodes at base in {self.zoomlevel_set.count()} layers." + @classmethod def load_from_xg(cls, file: str, xg_bin: str) -> 'GraphGenome': """XG is a graph format used by VG (variation graph). This method builds a @@ -52,26 +159,27 @@ def save_as_xg(self, file: str, xg_bin: str): gfa = GFA.from_graph(self) gfa.save_as_xg(file, xg_bin) - def append_node_to_path(self, node_id, strand, path_name) -> None: - """This is the preferred way to build a graph in a truly non-linear way. - Nodes will be created if necessary. - NodeTraversal is appended to Path (order dependent) and PathIndex is added to Node - (order independent).""" - if node_id not in self.nodes: # hasn't been created yet, need to retrieve from dictionary of guid - if isinstance(node_id, str): - self.nodes[node_id] = Node('', [], node_id) - else: - raise ValueError("Provide the id of the node, not", node_id) - Path.objects.get(name=path_name).append_node(Node.objects.get(name=node_id), strand) + + def highest_zoom_level(self): + return self.zoomlevel_set.all().order_by('-zoom').first().zoom class Node(models.Model): + """Nodes are the fundamental content carriers for sequence. A Path + can traverse nodes in any order, on both + and - strands. + Nodes can be utilized by Paths in multiple zoom levels. + Only first order Nodes (zoom=0) have sequences, but all have names which can be used + to fetch the node from GraphGenome.node().""" seq = models.CharField(max_length=255, blank=True) - name = models.CharField(primary_key=True, max_length=15) - graph = models.ForeignKey(GraphGenome, on_delete=models.CASCADE) + name = models.CharField(max_length=15) + zoom = models.ForeignKey(ZoomLevel, on_delete=models.CASCADE) + summarized_by = models.ForeignKey('Node', null=True, blank=True, on_delete=models.SET_NULL, + related_name='children') + # Break points for haploblocks - Erik Garrison - service for coordinates + # Start and stop positions for a node class Meta: - unique_together = ['graph', 'name'] + unique_together = ['zoom', 'name'] def __len__(self): return self.nodetraversal_set.count() @@ -93,6 +201,67 @@ def __hash__(self): def to_gfa(self, segment_id: int): return '\t'.join(['S', str(segment_id), self.seq]) + def specimens(self) -> Set[int]: + return set(self.traverses().values_list('path_id', flat=True)) + + def traverses(self) -> QuerySet: + return self.nodetraversal_set.all() + + def upstream_ids(self) -> Set[int]: + """Returns the node ids that are upstream of this node.""" + traverses = self.traverses() #.value_list('node_id', flat=True) + # Node.objects.filter(id__in=traverses).values_list('id', flat=True) + return set(t.upstream_id() for t in traverses) + + def upstream(self) -> Set['Node']: + """Returns the node ids that are upstream of this node.""" + traverses = self.traverses() #.value_list('node_id', flat=True) + nodes = set() + for t in traverses: + n = t.upstream() # upstream may be None + if n: + nodes.add(n.node) + return nodes + + def downstream_ids(self) -> Set[int]: + """Returns the node ids that are downstream of this node.""" + traverses = self.traverses() + return set(t.downstream_id() for t in traverses) + + def downstream(self) -> Set['Node']: + """Returns the node ids that are upstream of this node.""" + traverses = self.traverses() #.value_list('node_id', flat=True) + return set(t.downstream().node for t in traverses if t.downstream()) + + def __repr__(self): + return "N%s(%s)" % (str(self.name), self.seq) + + def details(self, zoom=0): + return f"""Node{self.name}: {self.seq} + upstream: {self.upstream_ids()} + downstream: {self.downstream_ids()} + {len(self.specimens())} specimens: {self.specimens()}""" + + def is_nothing(self): + """Useful in Node class definition to check for Node.NOTHING""" + return self.name == '-1' + + def validate(self): + """Returns true if the Node has specimens and does not have any negative + transition values, raises an AssertionError otherwise.""" + if not self.nodetraversal_set.count(): + assert False, "Orphaned: No NodeTraversals are referencing this Node." + self.details() + return True + + +# Node.NOTHING is essential "Not Applicable" when used to track transition rates between nodes. +# Node.NOTHING is an important concept to Haploblocker, used to track upstream and downstream +# that transitions to an unknown or untracked state. As neglect_nodes removes minority +# allele nodes, there will be specimens downstream that "come from" Node.NOTHING, meaning their +# full history is no longer tracked. Node.NOTHING is a regular exception case for missing data, +# the ends of chromosomes, and the gaps between haplotype blocks. +Node.NOTHING = Node(-1) + class Path(models.Model): """Paths represent the linear order of on particular individual (accession) as its genome @@ -100,13 +269,13 @@ class Path(models.Model): sequences is the accession's genome. Create Paths first from accession names, then append them to Nodes to link together.""" accession = models.CharField(max_length=1000) # one path per accession - graph = models.ForeignKey(GraphGenome, on_delete=models.CASCADE) + zoom = models.ForeignKey(ZoomLevel, on_delete=models.CASCADE) - class Meta: - unique_together = ['graph', 'accession'] + class Meta: # we need a database check where each accession name only occurs once per zoom level + unique_together = ['accession', 'zoom'] def __getitem__(self, path_index): - return self.nodes[path_index] + return self.nodes.filter(order=path_index) def __repr__(self): """Warning: the representation strings are very sensitive to whitespace""" @@ -119,33 +288,51 @@ def __hash__(self): return hash(self.accession) @property - def nodes(self) -> Iterable['NodeTraversal']: - return NodeTraversal.objects.filter(path=self).order_by('order').all() + def nodes(self) -> QuerySet: + return NodeTraversal.objects.filter(path=self).order_by('order') + + @property + def graph(self) -> GraphGenome: + return self.zoom.graph + + @property + def name(self): + return self.accession + + @property + def summary_child(self): + if self.zoom.zoom == 0: + return None + previous_zoom = self.zoom.graph.zoomlevel_set.get(zoom=self.zoom.zoom - 1) + return previous_zoom.path_set.get(accession=self.accession) def append_gfa_nodes(self, nodes): assert hasattr(nodes[0], 'orient') and hasattr(nodes[0], 'name'), 'Expecting gfapy.Gfa.path' for node in nodes: - NodeTraversal(node=Node.objects.get(name=node.name), - path=self, strand=node.orient).save() + # TODO: could be sped up with bulk_create after checking and setting order + NodeTraversal.objects.create(node=Node.objects.get(name=node.name), + path=self, strand=node.orient) def append_node(self, node: Node, strand: str): """This is the preferred way to build a graph in a truly non-linear way. + NodeTraversal.order is guaranteed to be contiguous, making upstream() and downstream() work properly. NodeTraversal is appended to Path (order dependent) and PathIndex is added to Node (order independent).""" - NodeTraversal(node=node, path=self, strand=strand).save() + NodeTraversal.objects.create(node=node, path=self, strand=strand) # calculates order - # @classmethod - # def build(cls, name: str, seq_of_nodes: List[str]): - # node = Node.objects.create(seq) - # for p in paths: - # NodeTraversal.objects.create(node, path) - - def name(self): - return self.accession def to_gfa(self): return '\t'.join(['P', self.accession, "+,".join([x.node.name + x.strand for x in self.nodes]) + "+", ",".join(['*' for x in self.nodes])]) +class NodeTraversalManager(models.Manager): + def create(self, node, path, strand, order=None): + """Checks the largest 'order' value in the current path and increments by 1. + IMPORTANT NOTE: save() does not get called if you do NodeTraverseal.objects.create + or get_or_create""" + if order is None: + last_traversal = path.nodetraversal_set.all().order_by('-order').first() + order = 0 if not last_traversal else last_traversal.order + 1 + return super(NodeTraversalManager, self).create(node=node, path=path, strand=strand, order=order) class NodeTraversal(models.Model): @@ -153,9 +340,14 @@ class NodeTraversal(models.Model): node = models.ForeignKey(Node, db_index=True, on_delete=models.CASCADE) path = models.ForeignKey(Path, db_index=True, on_delete=models.CASCADE, help_text='') strand = models.CharField(choices=[('+', '+'),('-', '-')], default='+', max_length=1) - order = models.IntegerField(help_text='Defines the order a path lists traversals') # set automatically + # order is set automatically in the CustomSaveManager + order = models.IntegerField(help_text='Defines the order a path lists traversals. ' + 'The scale of order is not preserved between zoom levels.') - objects = CustomSaveManager() + objects = NodeTraversalManager() + + class Meta: + unique_together = ['path', 'order'] def __repr__(self): if self.strand == '+': @@ -167,12 +359,50 @@ def __repr__(self): def __eq__(self, other): return self.node.id == other.node.id and self.strand == other.strand - def save(self, **kwargs): - """Checks the largest 'order' value in the current path and increments by 1. - IMPORTANT NOTE: save() does not get called if you do NodeTraverseal.objects.create - or get_or_create""" - if self.order is None: - last_traversal = self.path.nodetraversal_set.all().order_by('-order').first() - self.order = 0 if not last_traversal else last_traversal.order + 1 - super(NodeTraversal, self).save(**kwargs) - + def fetch_neighbor(self, target_index): + query = NodeTraversal.objects.filter \ + (path=self.path, order=target_index).values_list('node__id', flat=True) + if query: + return query[0] + return -1 + + def upstream_id(self): + target_index = self.order - 1 + return self.neighbor_id(target_index, True) + + def downstream_id(self): + target_index = self.order + 1 + return self.neighbor_id(target_index, False) + + def neighbor_id(self, target_index: int, less_than: bool): + """Faster query that just retruns the node_id, not the NodeTraversal.""" + try: # This query version can tolerate non-contiguous Path sequences + if less_than: + return NodeTraversal.objects.\ + filter(path=self.path, order__lte=target_index).\ + order_by('-order').values_list('node_id', flat=True).first() + return NodeTraversal.objects.\ + filter(path=self.path, order__gte=target_index).\ + order_by('order').values_list('node_id', flat=True).first() + except (NodeTraversal.DoesNotExist, IndexError): + return None + + def neighbor(self, target_index: int, less_than: bool) -> Optional['NodeTraversal']: + try: # This query version can tolerate non-contiguous Path sequences + if less_than: + return NodeTraversal.objects. \ + filter(path=self.path, order__lte=target_index). \ + order_by('-order').first() + return NodeTraversal.objects. \ + filter(path=self.path, order__gte=target_index). \ + order_by('order').first() + except (NodeTraversal.DoesNotExist, IndexError): + return None + + def upstream(self) -> 'NodeTraversal': + """Slower queries that return the neighboring NodeTraversal. This can be chained.""" + return self.neighbor(self.order - 1, True) + + def downstream(self) -> 'NodeTraversal': + """Slower queries that return the neighboring NodeTraversal. This can be chained.""" + return self.neighbor(self.order + 1, False) diff --git a/Graph/sort.py b/Graph/sort.py index dbea358..83687a9 100644 --- a/Graph/sort.py +++ b/Graph/sort.py @@ -2,7 +2,7 @@ import dataclasses from typing import List, Set -from Graph.models import NodeTraversal, Path, Node +from Graph.models import NodeTraversal, Path, Node, GraphGenome @dataclasses.dataclass @@ -32,22 +32,21 @@ class DAGify: """ DAGify accepts a set of paths, and """ - def __init__(self, paths: List[Path], nodes={}): + def __init__(self, graph: GraphGenome): """ :type paths: List[Path], nodes: Set[Node] """ - self.paths = paths - self.nodes = nodes + self.graph_genome = graph def generate_profiles_with_minimizing_replications(self) -> (List[Profile], int): """ - Generate profiles with minimizing the number of node repication by trying to assign each path as a primary path one by one. + Generate profiles with minimizing the number of node replication by trying to assign each path as a primary path one by one. It returns profiles and whose number of replication. :return: Profiles and the number of replicated nodes. """ min_rep = sys.maxsize profile = [] - for i, _ in enumerate(self.paths): + for i in range(self.graph_genome.paths.count()): profile_candidate = self.generate_profiles(i) if min_rep > len([x.duplicate for x in profile_candidate if x.duplicate]): min_rep = len([x.duplicate for x in profile_candidate if x.duplicate]) @@ -61,12 +60,15 @@ def generate_profiles(self, primary_path_index: int = 0) -> List[Profile]: :return: a list of profiles """ profile = [] - for node_index in self.paths[primary_path_index].nodes: + # TODO: replace this with database queries. Possibly pass in path, rather than an primary_path_index. + raise NotImplementedError() + primary_path = self.paths[primary_path_index] + for node_index in primary_path.nodes: if node_index.strand == "+": - profile.append(Profile(node_index, [self.paths[primary_path_index]], [], {self.paths[primary_path_index]}, False)) + profile.append(Profile(node_index, [primary_path], [], {primary_path}, False)) else: profile.append( - Profile(node_index, [], [self.paths[primary_path_index]], {self.paths[primary_path_index]}, False)) + Profile(node_index, [], [primary_path], {primary_path}, False)) for i, path in enumerate(self.paths): if i == primary_path_index: continue diff --git a/Graph/test.py b/Graph/test.py index 9a44150..8af2213 100644 --- a/Graph/test.py +++ b/Graph/test.py @@ -1,15 +1,14 @@ import unittest -from datetime import datetime from django.test import TestCase -from typing import List import os from os.path import join from Graph.gfa import GFA -from Graph.models import Node, GraphGenome, Path +from Graph.models import Node, Path, NodeTraversal, ZoomLevel from Graph.sort import DAGify # Define the working directory +from Graph.utils import build_graph_from_slices from vgbrowser.settings import BASE_DIR PATH_TO_TEST_DATA = join(BASE_DIR, "test_data") location_of_xg = join(BASE_DIR, "test_data","xg") @@ -19,34 +18,6 @@ x, y, z = 'x', 'y', 'z' -def build_from_test_slices(cmd: List): - """This factory uses test data shorthand for linear graph slices to build - a database GraphGenome with all the necessary Paths and Nodes. Path order populated in the order - that they are mentioned in the slices. Currently, this is + only and does not support non-linear - orderings. Use Path.append_node() to build non-linear graphs.""" - if isinstance(cmd, str): - cmd = eval(cmd) - # preemptively grab all the path names from every odd list entry - graph = GraphGenome.objects.get_or_create(name='test_data')[0] # + str(datetime.now()) - node_count = 0 - paths = {key for sl in cmd for i in range(0, len(sl), 2) for key in sl[i + 1]} - path_objs = {} - for path_name in paths: - path_objs[path_name] = Path.objects.get_or_create(graph=graph, accession=path_name)[0] - for sl in cmd: - try: - for i in range(0, len(sl), 2): - paths_mentioned = [path_objs[key] for key in sl[i + 1]] - node, is_new = Node.objects.get_or_create(seq=sl[i], name=graph.name + str(node_count), graph=graph) - node_count += 1 - for path in paths_mentioned: - path.append_node(node, '+') - except IndexError: - raise IndexError("Expecting two terms: ", sl[0]) # sl[i:i+2]) - - return graph - - class GraphTest(TestCase): """Constructing a node with an existing Path object will modify that Path object (doubly linked) which means care must be taken when constructing Graphs. From factory_input we have an example of @@ -65,23 +36,55 @@ def test_equalities(self): def test_graph_factory(self): original_test = [['ACGT', {a, b, c, d}], - ['C', {a, b, d}, 'T', {c}], # SNP + ['C', {a, b, d}, 'T', {c}], # SNP ['GGA', {a, b, c, d}], # anchor - ['C', {a, b, d}], # [3] repeated from [1] SNP + ['C', {a, b, d}], # [3] repeated from [1] SNP ['AGTACG', {a, b, c}, 'CGTACT', {d}], # [4] different membership from [3] - ['TTG', {a, b, c, d}], # [5] anchor + ['TTG', {a, b, c, d}], # [5] anchor ['A', {a, b}, 'C', {d, e}, 'T', {c}], # [6] third allele ['GG', {a, b}, 'TT', {c, d}], # [7] equal size nodes ['C', {a, b, c, e}, 'T', {d}], # [8] path slip ['C', {a, b, e}, 'T', {c, d}], # [9] path slip ['C', {a, b, c}, 'T', {d}], # [10]path slip ['TATA', {a, b, c, d}]] # [11] anchor - g1, g2 = build_from_test_slices(original_test), build_from_test_slices(original_test) + g1, g2 = build_graph_from_slices(original_test), build_graph_from_slices(original_test) assert g1 == g2, \ ('\n' + repr(g1) + '\n' + repr(g2)) g_from_GFA = self.test_example_graph() # comes from matching assert g1 == g_from_GFA, repr(g1) + '\n' + repr(g_from_GFA) - + # TODO: stricter equality checks: currently just counting nodes + + def test_summary_storage(self): + graph = self.test_example_graph() + zoom0 = graph.nucleotide_level + zoom1 = ZoomLevel.objects.create(graph=graph, zoom=1, blank_layer=False) + path1 = zoom1.paths.get(accession='a') + self.assertEqual(zoom1.paths.count(), 5) + new_node = Node.objects.create(seq='ACGTCGGA', name='2*2', zoom=zoom1) + base_nodes = graph.nucleotide_level.nodes + base_nodes.filter(name__in=['1', '2', '4']).update(summarized_by=new_node) + assert new_node.children.count() == 3 + zoom1.nodes.filter(name__in=['1', '2', '4']).delete() # delete copies made redundant in next layer + for node in base_nodes.filter(name__in=['1','2','4']): + assert node.summarized_by_id == new_node.id + print(list(zoom1.nodes.all())) + self.assertEqual(NodeTraversal.objects.get(order=0, path=path1).downstream().downstream().node.name, '4') + self.assertEqual(graph.zoomlevel_set.count(), 2) + self.assertTrue(path1.summary_child), # "Path should be linked to its child." + self.assertEqual(zoom1.paths.count(), 5) + # ZoomLevel + self.assertEqual(len(zoom1), len(zoom0) - 3 + 1) + self.assertEqual(zoom1.node_ids(),set(range(23, 42)) - {24},) # 24 deleted + self.assertEqual(zoom0.node_ids(), set(range(1,21))) + names = zoom1.nodes.values_list('name', flat=True) + self.assertEqual(names, ['5', '6', '2*2']) + sequences = [x.seq for x in zoom1.nodes] + self.assertEqual(sequences, ['C', 'AGTACG', 'ACGTCGGA']) + + + # def test_zoom_level(self): + # graph = self.test_example_graph() + # node_ids @unittest.skip # DAGify has not been converted to databases yet. class DAGifyTest(TestCase): @@ -93,16 +96,16 @@ class DAGifyTest(TestCase): def test_dagify(self): gfa = GFA.load_from_gfa(join(PATH_TO_TEST_DATA, "test.gfa")) - paths = gfa.to_paths() - dagify = DAGify(paths) + graph = gfa.to_graph() + dagify = DAGify(graph) profile = dagify.generate_profiles(0) # self.assertEqual([['CAAATAAG', {x,y,z}], ['A', {y,z}, 'G', {x}], ['C', {x,y,z}], ['TTG', {x,y,z}], ['A', {z}, 'G', {x,y}], ['AAATTTTCTGGAGTTCTAT', {x,y,z}], ['T', {x,y,z}], ['ATAT', {x,y,z}], ['T', {x,y,z}], ['CCAACTCTCTG', {x,y,z}]], graph) def test_dagify2(self): gfa = GFA.load_from_gfa(join(PATH_TO_TEST_DATA, "test2.gfa")) - paths = gfa.to_paths() - dagify = DAGify(paths) + graph = gfa.to_graph() + dagify = DAGify(graph) profile = dagify.generate_profiles(0) # graph = SlicedGraph.load_from_slices(dagify.to_slices(profile), paths) # x,y,z,a = 'x', 'y', 'z', 'a' @@ -110,8 +113,8 @@ def test_dagify2(self): def test_dagify3(self): gfa = GFA.load_from_gfa(join(PATH_TO_TEST_DATA, "test3.gfa")) - paths = gfa.to_paths() - dagify = DAGify(paths) + graph = gfa.to_graph() + dagify = DAGify(graph) profile, rep_count = dagify.generate_profiles_with_minimizing_replications() self.assertEqual(rep_count, 1) # graph = SlicedGraph.load_from_slices(dagify.to_slices(profile), paths) @@ -119,8 +122,8 @@ def test_dagify3(self): def test_dagify_altpath(self): gfa = GFA.load_from_gfa(join(PATH_TO_TEST_DATA, "alternate_paths.gfa")) - paths = gfa.to_paths() - dagify = DAGify(paths) + graph = gfa.to_graph() + dagify = DAGify(graph) profile, rep_count = dagify.generate_profiles_with_minimizing_replications() self.assertEqual(rep_count, 1) # graph = SlicedGraph.load_from_slices(dagify.to_slices(profile), paths) @@ -128,8 +131,8 @@ def test_dagify_altpath(self): def test_dagify_dup(self): gfa = GFA.load_from_gfa(join(PATH_TO_TEST_DATA, "duplicate.gfa")) - paths = gfa.to_paths() - dagify = DAGify(paths) + graph = gfa.to_graph() + dagify = DAGify(graph) profile, rep_count = dagify.generate_profiles_with_minimizing_replications() self.assertEqual(rep_count, 2) # graph = SlicedGraph.load_from_slices(dagify.to_slices(profile), paths) @@ -138,8 +141,8 @@ def test_dagify_dup(self): def test_unresolved_repreat(self): gfa = GFA.load_from_gfa(join(PATH_TO_TEST_DATA, "unresolved_repeat.gfa")) - paths = gfa.to_paths() - dagify = DAGify(paths) + graph = gfa.to_graph() + dagify = DAGify(graph) profile, rep_count = dagify.generate_profiles_with_minimizing_replications() # graph = SlicedGraph.load_from_slices(dagify.to_slices(profile), paths) # self.assertEqual([['CAAATAAG', {'x'}, 'T', {'y'}], ['A', {'y', 'x'}], ['G', {'x'}, 'C', {'y'}]], graph) @@ -147,8 +150,8 @@ def test_unresolved_repreat(self): @unittest.skip("Inversion is unsupported") def test_inversion(self): gfa = GFA.load_from_gfa(join(PATH_TO_TEST_DATA, "inversion.gfa")) - paths = gfa.to_paths() - dagify = DAGify(paths) + graph = gfa.to_graph() + dagify = DAGify(graph) profile, rep_count = dagify.generate_profiles_with_minimizing_replications() # graph = SlicedGraph.load_from_slices(dagify.to_slices(profile), paths) # self.assertEqual(graph, []) @@ -156,8 +159,8 @@ def test_inversion(self): @unittest.skip("Inversion is unsupported") def test_nested_inversion(self): gfa = GFA.load_from_gfa(join(PATH_TO_TEST_DATA, "nested_inv.gfa")) - paths = gfa.to_paths() - dagify = DAGify(paths) + graph = gfa.to_graph() + dagify = DAGify(graph) profile, rep_count = dagify.generate_profiles_with_minimizing_replications() # graph = SlicedGraph.load_from_slices(dagify.to_slices(profile), paths) # self.assertEqual(graph, []) @@ -166,7 +169,7 @@ def test_nested_inversion(self): def test_simple_inversion(self): gfa = GFA.load_from_gfa(join(PATH_TO_TEST_DATA, "simple_inv.gfa")) graph = gfa.to_graph() - dagify = DAGify(graph.paths) + dagify = DAGify(graph) profile, rep_count = dagify.generate_profiles_with_minimizing_replications() # graph = SlicedGraph.load_from_slices(dagify.to_slices(profile), paths) # self.assertEqual(graph, [['CAAATAAG', {x,y}], ['AC', {x}, 'AC', {y}], ['G', {x, y}]]) @@ -189,7 +192,7 @@ def test_gfa(self): def test_load_gfa_to_graph(self): graph, gfa = self.make_graph_from_gfa() self.assertEqual(graph.paths.count(), 3) - self.assertEqual(graph.nodes.count(), 15) + self.assertEqual(graph.nucleotide_level.nodes.count(), 15) def make_graph_from_gfa(self): gfa = GFA.load_from_gfa(join(PATH_TO_TEST_DATA, "test.gfa")) @@ -213,10 +216,10 @@ def test_load_gfa_via_xg(self): graph2 = GFA.load_from_xg(join(PATH_TO_TEST_DATA, "test.xg"), location_of_xg) graph = graph2.to_graph() x,y,z = 'x','y','z' - self.assertEqual(graph, build_from_test_slices([['CAAATAAG', {x, y, z}], ['A', {y, z}, 'G', {x}], - ['C', {x, y, z}], ['TTG', {x, y, z}], - ['A', {z}, 'G', {x, y}], ['AAATTTTCTGGAGTTCTAT', {x, y, z}], ['T', {x, y, z}], - ['ATAT', {x, y, z}], ['T', {x, y, z}], ['CCAACTCTCTG', {x, y, z}]])) + self.assertEqual(graph, build_graph_from_slices([['CAAATAAG', {x, y, z}], ['A', {y, z}, 'G', {x}], + ['C', {x, y, z}], ['TTG', {x, y, z}], + ['A', {z}, 'G', {x, y}], ['AAATTTTCTGGAGTTCTAT', {x, y, z}], ['T', {x, y, z}], + ['ATAT', {x, y, z}], ['T', {x, y, z}], ['CCAACTCTCTG', {x, y, z}]])) @staticmethod def is_different(gfa1, gfa2): diff --git a/Graph/utils.py b/Graph/utils.py index df3e172..83f12e0 100644 --- a/Graph/utils.py +++ b/Graph/utils.py @@ -1,4 +1,7 @@ from collections import defaultdict +from typing import List + +from Graph.models import GraphGenome, Path, Node class keydefaultdict(defaultdict): @@ -7,4 +10,36 @@ def __missing__(self, key): raise KeyError( key ) else: ret = self[key] = self.default_factory(key) - return ret \ No newline at end of file + return ret + + +def build_graph_from_slices(cmd: List, graph_name='test_data') -> GraphGenome: + """This factory uses test data shorthand for linear graph slices to build + a database GraphGenome with all the necessary Paths and Nodes. Path order populated in the order + that they are mentioned in the slices. Currently, this is + only and does not support non-linear + orderings. Use Path.append_node() to build non-linear graphs.""" + if isinstance(cmd, str): + cmd = eval(cmd) + # preemptive grab all the path names from every odd list entry + graph = GraphGenome.objects.create(name=graph_name) + node_count = 0 + paths = {key for sl in cmd for i in range(0, len(sl), 2) for key in sl[i + 1]} + path_objs = {} + for path_name in paths: + if graph.paths.filter(accession=path_name).count() == 0: + path_objs[path_name] = Path.objects.create(accession=path_name, zoom=graph.nucleotide_level) + for sl in cmd: + try: + for i in range(0, len(sl), 2): + paths_mentioned = [path_objs[key] for key in sl[i + 1]] + node, is_new = Node.objects.get_or_create( + seq=sl[i], + name=graph.name + str(node_count), + zoom=graph.nucleotide_level) + node_count += 1 + for path in paths_mentioned: + path.append_node(node, '+') + except IndexError: + raise IndexError("Expecting two terms: ", sl[0]) # sl[i:i+2]) + + return graph \ No newline at end of file diff --git a/Graph/views.py b/Graph/views.py index 71c2249..adee7d1 100644 --- a/Graph/views.py +++ b/Graph/views.py @@ -1,3 +1,3 @@ -from django.shortcuts import render - # View contains the endpoints on the server for the browser to fetch data + + diff --git a/HaploBlocker/haplonetwork.py b/HaploBlocker/haplonetwork.py index 7f062f6..853785c 100644 --- a/HaploBlocker/haplonetwork.py +++ b/HaploBlocker/haplonetwork.py @@ -3,10 +3,11 @@ Graph Summarization: https://github.com/graph-genome/vgbrowser/issues/3 HaploBlocker: https://github.com/graph-genome/vgbrowser/issues/19 """ -from typing import List +from typing import List, Iterable import numpy as np from collections import defaultdict from copy import copy +from Graph.models import Node, Path, ZoomLevel, NodeTraversal, GraphGenome BLOCK_SIZE = 20 FILTER_THRESHOLD = 4 @@ -15,80 +16,6 @@ def first(iterable): return next(iter(iterable)) - -class Node: - """This definition of Node is designed to be equivalent to the R code HaploBlocker Nodes. - This will be combined with the VG definition of Graph.models.Node and extended to support the - concept of summarization layers. - - Currently, It has no concept of strand, CNV. It uses an absolute Start and End position, but those - are not referenced very often. - Critically, it needs Node.NOTHING which is used frequently to mark specimens whose - upstream or downstream nodes have been pruned. The usage of Node.NOTHING is equivalent to - our sequence mismatch penalties. In both cases information is being discarded for the - purpose of summarization.""" - def __init__(self, ident, start, end, specimens=None, upstream=None, downstream=None): - self.ident = ident - self.start = start # bp, arbitrary coordinates, used for debugging - self.end = end # bp, arbitrary coordinates, used for debugging - self.specimens = set() if specimens is None else specimens - self.upstream = defaultdict(lambda: 0) if not upstream else upstream - # E.g. {Node.NOTHING:501, Node: 38, Node: 201, Node: 3} - self.downstream = defaultdict(lambda: 0) if not downstream else downstream - # E.g. {Node: 38, Node: 201, Node: 3} - assert not self.is_nothing() or (self.end is None and self.start is None), self.details() - - def __len__(self): - return len(self.specimens) - - def __repr__(self): - return "N%s(%s, %s)" % (str(self.ident), str(self.start), str(self.end)) - - def __hash__(self): - return hash(self.ident + 1) * hash(self.start) * hash(self.end) - - def details(self): - return f"""Node{self.ident}: {self.start} - {self.end} - upstream: {dict((key, value) for key, value in self.upstream.items())} - downstream: {dict((key, value) for key, value in self.downstream.items())} - {len(self.specimens)} specimens: {self.specimens}""" - - def is_nothing(self): - """Useful in Node class definition to check for Node.NOTHING""" - return self.ident == -1 and self.start is None and self.end is None - - def validate(self): - """Returns true if the Node has specimens and does not have any negative - transition values, raises an AssertionError otherwise.""" - if not self.specimens: - assert self.specimens, "Specimens are empty" + self.details() - for node, weight in self.upstream.items(): - if not node.is_nothing() and weight < 0: - print(self.details()) - assert weight > -1, node.details() - - for node, weight in self.downstream.items(): - if not node.is_nothing() and weight < 0: - print(self.details()) - assert weight > -1, node.details() - return True - - def is_beginning(self) -> bool: - return self.start == 0 - - def is_end(self) -> bool: - return len(self.downstream) == 1 and first(self.downstream).is_nothing() - - -# Node.NOTHING is essential "Not Applicable" when used to track transition rates between nodes. -# Node.NOTHING is an important concept to Haploblocker, used to track upstream and downstream -# that transitions to an unknown or untracked state. As neglect_nodes removes minority -# allele nodes, there will be specimens downstream that "come from" Node.NOTHING, meaning their -# full history is no longer tracked. Node.NOTHING is a regular exception case for missing data, -# the ends of chromosomes, and the gaps between haplotype blocks. -Node.NOTHING = Node(-1, None, None) - - def read_data(file_path): """Reads one of Torsten's SNP files. In the file, Individuals are columns, not rows. This method returns both loci (all 501 individual alleles at one loci) and the Transposed @@ -106,7 +33,7 @@ def signature(individual, start_locus): return tuple(individual[start_locus: start_locus + BLOCK_SIZE]) -def get_unique_signatures(individuals, start_locus): +def nodes_from_unique_signatures(individuals, start_locus, current_graph: GraphGenome): """A signature is a series of BLOCK_SIZE SNPs inside of a locus. We want to know how many unique signatures are present inside of one locus. A Node is created for each unique signature found. @@ -117,41 +44,49 @@ def get_unique_signatures(individuals, start_locus): for individual in individuals: sig = signature(individual, start_locus) if sig not in unique_blocks: - unique_blocks[sig] = Node(len(unique_blocks), - start_locus // BLOCK_SIZE, - start_locus // BLOCK_SIZE) # Inclusive end + unique_blocks[sig] = Node.objects.create( # saves to Database + name=f'{len(unique_blocks)}:{start_locus // BLOCK_SIZE}-{start_locus // BLOCK_SIZE}', + seq=''.join(str(x) for x in sig), + zoom=current_graph.nucleotide_level) return unique_blocks -def get_all_signatures(alleles, individuals): - unique_signatures = [] - for locus_start in range(0, len(alleles) - BLOCK_SIZE, BLOCK_SIZE): # discards remainder - sig = get_unique_signatures(individuals, locus_start) - unique_signatures.append(sig) - return unique_signatures +def build_all_slices(alleles, individuals, current_graph): + """Each item in this list is a slice, representing all the possible states for one locus. + Inside a slice is a set of Nodes, one for each unique 'signature' or sequence state. + Paths that all have the same state in this slice all reference the same Node object.""" + slices = [] + for slice_start in range(0, len(alleles) - BLOCK_SIZE, BLOCK_SIZE): # discards remainder + nodes = nodes_from_unique_signatures(individuals, slice_start, current_graph) + slices.append(nodes) + return slices -def build_individuals(individuals, unique_signatures): - """Describes an individual as a list of Nodes that individual visits. - simplified_individuals is a list of loci which contain a list of Nodes which each contain specimen +def build_paths(individuals, unique_signatures, graph: GraphGenome): + """Describes an individual as a Path (list of Nodes) that the individual visits (NodeTraversals). + accessions is a list of loci which contain a list of Nodes which each contain specimen build nodes: [0] first 4 are the 4 starting signatures in window 0. Nodes represent a collection of individuals with the same signature at that locus - For each node list which individuals are present at that node""" - simplified_individuals = [] + For each node list which individuals are present at that node + :param graph: """ + # TODO: It may be more performant to merge build_all_slices and build_paths so that global lists are never stored + print(f"Building paths from {len(individuals)} individuals and {len(unique_signatures)} loci") + accessions = [] for i_specimen, specimen in enumerate(individuals): - my_simplification = [] - for w, window in enumerate(unique_signatures): # the length of the genome - sig = signature(specimen, w * BLOCK_SIZE) - my_simplification.append(unique_signatures[w][sig]) - simplified_individuals.append(my_simplification) - return simplified_individuals + my_path = Path.objects.create(accession=str(i_specimen), zoom=graph.nucleotide_level) + my_sigs = [unique_signatures[w][signature(specimen, w * BLOCK_SIZE)] for w in range(len(unique_signatures))] + traverses = [NodeTraversal(node=sig, path=my_path, strand='+', order=i) for i, sig in enumerate(my_sigs)] + NodeTraversal.objects.bulk_create(traverses, 100) + accessions.append(my_path) + print(f"Done building {len(accessions)}Paths") + return accessions def populate_transitions(simplified_individuals): """ List transition rates from one node to all other upstream and downstream. This method populates Node.specimens and begins the process of side-effecting Nodes. - To rebuild a fresh Graph copy, you must start at get_all_signatures() + To rebuild a fresh Graph copy, you must start at build_all_slices() :param simplified_individuals: """ for i, indiv in enumerate(simplified_individuals): @@ -160,152 +95,123 @@ def populate_transitions(simplified_individuals): node.specimens.add(i) if x + 1 < len(indiv): node.downstream[indiv[x + 1]] += 1 - else: - node.downstream[Node.NOTHING] += 1 + # else: + # node.downstream[Node.NOTHING] += 1 if x - 1 >= 0: node.upstream[indiv[x - 1]] += 1 - else: - node.upstream[Node.NOTHING] += 1 - - -def update_transition(node): - """Only transition values for nodes already listed in upstream and downstream will be calculated.""" - if node is not Node.NOTHING: - update_stream_transitions(node, 'upstream') - update_stream_transitions(node, 'downstream') - - return node - - -def update_stream_transitions(node, stream): - """This will updated either upstream or downstream transition counts based on the - the value of 'stream'. This is a meta-programming function that requires the exact - name of the class field 'upstream' or 'downstream' to work.""" - g = getattr # - running = g(node, stream).keys() - setattr(node, stream, defaultdict(lambda: 0)) - for n in running: - if n is not Node.NOTHING: - g(node, stream)[n] = len(node.specimens.intersection(n.specimens)) - accounted_upstream = sum(g(node, stream).values()) - g(node, stream)[Node.NOTHING] - g(node, stream)[Node.NOTHING] = len(node.specimens) - accounted_upstream - assert all([count > -1 for count in g(node, stream).values()]), node.details() - # Cleans up old keys including Node.NOTHING - to_be_deleted = {key for key, count in g(node, stream).items() if count == 0} - for key in to_be_deleted: - g(node, stream).pop(key, None) - - -def simple_merge(full_graph): + # else: + # node.upstream[Node.NOTHING] += 1 + + +def simple_merge(current_level: ZoomLevel) -> bool: """ Side effects full_graph by merging any consecutive nodes that have - identical specimens and removing the redundant node from full_graph. - :param full_graph: - :return: full_graph modified + identical specimens and removing the redundant my_node from full_graph. + :param current_level: Graph that will be read and edited + :return: true if modification occurred """ - n = 0 - while n < len(full_graph): # size of global_nodes changes, necessitating this weird loop - node = full_graph[n] - if len(node.downstream) == 1: - next_node = first(node.downstream.keys()) - if len(node.specimens) == len(next_node.specimens): - # Torsten deletes nodeA and modifies next_node - next_node.upstream = node.upstream - next_node.start = node.start - # prepare to delete node by removing references - for parent in node.upstream.keys(): - if parent is not Node.NOTHING: - count = parent.downstream[node] - del parent.downstream[node] # updating pointer - parent.downstream[next_node] = count - full_graph.remove(node) # delete node - n -= 1 - n += 1 - return full_graph - - -def delete_node(node, cutoff): + #TODO: Global bool for whether or not a particular path was modified. + + # TODO: Iterate an optimized query or Remove this nonsense comment + # Node.objects.filter() + # NodeTraversal.objects.filter(node_id) + # traverses = Node.nodetraversal_set.filter(path_zoom=zoom)\ + # .distinct()\ + # .filter(count=1) + # downstream_ids = set(t.downstream_id() for t in traverses) + # a.path_set == b.path_set + # Node.objects.filter(path_set == ) + modification_happened = False + path_ids = current_level.paths.values_list('id', flat=True) + for my_node in current_level.nodes_xrange(): + + # only one Node Downstream, no matter the number of specimens + if len(my_node.downstream_ids()) == 1: + d = my_node.nodetraversal_set.first().downstream() + if d: + modification_happened = True + next_node = d.node # fetched from DB + if my_node.nodetraversal_set.count() == next_node.nodetraversal_set.count(): # Not a complete guarantee... + # Torsten deletes my_node and modifies next_node + merged_node = Node.objects.create(name=f'{my_node.name}*{current_level.zoom}', + zoom=current_level) + for x in [my_node, next_node]: + x.summarized_by = merged_node + x.save() # TODO: doesn't work because reading and writing same layer. next_node gets deleted soon + + # edit existing traversals + next_node.nodetraversal_set.\ + filter(path_id__in=path_ids).\ + update(node_id=merged_node.id) + + # delete my_node and all associates + query = my_node.nodetraversal_set.filter(path_id__in=path_ids) + query._raw_delete(query.db) # https://www.nickang.com/fastest-delete-django/ + # TODO: merged_node.start = my_node.start, length = my_node.length + next_node.length + return modification_happened + + +def prep_next_summary_layer(current_level): + zoom = current_level.zoom + assert current_level.graph.highest_zoom_level() == zoom, \ + "You should only be summarizing the topmost layer" + next_level = ZoomLevel.objects.create(graph=current_level.graph, zoom=zoom + 1) + return next_level, zoom + + +def delete_node(node: Node, cutoff: int, layer: ZoomLevel): """Changes references to this node to add to references to Node.NOTHING""" if cutoff < 1: return # if cutoff is 0, then don't touch upstream and downstream - for parent, count in node.upstream.items(): - parent.downstream[Node.NOTHING] += parent.downstream[node] - del parent.downstream[node] - for descendant, count in node.downstream.items(): - descendant.upstream[Node.NOTHING] += descendant.upstream[node] - del descendant.upstream[node] + node.traverses().delete() + node.validate() -def neglect_nodes(all_nodes, deletion_cutoff=FILTER_THRESHOLD): +def neglect_nodes(zoom_level : ZoomLevel, deletion_cutoff=FILTER_THRESHOLD): """Deletes nodes if they have too few specimens supporting them defined by :param deletion_cutoff :returns a new list of nodes lacking the pruned nodes in all_nodes""" - nodes_to_delete = set() - for node in all_nodes: - if len(node.specimens) <= deletion_cutoff: - delete_node(node, deletion_cutoff) # TODO: check if this will orphan - nodes_to_delete.add(node) - filtered_nodes = [x for x in all_nodes if x not in nodes_to_delete] - # TODO: remove orphaned haplotypes in a node that transition to and from zero within a 10 window length - return filtered_nodes + # next_level, zoom = prep_next_summary_layer(current_level) + + for node in zoom_level.nodes_xrange(): # TODO optimize distinct count + if len(node.specimens()) <= deletion_cutoff: + delete_node(node, deletion_cutoff, zoom_level) -def split_one_group(prev_node, anchor, next_node): + +def split_one_group(prev_node, anchor, next_node, zoom_level: ZoomLevel): """ Called when up.specimens == down.specimens Comment: That is actually the case we want to split up to obtain longer blocks later Extension of full windows will take care of potential loss of information later""" - my_specimens = copy(anchor.specimens) # important to copy or side effects occur - if prev_node is not Node.NOTHING: # normal case - my_specimens = my_specimens.intersection(prev_node.specimens) - if next_node is not Node.NOTHING: # normal case - my_specimens = my_specimens.intersection(next_node.specimens) - if prev_node is Node.NOTHING and next_node is Node.NOTHING: # exceptional: both are nothing node - my_specimens = copy(anchor.specimens) - # removing all specimens that transition to nothing - for n in anchor.downstream.keys(): - if n is Node.NOTHING: # remove dead leads - my_specimens -= n.specimens - for n in anchor.upstream.keys(): - if n is Node.NOTHING: # remove dead leads - my_specimens -= n.specimens - - my_start, my_end = prev_node.start, next_node.end - my_upstream, my_downstream = copy(prev_node.upstream), copy(next_node.downstream) - if Node.NOTHING is prev_node: # Rare case - my_start = anchor.start - my_upstream = copy(anchor.upstream) - if Node.NOTHING is next_node: # Rare case - my_end = anchor.end - my_downstream = copy(anchor.downstream) - - # TODO: what about case where more content is joining downstream? - new = Node(777, my_start, my_end, my_specimens, my_upstream, my_downstream) - - # Update Specimens in prev_node, anchor, next_node - anchor.specimens -= new.specimens - prev_node.specimens -= new.specimens - next_node.specimens -= new.specimens - - # Update upstream/downstream - update_neighbor_pointers(new) - suspects = {new, prev_node, anchor, next_node}.union(set(new.upstream.keys()), set(new.downstream.keys())) - for n in suspects: - update_transition(n) - new.validate() - return new + + my_specimens = anchor.specimens() # list of path_ids + my_specimens = my_specimens.intersection(prev_node.specimens()) + my_specimens = my_specimens.intersection(next_node.specimens()) + new_node = Node.objects.create(zoom=zoom_level, name=f'{anchor.name}:{zoom_level.zoom}') + for a in (prev_node, anchor, next_node): + a.summarized_by = new_node + a.save() + + NodeTraversal.objects.filter(path_id__in=my_specimens, node_id=anchor.id).update(node_id=new_node.id) + NodeTraversal.objects.filter(path_id__in=my_specimens, node_id=prev_node.id).delete() + NodeTraversal.objects.filter(path_id__in=my_specimens, node_id=next_node.id).delete() + # TODO: if this is slow use query._raw_delete + + new_node.validate() + return new_node def update_neighbor_pointers(new_node): """Ensure that my new upstream pointers have matching downstream pointers in neighbors, and vice versa. This does not set correct transition rates, it only makes the nodes connected.""" for n in new_node.upstream.keys(): - if n is not Node.NOTHING: + if not n.is_nothing(): n.downstream[new_node] = 1 for n in new_node.downstream.keys(): - if n is not Node.NOTHING: + if not n.is_nothing(): n.upstream[new_node] = 1 -def split_groups(all_nodes: List[Node]): +def split_groups(zoom_level: ZoomLevel): """When two haplotypes have a locus in common with no variation, then the graph represents this with a single anchor node flanked by 2 haplotypes on either side. This means 5 Nodes are present where 2 would suffice. Split groups splits the anchor node and gives pieces @@ -315,29 +221,15 @@ def split_groups(all_nodes: List[Node]): Note: This is called crossmerge in the R code. TODO: Ideally, the database would retain some record of how many nucleotides are shared between the two new haplotype nodes.""" - new_graph = list(all_nodes) - for node in all_nodes: + + for node in zoom_level.nodes_xrange(): # check if all transition upstream match with one of my downstream nodes - if len(node.specimens) > 0: + if len(node.specimens()) > 0: # Matchup upstream and downstream with specimen identities - for up in tuple(node.upstream.keys()): - for down in tuple(node.downstream.keys()): - set1 = copy(up.specimens) - set2 = copy(down.specimens) - if up is Node.NOTHING: - set1 = copy(node.specimens) - for index in tuple(node.upstream.keys()): - if index is not Node.NOTHING: - set1.difference_update(index.specimens) - if down is Node.NOTHING: - set2 = copy(node.specimens) - for index in tuple(node.downstream.keys()): - if index is not Node.NOTHING: - set2.difference_update(index.specimens) - - if set1 == set2 and len(set1) > 0: - new_node = split_one_group(up, node, down) - new_graph.append(new_node) - - filtered = neglect_nodes(new_graph, 0) # Delete nodes with zero specimens from the Graph? - return filtered + for up in node.upstream(): + set1 = up.specimens() + if len(set1): + for down in node.downstream(): + set2 = down.specimens() + if set1 == set2 and len(set2) > 0: + new_node = split_one_group(up, node, down, zoom_level) diff --git a/HaploBlocker/migrations/0001_initial.py b/HaploBlocker/migrations/0001_initial.py deleted file mode 100644 index a7b3890..0000000 --- a/HaploBlocker/migrations/0001_initial.py +++ /dev/null @@ -1,40 +0,0 @@ -# Generated by Django 2.2.1 on 2019-07-23 21:25 - -from django.db import migrations, models -import django.db.models.deletion - - -class Migration(migrations.Migration): - - initial = True - - dependencies = [ - ] - - operations = [ - migrations.CreateModel( - name='Node', - fields=[ - ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('seq', models.CharField(max_length=1000)), - ], - ), - migrations.CreateModel( - name='Path', - fields=[ - ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('accession', models.CharField(max_length=1000, unique=True)), - ], - ), - migrations.CreateModel( - name='Edge', - fields=[ - ('node', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='HaploBlocker.Node')), - ('path', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='HaploBlocker.Path')), - ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ], - options={ - 'ordering': ['path', 'id'], - }, - ), - ] diff --git a/HaploBlocker/models.py b/HaploBlocker/models.py index 3796ddd..c8945cd 100644 --- a/HaploBlocker/models.py +++ b/HaploBlocker/models.py @@ -3,18 +3,3 @@ # Create your models here. from django.db import models - -class Node(models.Model): - seq = models.CharField(max_length=1000) - - -class Path(models.Model): - accession = models.CharField(unique=True, max_length=1000) - - -class Edge(models.Model): - node = models.ForeignKey(Node, on_delete=models.CASCADE) - path = models.ForeignKey(Path, on_delete=models.CASCADE) - - class Meta: - ordering = ['path','id'] \ No newline at end of file diff --git a/HaploBlocker/tests.py b/HaploBlocker/tests.py index 500785c..3ae93d9 100644 --- a/HaploBlocker/tests.py +++ b/HaploBlocker/tests.py @@ -1,41 +1,53 @@ +from unittest import skip, skipIf from django.test import TestCase +import Graph.utils +import Graph.views +from Graph.models import GraphGenome, Path, ZoomLevel, NodeTraversal from vgbrowser.settings import BASE_DIR -import unittest import os # Create your tests here. # from HaploBlocker.models import Node, Path, Edge -from HaploBlocker.haplonetwork import Node, split_one_group -from HaploBlocker.haplonetwork import read_data, get_all_signatures, build_individuals, get_unique_signatures, \ +from HaploBlocker.haplonetwork import Node, split_one_group, prep_next_summary_layer +from HaploBlocker.haplonetwork import read_data, build_all_slices, build_paths, nodes_from_unique_signatures, \ populate_transitions, simple_merge, neglect_nodes, split_groups -# -# class ModelTest(TestCase): -# def test_creation(self): -# p = Path.objects.create(accession='watermelon') -# n = Node.objects.create(seq='ACGT') -# Edge.objects.create(node=n, path=p) -# Edge.objects.create(node=Node.objects.create(seq='AAAA'), path=p) -# Edge.objects.create(node=Node.objects.create(seq='TTT'), path=p) -# print(Edge.objects.all()) +class ModelTest(TestCase): + def test_creation(self): + g = GraphGenome.objects.create(name='test_creation') + z = g.nucleotide_level + p = Path.objects.create(accession='watermelon', zoom=z) + n = Node.objects.create(seq='ACGT', zoom=z) + NodeTraversal.objects.create(node=n, path=p, strand='+') + NodeTraversal.objects.create(node=Node.objects.create(seq='AAAA', name='-9', zoom=z), + path=p, strand='+') + NodeTraversal.objects.create(node=Node.objects.create(seq='TTT', name='-8', zoom=z), + path=p, strand='+') + print(NodeTraversal.objects.all()) -class HaploTest(unittest.TestCase): + +class HaploTest(TestCase): @classmethod def setUpClass(self) -> None: """Reads the input data file once. Tests that need a fresh graph must - call create_nodes()""" - print(os.getcwd()) + call create_graph()""" + print("Setting up HaploTest:", os.getcwd()) self.alleles, self.individuals = read_data(os.path.join(BASE_DIR, "test_data/KE_chromo10.txt")) + print("Finished reading SNP file") + super(HaploTest, self).setUpClass() - def create_nodes(self): - """Tests that need a fresh graph must call create_nodes() FIRST! + def create_graph(self, graph_name): + """Tests that need a fresh graph must call create_graph() FIRST! Graph summarization works by side effecting Node objects. Tests can not run independently with order dependent side effects. This method is slow, so don't use it unless you - need it.""" - self.unique_signatures = get_all_signatures(self.alleles, self.individuals) - self.simplified_individuals = build_individuals(self.individuals, self.unique_signatures) - # G = build_graph(simplified_individuals) - populate_transitions(self.simplified_individuals) + need it. + :param graph_name: """ + print("Creating test Graph") + graph = GraphGenome.objects.create(name=graph_name) + slices = build_all_slices(self.alleles, self.individuals, graph) + self.paths = build_paths(self.individuals, slices, graph) + print("Finished test graph.") + return graph def test_read(self): @@ -49,29 +61,35 @@ def test_build_individuals(self): def internal_build_individuals(self, alleles, individuals): - unique_signatures = get_all_signatures(alleles, individuals) - assert repr(unique_signatures[ - 21]) == '{(0, 0, 0, 0, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2): N0(21, 21), (0, 0, 2, 0, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2): N1(21, 21), (0, 0, 0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0): N2(21, 21), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0): N3(21, 21), (0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0): N4(21, 21), (0, 0, 0, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 0, 0, 2, 0, 0, 0, 2): N5(21, 21), (0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0): N6(21, 21), (0, 0, 0, 0, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 0, 2, 0, 0, 2, 2): N7(21, 21)}' - simplified_individuals = build_individuals(individuals, unique_signatures) - assert repr(simplified_individuals[500][ - :100]) == '[N2(0, 0), N2(1, 1), N2(2, 2), N2(3, 3), N2(4, 4), N2(5, 5), N3(6, 6), N3(7, 7), N3(8, 8), N2(9, 9), N0(10, 10), N1(11, 11), N2(12, 12), N2(13, 13), N2(14, 14), N2(15, 15), N3(16, 16), N3(17, 17), N4(18, 18), N3(19, 19), N5(20, 20), N3(21, 21), N3(22, 22), N10(23, 23), N4(24, 24), N3(25, 25), N4(26, 26), N3(27, 27), N1(28, 28), N1(29, 29), N4(30, 30), N3(31, 31), N21(32, 32), N1(33, 33), N1(34, 34), N1(35, 35), N1(36, 36), N1(37, 37), N1(38, 38), N1(39, 39), N1(40, 40), N1(41, 41), N1(42, 42), N1(43, 43), N1(44, 44), N1(45, 45), N1(46, 46), N1(47, 47), N1(48, 48), N1(49, 49), N1(50, 50), N1(51, 51), N1(52, 52), N1(53, 53), N1(54, 54), N1(55, 55), N1(56, 56), N1(57, 57), N1(58, 58), N1(59, 59), N1(60, 60), N1(61, 61), N1(62, 62), N1(63, 63), N1(64, 64), N1(65, 65), N1(66, 66), N1(67, 67), N1(68, 68), N1(69, 69), N1(70, 70), N1(71, 71), N1(72, 72), N1(73, 73), N1(74, 74), N1(75, 75), N1(76, 76), N1(77, 77), N0(78, 78), N0(79, 79), N1(80, 80), N1(81, 81), N1(82, 82), N1(83, 83), N1(84, 84), N1(85, 85), N1(86, 86), N1(87, 87), N1(88, 88), N1(89, 89), N1(90, 90), N1(91, 91), N1(92, 92), N1(93, 93), N1(94, 94), N1(95, 95), N1(96, 96), N1(97, 97), N0(98, 98), N0(99, 99)]' - assert len(simplified_individuals) == 501 and len(simplified_individuals[60]) == 1638 + graph = GraphGenome.objects.create(name='internal_build_individuals') + unique_signatures = build_all_slices(alleles, individuals, graph) + peek = repr(list(unique_signatures[21].values())) + assert peek == '[N0:21-21(00002202022222220202), N1:21-21(00202202022222220202), N2:21-21(00022200000000000000), N3:21-21(00000000000000000000), N4:21-21(00002200000000000000), N5:21-21(00022202022220020002), N6:21-21(02000000000000000000), N7:21-21(00002202222220020022)]', peek + simplified_individuals = build_paths(individuals, unique_signatures, graph) + traverses = simplified_individuals[500].nodes.filter(order__lt=100) # [:100] + nodes = [t.node for t in traverses.prefetch_related('node').all()] + peek = repr(nodes) + self.maxDiff = None # tells the debugger to show the whole thing + self.assertEqual(peek, '[N2:0-0(00000000000000000000), N2:1-1(00000000000000000000), N2:2-2(00000000000000000000), N2:3-3(00000000000000000000), N2:4-4(00000000000000000000), N2:5-5(00000000000000000000), N3:6-6(00000000000000000000), N3:7-7(00000000000000000000), N3:8-8(00000000000000000000), N2:9-9(00000000000000000000), N0:10-10(00000000000000000000), N1:11-11(00000000000000000000), N2:12-12(00000000000000000000), N2:13-13(00000000000000000000), N2:14-14(00000000000000000000), N2:15-15(00000000000000000000), N3:16-16(00000000000000000000), N3:17-17(00000000000000000000), N4:18-18(00000000000000000000), N3:19-19(00000000000000000000), N5:20-20(00000000000000000000), N3:21-21(00000000000000000000), N3:22-22(00000000000000000000), N10:23-23(00200000000000000000), N4:24-24(00002200222220002000), N3:25-25(02000222220002020222), N4:26-26(20022000002002220002), N3:27-27(22222202020222220000), N1:28-28(00000000000000000000), N1:29-29(00000000000000000022), N4:30-30(00002222222000002200), N3:31-31(00022222202000000000), N21:32-32(00000020202200022020), N1:33-33(02202220022020222000), N1:34-34(00020002000202222002), N1:35-35(22220002220022200022), N1:36-36(22222200000000000000), N1:37-37(00202002222220000200), N1:38-38(00000200000202022200), N1:39-39(02202000202202220000), N1:40-40(00020222200020000020), N1:41-41(20220020022200022200), N1:42-42(00000000000000000000), N1:43-43(00000000000000000000), N1:44-44(00000000000000000000), N1:45-45(00000000000000000000), N1:46-46(00000002220220020020), N1:47-47(00202220222220222202), N1:48-48(00000000000000000002), N1:49-49(20002200000002220022), N1:50-50(22020002002020202022), N1:51-51(02202222220222202000), N1:52-52(20000020000000000000), N1:53-53(00000000000000000000), N1:54-54(00000000000000000000), N1:55-55(00220220000200000220), N1:56-56(20000000202022022020), N1:57-57(20222022022202222220), N1:58-58(22022202222222020200), N1:59-59(22202200202220202220), N1:60-60(22020022220200022022), N1:61-61(20202220000220000220), N1:62-62(00022002000000000000), N1:63-63(00000220000000000000), N1:64-64(00000000000220200000), N1:65-65(00022020200000020022), N1:66-66(20020222222020200020), N1:67-67(00000000000000202222), N1:68-68(22222222000202222202), N1:69-69(22022222020020000022), N1:70-70(00002002220022222200), N1:71-71(22002020020202000000), N1:72-72(00022202000202220020), N1:73-73(22000000000000200020), N1:74-74(22220222220200202202), N1:75-75(00022202222200000000), N1:76-76(00000220220200200022), N1:77-77(02200202020020200000), N0:78-78(00002000000000000000), N0:79-79(00000000000000000000), N1:80-80(00000000000022220000), N1:81-81(00000000000000000000), N1:82-82(00022220200202202202), N1:83-83(20202222200202202202), N1:84-84(00000020000000000000), N1:85-85(00222022020000000002), N1:86-86(22020222020222222000), N1:87-87(00022222002020222022), N1:88-88(00002222000000000200), N1:89-89(00000000000000220022), N1:90-90(22020202200020222220), N1:91-91(00002000002220002222), N1:92-92(22200000000000000000), N1:93-93(00000000000000000000), N1:94-94(00202022200202222222), N1:95-95(22222202202020222222), N1:96-96(00222220200202222020), N1:97-97(22002202220222222022), N0:98-98(20222222222222020220), N0:99-99(20222222220222222002)]') + self.assertEqual(len(simplified_individuals),501) + self.assertEqual(simplified_individuals[60].nodes.count(), 1638) def test_get_unique_signatures(self): - unique_blocks = get_unique_signatures(self.individuals, 0) + graph = GraphGenome.objects.create(name='test_get_unique_signatures') + unique_blocks = nodes_from_unique_signatures(self.individuals, 0, graph) assert len(unique_blocks) == 4 - assert unique_blocks.__repr__() == '{(0, 2, 0, 0, 2, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 2, 0, 0, 0, 0): N0(0, 0), ' \ - '(0, 0, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 0, 2, 2, 2, 2): N1(0, 0), ' \ - '(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0): N2(0, 0), ' \ - '(2, 0, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 0, 2, 2, 2, 2): N3(0, 0)}' + peek = repr(list(unique_blocks.values())) + assert peek == \ + '[N0:0-0(02002020002000220000), N1:0-0(00220202220222002222), ' \ + 'N2:0-0(00000000000000000000), N3:0-0(20220202220222002222)]', peek - @unittest.skip + @skip def test_no_duplicate_nodes(self): - self.create_nodes() + graph = self.create_graph('test') unique_nodes = set() duplicates_found = 0 - for locus in self.simplified_individuals: + for locus in self.paths: for node in locus: # assert isinstance(node, Node) if node in unique_nodes: # If two nodes have the same __hash__ they'll be "in" @@ -81,19 +99,31 @@ def test_no_duplicate_nodes(self): unique_nodes.add(node) assert duplicates_found == 0, f"Found {duplicates_found} duplicated nodes in the graph" + def _test_simple_merge(self, layer: ZoomLevel): + # these tests could be made independent of test_workflow, but it would be slower + graph = layer.graph + zoom_level = layer.zoom + assert graph.highest_zoom_level() == zoom_level + self.assertEqual(len(layer), 7180) + next_level = simple_merge(layer) + # Test every Path has a representative in this ZoomLevel + self.assertEqual(Path.objects.filter(graph=graph, zoom=zoom_level + 1).count(), + Path.objects.filter(graph=graph, zoom=zoom_level + 0).count()) + self.assertEqual(NodeTraversal.objects.filter(graph=graph, zoom=zoom_level+1).count(), 3690) #*501? + - def _test_simple_merge(self, all_nodes): - assert len(all_nodes) == 7180 - summary1 = simple_merge(all_nodes) - assert len(summary1) == 3690 - return summary1 + def test_simple_merge(self): + graph = self.create_graph('test') + self._test_simple_merge(graph.nucleotide_level) - def _test_neglect_nodes(self, all_nodes): - summary2 = neglect_nodes(all_nodes) - assert len(summary2) == 2854 - unchanged = neglect_nodes(summary2, 0) - assert len([n for n in unchanged if len(n.specimens) == 0]) == 0 - return summary2 + + def _test_neglect_nodes(self, zoom_level : ZoomLevel): + neglect_nodes(zoom_level) + assert len(zoom_level) == 2854 + unchanged = neglect_nodes(zoom_level, 0) + assert len(zoom_level) == 2854 + # TODO: assert no orphans + # assert len([n for n in unchanged if len(n.specimens(zoom_level)) == 0]) == 0 def test_split_one_group(self): @@ -103,59 +133,55 @@ def test_split_one_group(self): ['C', {a, b, d}, '', {c}], # [3] repeated from [1] SNP """ nodes = [ - Node(91, 1, 1, {1, 2, 4}), - Node(92, 1, 1, {3}), - Node(93, 2, 2, {1, 2, 3, 4}), # [2] anchor - Node(94, 3, 3, {1, 2, 4}), - Node(95, 3, 3, {3}), - # additional bracketing to anchor - Node(90, 0, 0, {1, 2, 3, 4}), - Node(96, 4, 4, {1, 2, 3, 4}) + ['90', {1, 2, 3, 4}], #[5] + ['91', {1, 2, 4}, '92', {3}], + ['93', {1, 2, 3, 4}], # [2] anchor + ['94', {1, 2, 4}, '95', {3}], #[3] [4] + ['96', {1, 2, 3, 4}] #[6] ] - # connections - nodes[5].downstream[nodes[0]] = 3 - nodes[5].downstream[nodes[1]] = 1 - nodes[0].downstream[nodes[2]] = 3 - nodes[1].downstream[nodes[2]] = 1 - nodes[2].downstream[nodes[3]] = 3 - nodes[2].downstream[nodes[4]] = 1 - nodes[3].downstream[nodes[6]] = 3 - nodes[4].downstream[nodes[6]] = 1 - nodes[0].upstream[nodes[5]] = 3 - nodes[1].upstream[nodes[5]] = 1 - nodes[2].upstream[nodes[0]] = 3 - nodes[2].upstream[nodes[1]] = 1 - nodes[3].upstream[nodes[2]] = 3 - nodes[4].upstream[nodes[2]] = 1 - nodes[6].upstream[nodes[3]] = 3 - nodes[6].upstream[nodes[4]] = 1 - new_node = split_one_group(nodes[0], nodes[2], nodes[3]) # no mentions of minorities [1] or [4] + g = Graph.utils.build_graph_from_slices(nodes, '9') + z = g.nucleotide_level + first, anchor, third = z.node('91'), z.node('93'), z.node('94') + new_node = split_one_group(first, anchor, third, z) # no mentions of minorities [1] or [4] print(new_node.details()) - assert new_node in nodes[5].downstream and nodes[1] in nodes[5].downstream - assert nodes[0] not in nodes[5].downstream - assert nodes[5] in new_node.upstream and nodes[6] in new_node.downstream - assert new_node in nodes[6].upstream and nodes[4] in nodes[6].upstream - assert nodes[3] not in nodes[6].upstream - - def _test_split_groups(self, all_nodes): - summary3 = split_groups(all_nodes) - assert len(summary3) > 10 - return summary3 - + self.assertIn(new_node.id, z.node('90').downstream_ids()) + self.assertIn(z.node('92').id, z.node('90').downstream_ids()) + self.assertNotIn(z.node('91').id, z.node('90').downstream_ids()) + self.assertIn(z.node('90').id, new_node.upstream_ids()) + self.assertIn(z.node('96').id, new_node.downstream_ids()) + self.assertIn(new_node.id, z.node('96').upstream_ids()) + self.assertIn(z.node('95').id, z.node('96').upstream_ids()) + self.assertNotIn(z.node('94').id, z.node('96').upstream_ids()) + + def test_split_groups(self): + nodes = [ + ['90', {1, 2, 3, 4}], # [5] + ['91', {1, 2, 4}, '92', {3}], + ['93', {1, 2, 3, 4}], # [2] anchor + ['94', {1, 2, 4}, '95', {3}], # [3] [4] + ['96', {1, 2, 3, 4}] # [6] + ] + g = Graph.utils.build_graph_from_slices(nodes, 'test_split_groups') + # zoom_level, zoom = prep_next_summary_layer(g.nucleotide_level()) + split_groups(g.nucleotide_level) + self.assertEqual(len(g.nucleotide_level), 5) + # simple_merge(g.nucleotide_level()) + # self.assertEqual(len(g.nucleotide_level()), 3) def test_workflow(self): - self.create_nodes() - all_nodes = [node for window in self.unique_signatures for node in window.values()] # think about referencing and deletion - summary1 = self._test_simple_merge(all_nodes) - summary2 = self._test_neglect_nodes(summary1) - summary3 = self._test_split_groups(summary2) + graph = self.create_graph('test') + summary1, zoom = prep_next_summary_layer(graph.nucleotide_level) + self._test_simple_merge(summary1) + summary2, zoom = prep_next_summary_layer(summary1) + self._test_neglect_nodes(summary2) + summary3, zoom = prep_next_summary_layer(summary2) + split_groups(summary3) assert len(summary1) > len(summary2) > len(summary3), "Each summarization should result in less nodes" - summary4 = simple_merge(summary3) - bad = summary3[2] - print(bad.details()) + summary4, zoom = prep_next_summary_layer(summary2) + simple_merge(summary4) - # test_signatures = get_all_signatures(alleles, individuals) - # test_individuals = build_individuals(individuals, test_signatures) + # test_signatures = build_all_slices(alleles, individuals) + # test_individuals = build_paths(individuals, test_signatures) # populate_transitions(test_individuals) # no return val # # test1 = test_simple_merge(test_signatures) diff --git a/HaploBlocker/views.py b/HaploBlocker/views.py index a283b04..a47de89 100644 --- a/HaploBlocker/views.py +++ b/HaploBlocker/views.py @@ -2,10 +2,9 @@ from django.http import HttpResponse -from HaploBlocker.models import Edge def index(request): - return HttpResponse("Index of Edges.\n" + str(Edge.objects.all())) + return HttpResponse() #"Index of Edges.\n" + str(Edge.objects.all())) # Create your views here. diff --git a/vgbrowser/settings.py b/vgbrowser/settings.py index ef7bf9b..af17561 100644 --- a/vgbrowser/settings.py +++ b/vgbrowser/settings.py @@ -31,14 +31,14 @@ # Application definition INSTALLED_APPS = [ - 'HaploBlocker.apps.HaploblockerConfig', - 'Graph.apps.GraphConfig', 'django.contrib.admin', 'django.contrib.auth', 'django.contrib.contenttypes', 'django.contrib.sessions', 'django.contrib.messages', 'django.contrib.staticfiles', + 'Graph.apps.GraphConfig', + 'HaploBlocker.apps.HaploblockerConfig', ] MIDDLEWARE = [