Skip to content
This repository has been archived by the owner on Mar 20, 2020. It is now read-only.

I23 haplo conversion #27

Draft
wants to merge 15 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
15 commits
Select commit Hold shift + click to select a range
9884ea3
#23 WIP: First round of conversions moving halponetwork to Graph.Node…
josiahseaman Aug 20, 2019
a2bbcef
#23 #3 #4 MAJOR: Added Zoom layer concept by giving Path.zoom so Path…
josiahseaman Aug 22, 2019
c02b7f1
#23 WIP: Converting Haploblocker, automating transitions simplifies t…
josiahseaman Aug 22, 2019
52c5c0f
#23 ZoomLevel objects, custom create() methods for consistency and co…
josiahseaman Aug 23, 2019
005d1d8
#23 test_export_as_gfa now working with example fast queries for fetc…
josiahseaman Aug 23, 2019
44a3bd1
Updated some of DAGify to use GraphGenomes. Unfortunately, I don't u…
josiahseaman Aug 23, 2019
bc49253
#26 Simple_merge converted by copying the entire GraphGenome, Paths, …
josiahseaman Aug 26, 2019
59c18fd
#26 Simple_merge test cases improved. internal_build_individuals() i…
josiahseaman Aug 27, 2019
2fba2d8
#26 Split_one_group() updated for Database but Path - NodeTraversal -…
josiahseaman Sep 3, 2019
96df52a
#26 Split_groups Working fully. ZoomLevels are full copies of NodeTr…
josiahseaman Sep 3, 2019
5159eae
#26 Fix for Graph copying operation. Still hitting "too many SQL var…
josiahseaman Sep 3, 2019
90570c4
#26 changes to the DB schema for ZoomLevels being directly linked wit…
josiahseaman Sep 3, 2019
eba726d
#26 Added xrange iterators
josiahseaman Sep 4, 2019
6c3c879
First working version of random gfa generator. All positive strand
josiahseaman Nov 6, 2019
c7b8af0
Unknown: Dangling changes for halpo_conversion
josiahseaman Nov 21, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,9 @@ python:
# Command to install dependencies, e.g. pip install -r requirements_dev.txt --use-mirrors
install:
- pip install -r requirements_dev.txt
- python manage.py migrate
- python manage.py migrate Graph

env:
- DJANGO_VERSION=2.2.1
- DJANGO_SETTINGS_MODULE=vgbrowser.settings

# Command to run tests, e.g. python setup.py test
Expand Down
304 changes: 304 additions & 0 deletions GFA_Generator.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,304 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 74,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"H\tVN:Z:1.0\n",
"S\t1\tTTTT\n",
"S\t2\tAGTCGCAT\n",
"S\t3\tGTTG\n",
"S\t4\tCCTA\n",
"P\t7\t3-,4+,3+,3+\t*,*,*,*\n",
"P\t8\t3+,2+,4-,3+\t*,*,*,*\n",
"P\t9\t3+,3+,1+,2+\t*,*,*,*\n",
"P\t10\t2+,1-,1+,2+\t*,*,*,*\n",
"P\t11\t1-,1+,3+,2+\t*,*,*,*\n",
"L\t1\t+\t3\t+\t0M\n",
"L\t4\t+\t3\t+\t0M\n",
"L\t1\t-\t1\t+\t0M\n",
"L\t3\t-\t4\t+\t0M\n",
"L\t3\t+\t2\t+\t0M\n",
"L\t3\t+\t3\t+\t0M\n",
"L\t1\t+\t2\t+\t0M\n",
"L\t3\t+\t1\t+\t0M\n",
"L\t4\t-\t3\t+\t0M\n",
"L\t2\t+\t1\t-\t0M\n",
"L\t2\t+\t4\t-\t0M\n",
"11 Edges\n"
]
}
],
"source": [
"from random import choice, randint\n",
"\n",
"\n",
"def generate_random_gfa(rows = 5, printer=print, stranded=True):\n",
" printer('H\tVN:Z:1.0')\n",
" printer(\"\"\"S\t1\tTTTT\n",
"S\t2\tAGTCGCAT\n",
"S\t3\tGTTG\n",
"S\t4\tCCTA\"\"\")\n",
"\n",
" nodes = ['1', '2', '3', '4']\n",
" chance = 10 if stranded else 0\n",
" paths = []\n",
" for path in range(rows):\n",
" length = 4\n",
" my_path = [choice(nodes) + ('+' if randint(0,chance) != 9 else '-') for i in range(length)]\n",
" # for step in range(4):\n",
" paths.append(my_path)\n",
"\n",
" links = set()\n",
" for y, p in enumerate(paths):\n",
" for i in range(len(p)-1):\n",
" links.add('L\\t'+p[i][:-1]+'\\t'+p[i][-1:]+'\\t'+p[i+1][:-1]+'\\t'+p[i+1][-1:]+'\\t0M')\n",
"\n",
" printer('\\t'.join(['P', str(7+y), ','.join([n for n in p]), ','.join('*'*len(p))]))\n",
"\n",
" for l in links:\n",
" printer(l)\n",
" print(len(links), \"Edges\")\n",
"generate_random_gfa()"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def write_gfa(size, stranded=True):\n",
" mark = '_stranded' if stranded else '_no_inversions'\n",
" name = r'D:\\josiah\\Documents\\1001G\\Graph_Genome_Browser\\scalability_test\\second_batch\\\\' + str(size) + '_paths'+mark+'.gfa'\n",
" with open(name, 'w') as f:\n",
" lines = []\n",
" generate_random_gfa(size, lines.append, stranded)\n",
" f.write('\\n'.join(lines))"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"16 Edges\n"
]
}
],
"source": [
"write_gfa(10)"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"28 Edges\n"
]
}
],
"source": [
"write_gfa(20)"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"45 Edges\n"
]
}
],
"source": [
"write_gfa(100)"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"61 Edges\n"
]
}
],
"source": [
"write_gfa(1000)"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"14 Edges\n"
]
}
],
"source": [
"write_gfa(10, False)"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"15 Edges\n"
]
}
],
"source": [
"write_gfa(20, False)"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"16 Edges\n"
]
}
],
"source": [
"write_gfa(100, False)"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"16 Edges\n"
]
}
],
"source": [
"write_gfa(1000, False)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import gfapy\n",
"g1 = gfapy.Gfa.from_file(r'D:\\josiah\\Documents\\1001G\\Graph_Genome_Browser\\scalability_test\\5_node_1000_paths.gfa')"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from random import choice, randint\n",
"randint(0,0)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"metadata": {
"collapsed": false
},
"source": []
}
}
},
"nbformat": 4,
"nbformat_minor": 0
}
31 changes: 17 additions & 14 deletions Graph/gfa.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import io
import os
import tempfile
from Graph.models import *
from Graph.models import Node, Path, GraphGenome


def pairwise(iterable):
Expand Down Expand Up @@ -103,30 +103,33 @@ def save_as_gfa(self, file: str):
self.gfa.to_file(file)

@classmethod
def from_graph(cls, graph: GraphGenome):
def from_graph(cls, graph: GraphGenome): # TODO: should be given ZoomLevel instead
"""Constructs the lines of a GFA file listing paths, then sequence nodes in arbitrary order."""
gfa = gfapy.Gfa()
for path in graph.paths:
node_series = ",".join([traverse.node.name + traverse.strand for traverse in path.nodes])
gfa.add_line('\t'.join(['P', path.accession, node_series, ",".join(['*' for _ in path.nodes])]))
for node in graph.nodes: # in no particular order
for path in graph.paths.all():
visits = []
# example of using lazy queries and values_list for fast lookup
node_infos = path.nodes.values_list('node_id', 'strand', named=True)
for traverse in node_infos:
name = Node.objects.values_list('name', flat=True).get(id=traverse.node_id) # fast lookup
visits.append(name + traverse.strand)
node_series = ",".join(visits)
connections = ",".join(['*'] * path.nodes.count()) # count -1?
gfa.add_line('\t'.join(['P', path.accession, node_series, connections]))
for node in graph.nucleotide_level.nodes_xrange(): # in no particular order
gfa.add_line('\t'.join(['S', str(node.name), node.seq]))
return cls(gfa, "from Graph")

def to_paths(self) -> GraphGenome:
graph = self.to_graph()
return graph.paths

def to_graph(self) -> GraphGenome:
"""Create parent object for this genome and save it in the database.
This can create duplicates appended in Paths if it is called twice."""
gdb = GraphGenome.objects.get_or_create(name=self.source_path)[0]
gdb = GraphGenome.objects.create(name=self.source_path)
z = gdb.nucleotide_level
for segment in self.gfa.segments:
Node.objects.get_or_create(seq=segment.sequence, name=(segment.name), graph=gdb)
Node.objects.get_or_create(seq=segment.sequence, name=segment.name, zoom=z)

for path in self.gfa.paths:
p = Path(accession=path.name, graph=gdb)
p.save()
p = Path.objects.create(accession=path.name, zoom=z)
p.append_gfa_nodes(path.segment_names)
return gdb

3 changes: 2 additions & 1 deletion Graph/migrations/0001_initial.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,9 @@ class Migration(migrations.Migration):
migrations.CreateModel(
name='Node',
fields=[
('id', models.AutoField(auto_created=True, default=0, primary_key=True, serialize=False, verbose_name='ID')),
('seq', models.CharField(blank=True, max_length=255)),
('name', models.CharField(max_length=15, primary_key=True, serialize=False)),
('name', models.CharField(max_length=15,serialize=False)),
('graph', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='Graph.GraphGenome')),
],
options={
Expand Down
Loading