From b420ee277240e120cc8ecd598a38616268a67447 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Wed, 25 Sep 2024 13:47:55 -0700 Subject: [PATCH 01/41] create first iteration of forest builder --- happler/__main__.py | 54 +- happler/tree/__init__.py | 1 + happler/tree/forest_builder.py | 107 +++ happler/tree/tree_builder.py | 4 - .../data/19_45401409-46401409_1000G.multi.hap | 12 + .../19_45401409-46401409_1000G.multi.pheno | 777 ++++++++++++++++++ tests/test_examples.py | 20 + 7 files changed, 969 insertions(+), 6 deletions(-) create mode 100644 happler/tree/forest_builder.py create mode 100644 tests/data/19_45401409-46401409_1000G.multi.hap create mode 100644 tests/data/19_45401409-46401409_1000G.multi.pheno diff --git a/happler/__main__.py b/happler/__main__.py index f596afa..4b7d83a 100644 --- a/happler/__main__.py +++ b/happler/__main__.py @@ -103,6 +103,20 @@ def main(): hidden=True, help="Do not check that variants are phased. Saves time and memory.", ) +@click.option( + "--max-signals", + type=int, + default=1, + show_default=True, + help="The maximum number of expected causal signals", +) +@click.option( + "--max-iterations", + type=int, + default=1, + show_default=True, + help="The max number of times to repeat the tree building", +) @click.option( "-t", "--threshold", @@ -150,6 +164,13 @@ def main(): "no correction (n)" ), ) +@click.option( + "--remove-SNPs", + is_flag=True, + show_default=True, + default=False, + help="Remove haplotypes with only a single variant", +) @click.option( "-o", "--output", @@ -179,12 +200,15 @@ def run( chunk_size: int = None, maf: float = None, phased: bool = False, + max_signals: int = 1, + max_iterations: int = 1, threshold: float = 0.05, indep_thresh: float = 0.1, ld_prune_thresh: float = None, show_tree: bool = False, covars: Path = None, corrector: str = "n", + remove_snps: bool = False, output: Path = Path("/dev/stdout"), verbosity: str = "INFO", ): @@ -316,9 +340,35 @@ def run( indep_thresh=indep_thresh, ld_prune_thresh=ld_prune_thresh, log=log, - ).run() + ) + forest = tree.ForestBuilder( + hap_tree, + num_bins=max_signals, + max_iterations=max_iterations, + log=log, + ) log.info("Outputting haplotypes") - tree.Haplotypes.from_tree(fname=output, tree=hap_tree, gts=gt, log=log).write() + haplotypes = data.Haplotypes( + fname=output, + haplotype=tree.haplotypes.HapplerHaplotype, + variant=tree.haplotypes.HapplerVariant, + log=log, + ) + haplotypes.data = {} + hap_id = 0 + # merge the Haplotypes objects + # TODO: use a method of the Haplotypes class + for haps in forest.run(): + if haps is None: + continue + for hap in haps.data.values(): + if len(hap.variants) <= 1 and remove_snps: + continue + hap.id = f"H{hap_id}" + haplotypes.data[hap.id] = hap + hap_id += 1 + haplotypes.index() + haplotypes.write() if show_tree: dot_output = output if Path(output) != Path("/dev/stdout"): diff --git a/happler/tree/__init__.py b/happler/tree/__init__.py index e325bab..2bc2dc2 100644 --- a/happler/tree/__init__.py +++ b/happler/tree/__init__.py @@ -1,5 +1,6 @@ from .tree import Tree from .tree_builder import TreeBuilder +from .forest_builder import ForestBuilder from .variant import VariantType, Variant from .haplotypes import Haplotype, Haplotypes from .corrector import Corrector, BH, Bonferroni, BHSM diff --git a/happler/tree/forest_builder.py b/happler/tree/forest_builder.py new file mode 100644 index 0000000..ee03c3e --- /dev/null +++ b/happler/tree/forest_builder.py @@ -0,0 +1,107 @@ +from __future__ import annotations +from logging import Logger + +import numpy as np +import numpy.typing as npt +import statsmodels.api as sm +from haptools.logging import getLogger +from haptools.data import Genotypes, Phenotypes + +from .tree import Tree +from .haplotypes import Haplotypes +from .tree_builder import TreeBuilder + + +class ForestBuilder: + """ + Creates a ForestBuilder object from a provided TreeBuilder + + Attributes + ---------- + tree_builder: TreeBuilder + A TreeBuilder instance that we can run repeatedly + log: Logger + A logging instance for recording debug statements. + """ + + def __init__( + self, + tree_builder: TreeBuilder, + num_bins: int, + max_iterations: int = None, + log: Logger = None, + ): + self.tree_builder = tree_builder + self.max_iterations = max_iterations + self.trees = [None]*num_bins + self.haplotypes = [None]*num_bins + self.genotypes = self.tree_builder.gens + self.phenotypes = self.tree_builder.phens + self.log = log or getLogger(self.__class__.__name__) + + def run(self): + """ + Build a bunch of trees + """ + iterations = self.max_iterations + tree_idx = 0 + while iterations: + self.log.debug(f"Iteration {self.max_iterations - iterations}: tree {tree_idx}") + self.tree_builder.phens = self.get_residuals(tree_idx) + self.tree_builder.run() + self.trees[tree_idx] = self.tree_builder.tree + self.haplotypes[tree_idx] = Haplotypes.from_tree( + None, self.trees[tree_idx], self.genotypes, self.log, + ) + # increment tree_idx until it reaches the end + tree_idx += 1 + if tree_idx >= len(self.trees): + # TODO: implement a test to check whether to stop iterating + if iterations is not None: + iterations -= 1 + tree_idx = 0 + return self.haplotypes + + def get_residuals(self, tree_idx: int) -> Phenotypes: + """ + Retrieve the current phenotype values, computed after regressing out the + haplotype effects from each tree except the one indicated by tree_idx + + Parameters + ---------- + tree_idx: int + The index of the tree whose haplotypes to exclude from consideration + + Returns + ------- + Phenotypes + The residuals after regressing out the effects of the haplotypes from all + other trees + """ + # first: count the number of effects amongst all haplotypes + total_num_effects = sum( + len(hps.data) if hps is not None else 0 + for hps_idx, hps in enumerate(self.haplotypes) + if hps_idx != tree_idx + ) + # base case: if there are no effects to regress out + if total_num_effects == 0: + return self.phenotypes + num_samps = len(self.phenotypes.samples) + effects = np.empty((num_samps, total_num_effects), dtype=self.phenotypes.data.dtype) + effect_arr_idx = 0 + # iterate through every tree's Haplotypes obj and transform the haps into effects + self.log.debug(f"Extracting {total_num_effects} effects from {total_num_effects} total effects") + for hap_idx, hap in enumerate(self.haplotypes): + if hap is None or hap_idx == tree_idx: + continue + new_idx = effect_arr_idx + len(hap.data) + effects[:, effect_arr_idx:new_idx] = hap.transform(self.genotypes).data[:,:,:2].sum(axis=2) + effect_arr_idx = new_idx + resids = Phenotypes(fname=None, log=self.phenotypes.log) + resids.samples = self.phenotypes.samples + # get residuals with effects as covariates + resids.names = self.phenotypes.names + self.log.debug(f"Computing residuals for tree {tree_idx}") + resids.data = sm.OLS(self.phenotypes.data, sm.add_constant(effects)).fit().resid[:, np.newaxis] + return resids diff --git a/happler/tree/tree_builder.py b/happler/tree/tree_builder.py index c060ed8..41bc440 100644 --- a/happler/tree/tree_builder.py +++ b/happler/tree/tree_builder.py @@ -89,10 +89,6 @@ def run(self): """ Run the tree builder and create a tree rooted at the provided variant """ - if self.tree is not None: - raise AssertionError( - "A tree already exists for this TreeBuilder. Please create a new one." - ) # step one: initialize the tree self.tree = Tree(log=self.log) # step two: create a haplotype diff --git a/tests/data/19_45401409-46401409_1000G.multi.hap b/tests/data/19_45401409-46401409_1000G.multi.hap new file mode 100644 index 0000000..6fc8474 --- /dev/null +++ b/tests/data/19_45401409-46401409_1000G.multi.hap @@ -0,0 +1,12 @@ +# orderH beta pval +# orderV score +# version 0.2.0 +#H beta .2f Effect size in linear model +#H pval .2f -log(pval) in linear model +#V score .2f -log(pval) assigned to this variant +H 19 45410444 46310808 H0 0.36 30.13 +H 19 45892145 45910673 H1 0.36 30.24 +V H0 46310807 46310808 rs12984785 C 9.52 +V H0 45410444 45410445 rs769450 A 30.13 +V H1 45892145 45892146 rs36046716 G 6.30 +V H1 45910672 45910673 rs1046282 G 30.24 diff --git a/tests/data/19_45401409-46401409_1000G.multi.pheno b/tests/data/19_45401409-46401409_1000G.multi.pheno new file mode 100644 index 0000000..93d2808 --- /dev/null +++ b/tests/data/19_45401409-46401409_1000G.multi.pheno @@ -0,0 +1,777 @@ +#IID H0-H1 +HG00096 1.3017394225909524 +HG00097 -0.27455070497718187 +HG00099 0.9223961552249307 +HG00100 -0.4444101713652624 +HG00101 0.8260910716500838 +HG00102 -2.192160408889623 +HG00103 1.937411234430382 +HG00105 -0.6766475045485754 +HG00106 1.2938113228688546 +HG00107 -0.7601234658999931 +HG00108 1.3340426734082755 +HG00109 -0.3702278392333091 +HG00110 -0.657796336291472 +HG00111 0.7163642258408758 +HG00112 -0.5119745970171313 +HG00113 -0.7490020951497627 +HG00114 1.0517322599183143 +HG00115 0.03970553035468305 +HG00116 0.9382503170083054 +HG00117 -0.48446533442500284 +HG00118 0.1605062768449892 +HG00119 0.6650392570450632 +HG00120 -1.2239557284874587 +HG00121 -0.28155916154351257 +HG00122 -0.694600223401998 +HG00123 0.016002165837225357 +HG00125 0.6610124959978709 +HG00126 -1.8824897934414873 +HG00127 -0.0022863086650215525 +HG00128 -0.07817244477032392 +HG00129 -0.060144088248919036 +HG00130 0.20815469441867915 +HG00131 -0.4434759299302991 +HG00132 -0.3862605139969777 +HG00133 -0.01584928114183487 +HG00136 -0.5947118164891941 +HG00137 0.6007178043843452 +HG00138 1.341960465226173 +HG00139 -1.732650802300685 +HG00140 -0.06574912378085612 +HG00141 -1.1173921209561168 +HG00142 -0.8313815665565609 +HG00143 1.4673031969056392 +HG00145 -0.21654238430913353 +HG00146 0.8559756623313552 +HG00148 -0.6547746894235088 +HG00149 0.6512205590579918 +HG00150 0.1771917767565636 +HG00151 0.2835877343172233 +HG00154 -1.3223836301651573 +HG00155 -0.4867863486518235 +HG00157 -0.311207123560055 +HG00158 -0.7591601736064473 +HG00159 -0.46909876113819504 +HG00160 -0.45228839698679396 +HG00171 0.05534744773817596 +HG00173 -0.05728526482670221 +HG00174 0.11912007184337481 +HG00176 -0.5635001943233273 +HG00177 0.1927763795549795 +HG00178 1.5693430077698323 +HG00179 0.8442480505447929 +HG00180 0.5563159735566817 +HG00181 0.25739949585716204 +HG00182 0.20070166807808176 +HG00183 0.6315534221299418 +HG00185 0.2532479084681132 +HG00186 -1.538315230397934 +HG00187 0.4205562997540997 +HG00188 -0.3671931147770918 +HG00189 -0.682298236222033 +HG00190 1.1334295789662376 +HG00231 -1.6518714633900213 +HG00232 -0.5946064662889645 +HG00233 1.456274458550981 +HG00234 -0.673410561447305 +HG00235 0.7324998961620618 +HG00236 -0.7476782340145596 +HG00237 1.3742564614367883 +HG00238 -0.42666365031731934 +HG00239 0.8359563911296215 +HG00240 -0.8395670312438419 +HG00242 -1.926006404324073 +HG00243 0.8588343808065176 +HG00244 -1.0365863223317784 +HG00245 0.1843317854385289 +HG00246 0.732871711072655 +HG00250 -0.7025024579392856 +HG00251 0.2084041227613298 +HG00252 -1.4545955830939332 +HG00253 2.945009837767178 +HG00254 0.503912008880156 +HG00255 -1.3334829520303777 +HG00256 -0.5480397928115349 +HG00257 -0.938556923042386 +HG00258 -1.9106942563377778 +HG00259 1.6559216835838548 +HG00260 0.07826046127315978 +HG00261 0.09915001009411378 +HG00262 -0.03674349382971431 +HG00263 0.13961998399334552 +HG00264 -0.7313470923718625 +HG00265 -0.4638843982228111 +HG00266 1.883097199158406 +HG00267 0.6482813187326648 +HG00268 0.19621125364483505 +HG00269 0.23454471446401665 +HG00271 0.3856839957159506 +HG00272 -0.7613980713107256 +HG00273 0.5847564149458971 +HG00274 -2.200340761207475 +HG00275 0.9399114427840759 +HG00276 0.6208580988865791 +HG00277 1.9921086351108939 +HG00278 -0.7452347633051035 +HG00280 -0.04146856072515981 +HG00281 0.039852090431971354 +HG00282 -1.0473708422676937 +HG00284 1.210779477111332 +HG00285 -0.9212433227842408 +HG00288 -0.45267676876761587 +HG00290 0.21879672908691095 +HG00304 -1.4019032397917557 +HG00306 1.6882682875304393 +HG00308 -0.40454620804361807 +HG00309 0.14589913994153003 +HG00310 -0.8603828426604014 +HG00311 0.7859249725806237 +HG00313 -1.423515824608892 +HG00315 -0.2390376441256331 +HG00318 -0.23442109584659665 +HG00319 0.10717344738698581 +HG00320 -1.1879282057763563 +HG00321 2.010856476651868 +HG00323 1.9536925186312748 +HG00324 1.5630788196071053 +HG00325 -0.6746593314170943 +HG00326 0.645697774512938 +HG00327 0.27531983513078506 +HG00328 -0.3240679844184028 +HG00329 1.0858071408796113 +HG00330 -0.1380942934078231 +HG00331 -0.6617302418606026 +HG00332 -0.8196133472385092 +HG00334 1.8964153527428376 +HG00335 1.0979767394742055 +HG00336 0.45942599485848723 +HG00337 -0.29927846123948887 +HG00338 1.6589405080120647 +HG00339 1.0244855637755055 +HG00341 -0.1106888927519688 +HG00342 -1.6805177057993594 +HG00343 0.9484579844835173 +HG00344 -1.0120010079512851 +HG00345 -1.3587334871064143 +HG00346 -1.4503482183381182 +HG00349 -0.5961763016136395 +HG00350 -1.2248086186506415 +HG00351 0.14578426476484357 +HG00353 0.07659525915044929 +HG00355 1.112412479404136 +HG00356 -0.8814785938735148 +HG00357 -0.9926854251137129 +HG00358 -1.0394944051752386 +HG00360 0.2171639569494055 +HG00361 -0.8036340494724711 +HG00362 0.10075077066547061 +HG00364 0.4835962293815953 +HG00365 0.19382680907013505 +HG00366 0.886385556034768 +HG00367 -0.3880716375161831 +HG00368 0.47239284200124787 +HG00369 -0.7839477950837497 +HG00371 1.0910733414788725 +HG00372 0.8663702947127541 +HG00373 0.19311323966514193 +HG00375 1.0061648184902738 +HG00376 0.9748409248884979 +HG00378 -0.6743931986307602 +HG00379 -0.6408435039796199 +HG00380 1.7644488312594047 +HG00381 0.6343595917549711 +HG00382 1.5614824358020498 +HG00383 -0.010665982476151414 +HG00384 0.5756403538114765 +HG00403 -0.7644713441588533 +HG00404 -1.6355736051237253 +HG00406 0.034088299656111654 +HG00407 -0.06974035486253505 +HG00409 -0.3266224287338094 +HG00410 1.285133304190869 +HG00419 -0.17747468981685946 +HG00421 -0.5216809527310943 +HG00422 1.14580062711893 +HG00428 0.47079269261721013 +HG00436 -1.1991826901148706 +HG00437 1.0922802894213188 +HG00442 1.9524696339807277 +HG00443 -1.276527509624545 +HG00445 0.54720269777494 +HG00446 1.1167374950321065 +HG00448 -0.9943615939542239 +HG00449 0.3015798433428688 +HG00451 0.4581203738623839 +HG00452 -0.3992703612852424 +HG00457 0.010823749383855108 +HG00458 -0.8098101517066841 +HG00463 -0.2448101536344522 +HG00464 -0.8733383667141027 +HG00472 0.7939345553658519 +HG00473 -1.0655863577921791 +HG00475 -0.7222165148083026 +HG00476 -0.9624957925891318 +HG00478 -0.0013715478358110045 +HG00479 -2.40327667661413 +HG00500 0.022552083785861265 +HG00513 -0.0032847044070901665 +HG00524 1.3696957342330283 +HG00525 0.09996072091096792 +HG00530 -0.9253087837168033 +HG00531 0.05387819253735748 +HG00533 0.7185769926770722 +HG00534 0.782101367150142 +HG00536 1.3953406773510117 +HG00537 1.307723084377591 +HG00542 -0.24537359254826419 +HG00543 -1.1289364983902215 +HG00551 -0.3586001725428226 +HG00553 -0.40832412475470525 +HG00554 3.675822203483924 +HG00556 -0.024706080231385097 +HG00557 1.4971197939347933 +HG00559 -0.04398445083603797 +HG00560 -0.16407710687711266 +HG00565 -0.7682990906804746 +HG00566 0.4678824762747546 +HG00580 -0.6669300269342622 +HG00581 -0.6700846349247087 +HG00583 0.5326664895129668 +HG00584 -0.2779053234796991 +HG00589 0.7781462516317879 +HG00590 -2.2572436631832327 +HG00592 -0.6433114427345645 +HG00593 -0.8295006793537826 +HG00595 0.0333143832658932 +HG00596 -0.9998587655976234 +HG00598 1.2013710076031352 +HG00599 0.5780529636331116 +HG00607 0.19702922956298496 +HG00608 -2.3574654810466606 +HG00610 0.42108704704301125 +HG00611 0.20412384859956612 +HG00613 1.2923109821657752 +HG00614 0.4287672171393162 +HG00619 -1.6119721910689218 +HG00620 -0.4329263648859035 +HG00622 0.5107996725772186 +HG00623 1.4808488027236744 +HG00625 -0.7980477107711863 +HG00626 1.8805724032304343 +HG00628 -0.07851834224053256 +HG00629 0.6951191348864068 +HG00631 0.5565444486212561 +HG00632 0.35114830609902814 +HG00634 -0.4048559861541365 +HG00637 -1.0590360908856513 +HG00638 0.4012517489601062 +HG00640 1.9041051366764545 +HG00641 -0.11858765354391748 +HG00650 -0.2720992279196343 +HG00651 -1.1782306581396977 +HG00653 1.666163243028577 +HG00654 0.5642021449803208 +HG00656 0.3597572204389563 +HG00657 -0.22112341590961582 +HG00662 -0.042381968527955505 +HG00663 -0.3884744651759044 +HG00671 -1.0650830457336684 +HG00672 -0.10252285397338956 +HG00674 2.585256684460668 +HG00675 -0.8129770930286141 +HG00683 0.2444522922101258 +HG00684 -0.43755239020277786 +HG00689 0.8245288974242369 +HG00690 0.5723664131283869 +HG00692 -0.3255448137258444 +HG00693 0.653983451319051 +HG00698 0.46514845909471514 +HG00699 -0.22295696340365445 +HG00701 0.5971173027939229 +HG00704 0.5093580294077722 +HG00705 -0.8307086637095085 +HG00707 0.5822457415535465 +HG00708 1.0270007475453435 +HG00717 0.05359074401090719 +HG00728 -0.3114946729690229 +HG00729 0.6751192883172805 +HG00731 -0.5150034836230196 +HG00732 -0.12798522904888415 +HG00734 1.013546450246632 +HG00736 0.6869753353027072 +HG00737 -0.43850863654157746 +HG00739 -0.12255652316791327 +HG00740 -0.527991999123023 +HG00742 -1.0608342352170212 +HG00743 0.5488493979377136 +HG00759 1.6224631515376773 +HG00766 0.7129815223518607 +HG00844 1.6689149744084937 +HG00851 3.4439936596775818 +HG00864 0.4702543572838877 +HG00867 -0.36112178875507417 +HG00879 -0.8401056803849913 +HG00881 0.479824583361269 +HG00956 -0.9284945721170539 +HG00978 -0.5651294761397843 +HG00982 -0.2462626268569882 +HG01028 1.3308648016408742 +HG01029 0.5024251758196058 +HG01031 -0.5355374532246513 +HG01046 -0.09355501647908948 +HG01047 -0.24342541106565116 +HG01048 -1.4135462455456238 +HG01049 0.3184435881775337 +HG01051 0.08654867357833768 +HG01052 -0.9700670489633084 +HG01054 1.1559454140871412 +HG01055 0.954942760323882 +HG01058 -0.7945090781828414 +HG01060 -0.03160599031617728 +HG01061 1.1863047071089463 +HG01063 1.5466296361248897 +HG01064 2.3499793828226445 +HG01066 0.41661528341296594 +HG01067 -0.6671381843672244 +HG01069 -0.5700338006896557 +HG01070 -0.5356882437925449 +HG01072 1.2553552346346728 +HG01073 -0.08577863872989477 +HG01075 -0.5132933981410371 +HG01077 -0.20636577600968187 +HG01079 -0.2359923693120397 +HG01080 1.3485351237334335 +HG01082 -0.08055547871995244 +HG01083 -0.41481781990904054 +HG01085 0.6846503975446758 +HG01086 -0.5107365703121772 +HG01088 0.20198090541546027 +HG01089 0.8904506995124579 +HG01092 1.0944270476855125 +HG01094 -0.21981626608057178 +HG01095 -0.42575883800617836 +HG01097 -0.5421369768663082 +HG01098 1.4859840653728797 +HG01101 0.1594650524518848 +HG01102 -0.277404564506499 +HG01104 -0.3045459641613543 +HG01105 -0.9325085789807781 +HG01107 0.6424914557878557 +HG01108 1.3163034090697114 +HG01110 0.7228953081839569 +HG01111 1.446641348653603 +HG01112 2.758413479417553 +HG01113 0.6077301224018714 +HG01119 -1.0529642439135634 +HG01121 0.07800350107491305 +HG01122 -0.41451465734361315 +HG01124 0.7607002016290159 +HG01125 0.11432801847372626 +HG01130 -0.7547045432285291 +HG01131 -0.36652083695629256 +HG01133 -0.26931656477459875 +HG01134 -1.0605782936561656 +HG01136 -1.035459208762127 +HG01137 1.4085149809557134 +HG01139 1.5342969923703986 +HG01140 0.847378452146835 +HG01142 1.1869804368689174 +HG01148 -2.975475585672566 +HG01149 -1.4491824340278092 +HG01161 -0.27099060632372735 +HG01162 0.5608862674548538 +HG01164 -0.9619239427802997 +HG01167 0.03698832406871888 +HG01168 -0.27540547256201225 +HG01170 2.0834892540141996 +HG01171 1.5374771041833983 +HG01173 1.1906553399713435 +HG01174 2.0146504504029226 +HG01176 -0.3447339318710625 +HG01177 -0.8604036385814623 +HG01182 0.14568185748744056 +HG01183 0.6662993491207352 +HG01187 -1.6974530748263668 +HG01188 -0.4205852001293092 +HG01190 0.6589674340269704 +HG01191 0.5732249874049651 +HG01197 0.10351652834159647 +HG01198 -0.10986771760944436 +HG01200 -0.2660634544870619 +HG01204 -0.464694630514974 +HG01205 -0.649340330132664 +HG01241 0.7377542115358544 +HG01242 1.7102390787723925 +HG01247 -0.04887599452558372 +HG01248 0.6869072846148201 +HG01250 1.6122307271180065 +HG01251 -0.006269529786685435 +HG01253 -0.1535991560141787 +HG01254 0.06486385324861255 +HG01256 0.1624736133399305 +HG01257 0.12720126896016382 +HG01259 -0.5611041978979303 +HG01260 0.8452007910129891 +HG01269 0.5765633665009684 +HG01271 1.4559524762179028 +HG01272 0.11539500759997834 +HG01275 0.24790776340921294 +HG01277 -0.043472289345663884 +HG01280 -0.4625647482063142 +HG01281 -0.018876707413124416 +HG01284 -0.6796253368687399 +HG01286 -0.3044217529345804 +HG01302 -2.0884863350227922 +HG01303 -0.1895879405963826 +HG01305 -0.3639634579361043 +HG01308 -0.28096127211380345 +HG01311 -1.067006406153747 +HG01312 -0.9598078660623194 +HG01323 0.2975749948154106 +HG01325 1.2012638264667665 +HG01326 -0.8243348201288716 +HG01334 -0.1598622584409964 +HG01341 -0.19555366908325295 +HG01342 0.4205481991288811 +HG01344 -1.1857084242467257 +HG01345 0.18791668758807312 +HG01348 -0.24858056282419405 +HG01350 1.0647884665460232 +HG01351 0.3821792758340964 +HG01353 -0.7577895465340109 +HG01354 1.4973311894872399 +HG01356 0.08993696498914311 +HG01357 -0.07137276654248531 +HG01359 -0.24485034819562637 +HG01360 -0.19399268806109768 +HG01362 -0.3421934499181302 +HG01363 0.013594478789405562 +HG01365 -1.1735883598885404 +HG01366 -0.40882640279430355 +HG01369 -0.6980593857929052 +HG01372 -0.14920281916494887 +HG01374 -0.22564984724333909 +HG01375 -1.9057224481945723 +HG01377 -0.3349897961833549 +HG01378 -0.6893113802602657 +HG01383 0.06399427332450408 +HG01384 2.0879822732079005 +HG01389 0.2905065963973552 +HG01390 -1.8161393350145354 +HG01392 0.21471482465500513 +HG01393 -0.30892617532578204 +HG01395 -2.3077217301646984 +HG01396 0.6514749358352425 +HG01398 0.4574977350848046 +HG01402 -0.45921566369595745 +HG01403 -1.1713628906570077 +HG01405 -1.1567176774100134 +HG01412 -0.21502671123675055 +HG01413 0.9924112201862092 +HG01414 -1.1139932031627564 +HG01431 0.061196914990173656 +HG01432 -0.6163207046627842 +HG01435 -0.6807933307860816 +HG01437 1.7868857357670271 +HG01438 0.9800844685974784 +HG01440 -1.0449999061114703 +HG01441 0.024951109934860716 +HG01443 0.6300148056020438 +HG01444 -0.21867101194856767 +HG01447 0.5770469936526864 +HG01455 1.0428562054430608 +HG01456 0.2710026628017298 +HG01459 0.34076449871031944 +HG01461 -0.6359349294097668 +HG01462 -1.6552750583804026 +HG01464 -0.0695625090644314 +HG01465 2.0255787662136453 +HG01468 0.26621377307968563 +HG01474 -0.2106567704185593 +HG01479 -0.44983305929705014 +HG01485 0.14488375652371077 +HG01486 0.2543365153611554 +HG01488 -0.213711743816126 +HG01489 1.4281623092837072 +HG01491 0.8024576594075794 +HG01492 0.23204715616732297 +HG01494 -0.371881854159044 +HG01495 0.8919333998823771 +HG01497 -1.4175503353096701 +HG01498 2.9357075502684578 +HG01500 -0.8874043521741435 +HG01501 -1.0720726873652815 +HG01503 -1.3571194490760414 +HG01504 -0.49892894310137426 +HG01506 -0.03931832059904905 +HG01507 -0.9412032420446552 +HG01509 0.31813781483934755 +HG01510 0.3508669576237833 +HG01512 1.4729719731022497 +HG01513 -0.984343032601463 +HG01515 0.46543101076195226 +HG01516 0.4460419126941211 +HG01518 -0.30959991646573015 +HG01519 -0.9398263010270295 +HG01521 -1.056578331271975 +HG01522 -1.4239638772246848 +HG01524 0.28144387350616684 +HG01525 0.7343342379454842 +HG01527 0.901922144462221 +HG01528 -0.08574761688772858 +HG01530 1.3662470368018946 +HG01531 0.9365544794916194 +HG01536 -1.423792342169354 +HG01537 1.975297967956009 +HG01550 -0.1595864688955843 +HG01551 0.3876305272480949 +HG01556 0.6011569906689535 +HG01565 -0.4401050318778242 +HG01566 -0.25402628945193184 +HG01571 -0.13603079578388413 +HG01572 -1.0867770031518553 +HG01577 -0.6069600379514265 +HG01578 -0.2527960568553665 +HG01583 -0.27998804624640916 +HG01586 0.3367042336962939 +HG01589 0.5992709593147879 +HG01593 -0.5782306881740438 +HG01595 0.6188787969948111 +HG01596 -0.5319272035027639 +HG01597 0.4633309062649581 +HG01598 -0.14212038865574916 +HG01599 -0.9693440536402446 +HG01600 1.329534410844258 +HG01602 -0.7027942179755869 +HG01603 1.3258050259785459 +HG01605 0.7583742247627854 +HG01606 -0.03806140482527687 +HG01607 0.5228190851369188 +HG01608 0.5614434937244929 +HG01610 -2.154321123984033 +HG01612 -0.770020813211133 +HG01613 -0.057682039549263764 +HG01615 0.17553713718651348 +HG01617 0.8680841503716705 +HG01618 -0.09365334577581458 +HG01619 0.7567266158704564 +HG01620 -0.15316680848114536 +HG01623 -0.07167547502727623 +HG01624 -0.7098691112479755 +HG01625 -0.5376565585030909 +HG01626 -1.1934123893983926 +HG01628 -1.3137578792784572 +HG01630 -0.3885220706417852 +HG01631 0.06380003793158934 +HG01632 0.9882096449012903 +HG01668 -1.2272834638886467 +HG01669 1.3148086098043614 +HG01670 -0.7194148905373262 +HG01672 -0.6727658059545862 +HG01673 1.104059086797995 +HG01675 -0.05270769798165276 +HG01676 0.026817985814787115 +HG01678 -1.3430039189000706 +HG01679 2.26409616179076 +HG01680 -0.18251160976200353 +HG01682 -2.3800023943973114 +HG01684 -0.5838253170821261 +HG01685 -0.2624810541679711 +HG01686 0.6676106843315626 +HG01694 -1.4791573816281445 +HG01695 0.07375590012010846 +HG01697 -0.674513249152488 +HG01699 0.07580824435391786 +HG01700 -0.6064964653997095 +HG01702 0.46059787936145224 +HG01704 1.5242524603163903 +HG01705 -1.2629813567475658 +HG01707 -0.3341063588736711 +HG01708 0.26000648605015186 +HG01709 1.4530481156739392 +HG01710 1.3905686564870718 +HG01746 0.7199055806858283 +HG01747 -2.055455595648907 +HG01756 2.0605421164194477 +HG01757 1.0498093758574174 +HG01761 0.9819684152254962 +HG01762 -1.0015790396866084 +HG01765 1.3718804568385248 +HG01766 0.08083252031872545 +HG01767 -0.6699710837703073 +HG01768 0.32734826418883145 +HG01770 -0.29685780134204226 +HG01771 -0.7126405956209039 +HG01773 -0.1826718106288978 +HG01775 -0.3284833437322132 +HG01776 0.32753059127823597 +HG01777 -0.16288878224321937 +HG01779 -1.7150146316175758 +HG01781 1.1251339075733333 +HG01783 0.04474406200438674 +HG01784 -0.14884787663011023 +HG01785 0.3076492887242234 +HG01786 0.9799621694745567 +HG01789 -0.7007101666193455 +HG01790 -0.32123129675212875 +HG01791 0.7551301160304824 +HG01794 1.3310246063851228 +HG01795 0.9876089909632227 +HG01796 -1.7025563253400566 +HG01797 -0.20852960737783074 +HG01798 1.0381327312289705 +HG01799 -0.5763532553403252 +HG01800 0.6792764360942509 +HG01801 -1.0825184866495092 +HG01802 -0.8551577350061023 +HG01804 0.06947664819157517 +HG01805 -2.0075674260015806 +HG01806 -0.9769109706355256 +HG01807 2.235744016243585 +HG01808 1.1261728493320944 +HG01809 -0.16532081221444295 +HG01810 0.7993077856593014 +HG01811 1.008630911516867 +HG01812 0.7987820324660475 +HG01813 -1.1010309129607947 +HG01815 0.0941042900031055 +HG01816 0.07479391251206335 +HG01817 0.039881199213996577 +HG01840 0.0008107907883523335 +HG01841 -1.7886038967713078 +HG01842 2.057534865314539 +HG01843 1.1813704618564589 +HG01844 -1.3070428653364963 +HG01845 -0.10415054047715061 +HG01846 -1.6347736522854883 +HG01847 0.5981082049215186 +HG01848 0.49950714611498737 +HG01849 -0.5765966932048827 +HG01850 2.2319784225108634 +HG01851 -1.20065235030314 +HG01852 -2.693691429962721 +HG01853 -0.4551628489456577 +HG01855 0.010416563200790374 +HG01857 -0.872012578333183 +HG01858 -0.00406205758991629 +HG01859 0.1221464121021395 +HG01860 -1.21101586360234 +HG01861 -1.74648042284672 +HG01862 1.4403951528372323 +HG01863 0.6952367542086773 +HG01864 -0.5261923337560288 +HG01865 -1.7138386295283845 +HG01866 -0.9079108023365258 +HG01867 1.3602716532873491 +HG01868 -0.1417979917964054 +HG01869 -0.8708257438590182 +HG01870 -0.04784419231061443 +HG01871 -0.4825374843931246 +HG01872 -1.4039256375821905 +HG01873 -0.08833353399627719 +HG01874 0.876278638618728 +HG01878 0.7726110474957442 +HG01879 2.0191601784878435 +HG01880 -0.39752922689275405 +HG01882 0.7546151453166463 +HG01883 0.157970379027911 +HG01885 0.41436692431391553 +HG01886 -0.10523086231530376 +HG01889 -0.2814716907530671 +HG01890 -0.8412892446081937 +HG01892 -0.2663219512249372 +HG01893 -0.5026892598671168 +HG01894 -1.1154829810938125 +HG01896 -0.7017998643291423 +HG01912 0.2970673421544838 +HG01914 0.6118689092417687 +HG01915 -0.6479071868113228 +HG01917 0.07777768776857702 +HG01918 -0.14628277487012908 +HG01920 1.7681672680378433 +HG01921 -0.5881650245475535 +HG01923 0.760405468975605 +HG01924 -0.2889807756235889 +HG01926 0.43621226021683823 +HG01927 -0.23730446674064543 +HG01932 -0.43699599580467485 +HG01933 0.6528533731003703 +HG01935 -0.7475526354879827 +HG01936 1.5094554510014961 +HG01938 0.27680408013140567 +HG01939 0.2055593358165455 +HG01941 2.1313080055667206 +HG01942 -0.588644766506938 +HG01944 1.4556856718913858 +HG01945 0.4734631906831199 +HG01947 -2.0126453915253872 +HG01948 -0.9430370879616923 +HG01950 2.057970260358122 +HG01951 0.3662051249610203 +HG01953 1.6481910566574476 +HG01954 -0.19638725549125463 +HG01956 -0.6689727087737826 +HG01958 0.191203691445877 +HG01961 0.5246736326505623 +HG01965 -0.24383069116823602 +HG01967 0.04427224701346849 +HG01968 -0.8908742825629983 +HG01970 -1.6463329063438394 +HG01971 0.9858644361877709 +HG01973 -0.10704467374751558 +HG01974 -0.9699953853969524 +HG01976 -0.062087789341763455 +HG01977 0.46202489031645044 +HG01979 1.0267036217537027 +HG01980 -0.4526578880597739 +HG01982 -0.8737852556329553 +HG01985 0.23359501037722807 +HG01986 3.1371193492918787 +HG01988 -0.5475025726786117 +HG01989 0.47435323460452494 +HG01990 -0.7465050935474141 +HG01991 -0.7788253215478809 +HG01992 0.08730436546292941 +HG01997 2.3062118697618788 +HG02002 0.10162781629363232 +HG02003 0.15859012616985357 +HG02006 1.6777132942618456 +HG02008 0.663470945207608 +HG02009 1.3060371169640814 +HG02010 -0.13844724424252397 +HG02012 1.2101340707437047 +HG02013 -0.8861603038151078 +HG02014 -0.04874235699909152 +HG02016 1.010456347263374 +HG02017 0.8203727150068628 +HG02019 0.04450042815337485 +HG02020 0.8754819935860505 +HG02023 -1.5485864250277281 +HG02025 -0.5231651040972453 +HG02026 1.306239127064317 +HG02028 1.1220011665698573 +HG02029 -1.3101376762530863 +HG02031 0.5318604881477949 +HG02032 -1.454155235116148 +HG02035 -0.1240485386500875 +HG02040 0.36235164987935087 +HG02047 0.037354545301426845 +HG02048 -0.8901052017075429 +HG02049 -0.8902400343183031 +HG02050 0.15737609529255614 +HG02051 0.36827034879898163 +HG02052 -1.1733976935691066 +HG02053 -0.1942191451687208 +HG02054 0.42557959365741316 +HG02057 0.6864527381317347 +HG02058 -0.7021161310752574 +HG02060 0.25330874383085666 +HG02061 1.0197142120800042 +HG02064 -0.39158335442623227 +HG02067 -0.8839840865164575 +HG02069 -1.0947804286230107 +HG02070 -0.4746157067039667 +HG02072 -0.5444193957629733 +HG02073 1.2416663295950512 +HG02075 -0.11171153408631879 diff --git a/tests/test_examples.py b/tests/test_examples.py index 5084f8b..1604532 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -684,6 +684,26 @@ def test_1000G_simulated(capfd): assert filecmp.cmp(hp_file, out_hp_file) +def test_1000G_simulated_multihap(capfd): + """ + Test using simulated data from the 1000G dataset + + In this case, more than one haplotype is causal + """ + gt_file = DATADIR / "19_45401409-46401409_1000G.pgen" + pt_file = DATADIR / "19_45401409-46401409_1000G.multi.pheno" + hp_file = DATADIR / "19_45401409-46401409_1000G.multi.hap" + out_hp_file = "test.hap" + + cmd = f"run --remove-SNPs --max-signals 3 --max-iterations 3 -o {out_hp_file} {gt_file} {pt_file}" + runner = CliRunner() + result = runner.invoke(main, cmd.split(" "), catch_exceptions=False) + captured = capfd.readouterr() + assert captured.out == "" + assert result.exit_code == 0 + assert filecmp.cmp(hp_file, out_hp_file) + + def test_1000G_simulated_maf(capfd): """ Test MAF filtering using simulated data from the 1000G dataset From c937bb6e9ba17c6d5d4691d7006a1345b39fda3c Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Mon, 30 Sep 2024 14:02:18 -0700 Subject: [PATCH 02/41] use new config file for simulations --- analysis/config/config-simulations.yml | 84 ++++++++++++++++++++++++++ analysis/config/config.yml | 2 +- 2 files changed, 85 insertions(+), 1 deletion(-) create mode 100644 analysis/config/config-simulations.yml diff --git a/analysis/config/config-simulations.yml b/analysis/config/config-simulations.yml new file mode 100644 index 0000000..5b05122 --- /dev/null +++ b/analysis/config/config-simulations.yml @@ -0,0 +1,84 @@ +# This is the Snakemake configuration file that specifies paths and +# and options for the pipeline. Anybody wishing to use +# the provided snakemake pipeline should first fill out this file with paths to +# their own data, as the Snakefile requires it. +# Every config option has reasonable defaults unless it is labeled as "required." +# All paths are relative to the directory that Snakemake is executed in. +# Note: this file is written in the YAML syntax (https://learnxinyminutes.com/docs/yaml/) + + +# Paths to a SNP-STR haplotype reference panel +# You can download this from http://gymreklab.com/2018/03/05/snpstr_imputation.html +# If the VCFs are per-chromosome, replace the contig name in the file name with "{chr}" +# The VCF(s) must be sorted and indexed (with a .tbi file in the same directory) +# required! +snp_panel: "data/simulations/1000G/19_45401409-46401409.pgen" +# str_panel: "/tscc/projects/ps-gymreklab/jmargoli/ukbiobank/str_imputed/runs/first_pass/vcfs/annotated_strs/chr{chr}.vcf.gz" + +# Path to a list of samples to exclude from the analysis +# There should be one sample ID per line +# exclude_samples: data/ukb_random_samples_exclude.tsv + +# If SNPs are unphased, provide the path to a SHAPEIT4 map file like these: +# https://github.com/odelaneau/shapeit4/tree/master/maps +# The map file should use the same reference genome as the reference panel VCFs +# phase_map: data/genetic_maps/chr{chr}.b37.gmap.gz + +# A "locus" is a string with a contig name, a colon, the start position, a dash, and +# the end position or a BED file with a ".bed" file ending +# There are different simulation modes that you can use: +# 1. "str" - a tandem repeat is a string with a contig name, a colon, and the start position +# 2. "snp" - a SNP follows the same format as "str" +# 3. "hap" - a haplotype +# 4. "ld_range" - creates random two-SNP haplotypes with a range of LD values between the alleles of each haplotype +# 5. "run" - execute happler on a locus without simulating anything +# The STR and SNP positions should be contained within the locus. +# The positions should be provided in the same coordinate system as the reference +# genome of the reference panel VCFs +# The contig should correspond with the contig name from the {chr} wildcard in the VCF +# required! and unless otherwise noted, all attributes of each mode are required +locus: 19:45401409-46401409 # center: 45901409 (APOe4) +modes: + str: + pos: 19:45903857 # STR_691361 + snp: + pos: 19:45910672 # rs1046282 + hap: + alleles: [rs36046716:G, rs1046282:G] # 45892145, 45910672 + beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45] + ld_range: + reps: 1 + min_ld: 0 + max_ld: 1 + step: 0.1 + min_af: 0.25 + max_af: 0.75 + # beta: [0.35] + beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45] + alpha: [0.05] + random: false # whether to also produce random haplotypes + run: + pheno: data/geuvadis/phenos/{trait}.pheno + SVs: data/geuvadis/pangenie_hprc_hg38_all-samples_bi_SVs-missing_removed.pgen + # pheno_matrix: data/geuvadis/EUR_converted_expr_hg38.csv # optional +mode: ld_range + +# Covariates to use if they're needed +# Otherwise, they're assumed to be regressed out of the phenotypes +# Note: the finemapping methods won't be able to handle these covariates +# covar: data/geuvadis/5PCs_sex.covar + +# Discard rare variants with a MAF below this number +# Defaults to 0 if not specified +min_maf: 0.05 + +# Sample sizes to use +# sample_size: [500, 1000, 1500, 2000, 2500] +sample_size: 777 + +# Whether to include the causal variant in the set of genotypes provided to the +# finemapping methods. Set this to true if you're interested in seeing how the +# methods perform when the causal variant is absent from the data. +# Defaults to false if not specified +# You can also provide a list of booleans, if you want to test both values +exclude_causal: [true, false] diff --git a/analysis/config/config.yml b/analysis/config/config.yml index 34462af..51c999d 120000 --- a/analysis/config/config.yml +++ b/analysis/config/config.yml @@ -1 +1 @@ -config-ukb.yml \ No newline at end of file +config-simulations.yml \ No newline at end of file From 0bcf1b292b4d7fa6f75154c0b9e915c182ff7ff1 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Mon, 30 Sep 2024 14:11:06 -0700 Subject: [PATCH 03/41] use new --max-signals param in happler --- analysis/workflow/rules/happler.smk | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/analysis/workflow/rules/happler.smk b/analysis/workflow/rules/happler.smk index 08cc773..10e879c 100644 --- a/analysis/workflow/rules/happler.smk +++ b/analysis/workflow/rules/happler.smk @@ -36,6 +36,8 @@ rule run: covar=lambda wildcards, input: ("--covar " + input["covar"] + " ") if check_config("covar") else "", maf = check_config("min_maf", 0), indep=lambda wildcards: 0.05 if "indep_alpha" not in wildcards else wildcards.indep_alpha, + max_signals=3, + max_iterations=3, output: hap=out + "/happler.hap", gz=out + "/happler.hap.gz", @@ -62,8 +64,10 @@ rule run: "happler" shell: "happler run -o {output.hap} --verbosity DEBUG --maf {params.maf} " - "--discard-multiallelic --region {params.region} {params.covar} --indep-thresh" - " {params.indep} -t {params.thresh} --show-tree {input.gts} {input.pts} &>{log}" + "--max-signals {params.max_signals} --max-iterations {params.max_iterations} " + "--discard-multiallelic --remove-SNPs --region {params.region}" + " {params.covar} --indep-thresh {params.indep} -t {params.thresh} " + "--show-tree {input.gts} {input.pts} &>{log}" " && haptools index -o {output.gz} {output.hap} &>>{log}" From 7a84684a3d95c0a316db5072543bfefa6e717b3c Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Mon, 30 Sep 2024 14:11:23 -0700 Subject: [PATCH 04/41] make simulate workflow reproducible --- analysis/workflow/rules/simulate.smk | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/analysis/workflow/rules/simulate.smk b/analysis/workflow/rules/simulate.smk index 7a9dbc3..e1071c1 100644 --- a/analysis/workflow/rules/simulate.smk +++ b/analysis/workflow/rules/simulate.smk @@ -31,13 +31,15 @@ checkpoint create_hap_ld_range: gts_pvar=Path(config["gts_snp_panel"]).with_suffix(".pvar"), gts_psam=Path(config["gts_snp_panel"]).with_suffix(".psam"), params: - reps = config["modes"]["ld_range"]["reps"], min_ld = config["modes"]["ld_range"]["min_ld"], max_ld = config["modes"]["ld_range"]["max_ld"], step = config["modes"]["ld_range"]["step"], min_af = config["modes"]["ld_range"]["min_af"], max_af = config["modes"]["ld_range"]["max_af"], - out = lambda wildcards: hap_ld_range_output, + out = lambda wildcards: expand( + hap_ld_range_output, **wildcards, allow_missing=True, + ), + seed = 42, output: hap=directory(out + "/create_ld_range") resources: @@ -49,9 +51,9 @@ checkpoint create_hap_ld_range: conda: "happler" shell: - "workflow/scripts/choose_different_ld.py -r {params.reps} " + "workflow/scripts/choose_different_ld.py " "--min-ld {params.min_ld} --max-ld {params.max_ld} " - "--step {params.step} --min-af {params.min_af} " + "--step {params.step} --min-af {params.min_af} --seed {params.seed} " "--max-af {params.max_af} {input.gts} {params.out} &> {log}" @@ -115,6 +117,7 @@ rule simphenotype: psam=rules.transform.output.psam, params: beta=lambda wildcards: wildcards.beta, + seed = 42, output: pheno=out + "/{beta}.pheno", resources: @@ -126,5 +129,5 @@ rule simphenotype: conda: "happler" shell: - "haptools simphenotype -o {output.pheno} {input.pgen} " + "haptools simphenotype --seed {params.seed} -o {output.pheno} {input.pgen} " "<( sed 's/\\t0.99$/\\t{params.beta}/' {input.hap} ) &>{log}" From 0431d4bf2abbd3ff10ef3371642dd178df654a8a Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Mon, 30 Sep 2024 14:12:36 -0700 Subject: [PATCH 05/41] output multiple haps in choose_different_ld --- .../workflow/scripts/choose_different_ld.py | 45 ++++++++++--------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/analysis/workflow/scripts/choose_different_ld.py b/analysis/workflow/scripts/choose_different_ld.py index 82f7fd1..befe50b 100755 --- a/analysis/workflow/scripts/choose_different_ld.py +++ b/analysis/workflow/scripts/choose_different_ld.py @@ -17,11 +17,11 @@ def find_haps( log: Logger, min_ld: float, max_ld: float, - reps: int = 1, + num_haps: int = 1, step: float = 0.1, seed: np.random.Generator = np.random.default_rng(None), ): - log.info(f"Using min_ld {min_ld}, max_ld {max_ld}, reps {reps}, and step {step}") + log.info(f"Using min_ld {min_ld}, max_ld {max_ld}, num_haps {num_haps}, and step {step}") sum_gts = gts.data.sum(axis=2) chrom = np.unique(gts.variants["chrom"]) @@ -29,8 +29,8 @@ def find_haps( chrom = chrom[0] intervals = np.arange(max(0, min_ld), min(1, max_ld), step) + step - ld_bins = {idx: 0 for idx in range(len(intervals))} - count = len(intervals) * reps + ld_bins = {idx: [] for idx in range(len(intervals))} + count = len(intervals) * num_haps combos = np.array(list(range(len(gts.variants)))) seed.shuffle(combos) @@ -51,10 +51,11 @@ def find_haps( combo_ld = np.abs(pearson_corr_ld(*tuple(sum_gts[:, snp_id] for snp_id in combo_id))) # which bin does this LD value fall in? bin_idx = np.argmax(combo_ld < intervals) - if ld_bins[bin_idx] < reps: - log.info(f"Outputting {hp_id} with LD {combo_ld} for bin {intervals[bin_idx]}") - yield combo_ld, hp - ld_bins[bin_idx] += 1 + if len(ld_bins[bin_idx]) < num_haps: + log.info(f"Adding {hp_id} with LD {combo_ld} to bin {intervals[bin_idx]}") + ld_bins[bin_idx].append((combo_ld, hp)) + if len(ld_bins[bin_idx]) == num_haps: + yield ld_bins[bin_idx] count -= 1 break if count == 0: @@ -64,14 +65,6 @@ def find_haps( @click.command() @click.argument("gts", type=click.Path(exists=True, path_type=Path)) @click.argument("output", type=click.Path(path_type=Path)) -@click.option( - "-r", - "--reps", - type=int, - default=1, - show_default=True, - help="The number of replicates to perform within each LD bin", -) @click.option( "--min-ld", type=float, @@ -107,6 +100,13 @@ def find_haps( show_default=True, help="The maximum LD value to allow", ) +@click.option( + "--num-haps", + type=int, + default=1, + show_default=True, + help="The number of haplotypes to output", +) @click.option( "--seed", type=int, @@ -125,12 +125,12 @@ def find_haps( def main( gts: Path, output: Path, - reps: int = 1, min_ld: float = 0, max_ld: float = 1, step: float = 0.1, min_af: float = 0.25, max_af: float = 0.75, + num_haps: int = 1, seed: int = None, verbosity: str = "DEBUG", ): @@ -138,7 +138,7 @@ def main( Create haplotypes with a range of LD values between their alleles LD values are injected into the output from brace expressions - For example, if we are given a a path like "ld_{ld}/happler.hap", then + For example, if we are given a path like "ld_{ld}/happler.hap", then we will replace {ld} with the provided ld value """ log = getLogger("choose", verbosity) @@ -155,21 +155,22 @@ def main( seed = np.random.default_rng(seed) - for combo_ld, hp in find_haps( + for haps in find_haps( gts, log, min_ld=min_ld, max_ld=max_ld, - reps=reps, + num_haps=num_haps, step=step, seed=seed, ): - out_path = Path(str(output).format(ld=round(combo_ld, 2))) + avg_combo_ld = np.mean([combo_ld for combo_ld, hp in haps]) + out_path = Path(str(output).format(ld=round(avg_combo_ld, 2))) out_path.parent.mkdir(parents=True, exist_ok=True) hps = Haplotypes( out_path, log=log, haplotype=Haplotype, ) - hps.data = {hp.id: hp} + hps.data = {hp.id: hp for combo_ld, hp in haps} hps.write() From c1dbe5bcb62dd065cfde7427976714c32eb49741 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Mon, 30 Sep 2024 14:22:02 -0700 Subject: [PATCH 06/41] try (and fail) to import graphviz trees TODO: fix this later --- happler/__main__.py | 2 +- happler/tree/forest_builder.py | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/happler/__main__.py b/happler/__main__.py index 4b7d83a..86db884 100644 --- a/happler/__main__.py +++ b/happler/__main__.py @@ -377,7 +377,7 @@ def run( dot_output = dot_output.with_suffix(".dot") log.info("Writing tree to dot file") with open(dot_output, "w") as dot_file: - dot_file.write(hap_tree.dot()) + dot_file.write(forest.dot()) @main.command(context_settings=CONTEXT_SETTINGS) diff --git a/happler/tree/forest_builder.py b/happler/tree/forest_builder.py index ee03c3e..e4d235b 100644 --- a/happler/tree/forest_builder.py +++ b/happler/tree/forest_builder.py @@ -105,3 +105,27 @@ def get_residuals(self, tree_idx: int) -> Phenotypes: self.log.debug(f"Computing residuals for tree {tree_idx}") resids.data = sm.OLS(self.phenotypes.data, sm.add_constant(effects)).fit().resid[:, np.newaxis] return resids + + def __repr__(self): + return self.dot() + + def dot(self) -> str: + """ + Convert the trees to a representation in the dot language + This is useful for quickly viewing the forest on the command line + + Returns + ------- + str + A string representing the trees + + Nodes are labeled by their variant ID and edges are labeled by their allele + """ + trees_str = "strict digraph {\nforcelabels=true;\nrankdir=TB;\n" + for idx, tree in enumerate(self.trees): + if tree is None: + continue + trees_str += f"subgraph tree_{idx}"+" {\nlabel=\""+f"Tree {idx}"+"\";\n" + trees_str += "\n".join(tree.dot().split("\n")[2:]) + trees_str += "}" + return trees_str From 7a589bb9686156784fab0946f2cae13c50cf82c9 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Mon, 30 Sep 2024 14:22:58 -0700 Subject: [PATCH 07/41] simply output an empty file when all haps are SNPs --- happler/__main__.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/happler/__main__.py b/happler/__main__.py index 86db884..af01260 100644 --- a/happler/__main__.py +++ b/happler/__main__.py @@ -367,6 +367,11 @@ def run( hap.id = f"H{hap_id}" haplotypes.data[hap.id] = hap hap_id += 1 + if haplotypes.data == {}: + log.critical("No haplotypes were found") + with open(output, "w") as hap_file: + pass + return 1 haplotypes.index() haplotypes.write() if show_tree: From 51766a5c44aafd55d1b76bfd37e8496041446d8e Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Mon, 30 Sep 2024 16:27:04 -0700 Subject: [PATCH 08/41] enable graphviz output for multiple trees --- happler/__main__.py | 8 ++++---- happler/tree/forest_builder.py | 21 +++++++++++++++++++-- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/happler/__main__.py b/happler/__main__.py index af01260..ea0f171 100644 --- a/happler/__main__.py +++ b/happler/__main__.py @@ -371,9 +371,9 @@ def run( log.critical("No haplotypes were found") with open(output, "w") as hap_file: pass - return 1 - haplotypes.index() - haplotypes.write() + else: + haplotypes.index() + haplotypes.write() if show_tree: dot_output = output if Path(output) != Path("/dev/stdout"): @@ -382,7 +382,7 @@ def run( dot_output = dot_output.with_suffix(".dot") log.info("Writing tree to dot file") with open(dot_output, "w") as dot_file: - dot_file.write(forest.dot()) + dot_file.write(forest.dot(remove_singletons=remove_snps)) @main.command(context_settings=CONTEXT_SETTINGS) diff --git a/happler/tree/forest_builder.py b/happler/tree/forest_builder.py index e4d235b..cecce98 100644 --- a/happler/tree/forest_builder.py +++ b/happler/tree/forest_builder.py @@ -1,4 +1,5 @@ from __future__ import annotations +import re from logging import Logger import numpy as np @@ -109,7 +110,7 @@ def get_residuals(self, tree_idx: int) -> Phenotypes: def __repr__(self): return self.dot() - def dot(self) -> str: + def dot(self, remove_singletons: bool = False) -> str: """ Convert the trees to a representation in the dot language This is useful for quickly viewing the forest on the command line @@ -120,12 +121,28 @@ def dot(self) -> str: A string representing the trees Nodes are labeled by their variant ID and edges are labeled by their allele + remove_singletons: bool + Whether to ignore trees composed of only a single SNP """ + node_pattern = r'(?)' + max_node_id = 0 trees_str = "strict digraph {\nforcelabels=true;\nrankdir=TB;\n" for idx, tree in enumerate(self.trees): if tree is None: continue + tree_str = tree.dot().split("\n")[2:] + if remove_singletons and len(tree_str) <= 5: + continue trees_str += f"subgraph tree_{idx}"+" {\nlabel=\""+f"Tree {idx}"+"\";\n" - trees_str += "\n".join(tree.dot().split("\n")[2:]) + tree_str = "\n".join(tree_str) + # increment the node IDs to ensure they remain unique across all trees + increment = lambda match: str(int(match.group(1))+max_node_id) + trees_str += re.sub(node_pattern, increment, tree_str) + # what was the greatest node ID in this tree? + # increment the next tree's IDs by that number + 1 if there were any nodes + try: + max_node_id = max(map(int, re.findall(node_pattern, tree_str))) + max_node_id + 1 + except ValueError: + pass trees_str += "}" return trees_str From 7cd90cb0d4df3c6ccf487ffebc7c5550ab3a85db Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Mon, 7 Oct 2024 12:51:23 -0700 Subject: [PATCH 09/41] simulate multiple haps as well --- analysis/config/config-simulations.yml | 1 + analysis/workflow/Snakefile | 8 +++- analysis/workflow/rules/happler.smk | 2 +- analysis/workflow/rules/plots.smk | 11 +++++- analysis/workflow/rules/simulate.smk | 11 +++--- .../workflow/scripts/choose_different_ld.py | 37 ++++++++++++++----- analysis/workflow/scripts/parameter_plot.py | 1 + 7 files changed, 51 insertions(+), 20 deletions(-) diff --git a/analysis/config/config-simulations.yml b/analysis/config/config-simulations.yml index 5b05122..35bb959 100644 --- a/analysis/config/config-simulations.yml +++ b/analysis/config/config-simulations.yml @@ -57,6 +57,7 @@ modes: beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45] alpha: [0.05] random: false # whether to also produce random haplotypes + num_haps: [1, 2, 3, 4] run: pheno: data/geuvadis/phenos/{trait}.pheno SVs: data/geuvadis/pangenie_hprc_hg38_all-samples_bi_SVs-missing_removed.pgen diff --git a/analysis/workflow/Snakefile b/analysis/workflow/Snakefile index df56b8d..c9ffb76 100644 --- a/analysis/workflow/Snakefile +++ b/analysis/workflow/Snakefile @@ -59,6 +59,10 @@ else: config["gts_snp_panel"] = rules.genotypes_subset.input.pgen config["random"] = False +if mode == "ld_range" and check_config("num_haps", place=config["modes"][mode]): + if isinstance(config["modes"][mode]["num_haps"], int): + config["modes"][mode]["num_haps"] = [config["modes"][mode]["num_haps"],] + if mode != "run": module simulate: snakefile: "rules/simulate.smk" @@ -106,7 +110,7 @@ if mode in ("snp", "hap", "ld_range"): happler_config["snps"] = config["modes"][mode]["alleles"] happler_config["causal_gt"] = rules.simulate_transform.output if mode == "ld_range": - happler_config["out"] = config["out"] + "/happler/"+mode+"/ld_{ld}/beta_{beta}/alpha_{alpha}" + happler_config["out"] = config["out"] + "/happler/"+mode+"/{num_haps}_haps/ld_{ld}/beta_{beta}/alpha_{alpha}" elif mode == "run": happler_config = { "pheno": config["modes"][mode]["pheno"], @@ -133,7 +137,7 @@ merged_happler = rules.happler_merge.output.pgen if mode == "ld_range" and config["modes"][mode]["random"]: happler_config["random"] = rules.simulate_random_transform.output happler_config["random_hap"] = rules.simulate_random_transform.input.hap - happler_config["out"] = config["out"] + "/happler/"+mode+"/random/ld_{ld}/beta_{beta}/alpha_{alpha}" + happler_config["out"] = config["out"] + "/happler/"+mode+"/random/{num_haps}_haps/ld_{ld}/beta_{beta}/alpha_{alpha}" module happler_random: snakefile: "rules/happler.smk" config: happler_config diff --git a/analysis/workflow/rules/happler.smk b/analysis/workflow/rules/happler.smk index 10e879c..d6e1c4a 100644 --- a/analysis/workflow/rules/happler.smk +++ b/analysis/workflow/rules/happler.smk @@ -65,7 +65,7 @@ rule run: shell: "happler run -o {output.hap} --verbosity DEBUG --maf {params.maf} " "--max-signals {params.max_signals} --max-iterations {params.max_iterations} " - "--discard-multiallelic --remove-SNPs --region {params.region}" + "--discard-multiallelic --region {params.region}" " {params.covar} --indep-thresh {params.indep} -t {params.thresh} " "--show-tree {input.gts} {input.pts} &>{log}" " && haptools index -o {output.gz} {output.hap} &>>{log}" diff --git a/analysis/workflow/rules/plots.smk b/analysis/workflow/rules/plots.smk index 2ef6632..e4c67f0 100644 --- a/analysis/workflow/rules/plots.smk +++ b/analysis/workflow/rules/plots.smk @@ -1,3 +1,4 @@ +import sys from pathlib import Path out = config["out"] @@ -12,8 +13,8 @@ bench += "/" + mode def agg_ld(wildcards): """ a helper function for the other agg functions """ checkpoint_output = config["ld_range_checkpoint"].get(**wildcards).output.hap - ld_vals = glob_wildcards(Path(checkpoint_output) / "ld_{ld}/haplotype.hap").ld - return checkpoint_output, ld_vals + ld_vals = glob_wildcards(Path(checkpoint_output) / "{num_haps}_haps/ld_{ld}/haplotype.hap").ld + return checkpoint_output, sorted(list(set(ld_vals))) def agg_ld_range_obs(wildcards): """ return a list of hap files from happler """ @@ -24,12 +25,14 @@ def agg_ld_range_obs(wildcards): ld=ld_vals, alpha=config["mode_attrs"]["alpha"], beta=config["mode_attrs"]["beta"], + num_haps=config["mode_attrs"]["num_haps"], **wildcards, ) else: return expand( config["happler_hap"], beta=config["mode_attrs"]["beta"], + num_haps=config["mode_attrs"]["num_haps"], **wildcards, ) @@ -41,12 +44,14 @@ def agg_ld_range_causal(wildcards): str(checkpoint_output), ld=ld_vals, beta=config["mode_attrs"]["beta"], + num_haps=config["mode_attrs"]["num_haps"], **wildcards, ) else: return expand( config["causal_hap"], beta=config["mode_attrs"]["beta"], + num_haps=config["mode_attrs"]["num_haps"], **wildcards, ) @@ -58,6 +63,7 @@ def agg_ld_range_metrics(wildcards): config["happler_metrics"], ld=ld_vals, beta=config["mode_attrs"]["beta"], + num_haps=config["mode_attrs"]["num_haps"], alpha=config["mode_attrs"]["alpha"], **wildcards, ) @@ -65,6 +71,7 @@ def agg_ld_range_metrics(wildcards): return expand( config["happler_metrics"], beta=config["mode_attrs"]["beta"], + num_haps=config["mode_attrs"]["num_haps"], **wildcards, ) diff --git a/analysis/workflow/rules/simulate.smk b/analysis/workflow/rules/simulate.smk index e1071c1..a56aed3 100644 --- a/analysis/workflow/rules/simulate.smk +++ b/analysis/workflow/rules/simulate.smk @@ -22,7 +22,7 @@ def parse_locus(locus): start = locus.split("_")[1].split("-")[0] return chrom, start, end -hap_ld_range_output = out + "/create_ld_range/ld_{ld}/haplotype.hap" +hap_ld_range_output = out + "/create_ld_range/{num_haps}_haps/ld_{ld}/haplotype.hap" checkpoint create_hap_ld_range: """ create a hap file suitable for haptools transform and simphenotype """ @@ -36,6 +36,7 @@ checkpoint create_hap_ld_range: step = config["modes"]["ld_range"]["step"], min_af = config["modes"]["ld_range"]["min_af"], max_af = config["modes"]["ld_range"]["max_af"], + num_haps = "-n " + " -n ".join(map(str, config["modes"]["ld_range"]["num_haps"])), out = lambda wildcards: expand( hap_ld_range_output, **wildcards, allow_missing=True, ), @@ -51,7 +52,7 @@ checkpoint create_hap_ld_range: conda: "happler" shell: - "workflow/scripts/choose_different_ld.py " + "workflow/scripts/choose_different_ld.py {params.num_haps} " "--min-ld {params.min_ld} --max-ld {params.max_ld} " "--step {params.step} --min-af {params.min_af} --seed {params.seed} " "--max-af {params.max_af} {input.gts} {params.out} &> {log}" @@ -81,9 +82,9 @@ rule create_hap: "../scripts/create_hap_file.sh" if mode == "ld_range": - out += "/pheno/ld_{ld}" - logs += "/pheno/ld_{ld}" - bench += "/pheno/ld_{ld}" + out += "/pheno/{num_haps}_haps/ld_{ld}" + logs += "/pheno/{num_haps}_haps/ld_{ld}" + bench += "/pheno/{num_haps}_haps/ld_{ld}" rule transform: """ use the hap file to create a pgen of the haplotype """ diff --git a/analysis/workflow/scripts/choose_different_ld.py b/analysis/workflow/scripts/choose_different_ld.py index befe50b..f09dd9b 100755 --- a/analysis/workflow/scripts/choose_different_ld.py +++ b/analysis/workflow/scripts/choose_different_ld.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +from typing import Tuple from pathlib import Path from logging import Logger from itertools import product @@ -29,6 +30,7 @@ def find_haps( chrom = chrom[0] intervals = np.arange(max(0, min_ld), min(1, max_ld), step) + step + # we return lists of haplotypes: one list for each bin ld_bins = {idx: [] for idx in range(len(intervals))} count = len(intervals) * num_haps @@ -101,11 +103,16 @@ def find_haps( help="The maximum LD value to allow", ) @click.option( + "-n", "--num-haps", type=int, - default=1, + multiple=True, + default=(1,), show_default=True, - help="The number of haplotypes to output", + help=( + "The number of haplotypes to output in each hap file. " + "If multiple, we will also match on the {num_haps} brace expression" + ), ) @click.option( "--seed", @@ -130,7 +137,7 @@ def main( step: float = 0.1, min_af: float = 0.25, max_af: float = 0.75, - num_haps: int = 1, + num_haps: Tuple[int] = (1,), seed: int = None, verbosity: str = "DEBUG", ): @@ -155,23 +162,33 @@ def main( seed = np.random.default_rng(seed) + if "{num_haps}" not in str(output) and len(num_haps) > 1: + log.warning( + "Multiple num_haps values provided, but {num_haps} brace expression is " + "absent from output path. Only the last --num-haps value will be used." + ) + for haps in find_haps( gts, log, min_ld=min_ld, max_ld=max_ld, - num_haps=num_haps, + num_haps=max(num_haps), step=step, seed=seed, ): avg_combo_ld = np.mean([combo_ld for combo_ld, hp in haps]) - out_path = Path(str(output).format(ld=round(avg_combo_ld, 2))) - out_path.parent.mkdir(parents=True, exist_ok=True) - hps = Haplotypes( - out_path, log=log, haplotype=Haplotype, + out_path_temp = Path(str(output).format( + ld=round(avg_combo_ld, 2), num_haps="{num_haps}"), ) - hps.data = {hp.id: hp for combo_ld, hp in haps} - hps.write() + for num_hap in num_haps: + out_path = Path(str(out_path_temp).format(num_haps=num_hap)) + out_path.parent.mkdir(parents=True, exist_ok=True) + hps = Haplotypes( + out_path, log=log, haplotype=Haplotype, + ) + hps.data = {hp.id: hp for combo_ld, hp in haps[:num_hap]} + hps.write() if __name__ == "__main__": diff --git a/analysis/workflow/scripts/parameter_plot.py b/analysis/workflow/scripts/parameter_plot.py index 34ce946..2aaacc5 100755 --- a/analysis/workflow/scripts/parameter_plot.py +++ b/analysis/workflow/scripts/parameter_plot.py @@ -22,6 +22,7 @@ "gt": "U5", "alpha": np.float64, "samp": np.uint32, + "num_haps": np.uint8, } LOG_SCALE = {"alpha",} From 35eded450f4d5e51b1abef795242b2f3669eabf5 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Mon, 7 Oct 2024 12:53:44 -0700 Subject: [PATCH 10/41] match obs and causal haps by their LD --- analysis/workflow/rules/plots.smk | 2 +- analysis/workflow/scripts/parameter_plot.py | 184 ++++++++++---------- 2 files changed, 95 insertions(+), 91 deletions(-) diff --git a/analysis/workflow/rules/plots.smk b/analysis/workflow/rules/plots.smk index e4c67f0..246bc34 100644 --- a/analysis/workflow/rules/plots.smk +++ b/analysis/workflow/rules/plots.smk @@ -104,7 +104,7 @@ rule params: conda: "happler" shell: - "workflow/scripts/parameter_plot.py -o {output.png} " + "workflow/scripts/parameter_plot.py --show-extras -o {output.png} " "{input.gts} {params.observed_haps} {params.causal_hap} &> {log}" diff --git a/analysis/workflow/scripts/parameter_plot.py b/analysis/workflow/scripts/parameter_plot.py index 2aaacc5..905f677 100755 --- a/analysis/workflow/scripts/parameter_plot.py +++ b/analysis/workflow/scripts/parameter_plot.py @@ -10,7 +10,8 @@ from haptools import logging import matplotlib.pyplot as plt from haptools.ld import pearson_corr_ld -from haptools.data import GenotypesVCF, GenotypesPLINK, Haplotypes +from scipy.optimize import linear_sum_assignment +from haptools.data import Genotypes, GenotypesVCF, GenotypesPLINK, Haplotypes from snakemake_io import glob_wildcards @@ -28,12 +29,68 @@ LOG_SCALE = {"alpha",} -def get_num_haps(hap: Path, log: Logger = None): - hap = Haplotypes(hap, log=log) - hap.read() - return len(hap.data) +def match_haps(gts: Genotypes, observed: Haplotypes, causal: Haplotypes, drop_extras: bool = True) -> tuple: + """ + Match the observed and causal haplotypes to maximize best pairwise LD + + This function uses scipy.optimize.linear_sum_assignment to find the optimal + match between causal and observed haplotypes to maximize the sum of the pairwise LD -def get_best_ld(gts: GenotypesVCF, observed_hap: Path, causal_hap: Path, region: str = None, observed_id: str = None, causal_id: str = None, log: Logger = None): + Parameters + ---------- + gts: Genotypes + The set of genotypes for each of the variants in the haplotypes + observed: Path + A path to a .hap file containing haplotypes output by happler + causal: Path + A path to a .hap file containing a simulated causal haplotype + drop_extras: bool, optional + Drop any observed haplotypes that don't have a match in the causal haps. + Otherwise, include a match for every observed haplotype! + + Returns + ------- + npt.NDArray + An array of the LD values for each pair of observed and causal haplotypes + npt.NDArray + An array of the row indices indicating the optimal match of causal haplotypes + to observed haplotypes + npt.NDArray + An array of the column indices indicating the optimal match of causal haplotypes + to observed haplotypes + """ + obs = observed.transform(gts).data.sum(axis=2) + exp = causal.transform(gts).data.sum(axis=2) + # Compute the pairwise LD between each observed and causal haplotype + # The rows and columns of ld_mat both correspond to the observed and causal haps in + # the order they were given. For example, if there is 1 observed hap and 3 causal + # haps, then there should be four rows and four columns in ld_mat + # Note that ld_mat is symmetric, so the upper triangle is always redundant + # TODO: use the latest haptools.ld.pearson_corr_ld to do this, instead + ld_mat = np.corrcoef(obs, exp, rowvar=False) + # Retrieve only the correlation of observed haps (as rows) vs causal haps (as cols) + ld_mat = np.abs(ld_mat[:obs.shape[1], obs.shape[1]:]) + # Return the ld_mat idxs of the optimal match of each observed hap to a causal hap + row_idx, col_idx = linear_sum_assignment(ld_mat, maximize=True) + if not drop_extras: + missing_rows = np.setdiff1d(range(ld_mat.shape[0]), row_idx) + # If there are more observed haps than causal haps, then we just assign the + # remaining haps to the best causal hap that matches for each of them + row_idx = np.concatenate((row_idx, missing_rows)) + col_idx = np.concatenate((col_idx, ld_mat[missing_rows].argmax(axis=1))) + return ld_mat, row_idx, col_idx + + +def get_best_ld( + gts: GenotypesVCF, + observed_hap: Path, + causal_hap: Path, + region: str = None, + observed_id: str = None, + causal_id: str = None, + show_extras: bool = False, + log: Logger = None + ): """ Compute the best LD between the observed haplotypes and the causal haplotype @@ -51,23 +108,24 @@ def get_best_ld(gts: GenotypesVCF, observed_hap: Path, causal_hap: Path, region: The ID to load from the observed_hap file (or just all of the IDs, otherwise) causal_id: str, optional The ID to load from the causal_hap file (or just the first hap, otherwise) + show_extras: bool, optional + Whether to show observed haps that don't match with a causal hap log: Logger, optional A logging module to pass to haptools """ # load the causal haplotype given by 'causal_id' or just the first hap causal_hap = Haplotypes(causal_hap, log=log) causal_hap.read(haplotypes=(set((causal_id,)) if causal_id is not None else None)) - if causal_id is None: - causal_id = list(causal_hap.data.keys())[0] - causal_hap.subset(haplotypes=(causal_id,), inplace=True) - causal_hap = causal_hap.data[causal_id] # load the observed haplotype given by 'observed_id' or just all of the haps observed_hap = Haplotypes(observed_hap, log=log) observed_hap.read(haplotypes=(set((observed_id,)) if observed_id is not None else None)) # load the genotypes - variants = {v.id for v in causal_hap.variants} + variants = { + v.id for hap in causal_hap.data + for v in causal_hap.data[hap].variants + } variants.update({ v.id for hap in observed_hap.data for v in observed_hap.data[hap].variants @@ -79,20 +137,11 @@ def get_best_ld(gts: GenotypesVCF, observed_hap: Path, causal_hap: Path, region: gts.check_biallelic() # compute LD between every observed haplotype and the causal haplotype - causal_gt = causal_hap.transform(gts).sum(axis=1) - observed_gt = observed_hap.transform(gts) - observed_ld = np.array([ - pearson_corr_ld(causal_gt, observed_gt.data[:, o].sum(axis=1)) - for o in range(observed_gt.data.shape[1]) - ]) - - # return the strongest LD among all haps - try: - best_observed_hap_idx = np.abs(observed_ld).argmax() - except ValueError: - # this can happen when there weren't any haplotypes output by happler - return 0 - return observed_ld[best_observed_hap_idx] + observed_ld, best_row_idx, best_col_idx = match_haps( + gts, observed_hap, causal_hap, drop_extras=(not show_extras), + ) + # return the strongest possible LD for each observed hap with a causal hap + return observed_ld[best_row_idx, best_col_idx] def get_finemap_metrics(metrics_path: Path, log: Logger = None): @@ -150,8 +199,6 @@ def plot_params( params: npt.NDArray, vals: npt.NDArray, val_title: str, - hap_counts: npt.NDArray = None, - sign: bool = False, ): """ Plot vals against parameter values @@ -164,46 +211,37 @@ def plot_params( A numpy array containing the values of the plot val_title: str The name of the value that we are plotting - hap_counts: npt.NDArray, optional - The number of haplotypes present in each hap file - sign: bool, optional - If True, depict positive LD values via a special marker symbol for the point """ - also_plot_counts = hap_counts is not None figsize = matplotlib.rcParams["figure.figsize"] if len(params) > 75: figsize[0] = len(params) / 15 if len(params.dtype) > 5: figsize[1] = len(params.dtype) * 1.25 fig, axs = plt.subplots( - nrows=len(params.dtype)+1+also_plot_counts, ncols=1, + nrows=len(params.dtype)+1, ncols=1, sharex=True, figsize=figsize, ) fig.subplots_adjust(hspace=0) # create a plot for the vals, first - axs[0].plot(np.abs(vals), "g-") - if sign: - axs[0].scatter( - np.squeeze(np.where(vals > 0)), vals[vals > 0], c="g", marker="o", - ) + x_vals = [j for j, arr in enumerate(vals) for i in arr] + axs[0].plot(x_vals, np.concatenate(vals), "go", markersize=2) axs[0].set_ylabel(val_title, rotation="horizontal", ha="right") axs[0].set_xticks(range(0, len(vals))) axs[0].set_xticklabels([]) axs[0].grid(axis="x") axs[0].set_ylim(None, 1) - if also_plot_counts: - axs[1].plot(hap_counts, "m-") - axs[1].set_ylabel("num_haps", rotation="horizontal", ha="right") - axs[1].grid(axis="x") # now, plot each of the parameter values on the other axes for idx, param in enumerate(params.dtype.names): val_title = param + if len(np.unique(params[param])) == 1: + axs[idx+1].remove() + continue if param in LOG_SCALE: params[param] = -np.log10(params[param]) val_title = "-log " + val_title - axs[idx+1+also_plot_counts].plot(params[param], "-") - axs[idx+1+also_plot_counts].set_ylabel(val_title, rotation="horizontal", ha="right") - axs[idx+1+also_plot_counts].grid(axis="x") + axs[idx+1].plot(params[param], "-") + axs[idx+1].set_ylabel(val_title, rotation="horizontal", ha="right") + axs[idx+1].grid(axis="x") return fig @@ -211,8 +249,6 @@ def plot_params_simple( params: npt.NDArray, vals: npt.NDArray, val_title: str, - hap_counts: npt.NDArray = None, - sign: bool = False ): """ Plot vals against only a single column of parameter values @@ -225,31 +261,17 @@ def plot_params_simple( A numpy array containing the values of the plot val_title: str The name of the value that we are plotting - hap_counts: npt.NDArray, optional - The number of haplotypes present in each hap file - sign: bool, optional - If True, depict positive LD values via a special marker symbol for the point """ fig, ax = plt.subplots(nrows=1, ncols=1, sharex=True) val_xlabel = params.dtype.names[0] if val_xlabel in LOG_SCALE: params = -np.log10(params) val_xlabel = "-log " + val_xlabel + params = [j for j, arr in zip(params, vals) for i in arr] # plot params against vals - ax.plot(params, np.abs(vals), "g-") - if sign: - ax.scatter( - params[vals > 0], vals[vals > 0], c="g", marker="o", - ) + ax.plot(params, np.concatenate(vals), "go", markersize=2) ax.set_ylabel(val_title, color="g") ax.set_xlabel(val_xlabel) - # also plot hap_counts as a second y-axis - if hap_counts is not None: - ax = ax.twinx() - ax.plot(params, hap_counts, "m-") - ax.set_ylabel("Number of observed haplotypes", color="m") - ax.set_yticks(list(range(max(hap_counts)+1))) - ax.set_yticklabels(list(range(max(hap_counts)+1))) return fig @@ -289,19 +311,11 @@ def plot_params_simple( help="A haplotype ID from the causal .hap file", ) @click.option( - "-n", - "--num-haps", - is_flag=True, - default=True, - show_default=True, - help="Whether to also depict the total number of haplotypes in the file", -) -@click.option( - "--sign", + "--show-extras", is_flag=True, default=False, show_default=True, - help="Depict the sign (pos vs neg) of the LD via the shape of the points", + help="Whether to show observed haps that don't match with a causal hap", ) @click.option( "-o", @@ -327,8 +341,7 @@ def main( region: str = None, observed_id: str = None, causal_id: str = None, - num_haps: bool = True, - sign: bool = False, + show_extras: bool = False, output: Path = Path("/dev/stdout"), verbosity: str = "ERROR", ): @@ -359,7 +372,7 @@ def main( get_hap_fname = lambda hap_path, param_set: Path(str(hap_path).format(**dict(zip(dtypes.keys(), param_set)))) # compute LD between the causal hap and the best observed hap across param vals - ld_vals = np.array([ + ld_vals = [ get_best_ld( gts, get_hap_fname(observed_hap, params[idx]), @@ -367,10 +380,11 @@ def main( region=region, observed_id=observed_id, causal_id=causal_id, + show_extras=show_extras, log=log ) for idx in range(len(params)) - ]) + ] # extract fine-mapping metrics for the observed hap if metrics is not None: @@ -382,28 +396,18 @@ def main( for idx in range(len(params)) ]) - hap_counts = None - if num_haps: - hap_counts = np.array([ - get_num_haps(get_hap_fname(observed_hap, params[idx])) - for idx in range(len(params)) - ]) - # if they're all 1, then there's no reason to depict it - if (hap_counts == 1).all(): - hap_counts = None - if len(dtypes) > 1 or metrics is not None: merged = params if metrics is not None: merged = np.lib.recfunctions.merge_arrays([params, metrics], flatten=True) - fig = plot_params(merged, ld_vals, "causal LD", hap_counts, sign) + fig = plot_params(merged, ld_vals, "causal LD") elif len(dtypes) == 1: - fig = plot_params_simple(params, ld_vals, "causal LD", hap_counts, sign) + fig = plot_params_simple(params, ld_vals, "causal LD") else: raise ValueError("No parameter values found") fig.tight_layout() - fig.savefig(output) + fig.savefig(output, bbox_inches="tight", pad_inches=0.05) if __name__ == "__main__": main() From f28dda577d67244e4d9477d7ccb3ec2da89cb702 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 15 Oct 2024 20:22:51 -0700 Subject: [PATCH 11/41] implement a final haplotype threshold too --- happler/__main__.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/happler/__main__.py b/happler/__main__.py index ea0f171..ef6e565 100644 --- a/happler/__main__.py +++ b/happler/__main__.py @@ -141,6 +141,13 @@ def main(): show_default=True, help="The LD threshold used to prune leaf nodes based on LD with their siblings", ) +@click.option( + "--out-thresh", + type=float, + default=5e-8, + show_default=True, + help="Threshold used to determine whether to output haplotypes (single-SNP)", +) @click.option( "--show-tree", is_flag=True, @@ -203,6 +210,7 @@ def run( max_signals: int = 1, max_iterations: int = 1, threshold: float = 0.05, + out_thresh: float = 5e-8, indep_thresh: float = 0.1, ld_prune_thresh: float = None, show_tree: bool = False, @@ -223,6 +231,7 @@ def run( Ex: happler run tests/data/simple.vcf tests/data/simple.tsv > simple.hap """ from . import tree + import numpy as np from haptools import data from haptools import logging @@ -361,6 +370,11 @@ def run( for haps in forest.run(): if haps is None: continue + if len(haps.data) == 1: + hap = next(iter(haps.data.values())) + if hap.pval < -np.log10(out_thresh): + log.info("Ignoring haplotype with low pval") + continue for hap in haps.data.values(): if len(hap.variants) <= 1 and remove_snps: continue From 635079e28e10deaf9038c5000df30554fb7f86f0 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 15 Oct 2024 20:23:36 -0700 Subject: [PATCH 12/41] color each point of the parameter plot by whether it is an extra --- analysis/workflow/scripts/parameter_plot.py | 41 +++++++++++++++------ 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/analysis/workflow/scripts/parameter_plot.py b/analysis/workflow/scripts/parameter_plot.py index 905f677..5a01a16 100755 --- a/analysis/workflow/scripts/parameter_plot.py +++ b/analysis/workflow/scripts/parameter_plot.py @@ -58,6 +58,9 @@ def match_haps(gts: Genotypes, observed: Haplotypes, causal: Haplotypes, drop_ex npt.NDArray An array of the column indices indicating the optimal match of causal haplotypes to observed haplotypes + npt.NDArray + An array of boolean values indicating whether each observed haplotype has a match + in the causal haplotypes """ obs = observed.transform(gts).data.sum(axis=2) exp = causal.transform(gts).data.sum(axis=2) @@ -72,13 +75,17 @@ def match_haps(gts: Genotypes, observed: Haplotypes, causal: Haplotypes, drop_ex ld_mat = np.abs(ld_mat[:obs.shape[1], obs.shape[1]:]) # Return the ld_mat idxs of the optimal match of each observed hap to a causal hap row_idx, col_idx = linear_sum_assignment(ld_mat, maximize=True) + extras_labels = np.zeros( + len(row_idx) if drop_extras else ld_mat.shape[0], dtype=np.bool_, + ) if not drop_extras: missing_rows = np.setdiff1d(range(ld_mat.shape[0]), row_idx) + extras_labels[missing_rows] = True # If there are more observed haps than causal haps, then we just assign the # remaining haps to the best causal hap that matches for each of them row_idx = np.concatenate((row_idx, missing_rows)) col_idx = np.concatenate((col_idx, ld_mat[missing_rows].argmax(axis=1))) - return ld_mat, row_idx, col_idx + return ld_mat, row_idx, col_idx, extras_labels def get_best_ld( @@ -121,6 +128,10 @@ def get_best_ld( observed_hap = Haplotypes(observed_hap, log=log) observed_hap.read(haplotypes=(set((observed_id,)) if observed_id is not None else None)) + # if there are no observed haplotypes, then we can't compute LD + if len(observed_hap.data) == 0: + return np.array([np.nan,]), np.array([False,]) + # load the genotypes variants = { v.id for hap in causal_hap.data @@ -137,11 +148,11 @@ def get_best_ld( gts.check_biallelic() # compute LD between every observed haplotype and the causal haplotype - observed_ld, best_row_idx, best_col_idx = match_haps( + observed_ld, best_row_idx, best_col_idx, extras_labels = match_haps( gts, observed_hap, causal_hap, drop_extras=(not show_extras), ) # return the strongest possible LD for each observed hap with a causal hap - return observed_ld[best_row_idx, best_col_idx] + return observed_ld[best_row_idx, best_col_idx], extras_labels def get_finemap_metrics(metrics_path: Path, log: Logger = None): @@ -199,6 +210,7 @@ def plot_params( params: npt.NDArray, vals: npt.NDArray, val_title: str, + val_color: npt.NDArray, ): """ Plot vals against parameter values @@ -207,10 +219,12 @@ def plot_params( ---------- params: npt.NDArray A numpy array of mixed dtype. Each column contains the values of a parameter - vals: npt.NDArray + vals: list[npt.NDArray] A numpy array containing the values of the plot val_title: str The name of the value that we are plotting + val_color: list[npt.NDArray] + Whether to color each point corresponding to this value """ figsize = matplotlib.rcParams["figure.figsize"] if len(params) > 75: @@ -224,7 +238,8 @@ def plot_params( fig.subplots_adjust(hspace=0) # create a plot for the vals, first x_vals = [j for j, arr in enumerate(vals) for i in arr] - axs[0].plot(x_vals, np.concatenate(vals), "go", markersize=2) + val_color = ["red" if j else "green" for i in val_color for j in i] + axs[0].scatter(x_vals, np.concatenate(vals), marker="o", color=val_color, s=5) axs[0].set_ylabel(val_title, rotation="horizontal", ha="right") axs[0].set_xticks(range(0, len(vals))) axs[0].set_xticklabels([]) @@ -249,6 +264,7 @@ def plot_params_simple( params: npt.NDArray, vals: npt.NDArray, val_title: str, + val_color: npt.NDArray, ): """ Plot vals against only a single column of parameter values @@ -257,10 +273,12 @@ def plot_params_simple( ---------- params: npt.NDArray A numpy array containing the values of a parameter - vals: npt.NDArray + vals: list[npt.NDArray] A numpy array containing the values of the plot val_title: str The name of the value that we are plotting + val_color: list[npt.NDArray] + Whether to color each point corresponding to this value """ fig, ax = plt.subplots(nrows=1, ncols=1, sharex=True) val_xlabel = params.dtype.names[0] @@ -268,8 +286,9 @@ def plot_params_simple( params = -np.log10(params) val_xlabel = "-log " + val_xlabel params = [j for j, arr in zip(params, vals) for i in arr] + val_color = ["red" if j else "green" for i in val_color for j in i] # plot params against vals - ax.plot(params, np.concatenate(vals), "go", markersize=2) + ax.scatter(params, np.concatenate(vals), marker="o", color=val_color, s=5) ax.set_ylabel(val_title, color="g") ax.set_xlabel(val_xlabel) return fig @@ -372,7 +391,7 @@ def main( get_hap_fname = lambda hap_path, param_set: Path(str(hap_path).format(**dict(zip(dtypes.keys(), param_set)))) # compute LD between the causal hap and the best observed hap across param vals - ld_vals = [ + ld_vals, ld_extras_labels = zip(*tuple( get_best_ld( gts, get_hap_fname(observed_hap, params[idx]), @@ -384,7 +403,7 @@ def main( log=log ) for idx in range(len(params)) - ] + )) # extract fine-mapping metrics for the observed hap if metrics is not None: @@ -400,9 +419,9 @@ def main( merged = params if metrics is not None: merged = np.lib.recfunctions.merge_arrays([params, metrics], flatten=True) - fig = plot_params(merged, ld_vals, "causal LD") + fig = plot_params(merged, ld_vals, "causal LD", ld_extras_labels) elif len(dtypes) == 1: - fig = plot_params_simple(params, ld_vals, "causal LD") + fig = plot_params_simple(params, ld_vals, "causal LD", ld_extras_labels) else: raise ValueError("No parameter values found") From c89f6e5073d2cff9b19542943a569c77f3b7c8ee Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Thu, 17 Oct 2024 13:30:17 -0700 Subject: [PATCH 13/41] enable simulation replicates --- analysis/config/config-simulations.yml | 2 +- analysis/workflow/Snakefile | 4 +- analysis/workflow/rules/happler.smk | 5 +- analysis/workflow/rules/plots.smk | 4 +- analysis/workflow/rules/simulate.smk | 4 +- analysis/workflow/scripts/parameter_plot.py | 173 +++++++++++++++----- happler/__main__.py | 4 - 7 files changed, 141 insertions(+), 55 deletions(-) diff --git a/analysis/config/config-simulations.yml b/analysis/config/config-simulations.yml index 35bb959..59fd987 100644 --- a/analysis/config/config-simulations.yml +++ b/analysis/config/config-simulations.yml @@ -47,7 +47,7 @@ modes: alleles: [rs36046716:G, rs1046282:G] # 45892145, 45910672 beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45] ld_range: - reps: 1 + reps: 10 min_ld: 0 max_ld: 1 step: 0.1 diff --git a/analysis/workflow/Snakefile b/analysis/workflow/Snakefile index c9ffb76..93d4640 100644 --- a/analysis/workflow/Snakefile +++ b/analysis/workflow/Snakefile @@ -110,7 +110,7 @@ if mode in ("snp", "hap", "ld_range"): happler_config["snps"] = config["modes"][mode]["alleles"] happler_config["causal_gt"] = rules.simulate_transform.output if mode == "ld_range": - happler_config["out"] = config["out"] + "/happler/"+mode+"/{num_haps}_haps/ld_{ld}/beta_{beta}/alpha_{alpha}" + happler_config["out"] = config["out"] + "/happler/"+mode+"/{num_haps}_haps/ld_{ld}/beta_{beta}/alpha_{alpha}/rep_{rep}" elif mode == "run": happler_config = { "pheno": config["modes"][mode]["pheno"], @@ -137,7 +137,7 @@ merged_happler = rules.happler_merge.output.pgen if mode == "ld_range" and config["modes"][mode]["random"]: happler_config["random"] = rules.simulate_random_transform.output happler_config["random_hap"] = rules.simulate_random_transform.input.hap - happler_config["out"] = config["out"] + "/happler/"+mode+"/random/{num_haps}_haps/ld_{ld}/beta_{beta}/alpha_{alpha}" + happler_config["out"] = config["out"] + "/happler/"+mode+"/random/{num_haps}_haps/ld_{ld}/beta_{beta}/alpha_{alpha}/rep_{rep}" module happler_random: snakefile: "rules/happler.smk" config: happler_config diff --git a/analysis/workflow/rules/happler.smk b/analysis/workflow/rules/happler.smk index d6e1c4a..2c75df5 100644 --- a/analysis/workflow/rules/happler.smk +++ b/analysis/workflow/rules/happler.smk @@ -38,6 +38,7 @@ rule run: indep=lambda wildcards: 0.05 if "indep_alpha" not in wildcards else wildcards.indep_alpha, max_signals=3, max_iterations=3, + rep=lambda wildcards: int(wildcards.rep), output: hap=out + "/happler.hap", gz=out + "/happler.hap.gz", @@ -65,8 +66,8 @@ rule run: shell: "happler run -o {output.hap} --verbosity DEBUG --maf {params.maf} " "--max-signals {params.max_signals} --max-iterations {params.max_iterations} " - "--discard-multiallelic --region {params.region}" - " {params.covar} --indep-thresh {params.indep} -t {params.thresh} " + "--discard-multiallelic --region {params.region} --pheno {params.rep} " + "{params.covar}--indep-thresh {params.indep} -t {params.thresh} " "--show-tree {input.gts} {input.pts} &>{log}" " && haptools index -o {output.gz} {output.hap} &>>{log}" diff --git a/analysis/workflow/rules/plots.smk b/analysis/workflow/rules/plots.smk index 246bc34..403f175 100644 --- a/analysis/workflow/rules/plots.smk +++ b/analysis/workflow/rules/plots.smk @@ -26,6 +26,7 @@ def agg_ld_range_obs(wildcards): alpha=config["mode_attrs"]["alpha"], beta=config["mode_attrs"]["beta"], num_haps=config["mode_attrs"]["num_haps"], + rep=range(config["mode_attrs"]["reps"]), **wildcards, ) else: @@ -65,6 +66,7 @@ def agg_ld_range_metrics(wildcards): beta=config["mode_attrs"]["beta"], num_haps=config["mode_attrs"]["num_haps"], alpha=config["mode_attrs"]["alpha"], + rep=range(config["mode_attrs"]["reps"]), **wildcards, ) else: @@ -104,7 +106,7 @@ rule params: conda: "happler" shell: - "workflow/scripts/parameter_plot.py --show-extras -o {output.png} " + "workflow/scripts/parameter_plot.py -o {output.png} " "{input.gts} {params.observed_haps} {params.causal_hap} &> {log}" diff --git a/analysis/workflow/rules/simulate.smk b/analysis/workflow/rules/simulate.smk index a56aed3..249e64d 100644 --- a/analysis/workflow/rules/simulate.smk +++ b/analysis/workflow/rules/simulate.smk @@ -119,6 +119,7 @@ rule simphenotype: params: beta=lambda wildcards: wildcards.beta, seed = 42, + reps = config["modes"]["ld_range"]["reps"], output: pheno=out + "/{beta}.pheno", resources: @@ -130,5 +131,6 @@ rule simphenotype: conda: "happler" shell: - "haptools simphenotype --seed {params.seed} -o {output.pheno} {input.pgen} " + "haptools simphenotype --seed {params.seed} -o {output.pheno} " + "--replications {params.reps} {input.pgen} " "<( sed 's/\\t0.99$/\\t{params.beta}/' {input.hap} ) &>{log}" diff --git a/analysis/workflow/scripts/parameter_plot.py b/analysis/workflow/scripts/parameter_plot.py index 5a01a16..bea84d2 100755 --- a/analysis/workflow/scripts/parameter_plot.py +++ b/analysis/workflow/scripts/parameter_plot.py @@ -7,6 +7,7 @@ import numpy as np matplotlib.use('Agg') import numpy.typing as npt +from scipy.stats import sem from haptools import logging import matplotlib.pyplot as plt from haptools.ld import pearson_corr_ld @@ -28,8 +29,10 @@ LOG_SCALE = {"alpha",} +plt.rcParams['figure.dpi'] = 400 # Set the figure DPI to 300 +plt.rcParams['savefig.dpi'] = plt.rcParams['figure.dpi'] # Set the DPI for saving figures -def match_haps(gts: Genotypes, observed: Haplotypes, causal: Haplotypes, drop_extras: bool = True) -> tuple: +def match_haps(gts: Genotypes, observed: Haplotypes, causal: Haplotypes) -> tuple: """ Match the observed and causal haplotypes to maximize best pairwise LD @@ -44,9 +47,6 @@ def match_haps(gts: Genotypes, observed: Haplotypes, causal: Haplotypes, drop_ex A path to a .hap file containing haplotypes output by happler causal: Path A path to a .hap file containing a simulated causal haplotype - drop_extras: bool, optional - Drop any observed haplotypes that don't have a match in the causal haps. - Otherwise, include a match for every observed haplotype! Returns ------- @@ -58,6 +58,8 @@ def match_haps(gts: Genotypes, observed: Haplotypes, causal: Haplotypes, drop_ex npt.NDArray An array of the column indices indicating the optimal match of causal haplotypes to observed haplotypes + npt.NDArray + The index of the causal haplotype that each observed haplotype is matched to npt.NDArray An array of boolean values indicating whether each observed haplotype has a match in the causal haplotypes @@ -75,17 +77,20 @@ def match_haps(gts: Genotypes, observed: Haplotypes, causal: Haplotypes, drop_ex ld_mat = np.abs(ld_mat[:obs.shape[1], obs.shape[1]:]) # Return the ld_mat idxs of the optimal match of each observed hap to a causal hap row_idx, col_idx = linear_sum_assignment(ld_mat, maximize=True) - extras_labels = np.zeros( - len(row_idx) if drop_extras else ld_mat.shape[0], dtype=np.bool_, - ) - if not drop_extras: - missing_rows = np.setdiff1d(range(ld_mat.shape[0]), row_idx) - extras_labels[missing_rows] = True - # If there are more observed haps than causal haps, then we just assign the - # remaining haps to the best causal hap that matches for each of them - row_idx = np.concatenate((row_idx, missing_rows)) - col_idx = np.concatenate((col_idx, ld_mat[missing_rows].argmax(axis=1))) - return ld_mat, row_idx, col_idx, extras_labels + # now deal with the extras (observed haps with no causal hap match) + extras_labels = np.zeros(ld_mat.shape[0], dtype=np.uint8) + extras_labels_bool = np.zeros(ld_mat.shape[0], dtype=bool) + missing_rows = np.setdiff1d(range(ld_mat.shape[0]), row_idx) + extras_labels[row_idx] = col_idx + extras_labels_bool[row_idx] = True + # If there are more observed haps than causal haps, then we just assign the + # remaining haps to the best causal hap that matches for each of them + row_idx = np.concatenate((row_idx, missing_rows)) + missing_cols = ld_mat[missing_rows].argmax(axis=1) + col_idx = np.concatenate((col_idx, missing_cols)) + extras_labels[missing_rows] = missing_cols + extras_labels_bool[missing_rows] = False + return ld_mat, row_idx, col_idx, extras_labels, extras_labels_bool def get_best_ld( @@ -95,7 +100,6 @@ def get_best_ld( region: str = None, observed_id: str = None, causal_id: str = None, - show_extras: bool = False, log: Logger = None ): """ @@ -115,8 +119,6 @@ def get_best_ld( The ID to load from the observed_hap file (or just all of the IDs, otherwise) causal_id: str, optional The ID to load from the causal_hap file (or just the first hap, otherwise) - show_extras: bool, optional - Whether to show observed haps that don't match with a causal hap log: Logger, optional A logging module to pass to haptools """ @@ -130,7 +132,11 @@ def get_best_ld( # if there are no observed haplotypes, then we can't compute LD if len(observed_hap.data) == 0: - return np.array([np.nan,]), np.array([False,]) + return ( + np.array([np.nan,]), + np.array([np.iinfo(np.uint8).max,], dtype=np.uint8), + np.array([False,], dtype=bool), + ) # load the genotypes variants = { @@ -148,11 +154,11 @@ def get_best_ld( gts.check_biallelic() # compute LD between every observed haplotype and the causal haplotype - observed_ld, best_row_idx, best_col_idx, extras_labels = match_haps( - gts, observed_hap, causal_hap, drop_extras=(not show_extras), + observed_ld, best_row_idx, best_col_idx, extras_labels, labels_bool = match_haps( + gts, observed_hap, causal_hap, ) # return the strongest possible LD for each observed hap with a causal hap - return observed_ld[best_row_idx, best_col_idx], extras_labels + return observed_ld[best_row_idx, best_col_idx], extras_labels, labels_bool def get_finemap_metrics(metrics_path: Path, log: Logger = None): @@ -209,6 +215,7 @@ def count_shared(observed, causal, log, observed_id: str = None, causal_id: str def plot_params( params: npt.NDArray, vals: npt.NDArray, + vals_sem: npt.NDArray, val_title: str, val_color: npt.NDArray, ): @@ -219,11 +226,13 @@ def plot_params( ---------- params: npt.NDArray A numpy array of mixed dtype. Each column contains the values of a parameter - vals: list[npt.NDArray] - A numpy array containing the values of the plot + vals: npt.NDArray + A numpy array of numpy arrays containing the values of the plot + vals_sem: npt.NDArray + A numpy array of numpy arrays containing the standard error of the values val_title: str The name of the value that we are plotting - val_color: list[npt.NDArray] + val_color: npt.NDArray Whether to color each point corresponding to this value """ figsize = matplotlib.rcParams["figure.figsize"] @@ -238,10 +247,13 @@ def plot_params( fig.subplots_adjust(hspace=0) # create a plot for the vals, first x_vals = [j for j, arr in enumerate(vals) for i in arr] - val_color = ["red" if j else "green" for i in val_color for j in i] - axs[0].scatter(x_vals, np.concatenate(vals), marker="o", color=val_color, s=5) + val_color = ["green" if j else "red" for i in val_color for j in i] + vals = np.concatenate(vals) + vals_sem = np.concatenate(vals_sem) + for v in zip(x_vals, vals, vals_sem, val_color): + axs[0].errorbar(v[0], v[1], yerr=v[2], marker="o", c=v[3], markersize=3) axs[0].set_ylabel(val_title, rotation="horizontal", ha="right") - axs[0].set_xticks(range(0, len(vals))) + axs[0].set_xticks(range(0, len(x_vals))) axs[0].set_xticklabels([]) axs[0].grid(axis="x") axs[0].set_ylim(None, 1) @@ -263,6 +275,7 @@ def plot_params( def plot_params_simple( params: npt.NDArray, vals: npt.NDArray, + val_sem: npt.NDArray, val_title: str, val_color: npt.NDArray, ): @@ -273,11 +286,13 @@ def plot_params_simple( ---------- params: npt.NDArray A numpy array containing the values of a parameter - vals: list[npt.NDArray] - A numpy array containing the values of the plot + vals: npt.NDArray + A numpy array of numpy arrays containing the values of the plot + val_sem: npt.NDArray + A numpy array of numpy arrays containing the standard error of the values val_title: str The name of the value that we are plotting - val_color: list[npt.NDArray] + val_color: npt.NDArray Whether to color each point corresponding to this value """ fig, ax = plt.subplots(nrows=1, ncols=1, sharex=True) @@ -286,7 +301,7 @@ def plot_params_simple( params = -np.log10(params) val_xlabel = "-log " + val_xlabel params = [j for j, arr in zip(params, vals) for i in arr] - val_color = ["red" if j else "green" for i in val_color for j in i] + val_color = ["green" if j else "red" for i in val_color for j in i] # plot params against vals ax.scatter(params, np.concatenate(vals), marker="o", color=val_color, s=5) ax.set_ylabel(val_title, color="g") @@ -294,6 +309,77 @@ def plot_params_simple( return fig +def group_by_rep(params: npt.NDArray, vals: npt.NDArray, causal_idxs: npt.NDArray, bools: npt.NDArray): + """ + Group replicates with identical parameter values + + Compute mean and standard error of the values for each group + + Parameters + ---------- + params: npt.NDArray + A numpy array of mixed dtype. Each column contains the values of a parameter + vals: npt.NDArray + A numpy array containing the values of the plot + causal_idxs: npt.NDArray + The index of the causal haplotype that each observed haplotype belongs to + bools: npt.NDArray + An array of boolean values indicating whether each observed haplotype has a match + + Returns + ------- + npt.NDArray + A numpy array of the unique parameter values + npt.NDArray + A numpy array containing the mean and std of the values over the replicates + """ + other_param_names = [name for name in params.dtype.names if name != "rep"] + grouped_params = np.unique(params[other_param_names]) + get_mean_std = lambda x: (np.mean(x), sem(x) if len(x) > 1 else np.nan) + filter_causal_idxs = lambda x: np.unique(x[x != np.iinfo(np.uint8).max]) + grouped_vals = [] + grouped_sem = [] + grouped_bools = [] + for group in grouped_params: + subgrouped_vals = [] + subgrouped_sem = [] + subgrouped_bools = [] + curr_vals = vals[params[other_param_names] == group] + curr_causal_idxs = causal_idxs[params[other_param_names] == group] + curr_bools = bools[params[other_param_names] == group] + # compute mean and std err for each causal match + for causal_idx in filter_causal_idxs(np.concatenate(curr_causal_idxs)): + try: + curr_vals_causal_idxs = np.concatenate([ + val[(indices == causal_idx) & curr_bool] + for val, indices, curr_bool in zip(curr_vals, curr_causal_idxs, curr_bools) + ]) + except ValueError: + continue + val_mean, val_sem = get_mean_std(curr_vals_causal_idxs) + subgrouped_vals.append(val_mean) + subgrouped_sem.append(val_sem) + subgrouped_bools.append(True) + # compute mean and std err for each unmatched observed hap + for unmatched_idx in range(max([(~i).sum() for i in curr_bools])): + try: + curr_vals_unmatched_idxs = np.array([ + val[(indices == causal_idx) & ~curr_bool][unmatched_idx] + for val, indices, curr_bool in zip(curr_vals, curr_causal_idxs, curr_bools) + if unmatched_idx < ((indices == causal_idx) & ~curr_bool).sum() + ]) + except ValueError: + continue + val_mean, val_sem = get_mean_std(curr_vals_unmatched_idxs) + subgrouped_vals.append(val_mean) + subgrouped_sem.append(val_sem) + subgrouped_bools.append(False) + grouped_vals.append(np.array(subgrouped_vals, dtype=object)) + grouped_sem.append(np.array(subgrouped_sem, dtype=object)) + grouped_bools.append(np.array(subgrouped_bools, dtype=object)) + return grouped_params, grouped_vals, grouped_sem, grouped_bools + + @click.command() @click.argument("genotypes", type=click.Path(exists=True, path_type=Path)) @click.argument("observed_hap", type=click.Path(path_type=Path)) @@ -329,13 +415,6 @@ def plot_params_simple( show_default="the first haplotype in the file", help="A haplotype ID from the causal .hap file", ) -@click.option( - "--show-extras", - is_flag=True, - default=False, - show_default=True, - help="Whether to show observed haps that don't match with a causal hap", -) @click.option( "-o", "--output", @@ -360,7 +439,6 @@ def main( region: str = None, observed_id: str = None, causal_id: str = None, - show_extras: bool = False, output: Path = Path("/dev/stdout"), verbosity: str = "ERROR", ): @@ -391,7 +469,7 @@ def main( get_hap_fname = lambda hap_path, param_set: Path(str(hap_path).format(**dict(zip(dtypes.keys(), param_set)))) # compute LD between the causal hap and the best observed hap across param vals - ld_vals, ld_extras_labels = zip(*tuple( + ld_vals, ld_extras_idxs, ld_extras_bool = zip(*tuple( get_best_ld( gts, get_hap_fname(observed_hap, params[idx]), @@ -399,11 +477,13 @@ def main( region=region, observed_id=observed_id, causal_id=causal_id, - show_extras=show_extras, log=log ) for idx in range(len(params)) )) + ld_vals = np.array(ld_vals, dtype=object) + ld_extras_idxs = np.array(ld_extras_idxs, dtype=object) + ld_extras_bool = np.array(ld_extras_bool, dtype=object) # extract fine-mapping metrics for the observed hap if metrics is not None: @@ -415,13 +495,18 @@ def main( for idx in range(len(params)) ]) + # TODO: also handle the metrics in group_by_rep() + params, ld_vals, ld_sem, ld_extras_bool = group_by_rep( + params, ld_vals, ld_extras_idxs, ld_extras_bool, + ) + if len(dtypes) > 1 or metrics is not None: merged = params if metrics is not None: merged = np.lib.recfunctions.merge_arrays([params, metrics], flatten=True) - fig = plot_params(merged, ld_vals, "causal LD", ld_extras_labels) + fig = plot_params(merged, ld_vals, ld_sem, "causal LD", ld_extras_bool) elif len(dtypes) == 1: - fig = plot_params_simple(params, ld_vals, "causal LD", ld_extras_labels) + fig = plot_params_simple(params, ld_vals, ld_sem, "causal LD", ld_extras_bool) else: raise ValueError("No parameter values found") diff --git a/happler/__main__.py b/happler/__main__.py index ef6e565..2ed1739 100644 --- a/happler/__main__.py +++ b/happler/__main__.py @@ -297,10 +297,6 @@ def run( log.info("There are {} samples and {} variants".format(*gt.data.shape)) # subset to just one phenotype - if len(ph.names) > 1: - log.warning("Ignoring all but the first trait in the phenotypes file") - ph.names = ph.names[:1] - ph.data = ph.data[:, :1] # also reorder and subset samples in Phenotypes to match those in the Genotypes ph.subset(samples=tuple(gt.samples), names=(pheno,), inplace=True) From 8070c4e60045ed6f39cc18652d86665524ff230a Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 22 Oct 2024 10:02:40 -0700 Subject: [PATCH 14/41] output finemapping metrics for multiple haps --- analysis/workflow/rules/happler.smk | 4 +- analysis/workflow/scripts/susie_metrics.R | 79 +++++++++++------------ analysis/workflow/scripts/utils.R | 13 ++++ 3 files changed, 52 insertions(+), 44 deletions(-) diff --git a/analysis/workflow/rules/happler.smk b/analysis/workflow/rules/happler.smk index 2c75df5..a83481e 100644 --- a/analysis/workflow/rules/happler.smk +++ b/analysis/workflow/rules/happler.smk @@ -392,11 +392,11 @@ rule metrics: input: finemap=expand(rules.finemapper.output.susie, ex="in", allow_missing=True), obs_hap=rules.run.output.hap, - caus_hap=config["hap_file"], + # caus_hap=config["hap_file"], output: metrics=out + "/susie_metrics.tsv", resources: - runtime=5, + runtime=10, threads: 6, log: logs + "/metrics", diff --git a/analysis/workflow/scripts/susie_metrics.R b/analysis/workflow/scripts/susie_metrics.R index 6147b76..d02a3df 100755 --- a/analysis/workflow/scripts/susie_metrics.R +++ b/analysis/workflow/scripts/susie_metrics.R @@ -4,7 +4,6 @@ # param1: The path to a RDS file containing the output of SuSiE # param2: The path to a .hap file containing the observed haplotype -# param3: The path to a .hap file containing the causal haplotype # Output will be written in tab-separated format to stdout # To debug this script, add a browser() call somewhere in it. Then run it with its command line args but replace the script name with "R --args". @@ -13,25 +12,12 @@ args = commandArgs(trailingOnly = TRUE) susie_res = args[1] obs_hap = args[2] -caus_hap = args[3] -write("Parsing observed hap file", stderr()) -for (line in readLines(obs_hap)) { - # we assume the first line in the hap file that begins with H denotes the haplotype - if (grepl("^H\\t", line)) break -} -# extract the start, end, and ID of the haplotype -happler_hap = as.integer(strsplit(line, "\t")[[1]][c(3,4)]) -happler_hap_id = strsplit(line, "\t")[[1]][c(5)] +# load functions to help read various file types +source("workflow/scripts/utils.R") -write("Parsing causal hap file", stderr()) -for (line in readLines(caus_hap)) { - # we assume the first line in the hap file that begins with H denotes the haplotype - if (grepl("^H\\t", line)) break -} -# extract the start, end, and ID of the haplotype -causal_hap = as.integer(strsplit(line, "\t")[[1]][c(3,4)]) -causal_hap_id = strsplit(line, "\t")[[1]][c(5)] +write("Parsing observed hap file", stderr()) +happler_hap = readHap(obs_hap) write("Parsing SuSiE results", stderr()) # parse the susie data: @@ -40,30 +26,39 @@ write("Parsing SuSiE results", stderr()) susie_res = readRDS(susie_res) fitted = susie_res$fitted susie_pip = fitted$pip -happler_hap_idx = which(names(susie_pip) %in% happler_hap_id) -write("Computing metrics", stderr()) -# The metrics are: -# 1) What is the PIP of the observed hap? -# 2) Does the observed hap get the highest PIP? -# 3) What is the best PIP among the variants? -# 4) Is the observed hap in a credible set? -# 5) What is the purity of the credible set? -obs_pip = susie_pip[happler_hap_id] -has_highest_pip = as.integer(sum(obs_pip < susie_pip) == 0) -best_variant_pip = max(susie_pip[names(susie_pip) != names(obs_pip)]) -credible_set = NULL -for (cs in names(fitted$sets$cs)) { - if (happler_hap_idx %in% fitted$sets$cs[cs]) { - credible_set = cs - break +for (happler_hap_file_idx in 1:nrow(happler_hap)) { + + happler_hap_id = happler_hap[happler_hap_file_idx, "id"] + happler_hap_idx = which(names(susie_pip) %in% happler_hap_id) + + write(paste("Computing metrics for", happler_hap_id), stderr()) + # The metrics are: + # 1) What is the PIP of the observed hap? + # 2) Does the observed hap get the highest PIP? + # 3) What is the best PIP among the variants? + # 4) Is the observed hap in a credible set? + # 5) What is the purity of the credible set? + # 6) What is the length of the credible set? + obs_pip = susie_pip[happler_hap_id] + has_highest_pip = as.integer(sum(obs_pip < susie_pip) == 0) + best_variant_pip = max(susie_pip[names(susie_pip) != names(obs_pip)]) + credible_set = NULL + for (cs in names(fitted$sets$cs)) { + if (happler_hap_idx %in% fitted$sets$cs[cs]) { + credible_set = cs + break + } } -} -purity = 0 -if (!is.null(credible_set)) { - purity = fitted$sets$purity[cs, "mean.abs.corr"] -} -in_credible_set = as.integer(!is.null(credible_set)) + purity = 0 + cs_len = 0 + if (!is.null(credible_set)) { + purity = fitted$sets$purity[cs, "mean.abs.corr"] + cs_length = length(fitted$sets$cs[credible_set]) + } + in_credible_set = as.integer(!is.null(credible_set)) -write("Outputting metrics", stderr()) -write(paste(obs_pip, has_highest_pip, best_variant_pip, in_credible_set, purity), stdout()) + write(paste("Outputting metrics for", happler_hap_id), stderr()) + write(paste(obs_pip, has_highest_pip, best_variant_pip, in_credible_set, purity, cs_length), stdout()) + +} diff --git a/analysis/workflow/scripts/utils.R b/analysis/workflow/scripts/utils.R index ba54c1f..b3a8219 100644 --- a/analysis/workflow/scripts/utils.R +++ b/analysis/workflow/scripts/utils.R @@ -97,3 +97,16 @@ save_curr_env = function(env = parent.frame(), filePath = "myenv.RData") { save(list = ls(all.names = TRUE, envir = env), file = filePath, envir = env) } + +readHap = function(hapfile) { + # Return a data.frame with the start, end, and ID columns of the haplotyp lines + # in the .hap file + # Parameters: + # hapfile: The path to the .hap file + data <- read.table(hapfile, header = FALSE, fill = TRUE, stringsAsFactors = FALSE) + # Filter rows that start with "H" and select columns 3 to 5 + h_data <- data[grep("^H", data$V1), c(3, 4, 5)] + # Rename the columns for clarity (optional) + colnames(h_data) <- c("start", "end", "id") + h_data +} From 4edfd019a93daffe4782e42fdf4b71cf6acbc2bf Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 22 Oct 2024 17:43:05 -0700 Subject: [PATCH 15/41] allow for setting output threshold as a param in the config --- analysis/config/config-simulations.yml | 5 ++++- analysis/workflow/Snakefile | 1 + analysis/workflow/rules/happler.smk | 3 ++- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/analysis/config/config-simulations.yml b/analysis/config/config-simulations.yml index 59fd987..a48ae14 100644 --- a/analysis/config/config-simulations.yml +++ b/analysis/config/config-simulations.yml @@ -74,9 +74,12 @@ mode: ld_range min_maf: 0.05 # Sample sizes to use -# sample_size: [500, 1000, 1500, 2000, 2500] +# sample_size: [777, 800, 1600, 2400, 3200] sample_size: 777 +# A threshold on the pvalues of the haplotypes +out_thresh: 5e-5 + # Whether to include the causal variant in the set of genotypes provided to the # finemapping methods. Set this to true if you're interested in seeing how the # methods perform when the causal variant is absent from the data. diff --git a/analysis/workflow/Snakefile b/analysis/workflow/Snakefile index 93d4640..2747744 100644 --- a/analysis/workflow/Snakefile +++ b/analysis/workflow/Snakefile @@ -103,6 +103,7 @@ if mode in ("snp", "hap", "ld_range"): "random": None, "covar": covar_file, "min_maf": check_config("min_maf", 0), + "out_thresh": check_config("out_thresh", 5e-8), } if mode in ("hap", "ld_range"): happler_config["snps"] = [] diff --git a/analysis/workflow/rules/happler.smk b/analysis/workflow/rules/happler.smk index a83481e..11c5b24 100644 --- a/analysis/workflow/rules/happler.smk +++ b/analysis/workflow/rules/happler.smk @@ -39,6 +39,7 @@ rule run: max_signals=3, max_iterations=3, rep=lambda wildcards: int(wildcards.rep), + out_thresh=check_config("out_thresh", 5e-8), output: hap=out + "/happler.hap", gz=out + "/happler.hap.gz", @@ -68,7 +69,7 @@ rule run: "--max-signals {params.max_signals} --max-iterations {params.max_iterations} " "--discard-multiallelic --region {params.region} --pheno {params.rep} " "{params.covar}--indep-thresh {params.indep} -t {params.thresh} " - "--show-tree {input.gts} {input.pts} &>{log}" + "--out-thresh {params.out_thresh} --show-tree {input.gts} {input.pts} &>{log}" " && haptools index -o {output.gz} {output.hap} &>>{log}" From 8371c80ccec013eb03f7cefd8089630a5a79a57b Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 22 Oct 2024 17:44:13 -0700 Subject: [PATCH 16/41] clean up and update scripts README --- analysis/workflow/scripts/README.md | 12 +++-- analysis/workflow/scripts/compute_pgen_ld.py | 2 +- analysis/workflow/scripts/haplotype_ld.py | 49 -------------------- 3 files changed, 10 insertions(+), 53 deletions(-) delete mode 100755 analysis/workflow/scripts/haplotype_ld.py diff --git a/analysis/workflow/scripts/README.md b/analysis/workflow/scripts/README.md index 24538c9..99f7f4f 100644 --- a/analysis/workflow/scripts/README.md +++ b/analysis/workflow/scripts/README.md @@ -6,6 +6,12 @@ All python scripts implement the --help argument. For bash, awk, and R scripts, ## [choose_different_ld.py](choose_different_ld.py) Choose haplotype alleles such that their SNPs have a range of strengths of LD with the causal haplotype. +## [compute_pgen_ld.py](compute_pgen_ld.py) +Quickly compute the LD between a single variant and all other variants in a PGEN file. + +## [conditional_regression_plots.py](conditional_regression_plots.py) +Make manhattan plots for a region after conditioning on 1) haplotypes and 2) their alleles together and 3) separately. Also, 4) show the original manhattan plot without any conditioning. + ## [create_hap_file.sh](create_hap_file.sh) Create a [.hap file](https://haptools.readthedocs.io/en/stable/formats/haplotypes.html) for use by haptools when simulating causal variables. @@ -33,12 +39,12 @@ Create a manhattan plot to visualize the results of a GWAS. ## [merge_plink.py](merge_plink.py) Merge variants from two PGEN files that have the same set of samples. -## [midway_manhattan.bash](midway_manhattan.bash) -Create a "midway" manhattan plot depicting the p-values on a branch of a tree mid-way through happler's tree-building process. - ## [midway_manhattan_summary.py](midway_manhattan_summary.py) Summarize the results of multiple midway manhattan plots via an ROC curve +## [midway_manhattan.bash](midway_manhattan.bash) +Create a "midway" manhattan plot depicting the p-values on a branch of a tree mid-way through happler's tree-building process. + ## [parameter_plot.py](parameter_plot.py) Plot the LD between a causal haplotype and a set of observed haplotypes across a range of hyper-parameters. diff --git a/analysis/workflow/scripts/compute_pgen_ld.py b/analysis/workflow/scripts/compute_pgen_ld.py index bc95a39..075af17 100755 --- a/analysis/workflow/scripts/compute_pgen_ld.py +++ b/analysis/workflow/scripts/compute_pgen_ld.py @@ -7,7 +7,7 @@ from haptools.logging import getLogger from haptools.ld import pearson_corr_ld -from haptools.data import Data, Genotypes, GenotypesPLINK +from haptools.data import Data, GenotypesPLINK def corr(a, b): diff --git a/analysis/workflow/scripts/haplotype_ld.py b/analysis/workflow/scripts/haplotype_ld.py deleted file mode 100755 index b4f9012..0000000 --- a/analysis/workflow/scripts/haplotype_ld.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python -import pandas as pd -import seaborn as sns -from matplotlib import pyplot as plt - - -sns.set(rc={'figure.figsize':(11.7,8.27)}) - -phens = pd.read_csv("../phens.tsv.gz", sep="\t", index_col=0, header=0) -gens = pd.read_csv("genotypes.tsv", sep="\t", index_col=1) -gt = gens.iloc[:,3:] -alleles = gens['alleles'] - -SNPs = gt.iloc[~gt.index.str.startswith('STR')] -STR = gt.iloc[gt.index.str.startswith('STR')] - -samples = phens.index.astype(str) -num_samp = len(samples) - -lens = {str(i): len(al) for i, al in enumerate(gens.loc[STR.index[0],'alleles'].split(','))} -str_gt = STR.iloc[0].str.split('|', expand=True).applymap(lens.__getitem__) -str_gt = str_gt.loc[samples] -str_gts = pd.concat((str_gt[0], str_gt[1]), axis=0, ignore_index=True) -split_chroms = pd.DataFrame({STR.index[0]: str_gts}) - -fig, axes = plt.subplots(1, len(SNPs.index)) -for idx, snp in enumerate(SNPs.index): - snp_gt = gt.loc[snp].str.split('|', expand=True) - snp_gt = snp_gt.loc[samples] - snp_gts = pd.concat((snp_gt[0], snp_gt[1]), axis=0, ignore_index=True).astype(int) - split_chroms[snp] = snp_gts - sns.violinplot(ax=axes[idx], x=snp, y=STR.index[0], data=split_chroms, order=[0,1]) - -plt.savefig('snps.png') -plt.clf() - -fig, axes = plt.subplots(1, 2) -hap_alleles = {snp: 1 for snp in SNPs.index} -hap = pd.DataFrame([(split_chroms[snp] == hap_alleles[snp]).astype(bool) for snp in SNPs.index]).T -hap['haplotype'] = hap.all(axis=1).astype(int) -hap[STR.index[0]] = split_chroms[STR.index[0]] -sns.violinplot(ax=axes[0], x='haplotype', y=STR.index[0], data=hap, order=[0,1]) - -hap_sum = pd.DataFrame({ - 'haplotype': (hap['haplotype'].iloc[:num_samp].to_numpy() + hap['haplotype'].iloc[num_samp:].to_numpy()), - STR.index[0]: (hap[STR.index[0]].iloc[:num_samp].to_numpy() + hap[STR.index[0]].iloc[num_samp:].to_numpy()), -}) -sns.regplot(ax=axes[1], x='haplotype', y=STR.index[0], data=hap_sum) -plt.savefig('haplotype.png') From 1b5a9edca3633fa3974833a870a1ea4f6768364a Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 22 Oct 2024 17:47:45 -0700 Subject: [PATCH 17/41] handle empty hap files in utils.R --- analysis/workflow/scripts/susie_metrics.R | 2 +- analysis/workflow/scripts/utils.R | 16 ++++++++++------ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/analysis/workflow/scripts/susie_metrics.R b/analysis/workflow/scripts/susie_metrics.R index d02a3df..3d1733e 100755 --- a/analysis/workflow/scripts/susie_metrics.R +++ b/analysis/workflow/scripts/susie_metrics.R @@ -51,7 +51,7 @@ for (happler_hap_file_idx in 1:nrow(happler_hap)) { } } purity = 0 - cs_len = 0 + cs_length = 0 if (!is.null(credible_set)) { purity = fitted$sets$purity[cs, "mean.abs.corr"] cs_length = length(fitted$sets$cs[credible_set]) diff --git a/analysis/workflow/scripts/utils.R b/analysis/workflow/scripts/utils.R index b3a8219..52c0886 100644 --- a/analysis/workflow/scripts/utils.R +++ b/analysis/workflow/scripts/utils.R @@ -103,10 +103,14 @@ readHap = function(hapfile) { # in the .hap file # Parameters: # hapfile: The path to the .hap file - data <- read.table(hapfile, header = FALSE, fill = TRUE, stringsAsFactors = FALSE) - # Filter rows that start with "H" and select columns 3 to 5 - h_data <- data[grep("^H", data$V1), c(3, 4, 5)] - # Rename the columns for clarity (optional) - colnames(h_data) <- c("start", "end", "id") - h_data + data <- tryCatch(read.table(hapfile, header = FALSE, fill = TRUE, stringsAsFactors = FALSE), error=function(e) data.frame()) + if (nrow(data) != 0) { + # Filter rows that start with "H" and select columns 3 to 5 + h_data <- data[grep("^H", data$V1), c(3, 4, 5)] + # Rename the columns for clarity (optional) + colnames(h_data) <- c("start", "end", "id") + h_data + } else { + data + } } From c8e2eb5bfe9114c4a4be621c3e8c0e70f41cb8fa Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 22 Oct 2024 17:50:02 -0700 Subject: [PATCH 18/41] allow for reordering rows of parameter_plot --- analysis/workflow/rules/plots.smk | 6 ++- analysis/workflow/scripts/parameter_plot.py | 51 +++++++++++++++++++-- 2 files changed, 50 insertions(+), 7 deletions(-) diff --git a/analysis/workflow/rules/plots.smk b/analysis/workflow/rules/plots.smk index 403f175..890cbff 100644 --- a/analysis/workflow/rules/plots.smk +++ b/analysis/workflow/rules/plots.smk @@ -98,7 +98,7 @@ rule params: output: png=out + "/happler_params.png", resources: - runtime=10, + runtime=20, log: logs + "/plot_params", benchmark: @@ -107,6 +107,7 @@ rule params: "happler" shell: "workflow/scripts/parameter_plot.py -o {output.png} " + "--order num_haps,beta,ld " "{input.gts} {params.observed_haps} {params.causal_hap} &> {log}" @@ -126,7 +127,7 @@ rule metrics: output: png=out + "/finemapping_metrics.png", resources: - runtime=10, + runtime=20, log: logs + "/metrics", benchmark: @@ -135,4 +136,5 @@ rule metrics: "happler" shell: "workflow/scripts/parameter_plot.py -o {output.png} -m {params.metrics} " + "--order num_haps,beta,ld " "{input.gts} {params.observed_haps} {params.causal_hap} &> {log}" diff --git a/analysis/workflow/scripts/parameter_plot.py b/analysis/workflow/scripts/parameter_plot.py index bea84d2..af1db31 100755 --- a/analysis/workflow/scripts/parameter_plot.py +++ b/analysis/workflow/scripts/parameter_plot.py @@ -218,6 +218,7 @@ def plot_params( vals_sem: npt.NDArray, val_title: str, val_color: npt.NDArray, + hide_extras: bool = False, ): """ Plot vals against parameter values @@ -234,6 +235,8 @@ def plot_params( The name of the value that we are plotting val_color: npt.NDArray Whether to color each point corresponding to this value + hide_extras: bool + Whether to hide the observed haps that have no causal hap match """ figsize = matplotlib.rcParams["figure.figsize"] if len(params) > 75: @@ -251,6 +254,8 @@ def plot_params( vals = np.concatenate(vals) vals_sem = np.concatenate(vals_sem) for v in zip(x_vals, vals, vals_sem, val_color): + if hide_extras and v[3] != "green": + continue axs[0].errorbar(v[0], v[1], yerr=v[2], marker="o", c=v[3], markersize=3) axs[0].set_ylabel(val_title, rotation="horizontal", ha="right") axs[0].set_xticks(range(0, len(x_vals))) @@ -275,9 +280,10 @@ def plot_params( def plot_params_simple( params: npt.NDArray, vals: npt.NDArray, - val_sem: npt.NDArray, + vals_sem: npt.NDArray, val_title: str, val_color: npt.NDArray, + hide_extras: bool = False, ): """ Plot vals against only a single column of parameter values @@ -288,12 +294,14 @@ def plot_params_simple( A numpy array containing the values of a parameter vals: npt.NDArray A numpy array of numpy arrays containing the values of the plot - val_sem: npt.NDArray + vals_sem: npt.NDArray A numpy array of numpy arrays containing the standard error of the values val_title: str The name of the value that we are plotting val_color: npt.NDArray Whether to color each point corresponding to this value + hide_extras: bool + Whether to hide the observed haps that have no causal hap match """ fig, ax = plt.subplots(nrows=1, ncols=1, sharex=True) val_xlabel = params.dtype.names[0] @@ -302,8 +310,13 @@ def plot_params_simple( val_xlabel = "-log " + val_xlabel params = [j for j, arr in zip(params, vals) for i in arr] val_color = ["green" if j else "red" for i in val_color for j in i] + vals = np.concatenate(vals) + vals_sem = np.concatenate(vals_sem) # plot params against vals - ax.scatter(params, np.concatenate(vals), marker="o", color=val_color, s=5) + for v in zip(params, vals, vals_sem, val_color): + if hide_extras and v[3] != "green": + continue + ax.errorbar(v[0], v[1], yerr=v[2], marker="o", c=v[3], markersize=5) ax.set_ylabel(val_title, color="g") ax.set_xlabel(val_xlabel) return fig @@ -415,6 +428,20 @@ def group_by_rep(params: npt.NDArray, vals: npt.NDArray, causal_idxs: npt.NDArra show_default="the first haplotype in the file", help="A haplotype ID from the causal .hap file", ) +@click.option( + "--hide-extras", + is_flag=True, + default=False, + show_default=True, + help="Hide the observed haps that have no causal hap match", +) +@click.option( + "--order", + type=str, + default=None, + show_default="The order of the wildcards in the observed_hap path", + help="The order of the parameters in the plot, as a comma-separated list", +) @click.option( "-o", "--output", @@ -439,6 +466,8 @@ def main( region: str = None, observed_id: str = None, causal_id: str = None, + hide_extras: bool = False, + order: str = None, output: Path = Path("/dev/stdout"), verbosity: str = "ERROR", ): @@ -463,6 +492,17 @@ def main( params = dict(glob_wildcards(observed_hap)._asdict()) # convert the dictionary to a numpy mixed dtype array dtypes = {k: DTYPES[k] for k in params.keys()} + # if the user specified an order, then we will sort the parameters by that order + if order is not None: + new_params = {} + for k in order.split(",")[::-1]: + if k not in dtypes: + log.warning(f"Parameter {k} not found in the observed hap path") + continue + dtypes = {k: dtypes.pop(k), **dtypes} + for k in dtypes: + new_params[k] = params.pop(k) + params = new_params params = np.array(list(zip(*params.values())), dtype=list(dtypes.items())) params.sort() @@ -499,14 +539,15 @@ def main( params, ld_vals, ld_sem, ld_extras_bool = group_by_rep( params, ld_vals, ld_extras_idxs, ld_extras_bool, ) + del dtypes["rep"] if len(dtypes) > 1 or metrics is not None: merged = params if metrics is not None: merged = np.lib.recfunctions.merge_arrays([params, metrics], flatten=True) - fig = plot_params(merged, ld_vals, ld_sem, "causal LD", ld_extras_bool) + fig = plot_params(merged, ld_vals, ld_sem, "causal LD", ld_extras_bool, hide_extras) elif len(dtypes) == 1: - fig = plot_params_simple(params, ld_vals, ld_sem, "causal LD", ld_extras_bool) + fig = plot_params_simple(params, ld_vals, ld_sem, "causal LD", ld_extras_bool, hide_extras) else: raise ValueError("No parameter values found") From 4f5b24b6cabefa175a095ee3e2dc874b252076b7 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 22 Oct 2024 17:50:49 -0700 Subject: [PATCH 19/41] allow for leniency when estimating pvals in happler --- happler/tree/assoc_test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/happler/tree/assoc_test.py b/happler/tree/assoc_test.py index f1e6f95..fa9c02d 100644 --- a/happler/tree/assoc_test.py +++ b/happler/tree/assoc_test.py @@ -146,7 +146,7 @@ def pval_as_decimal(t_stat: float, df: int, precision: int = 1000) -> Decimal: ------ ValueError This function is only valid when the t distribution is approximately - equivalent to the normal distribution. Thus, if df < 10000, we raise a + equivalent to the normal distribution. Thus, if df < 1000, we raise a ValueError to indicate that the function will not return an accurate value Returns @@ -154,8 +154,8 @@ def pval_as_decimal(t_stat: float, df: int, precision: int = 1000) -> Decimal: Decimal An approximate, higher precision p-value for the provided t statistic """ - if df < 10000: - raise ValueError("You need a larger sample size to approximate this p-value") + if df < 1000: + log.warning("You need a larger sample size to approximate this p-value") log10_pval = stats.norm.logsf(np.abs(t_stat)) / np.log(10) + np.log10(2) # set the desired precision getcontext().prec = precision From 46fe5ca8048a5359e098f7ac182b593b0aff3099 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 22 Oct 2024 21:35:19 -0700 Subject: [PATCH 20/41] make thresh more lenient --- analysis/config/config-simulations.yml | 2 +- happler/__main__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/analysis/config/config-simulations.yml b/analysis/config/config-simulations.yml index a48ae14..d983344 100644 --- a/analysis/config/config-simulations.yml +++ b/analysis/config/config-simulations.yml @@ -78,7 +78,7 @@ min_maf: 0.05 sample_size: 777 # A threshold on the pvalues of the haplotypes -out_thresh: 5e-5 +out_thresh: 5e-3 # Whether to include the causal variant in the set of genotypes provided to the # finemapping methods. Set this to true if you're interested in seeing how the diff --git a/happler/__main__.py b/happler/__main__.py index 2ed1739..5748e97 100644 --- a/happler/__main__.py +++ b/happler/__main__.py @@ -369,7 +369,7 @@ def run( if len(haps.data) == 1: hap = next(iter(haps.data.values())) if hap.pval < -np.log10(out_thresh): - log.info("Ignoring haplotype with low pval") + log.info(f"Ignoring haplotype with low pval {hap.pval}") continue for hap in haps.data.values(): if len(hap.variants) <= 1 and remove_snps: From 15848901e0c8883ba7757b5c95634a0a0431a94b Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 22 Oct 2024 21:36:20 -0700 Subject: [PATCH 21/41] split pheno for happler rules --- analysis/workflow/rules/happler.smk | 43 +++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 9 deletions(-) diff --git a/analysis/workflow/rules/happler.smk b/analysis/workflow/rules/happler.smk index 11c5b24..f98f666 100644 --- a/analysis/workflow/rules/happler.smk +++ b/analysis/workflow/rules/happler.smk @@ -24,11 +24,36 @@ def parse_locus(locus): return chrom, start, end +rule sub_pheno: + """subset the phenotype file to include only the desired replicate""" + input: + pheno=config["pheno"], + params: + rep=lambda wildcards: int(wildcards.rep)+2, + output: + pheno=out + "/phen.pheno", + resources: + runtime=1, + threads: 1, + log: + logs + "/sub_pheno", + benchmark: + bench + "/sub_pheno", + conda: + "../envs/default.yml" + shell: + "cut -f 1,{params.rep} {input.pheno} | " + "(echo -e \"#IID\\thap\" && tail -n+2) >{output.pheno} 2>{log}" + + +pheno = rules.sub_pheno.output.pheno if "{rep}" in out else config["pheno"] + + rule run: """ execute happler! """ input: gts=config["snp_panel"], - pts=config["pheno"], + pts=pheno, covar=config["covar"], params: thresh=lambda wildcards: 0.05 if "alpha" not in wildcards else wildcards.alpha, @@ -106,7 +131,7 @@ rule cond_linreg: pvar=Path(config["snp_panel"]).with_suffix(".pvar"), psam=Path(config["snp_panel"]).with_suffix(".psam"), hap=rules.run.output.hap, - pts=config["pheno"], + pts=pheno, params: maf = check_config("min_maf", 0), region=lambda wildcards: wildcards.locus.replace("_", ":"), @@ -138,7 +163,7 @@ rule heatmap: pvar=Path(config["snp_panel"]).with_suffix(".pvar"), psam=Path(config["snp_panel"]).with_suffix(".psam"), hap=rules.run.output.hap, - pts=config["pheno"], + pts=pheno, params: region=lambda wildcards: wildcards.locus.replace("_", ":"), output: @@ -202,7 +227,7 @@ rule transform: pgen=config["snp_panel"], pvar=Path(config["snp_panel"]).with_suffix(".pvar"), psam=Path(config["snp_panel"]).with_suffix(".psam"), - pts=config["pheno"], + pts=pheno, params: region=lambda wildcards: wildcards.locus.replace("_", ":"), output: @@ -229,7 +254,7 @@ rule linreg: hap=rules.transform.output.pgen, hap_pvar=rules.transform.output.pvar, hap_psam=rules.transform.output.psam, - pts=config["pheno"], + pts=pheno, params: region=lambda wildcards: wildcards.locus.replace("_", ":"), output: @@ -339,7 +364,7 @@ rule finemapper: gt=lambda wildcards: finemapper_input(wildcards).pgen, gt_pvar=lambda wildcards: finemapper_input(wildcards).pvar, gt_psam=lambda wildcards: finemapper_input(wildcards).psam, - phen=config["pheno"], + phen=pheno, params: outdir=lambda wildcards, output: Path(output.susie).parent, exclude_causal="NULL", @@ -347,7 +372,7 @@ rule finemapper: # num_signals=lambda wildcards: get_num_haplotypes( # expand(rules.run.output.hap, **wildcards)[0] # ) + 1, - num_signals=5, + num_signals=10, # TODO: also add MAF filter output: susie=out + "/{ex}clude/susie.rds", @@ -433,7 +458,7 @@ rule results: """ input: gt=rules.finemapper.input.gt, - phen=config["pheno"], + phen=pheno, susie=rules.finemapper.output.susie, happler_hap=results_happler_hap_input, causal_gt=config["causal_gt"].pgen if "causal_gt" in config else [], @@ -491,7 +516,7 @@ rule gwas: pgen=(rules.merge_SVs if check_config("SVs") else rules.merge).output.pgen, pvar=(rules.merge_SVs if check_config("SVs") else rules.merge).output.pvar, psam=(rules.merge_SVs if check_config("SVs") else rules.merge).output.psam, - pts=config["pheno"], + pts=pheno, covar=config["covar"], params: maf = check_config("min_maf", 0), From 63bd60be7846353ab3e45e6de37bad93333efe37 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Wed, 23 Oct 2024 20:43:45 -0700 Subject: [PATCH 22/41] set up workflow for running on geuvadis again --- analysis/workflow/Snakefile | 1 + analysis/workflow/rules/happler.smk | 28 +++++++++++----------------- 2 files changed, 12 insertions(+), 17 deletions(-) diff --git a/analysis/workflow/Snakefile b/analysis/workflow/Snakefile index 2747744..5a9672c 100644 --- a/analysis/workflow/Snakefile +++ b/analysis/workflow/Snakefile @@ -121,6 +121,7 @@ elif mode == "run": "random": None, "covar": covar_file, "min_maf": check_config("min_maf", 0), + "out_thresh": check_config("out_thresh", 5e-8), } if check_config("SVs", place=config["modes"][mode]): happler_config["SVs"] = config["modes"][mode]["SVs"] diff --git a/analysis/workflow/rules/happler.smk b/analysis/workflow/rules/happler.smk index f98f666..6bd4b65 100644 --- a/analysis/workflow/rules/happler.smk +++ b/analysis/workflow/rules/happler.smk @@ -33,7 +33,7 @@ rule sub_pheno: output: pheno=out + "/phen.pheno", resources: - runtime=1, + runtime=5, threads: 1, log: logs + "/sub_pheno", @@ -63,8 +63,8 @@ rule run: indep=lambda wildcards: 0.05 if "indep_alpha" not in wildcards else wildcards.indep_alpha, max_signals=3, max_iterations=3, - rep=lambda wildcards: int(wildcards.rep), out_thresh=check_config("out_thresh", 5e-8), + keep_SNPs="--remove-SNPs " if "{rep}" in out else "", output: hap=out + "/happler.hap", gz=out + "/happler.hap.gz", @@ -72,15 +72,12 @@ rule run: dot=out + "/happler.dot", resources: runtime=lambda wildcards, input: ( - Path(input.gts).with_suffix(".pvar").stat().st_size/1000 * 0.5454649806119475 + 0.6935132147046765 - if Path(input.gts).suffix == ".pgen" and check_config("dynamic_resources") else 45 + max(45, Path(input.gts).with_suffix(".pvar").stat().st_size/1000 * 0.5454649806119475 + 0.6935132147046765) ), # slurm_partition="hotel", # slurm_extra="--qos=hotel", - # mem_mb=lambda wildcards, threads: threads*4.57, mem_mb=lambda wildcards, input: ( - Path(input.gts).with_suffix(".pvar").stat().st_size/1000 * 14.90840694595845 + 401.8011129664152 - if Path(input.gts).suffix == ".pgen" and check_config("dynamic_resources") else 5000 + max(5000, Path(input.gts).with_suffix(".pvar").stat().st_size/1000 * 14.90840694595845 + 401.8011129664152) ), threads: 1 log: @@ -92,7 +89,7 @@ rule run: shell: "happler run -o {output.hap} --verbosity DEBUG --maf {params.maf} " "--max-signals {params.max_signals} --max-iterations {params.max_iterations} " - "--discard-multiallelic --region {params.region} --pheno {params.rep} " + "--discard-multiallelic --region {params.region} {params.keep_SNPs}" "{params.covar}--indep-thresh {params.indep} -t {params.thresh} " "--out-thresh {params.out_thresh} --show-tree {input.gts} {input.pts} &>{log}" " && haptools index -o {output.gz} {output.hap} &>>{log}" @@ -139,9 +136,9 @@ rule cond_linreg: png=out + "/cond_linreg.pdf", resources: runtime=lambda wildcards, input: ( - 1.5 * Path(input.pvar).stat().st_size/1000 * ( + max(50, 1.5 * Path(input.pvar).stat().st_size/1000 * ( 0.2400978997329614 + get_num_variants(input.hap) * 0.045194464826048095 - ) if Path(input.pgen).suffix == ".pgen" and check_config("dynamic_resources") else 50 + )) ), log: logs + "/cond_linreg", @@ -335,8 +332,7 @@ rule merge: psam=temp(out + "/{ex}clude/merged.psam"), resources: runtime=lambda wildcards, input: ( - Path(input.gts_pvar).stat().st_size/1000 * 0.05652315368728583 + 2.0888654705656844 - if Path(input.gts).suffix == ".pgen" and check_config("dynamic_resources") else 10 + max(10, Path(input.gts_pvar).stat().st_size/1000 * 0.05652315368728583 + 2.0888654705656844) ), log: logs + "/{ex}clude/merge", @@ -377,11 +373,9 @@ rule finemapper: output: susie=out + "/{ex}clude/susie.rds", resources: -# runtime=lambda wildcards, input: ( -# Path(input.gt_pvar).stat().st_size/1000 * 0.08 -# if Path(input.gt).suffix == ".pgen" and check_config("dynamic_resources") else 75 -# ), - runtime=120, + runtime=lambda wildcards, input: ( + max(120, Path(input.gt_pvar).stat().st_size/1000 * 0.08) + ), mem_mb = 7000, threads: 1, log: From eeb4659b9c276dfb04783c9ec87a18076a966122 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Wed, 23 Oct 2024 20:44:41 -0700 Subject: [PATCH 23/41] switch to geuvadis config --- analysis/config/config-geuvadis.yml | 3 +++ analysis/config/config.yml | 2 +- happler/tree/assoc_test.py | 1 - happler/tree/terminator.py | 2 +- 4 files changed, 5 insertions(+), 3 deletions(-) diff --git a/analysis/config/config-geuvadis.yml b/analysis/config/config-geuvadis.yml index b5f23d9..6089e55 100644 --- a/analysis/config/config-geuvadis.yml +++ b/analysis/config/config-geuvadis.yml @@ -79,6 +79,9 @@ min_maf: 0.1 # sample_size: [500, 1000, 1500, 2000, 2500] # sample_size: 777 +# A threshold on the pvalues of the haplotypes +out_thresh: 5e-6 + # Whether to include the causal variant in the set of genotypes provided to the # finemapping methods. Set this to true if you're interested in seeing how the # methods perform when the causal variant is absent from the data. diff --git a/analysis/config/config.yml b/analysis/config/config.yml index 51c999d..c7dda66 120000 --- a/analysis/config/config.yml +++ b/analysis/config/config.yml @@ -1 +1 @@ -config-simulations.yml \ No newline at end of file +config-geuvadis.yml \ No newline at end of file diff --git a/happler/tree/assoc_test.py b/happler/tree/assoc_test.py index fa9c02d..f17f202 100644 --- a/happler/tree/assoc_test.py +++ b/happler/tree/assoc_test.py @@ -7,7 +7,6 @@ from scipy import stats import numpy.typing as npt import statsmodels.api as sm -from scipy.stats import t as t_dist # We declare this class to be a dataclass to automatically define __init__ and a few diff --git a/happler/tree/terminator.py b/happler/tree/terminator.py index 13daee3..105ce2a 100644 --- a/happler/tree/terminator.py +++ b/happler/tree/terminator.py @@ -132,7 +132,7 @@ def compute_val( # terminate if the effect sizes have gone in the opposite direction self.log.debug("Terminated b/c effect size did not improve") return True - # perform a two tailed, two-sample t-test using the difference of the effect sizes + # perform a one tailed, two-sample t-test using the difference of the effect sizes # first, we compute the standard error of the difference of the effect sizes std_err = np.sqrt( ((results.data["stderr"] ** 2) + (parent_res.stderr**2)) / 2 From 1dbe559733493114831ae0588f2f51a471b4b29a Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Wed, 23 Oct 2024 20:45:34 -0700 Subject: [PATCH 24/41] enable pickling as a parameter plot option --- analysis/workflow/scripts/parameter_plot.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/analysis/workflow/scripts/parameter_plot.py b/analysis/workflow/scripts/parameter_plot.py index af1db31..58a18aa 100755 --- a/analysis/workflow/scripts/parameter_plot.py +++ b/analysis/workflow/scripts/parameter_plot.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +import pickle from pathlib import Path from logging import Logger @@ -303,7 +304,7 @@ def plot_params_simple( hide_extras: bool Whether to hide the observed haps that have no causal hap match """ - fig, ax = plt.subplots(nrows=1, ncols=1, sharex=True) + fig, ax = plt.subplots(nrows=1, ncols=1, sharex=True, figsize=(3.5, 3)) val_xlabel = params.dtype.names[0] if val_xlabel in LOG_SCALE: params = -np.log10(params) @@ -319,6 +320,7 @@ def plot_params_simple( ax.errorbar(v[0], v[1], yerr=v[2], marker="o", c=v[3], markersize=5) ax.set_ylabel(val_title, color="g") ax.set_xlabel(val_xlabel) + ax.set_ylim(0, 1.03) return fig @@ -435,6 +437,13 @@ def group_by_rep(params: npt.NDArray, vals: npt.NDArray, causal_idxs: npt.NDArra show_default=True, help="Hide the observed haps that have no causal hap match", ) +@click.option( + "--pickle-out", + is_flag=True, + default=False, + show_default=True, + help="Save the output as a pickle file as well", +) @click.option( "--order", type=str, @@ -467,6 +476,7 @@ def main( observed_id: str = None, causal_id: str = None, hide_extras: bool = False, + pickle_out: bool = False, order: str = None, output: Path = Path("/dev/stdout"), verbosity: str = "ERROR", @@ -541,6 +551,10 @@ def main( ) del dtypes["rep"] + if pickle_out: + with open(output.with_suffix(".pickle"), "wb") as f: + pickle.dump([params, ld_vals, ld_sem, ld_extras_bool], f) + if len(dtypes) > 1 or metrics is not None: merged = params if metrics is not None: From 4383e3f7f97dbe9201408651b318023e4c469dee Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 25 Oct 2024 06:43:34 -0700 Subject: [PATCH 25/41] start to try to handle multiple haps in summarize_results.R --- analysis/workflow/scripts/summarize_results.R | 42 ++++++------------- analysis/workflow/scripts/susie_metrics.R | 1 + 2 files changed, 13 insertions(+), 30 deletions(-) diff --git a/analysis/workflow/scripts/summarize_results.R b/analysis/workflow/scripts/summarize_results.R index b31cdf9..5a92017 100755 --- a/analysis/workflow/scripts/summarize_results.R +++ b/analysis/workflow/scripts/summarize_results.R @@ -28,9 +28,10 @@ X = readPGEN(snakemake@input[["gt"]], region=region, samples=samples[,2]) # also load the positions of each of the variants pos = readPVAR(snakemake@input[["gt"]], region=region) if (length(snakemake@input[["causal_gt"]])) { + write("Loading causal genotype data too", stderr()) causal_gt_file = snakemake@input[["causal_gt"]] causal_gt_samples = readPSAM(causal_gt_file, samples=readPheno(phen)[,1]) - causal_gt = readPGEN(causal_gt_file, region=region, samples=causal_gt_samples) + causal_gt = readPGEN(causal_gt_file, region=region, samples=causal_gt_samples[,2]) X = cbind(X, causal_gt) causal_variant = colnames(causal_gt)[1] # also load the positions of each of the variants @@ -52,43 +53,24 @@ hap = FALSE causal_hap_is_provided = FALSE if (length(snakemake@input[["happler_hap"]]) > 0) { hap = TRUE - happler_haplotype = file(snakemake@input[["happler_hap"]], "rt") + happler_haplotype = snakemake@input[["happler_hap"]] } if (length(snakemake@params[["causal_hap"]]) > 0) { causal_hap_is_provided = TRUE - causal_haplotype = file(snakemake@params[["causal_hap"]], "rt") + causal_haplotype = snakemake@params[["causal_hap"]] } if (hap) { write("Parsing happler hap file", stderr()) - while(TRUE) { - line = readLines(happler_haplotype, 1) - # we assume the first line in the hap file that begins with H denotes the haplotype - if (grepl("^H\\t", line)) break - } - # extract the start, end, and ID of the haplotype - happler_hap = as.integer(strsplit(line, "\t")[[1]][c(3,4)]) - happler_hap_id = strsplit(line, "\t")[[1]][c(5)] + happler_hap = readHap(happler_haplotype) } if (causal_hap_is_provided) { write("Parsing causal hap file", stderr()) - while(TRUE) { - line = readLines(causal_haplotype, 1) - # we assume the first line in the hap file that begins with H denotes the haplotype - if (grepl("^H\\t", line)) break - } - # extract the start, end, and ID of the haplotype - causal_hap = as.integer(strsplit(line, "\t")[[1]][c(3,4)]) - causal_hap_id = strsplit(line, "\t")[[1]][c(5)] - stopifnot(causal_hap_id == causal_variant) - haplotypes = t(data.frame( - causal_hap = c("start"=causal_hap[1], "end"=causal_hap[2], "id"=causal_hap_id), - happler_hap = c("start"=happler_hap[1], "end"=happler_hap[2], "id"=happler_hap_id) - )) -} else { - haplotypes = t(data.frame( - happler_hap = c("start"=happler_hap[1], "end"=happler_hap[2], "id"=happler_hap_id) - )) + causal_hap = readHap(causal_haplotype) + stopifnot(causal_variant %in% causal_hap["id"]) + haplotypes = cbind(causal_hap, happler_hap) +} else if (hap) { + haplotypes = happler_hap } write("Formatting genotypes and creating output dir", stderr()) @@ -253,7 +235,7 @@ if (hap) { } else { pip_plot(susie_pip, X, b, pos, susie_cs=names(susie_pip[fitted$sets$cs[['L1']]])) } -ggsave(paste0(out,'/susie.pdf'), width=10, height=5, device='pdf') +ggsave(paste0(out,'/susie.pdf'), width=6, height=5, device='pdf') dev.off() if (!is.null(finemap_results)) { @@ -263,6 +245,6 @@ if (!is.null(finemap_results)) { } else { pip_plot(finemap_pip, X, b, pos) } - ggsave(paste0(out,'/finemap.pdf'), width=10, height=5, device='pdf') + ggsave(paste0(out,'/finemap.pdf'), width=6, height=5, device='pdf') while (!is.null(dev.list())) dev.off() } diff --git a/analysis/workflow/scripts/susie_metrics.R b/analysis/workflow/scripts/susie_metrics.R index 3d1733e..f3fd402 100755 --- a/analysis/workflow/scripts/susie_metrics.R +++ b/analysis/workflow/scripts/susie_metrics.R @@ -42,6 +42,7 @@ for (happler_hap_file_idx in 1:nrow(happler_hap)) { # 6) What is the length of the credible set? obs_pip = susie_pip[happler_hap_id] has_highest_pip = as.integer(sum(obs_pip < susie_pip) == 0) + # TODO: fix this so that it also excludes other haplotypes besides obs_pip best_variant_pip = max(susie_pip[names(susie_pip) != names(obs_pip)]) credible_set = NULL for (cs in names(fitted$sets$cs)) { From 23735cfe6deb72dcae756a2517652573127b03d9 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 25 Oct 2024 06:44:27 -0700 Subject: [PATCH 26/41] enable computing mean and stderr of finemapping metrics in parameter_plot --- analysis/workflow/scripts/parameter_plot.py | 221 +++++++++++++++++--- 1 file changed, 187 insertions(+), 34 deletions(-) diff --git a/analysis/workflow/scripts/parameter_plot.py b/analysis/workflow/scripts/parameter_plot.py index 58a18aa..107482d 100755 --- a/analysis/workflow/scripts/parameter_plot.py +++ b/analysis/workflow/scripts/parameter_plot.py @@ -179,17 +179,59 @@ def get_finemap_metrics(metrics_path: Path, log: Logger = None): # 3) What is the best PIP among the variants? # 4) Is the observed hap in a credible set? # 5) What is the purity of the credible set? - return np.loadtxt( - fname=metrics_path, - delimiter=" ", - dtype=[ - ("pip", np.float64), - ("has_highest_pip", np.bool_), - ("best_variant_pip", np.float64), - ("in_credible_set", np.bool_), - ("cs_purity", np.float64), - ], + # 6) What is the length of the credible set? + # If you add or remove any metrics here, make sure to also change the + # "num_expected_vals" in get_metrics_mean_std so it knows the number of metrics + dtype = [ + ("pip", np.float64), + ("has_highest_pip", np.bool_), + ("best_variant_pip", np.float64), + ("in_credible_set", np.bool_), + ("cs_purity", np.float64), + ("cs_length", np.float64) + ] + null_val = np.array([ + (np.nan, False, np.nan, False, np.nan, np.nan), + ], dtype=dtype) + # if there are no observed haplotypes, then we can't compute LD + if not metrics_path.exists(): + return null_val + metrics = np.loadtxt(fname=metrics_path, delimiter=" ", dtype=dtype) + if not metrics.shape: + log.debug(f"No metrics found in the metrics file {metrics_path}") + return null_val + return metrics + + +def get_metrics_mean_std(metrics_vals: npt.NDArray, num_expected_vals: int = 6): + """ + Compute the mean and standard error of the metrics + + Parameters + ---------- + metrics_vals: npt.NDArray + A numpy array containing the metrics for each observed hap + num_expected_vals: int, optional + The number of expected metrics + + Returns + ------- + npt.NDArray + A numpy array containing the mean of the metrics + npt.NDArray + A numpy array containing the standard error of the metrics + """ + if len(metrics_vals) == 0: + return np.array([np.nan,]*6), np.array([np.nan,]*6) + nan_vals = np.isnan(metrics_vals["pip"]) + metrics_vals = metrics_vals[~nan_vals] + mean_vals = np.array( + [np.nanmean(metrics_vals[field]) for field in metrics_vals.dtype.names], + ) + sem_vals = np.array( + [sem(metrics_vals[field], nan_policy="omit") for field in metrics_vals.dtype.names], ) + return mean_vals, sem_vals def count_shared(observed, causal, log, observed_id: str = None, causal_id: str = None): @@ -219,6 +261,7 @@ def plot_params( vals_sem: npt.NDArray, val_title: str, val_color: npt.NDArray, + metrics: dict = None, hide_extras: bool = False, ): """ @@ -236,16 +279,22 @@ def plot_params( The name of the value that we are plotting val_color: npt.NDArray Whether to color each point corresponding to this value - hide_extras: bool + metrics: dict, optional + A dict mapping fine-mapping metrics to tuples of two lists of numpy arrays + containing means and standard errors of fine-mapping metrics for each observed hap + hide_extras: bool, optional Whether to hide the observed haps that have no causal hap match """ figsize = matplotlib.rcParams["figure.figsize"] if len(params) > 75: figsize[0] = len(params) / 15 - if len(params.dtype) > 5: - figsize[1] = len(params.dtype) * 1.25 + num_rows = len(params.dtype) + if metrics is not None: + num_rows = len(params.dtype) + len(metrics) + if num_rows > 5: + figsize[1] = num_rows * 1.25 fig, axs = plt.subplots( - nrows=len(params.dtype)+1, ncols=1, + nrows=num_rows+1, ncols=1, sharex=True, figsize=figsize, ) fig.subplots_adjust(hspace=0) @@ -275,6 +324,21 @@ def plot_params( axs[idx+1].plot(params[param], "-") axs[idx+1].set_ylabel(val_title, rotation="horizontal", ha="right") axs[idx+1].grid(axis="x") + if metrics is not None: + for idx, metric in enumerate(metrics.keys()): + curr_ax = axs[idx+len(params.dtype)+1] + val_title = metric + vals = np.concatenate(metrics[metric][0]) + if len(np.unique(vals)) == 1: + curr_ax.remove() + continue + vals_sem = np.concatenate(metrics[metric][1]) + for v in zip(x_vals, vals, vals_sem, val_color): + if hide_extras and v[3] != "green": + continue + curr_ax.errorbar(v[0], v[1], yerr=v[2], marker="o", c=v[3], markersize=3) + curr_ax.set_ylabel(val_title, rotation="horizontal", ha="right") + curr_ax.grid(axis="x") return fig @@ -301,7 +365,7 @@ def plot_params_simple( The name of the value that we are plotting val_color: npt.NDArray Whether to color each point corresponding to this value - hide_extras: bool + hide_extras: bool, optional Whether to hide the observed haps that have no causal hap match """ fig, ax = plt.subplots(nrows=1, ncols=1, sharex=True, figsize=(3.5, 3)) @@ -324,7 +388,13 @@ def plot_params_simple( return fig -def group_by_rep(params: npt.NDArray, vals: npt.NDArray, causal_idxs: npt.NDArray, bools: npt.NDArray): +def group_by_rep( + params: npt.NDArray, + vals: npt.NDArray, + causal_idxs: npt.NDArray, + bools: npt.NDArray, + metrics: npt.NDArray = None, + ): """ Group replicates with identical parameter values @@ -340,6 +410,8 @@ def group_by_rep(params: npt.NDArray, vals: npt.NDArray, causal_idxs: npt.NDArra The index of the causal haplotype that each observed haplotype belongs to bools: npt.NDArray An array of boolean values indicating whether each observed haplotype has a match + metrics: npt.NDArray + A numpy array of numpy arrays containing the metrics for each observed hap Returns ------- @@ -350,18 +422,25 @@ def group_by_rep(params: npt.NDArray, vals: npt.NDArray, causal_idxs: npt.NDArra """ other_param_names = [name for name in params.dtype.names if name != "rep"] grouped_params = np.unique(params[other_param_names]) - get_mean_std = lambda x: (np.mean(x), sem(x) if len(x) > 1 else np.nan) + get_mean_std = lambda x: (np.nanmean(x), sem(x, nan_policy="omit") if len(x) > 1 else np.nan) filter_causal_idxs = lambda x: np.unique(x[x != np.iinfo(np.uint8).max]) grouped_vals = [] grouped_sem = [] grouped_bools = [] + grouped_metrics = [] + grouped_metrics_sem = [] for group in grouped_params: subgrouped_vals = [] subgrouped_sem = [] subgrouped_bools = [] + subgrouped_metrics = [] + subgrouped_metrics_sem = [] curr_vals = vals[params[other_param_names] == group] curr_causal_idxs = causal_idxs[params[other_param_names] == group] curr_bools = bools[params[other_param_names] == group] + curr_metrics = None + if metrics is not None: + curr_metrics = metrics[params[other_param_names] == group] # compute mean and std err for each causal match for causal_idx in filter_causal_idxs(np.concatenate(curr_causal_idxs)): try: @@ -375,6 +454,14 @@ def group_by_rep(params: npt.NDArray, vals: npt.NDArray, causal_idxs: npt.NDArra subgrouped_vals.append(val_mean) subgrouped_sem.append(val_sem) subgrouped_bools.append(True) + if curr_metrics is not None: + curr_metrics_causal_idxs = np.concatenate([ + val[(indices == causal_idx) & curr_bool] + for val, indices, curr_bool in zip(curr_metrics, curr_causal_idxs, curr_bools) + ]) + metrics_mean, metrics_sem = get_metrics_mean_std(curr_metrics_causal_idxs) + subgrouped_metrics.append(metrics_mean) + subgrouped_metrics_sem.append(metrics_sem) # compute mean and std err for each unmatched observed hap for unmatched_idx in range(max([(~i).sum() for i in curr_bools])): try: @@ -389,9 +476,23 @@ def group_by_rep(params: npt.NDArray, vals: npt.NDArray, causal_idxs: npt.NDArra subgrouped_vals.append(val_mean) subgrouped_sem.append(val_sem) subgrouped_bools.append(False) + if curr_metrics is not None: + curr_metrics_unmatched_idxs = np.array([ + val[(indices == causal_idx) & ~curr_bool][unmatched_idx] + for val, indices, curr_bool in zip(curr_metrics, curr_causal_idxs, curr_bools) + if unmatched_idx < ((indices == causal_idx) & ~curr_bool).sum() + ]) + metrics_mean, metrics_sem = get_metrics_mean_std(curr_metrics_unmatched_idxs) + subgrouped_metrics.append(metrics_mean) + subgrouped_metrics_sem.append(metrics_sem) grouped_vals.append(np.array(subgrouped_vals, dtype=object)) grouped_sem.append(np.array(subgrouped_sem, dtype=object)) grouped_bools.append(np.array(subgrouped_bools, dtype=object)) + if curr_metrics is not None: + grouped_metrics.append(np.array(subgrouped_metrics, dtype=object)) + grouped_metrics_sem.append(np.array(subgrouped_metrics_sem, dtype=object)) + if curr_metrics is not None: + return grouped_params, grouped_vals, grouped_sem, grouped_bools, grouped_metrics, grouped_metrics_sem return grouped_params, grouped_vals, grouped_sem, grouped_bools @@ -407,6 +508,13 @@ def group_by_rep(params: npt.NDArray, vals: npt.NDArray, causal_idxs: npt.NDArra show_default=True, help="Plot fine-mapping metrics, as well", ) +@click.option( + "--use-metric", + type=str, + default=None, + show_default="observed LD", + help="Use this fine-mapping metric to color the plot", +) @click.option( "--region", type=str, @@ -472,6 +580,7 @@ def main( observed_hap: Path, causal_hap: Path, metrics: Path, + use_metric: str = None, region: str = None, observed_id: str = None, causal_id: str = None, @@ -543,30 +652,74 @@ def main( log=log ) for idx in range(len(params)) - ]) - - # TODO: also handle the metrics in group_by_rep() - params, ld_vals, ld_sem, ld_extras_bool = group_by_rep( - params, ld_vals, ld_extras_idxs, ld_extras_bool, - ) + ], dtype=object) + params, ld_vals, ld_sem, ld_extras_bool, metrics_vals, metrics_vals_sem = group_by_rep( + params, ld_vals, ld_extras_idxs, ld_extras_bool, metrics, + ) + extract_metrics_vals = lambda x, idx: np.array([i[:, idx] for i in x], dtype=object) + metrics = { + name: ( + extract_metrics_vals(metrics_vals, idx), + extract_metrics_vals(metrics_vals_sem, idx) + ) for idx, name in enumerate(metrics[0].dtype.names) + } + else: + params, ld_vals, ld_sem, ld_extras_bool = group_by_rep( + params, ld_vals, ld_extras_idxs, ld_extras_bool, + ) del dtypes["rep"] if pickle_out: with open(output.with_suffix(".pickle"), "wb") as f: - pickle.dump([params, ld_vals, ld_sem, ld_extras_bool], f) - - if len(dtypes) > 1 or metrics is not None: - merged = params - if metrics is not None: - merged = np.lib.recfunctions.merge_arrays([params, metrics], flatten=True) - fig = plot_params(merged, ld_vals, ld_sem, "causal LD", ld_extras_bool, hide_extras) - elif len(dtypes) == 1: - fig = plot_params_simple(params, ld_vals, ld_sem, "causal LD", ld_extras_bool, hide_extras) + if metrics is not None: + pickle.dump([params, ld_vals, ld_sem, ld_extras_bool, metrics], f) + else: + pickle.dump([params, ld_vals, ld_sem, ld_extras_bool], f) + + if use_metric is None: + if len(dtypes) > 1: + fig = plot_params( + params, + ld_vals, + ld_sem, + "Observed LD", + ld_extras_bool, + metrics=metrics, + hide_extras=hide_extras + ) + elif len(dtypes) == 1: + fig = plot_params_simple( + params, + ld_vals, + ld_sem, + "Observed LD", + ld_extras_bool, + hide_extras=hide_extras + ) else: - raise ValueError("No parameter values found") + if len(dtypes) > 2: + fig = plot_params( + params, + metrics[use_metric][0], + metrics[use_metric][1], + use_metric, + ld_extras_bool, + hide_extras=hide_extras + ) + elif len(dtypes) == 1: + fig = plot_params_simple( + params, + metrics[use_metric][0], + metrics[use_metric][1], + use_metric, + ld_extras_bool, + ld_extras_bool, + hide_extras=hide_extras + ) fig.tight_layout() - fig.savefig(output, bbox_inches="tight", pad_inches=0.05) + fig.savefig(output, + bbox_inches="tight", pad_inches=0.05) if __name__ == "__main__": main() From b9a1c22fda8914042fd188ccb3fcc9b898dee3da Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 25 Oct 2024 06:46:32 -0700 Subject: [PATCH 27/41] adjust the amount of time for the multiline rule --- analysis/workflow/Snakefile | 2 +- analysis/workflow/rules/genotypes.smk | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/analysis/workflow/Snakefile b/analysis/workflow/Snakefile index 5a9672c..1cefc93 100644 --- a/analysis/workflow/Snakefile +++ b/analysis/workflow/Snakefile @@ -196,7 +196,7 @@ elif mode == "run": output: hps = orig_out + "/multiline.tsv", resources: - runtime=3, + runtime=10, log: orig_out + "/logs/multiline", benchmark: diff --git a/analysis/workflow/rules/genotypes.smk b/analysis/workflow/rules/genotypes.smk index 8520d46..bfbc555 100644 --- a/analysis/workflow/rules/genotypes.smk +++ b/analysis/workflow/rules/genotypes.smk @@ -163,7 +163,7 @@ rule vcf2plink: rule subset_str: - """ subset samples from a STR VCF """ + """ subset samples from an STR VCF """ input: vcf=lambda wildcards: expand(config["str_panel"], chr=parse_locus(wildcards.locus)[0])[0], vcf_idx=lambda wildcards: expand(config["str_panel"] + ".tbi", chr=parse_locus(wildcards.locus)[0]), From e1f01c3a5bd105987a6c0081dad43d2937997217 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 25 Oct 2024 06:47:05 -0700 Subject: [PATCH 28/41] start on STR analysis --- analysis/config/config-simulations.STR.yml | 89 ++++++++++++++++++++++ analysis/config/config-simulations.yml | 1 + 2 files changed, 90 insertions(+) create mode 100644 analysis/config/config-simulations.STR.yml diff --git a/analysis/config/config-simulations.STR.yml b/analysis/config/config-simulations.STR.yml new file mode 100644 index 0000000..e346e6d --- /dev/null +++ b/analysis/config/config-simulations.STR.yml @@ -0,0 +1,89 @@ +# This is the Snakemake configuration file that specifies paths and +# and options for the pipeline. Anybody wishing to use +# the provided snakemake pipeline should first fill out this file with paths to +# their own data, as the Snakefile requires it. +# Every config option has reasonable defaults unless it is labeled as "required." +# All paths are relative to the directory that Snakemake is executed in. +# Note: this file is written in the YAML syntax (https://learnxinyminutes.com/docs/yaml/) + + +# Paths to a SNP-STR haplotype reference panel +# You can download this from http://gymreklab.com/2018/03/05/snpstr_imputation.html +# If the VCFs are per-chromosome, replace the contig name in the file name with "{chr}" +# The VCF(s) must be sorted and indexed (with a .tbi file in the same directory) +# required! +snp_panel: "data/simulations/1000G/19_45401409-46401409.pgen" +# str_panel: "/tscc/projects/ps-gymreklab/jmargoli/ukbiobank/str_imputed/runs/first_pass/vcfs/annotated_strs/chr{chr}.vcf.gz" +str_panel: "data/simulations/1000G.SNP-STR/19_45401409-46401409.STRs.pgen" + +# Path to a list of samples to exclude from the analysis +# There should be one sample ID per line +# exclude_samples: data/ukb_random_samples_exclude.tsv + +# If SNPs are unphased, provide the path to a SHAPEIT4 map file like these: +# https://github.com/odelaneau/shapeit4/tree/master/maps +# The map file should use the same reference genome as the reference panel VCFs +# phase_map: data/genetic_maps/chr{chr}.b37.gmap.gz + +# A "locus" is a string with a contig name, a colon, the start position, a dash, and +# the end position or a BED file with a ".bed" file ending +# There are different simulation modes that you can use: +# 1. "str" - a tandem repeat is a string with a contig name, a colon, and the start position +# 2. "snp" - a SNP follows the same format as "str" +# 3. "hap" - a haplotype +# 4. "ld_range" - creates random two-SNP haplotypes with a range of LD values between the alleles of each haplotype +# 5. "run" - execute happler on a locus without simulating anything +# The STR and SNP positions should be contained within the locus. +# The positions should be provided in the same coordinate system as the reference +# genome of the reference panel VCFs +# The contig should correspond with the contig name from the {chr} wildcard in the VCF +# required! and unless otherwise noted, all attributes of each mode are required +locus: 19:45401409-46401409 # center: 45901409 (APOe4) +modes: + str: + pos: 19:45903859 + snp: + pos: 19:45910672 # rs1046282 + hap: + alleles: [rs36046716:G, rs1046282:G] # 45892145, 45910672 + beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45] + ld_range: + reps: 10 + min_ld: 0 + max_ld: 1 + step: 0.1 + min_af: 0.25 + max_af: 0.75 + # beta: [0.35] + beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45] + alpha: [0.05] + random: false # whether to also produce random haplotypes + num_haps: [1, 2, 3, 4] + run: + pheno: data/geuvadis/phenos/{trait}.pheno + SVs: data/geuvadis/pangenie_hprc_hg38_all-samples_bi_SVs-missing_removed.pgen + # pheno_matrix: data/geuvadis/EUR_converted_expr_hg38.csv # optional +mode: str + +# Covariates to use if they're needed +# Otherwise, they're assumed to be regressed out of the phenotypes +# Note: the finemapping methods won't be able to handle these covariates +# covar: data/geuvadis/5PCs_sex.covar + +# Discard rare variants with a MAF below this number +# Defaults to 0 if not specified +min_maf: 0.05 + +# Sample sizes to use +# sample_size: [777, 800, 1600, 2400, 3200] +sample_size: 777 + +# A threshold on the pvalues of the haplotypes +out_thresh: 5e-3 + +# Whether to include the causal variant in the set of genotypes provided to the +# finemapping methods. Set this to true if you're interested in seeing how the +# methods perform when the causal variant is absent from the data. +# Defaults to false if not specified +# You can also provide a list of booleans, if you want to test both values +exclude_causal: [true, false] diff --git a/analysis/config/config-simulations.yml b/analysis/config/config-simulations.yml index d983344..156c1fe 100644 --- a/analysis/config/config-simulations.yml +++ b/analysis/config/config-simulations.yml @@ -14,6 +14,7 @@ # required! snp_panel: "data/simulations/1000G/19_45401409-46401409.pgen" # str_panel: "/tscc/projects/ps-gymreklab/jmargoli/ukbiobank/str_imputed/runs/first_pass/vcfs/annotated_strs/chr{chr}.vcf.gz" +# str_panel: "data/simulations/1000G.SNP-STR/19_45401409-46401409.pgen" # Path to a list of samples to exclude from the analysis # There should be one sample ID per line From 321ced6e0ba07cfb50291cc00c734dc40cb49a21 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 25 Oct 2024 06:47:29 -0700 Subject: [PATCH 29/41] revert to min for now. need to come up with a better way of doing this eventually --- analysis/workflow/rules/happler.smk | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/analysis/workflow/rules/happler.smk b/analysis/workflow/rules/happler.smk index 6bd4b65..f185191 100644 --- a/analysis/workflow/rules/happler.smk +++ b/analysis/workflow/rules/happler.smk @@ -64,7 +64,7 @@ rule run: max_signals=3, max_iterations=3, out_thresh=check_config("out_thresh", 5e-8), - keep_SNPs="--remove-SNPs " if "{rep}" in out else "", + keep_SNPs="--remove-SNPs " if not ("{rep}" in out) else "", output: hap=out + "/happler.hap", gz=out + "/happler.hap.gz", @@ -72,12 +72,12 @@ rule run: dot=out + "/happler.dot", resources: runtime=lambda wildcards, input: ( - max(45, Path(input.gts).with_suffix(".pvar").stat().st_size/1000 * 0.5454649806119475 + 0.6935132147046765) + min(45, Path(input.gts).with_suffix(".pvar").stat().st_size/1000 * 0.5454649806119475 + 0.6935132147046765) ), # slurm_partition="hotel", # slurm_extra="--qos=hotel", mem_mb=lambda wildcards, input: ( - max(5000, Path(input.gts).with_suffix(".pvar").stat().st_size/1000 * 14.90840694595845 + 401.8011129664152) + min(5000, Path(input.gts).with_suffix(".pvar").stat().st_size/1000 * 14.90840694595845 + 401.8011129664152) ), threads: 1 log: @@ -136,7 +136,7 @@ rule cond_linreg: png=out + "/cond_linreg.pdf", resources: runtime=lambda wildcards, input: ( - max(50, 1.5 * Path(input.pvar).stat().st_size/1000 * ( + min(50, 1.5 * Path(input.pvar).stat().st_size/1000 * ( 0.2400978997329614 + get_num_variants(input.hap) * 0.045194464826048095 )) ), @@ -280,7 +280,7 @@ rule sv_ld: sv_pvar=lambda wildcards: Path(config["SVs"]).with_suffix(".pvar"), sv_psam=lambda wildcards: Path(config["SVs"]).with_suffix(".psam"), params: - start=lambda wildcards: max(0, int(parse_locus(wildcards.locus)[1])-1000000), + start=lambda wildcards: min(0, int(parse_locus(wildcards.locus)[1])-1000000), end=lambda wildcards: int(parse_locus(wildcards.locus)[2])+1000000, chrom=lambda wildcards: parse_locus(wildcards.locus)[0], maf = check_config("min_maf", 0), @@ -332,7 +332,7 @@ rule merge: psam=temp(out + "/{ex}clude/merged.psam"), resources: runtime=lambda wildcards, input: ( - max(10, Path(input.gts_pvar).stat().st_size/1000 * 0.05652315368728583 + 2.0888654705656844) + min(10, Path(input.gts_pvar).stat().st_size/1000 * 0.05652315368728583 + 2.0888654705656844) ), log: logs + "/{ex}clude/merge", @@ -374,7 +374,7 @@ rule finemapper: susie=out + "/{ex}clude/susie.rds", resources: runtime=lambda wildcards, input: ( - max(120, Path(input.gt_pvar).stat().st_size/1000 * 0.08) + min(120, Path(input.gt_pvar).stat().st_size/1000 * 0.08) ), mem_mb = 7000, threads: 1, @@ -455,8 +455,8 @@ rule results: phen=pheno, susie=rules.finemapper.output.susie, happler_hap=results_happler_hap_input, - causal_gt=config["causal_gt"].pgen if "causal_gt" in config else [], - causal_hap=results_causal_hap_input, + # causal_gt=config["causal_gt"].pgen if "causal_gt" in config else [], + # causal_hap=results_causal_hap_input, params: outdir=lambda wildcards, output: Path(output.susie_pdf).parent, region=lambda wildcards: wildcards.locus.replace("_", ":"), From f0b5b38a0355a72e40d4287033618004f03245ea Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Wed, 30 Oct 2024 13:31:35 -0700 Subject: [PATCH 30/41] allow for pickled output for explained_variances.png --- analysis/workflow/scripts/variance_explained_plot.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/analysis/workflow/scripts/variance_explained_plot.py b/analysis/workflow/scripts/variance_explained_plot.py index 451536a..57fbf95 100755 --- a/analysis/workflow/scripts/variance_explained_plot.py +++ b/analysis/workflow/scripts/variance_explained_plot.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +import pickle from pathlib import Path from logging import Logger @@ -300,6 +301,9 @@ def main( f, (ax1, ax2) = plt.subplots(1, 2) + with open(output.with_suffix(".pickle"), "wb") as picklef: + pickle.dump((explained_variances, rsquareds), picklef) + ax1.scatter(explained_variances[:, 1], explained_variances[:, 0]) ax1.axline([0, 0], [max_ev_val, max_ev_val]) ax1.set_title("Variance Explained") From 03f9eef34e5690fd5f927184902df8660a99632f Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 22 Nov 2024 12:51:26 -0800 Subject: [PATCH 31/41] upgrade to newest susie --- analysis/workflow/envs/susie.post-deploy.sh | 4 ---- analysis/workflow/envs/susie.yml | 20 +++++++++++--------- 2 files changed, 11 insertions(+), 13 deletions(-) delete mode 100644 analysis/workflow/envs/susie.post-deploy.sh diff --git a/analysis/workflow/envs/susie.post-deploy.sh b/analysis/workflow/envs/susie.post-deploy.sh deleted file mode 100644 index 272cfdb..0000000 --- a/analysis/workflow/envs/susie.post-deploy.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!env bash -set -o pipefail - -R -e 'install.packages("pgenlibr",repos="http://cran.us.r-project.org")' diff --git a/analysis/workflow/envs/susie.yml b/analysis/workflow/envs/susie.yml index fbd91b7..8e704f3 100644 --- a/analysis/workflow/envs/susie.yml +++ b/analysis/workflow/envs/susie.yml @@ -2,15 +2,17 @@ channels: - conda-forge - nodefaults dependencies: - - conda-forge::r-susier==0.11.42 + - conda-forge::r-susier==0.12.35 - conda-forge::r-abind==1.4_5 - - conda-forge::r-r.utils==2.12.0 - - conda-forge::r-dplyr==1.0.7 - - conda-forge::r-readr==1.4.0 + - conda-forge::r-r.utils==2.12.3 + - conda-forge::r-dplyr==1.1.4 + - conda-forge::r-readr==2.1.5 - aryarm::r-dscrutils==0.4.3 - - conda-forge::r-irkernel==1.2 - - conda-forge::sos-notebook==0.22.3 - - conda-forge::coreutils==8.31 + - aryarm::r-pgenlibr==0.3.7 + - conda-forge::r-irkernel==1.3.2 + - conda-forge::sos-notebook==0.24.4 + - conda-forge::coreutils=9.5 - conda-forge::sed==4.8 - - conda-forge::bash==5.0.018 - - conda-forge::gzip==1.10 + - conda-forge::bash==5.2.21 + - conda-forge::gzip==1.13 + - conda-forge::r-rfast==2.1.0 From 32f1a32bb530c35e72fbfae21095dd904ade3393 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 22 Nov 2024 12:55:08 -0800 Subject: [PATCH 32/41] recalibrate timing and mem --- analysis/.gitignore | 4 ++-- analysis/workflow/rules/happler.smk | 22 ++++++++++++---------- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/analysis/.gitignore b/analysis/.gitignore index 034afe0..06a969f 100644 --- a/analysis/.gitignore +++ b/analysis/.gitignore @@ -1,4 +1,4 @@ midway_manhattans-indepterminator !midway_manhattans-indepterminator/LINKS -old-outs -HAPFILES \ No newline at end of file +outs +HAPFILES diff --git a/analysis/workflow/rules/happler.smk b/analysis/workflow/rules/happler.smk index f185191..d34f643 100644 --- a/analysis/workflow/rules/happler.smk +++ b/analysis/workflow/rules/happler.smk @@ -47,6 +47,8 @@ rule sub_pheno: pheno = rules.sub_pheno.output.pheno if "{rep}" in out else config["pheno"] +# if the pvar size is larger than 0.5 GB, just use the default memory instead +rsrc_func = lambda x: max if .5 > Path(x).with_suffix(".pvar").stat().st_size/1000/1000/1000 else min rule run: @@ -72,12 +74,12 @@ rule run: dot=out + "/happler.dot", resources: runtime=lambda wildcards, input: ( - min(45, Path(input.gts).with_suffix(".pvar").stat().st_size/1000 * 0.5454649806119475 + 0.6935132147046765) + rsrc_func(input.gts)(45, Path(input.gts).with_suffix(".pvar").stat().st_size/1000 * 2.5379343786643838 + 20.878965342140603) ), # slurm_partition="hotel", # slurm_extra="--qos=hotel", mem_mb=lambda wildcards, input: ( - min(5000, Path(input.gts).with_suffix(".pvar").stat().st_size/1000 * 14.90840694595845 + 401.8011129664152) + rsrc_func(input.gts)(2500, Path(input.gts).with_suffix(".pvar").stat().st_size/1000 * 7.5334226167661384 + 22.471377010118147) ), threads: 1 log: @@ -136,7 +138,7 @@ rule cond_linreg: png=out + "/cond_linreg.pdf", resources: runtime=lambda wildcards, input: ( - min(50, 1.5 * Path(input.pvar).stat().st_size/1000 * ( + rsrc_func(input.pgen)(50, 1.5 * Path(input.pvar).stat().st_size/1000 * ( 0.2400978997329614 + get_num_variants(input.hap) * 0.045194464826048095 )) ), @@ -166,7 +168,7 @@ rule heatmap: output: png=out + "/heatmap.pdf", resources: - runtime=4, + runtime=10, log: logs + "/heatmap", benchmark: @@ -232,7 +234,7 @@ rule transform: pvar=temp(out + "/happler.pvar"), psam=temp(out + "/happler.psam"), resources: - runtime=4, + runtime=10, log: logs + "/transform", benchmark: @@ -332,7 +334,7 @@ rule merge: psam=temp(out + "/{ex}clude/merged.psam"), resources: runtime=lambda wildcards, input: ( - min(10, Path(input.gts_pvar).stat().st_size/1000 * 0.05652315368728583 + 2.0888654705656844) + rsrc_func(input.gts)(10, Path(input.gts_pvar).stat().st_size/1000 * 0.05652315368728583 + 2.0888654705656844) ), log: logs + "/{ex}clude/merge", @@ -374,9 +376,9 @@ rule finemapper: susie=out + "/{ex}clude/susie.rds", resources: runtime=lambda wildcards, input: ( - min(120, Path(input.gt_pvar).stat().st_size/1000 * 0.08) + rsrc_func(input.gt)(120, Path(input.gt_pvar).stat().st_size/1000 * 0.3) ), - mem_mb = 7000, + mem_mb = 8500, threads: 1, log: logs + "/{ex}clude/finemapper", @@ -395,7 +397,7 @@ rule pips: output: tsv=out + "/{ex}clude/susie_pips.tsv", resources: - runtime=5, + runtime=10, threads: 3, log: logs + "/{ex}clude/pips", @@ -465,7 +467,7 @@ rule results: susie_pdf = out + "/{ex}clude/susie.pdf", # susie_eff_pdf=temp(out + "/susie_eff.pdf"), resources: - runtime=5, + runtime=10, log: logs + "/{ex}clude/results", benchmark: From f0164e0b98f74bbc4abf858eaf5be6627da41973 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 22 Nov 2024 13:02:22 -0800 Subject: [PATCH 33/41] collapse sim configs --- analysis/config/config-geuvadis.yml | 18 ----- analysis/config/config-simulations.STR.yml | 89 ---------------------- analysis/config/config-simulations.yml | 12 ++- analysis/config/config.yml | 2 +- 4 files changed, 9 insertions(+), 112 deletions(-) delete mode 100644 analysis/config/config-simulations.STR.yml diff --git a/analysis/config/config-geuvadis.yml b/analysis/config/config-geuvadis.yml index 6089e55..2229a2c 100644 --- a/analysis/config/config-geuvadis.yml +++ b/analysis/config/config-geuvadis.yml @@ -42,24 +42,6 @@ snp_panel: "data/geuvadis/geuvadis_ensemble_phasing.pgen" # locus: 19:45401409-46401409 # center: 45901409 (APOe4) locus: data/geuvadis/geuvadis_eqtl_genes.full.liftover.bed modes: - str: - pos: 19:45903857 # STR_691361 - snp: - pos: 19:45910672 # rs1046282 - hap: - alleles: [rs36046716:G, rs1046282:G] # 45892145, 45910672 - beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45] - ld_range: - reps: 1 - min_ld: 0 - max_ld: 1 - step: 0.1 - min_af: 0.25 - max_af: 0.75 - # beta: [0.35] - beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45] - alpha: [0.05] - random: false # whether to also produce random haplotypes run: pheno: data/geuvadis/phenos/{trait}.pheno SVs: data/geuvadis/pangenie_hprc_hg38_all-samples_bi_SVs-missing_removed.pgen diff --git a/analysis/config/config-simulations.STR.yml b/analysis/config/config-simulations.STR.yml deleted file mode 100644 index e346e6d..0000000 --- a/analysis/config/config-simulations.STR.yml +++ /dev/null @@ -1,89 +0,0 @@ -# This is the Snakemake configuration file that specifies paths and -# and options for the pipeline. Anybody wishing to use -# the provided snakemake pipeline should first fill out this file with paths to -# their own data, as the Snakefile requires it. -# Every config option has reasonable defaults unless it is labeled as "required." -# All paths are relative to the directory that Snakemake is executed in. -# Note: this file is written in the YAML syntax (https://learnxinyminutes.com/docs/yaml/) - - -# Paths to a SNP-STR haplotype reference panel -# You can download this from http://gymreklab.com/2018/03/05/snpstr_imputation.html -# If the VCFs are per-chromosome, replace the contig name in the file name with "{chr}" -# The VCF(s) must be sorted and indexed (with a .tbi file in the same directory) -# required! -snp_panel: "data/simulations/1000G/19_45401409-46401409.pgen" -# str_panel: "/tscc/projects/ps-gymreklab/jmargoli/ukbiobank/str_imputed/runs/first_pass/vcfs/annotated_strs/chr{chr}.vcf.gz" -str_panel: "data/simulations/1000G.SNP-STR/19_45401409-46401409.STRs.pgen" - -# Path to a list of samples to exclude from the analysis -# There should be one sample ID per line -# exclude_samples: data/ukb_random_samples_exclude.tsv - -# If SNPs are unphased, provide the path to a SHAPEIT4 map file like these: -# https://github.com/odelaneau/shapeit4/tree/master/maps -# The map file should use the same reference genome as the reference panel VCFs -# phase_map: data/genetic_maps/chr{chr}.b37.gmap.gz - -# A "locus" is a string with a contig name, a colon, the start position, a dash, and -# the end position or a BED file with a ".bed" file ending -# There are different simulation modes that you can use: -# 1. "str" - a tandem repeat is a string with a contig name, a colon, and the start position -# 2. "snp" - a SNP follows the same format as "str" -# 3. "hap" - a haplotype -# 4. "ld_range" - creates random two-SNP haplotypes with a range of LD values between the alleles of each haplotype -# 5. "run" - execute happler on a locus without simulating anything -# The STR and SNP positions should be contained within the locus. -# The positions should be provided in the same coordinate system as the reference -# genome of the reference panel VCFs -# The contig should correspond with the contig name from the {chr} wildcard in the VCF -# required! and unless otherwise noted, all attributes of each mode are required -locus: 19:45401409-46401409 # center: 45901409 (APOe4) -modes: - str: - pos: 19:45903859 - snp: - pos: 19:45910672 # rs1046282 - hap: - alleles: [rs36046716:G, rs1046282:G] # 45892145, 45910672 - beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45] - ld_range: - reps: 10 - min_ld: 0 - max_ld: 1 - step: 0.1 - min_af: 0.25 - max_af: 0.75 - # beta: [0.35] - beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45] - alpha: [0.05] - random: false # whether to also produce random haplotypes - num_haps: [1, 2, 3, 4] - run: - pheno: data/geuvadis/phenos/{trait}.pheno - SVs: data/geuvadis/pangenie_hprc_hg38_all-samples_bi_SVs-missing_removed.pgen - # pheno_matrix: data/geuvadis/EUR_converted_expr_hg38.csv # optional -mode: str - -# Covariates to use if they're needed -# Otherwise, they're assumed to be regressed out of the phenotypes -# Note: the finemapping methods won't be able to handle these covariates -# covar: data/geuvadis/5PCs_sex.covar - -# Discard rare variants with a MAF below this number -# Defaults to 0 if not specified -min_maf: 0.05 - -# Sample sizes to use -# sample_size: [777, 800, 1600, 2400, 3200] -sample_size: 777 - -# A threshold on the pvalues of the haplotypes -out_thresh: 5e-3 - -# Whether to include the causal variant in the set of genotypes provided to the -# finemapping methods. Set this to true if you're interested in seeing how the -# methods perform when the causal variant is absent from the data. -# Defaults to false if not specified -# You can also provide a list of booleans, if you want to test both values -exclude_causal: [true, false] diff --git a/analysis/config/config-simulations.yml b/analysis/config/config-simulations.yml index 156c1fe..dfc35d7 100644 --- a/analysis/config/config-simulations.yml +++ b/analysis/config/config-simulations.yml @@ -12,9 +12,11 @@ # If the VCFs are per-chromosome, replace the contig name in the file name with "{chr}" # The VCF(s) must be sorted and indexed (with a .tbi file in the same directory) # required! -snp_panel: "data/simulations/1000G/19_45401409-46401409.pgen" +# snp_panel: "data/simulations/1000G/19_45401409-46401409.pgen" # str_panel: "/tscc/projects/ps-gymreklab/jmargoli/ukbiobank/str_imputed/runs/first_pass/vcfs/annotated_strs/chr{chr}.vcf.gz" -# str_panel: "data/simulations/1000G.SNP-STR/19_45401409-46401409.pgen" +snp_panel: "data/simulations/1000G.SNP-STR/19_45401409-46401409.SNPs.pgen" +str_panel: "data/simulations/1000G.SNP-STR/19_45401409-46401409.STRs.pgen" + # Path to a list of samples to exclude from the analysis # There should be one sample ID per line @@ -41,7 +43,9 @@ snp_panel: "data/simulations/1000G/19_45401409-46401409.pgen" locus: 19:45401409-46401409 # center: 45901409 (APOe4) modes: str: - pos: 19:45903857 # STR_691361 + pos: 19:45903859 + id: 19:45903859 + reps: 10 snp: pos: 19:45910672 # rs1046282 hap: @@ -63,7 +67,7 @@ modes: pheno: data/geuvadis/phenos/{trait}.pheno SVs: data/geuvadis/pangenie_hprc_hg38_all-samples_bi_SVs-missing_removed.pgen # pheno_matrix: data/geuvadis/EUR_converted_expr_hg38.csv # optional -mode: ld_range +mode: str # Covariates to use if they're needed # Otherwise, they're assumed to be regressed out of the phenotypes diff --git a/analysis/config/config.yml b/analysis/config/config.yml index c7dda66..51c999d 120000 --- a/analysis/config/config.yml +++ b/analysis/config/config.yml @@ -1 +1 @@ -config-geuvadis.yml \ No newline at end of file +config-simulations.yml \ No newline at end of file From 2ac2c270d4773aa3f73c75cbae7ba7f8e093fb09 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 26 Nov 2024 13:46:59 -0800 Subject: [PATCH 34/41] switch to dev version of haptools --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 31b780b..c620062 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,8 @@ networkx = ">=2.6.3" scipy = ">=1.7.3" pydot = ">=1.4.2" pysam = ">=0.19.0" -haptools = ">=0.4.0" +haptools = {git = "https://github.com/cast-genomics/haptools.git", , rev = "f0ba9a3a464725fe66b9bbf31726af886c516374"} # switch to 0.5.0 once released +# haptools = ">=0.5.0" statsmodels = ">=0.13.4" [tool.poetry.group.docs.dependencies] From f6d5a931c253bcefd5e198ede5a6b4e82e45c90e Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 26 Nov 2024 21:57:03 +0000 Subject: [PATCH 35/41] ci: handle deprecations in gh actions and devcontainer --- .devcontainer.json | 5 +---- .github/workflows/conventional-prs.yml | 2 +- .github/workflows/tests.yml | 7 +++---- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/.devcontainer.json b/.devcontainer.json index 253be95..d20cbbf 100644 --- a/.devcontainer.json +++ b/.devcontainer.json @@ -5,10 +5,7 @@ // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile "image": "mcr.microsoft.com/devcontainers/base:jammy", "features": { - "ghcr.io/rocker-org/devcontainer-features/miniforge:1": { - "version": "latest", - "variant": "Mambaforge" - } + "ghcr.io/rocker-org/devcontainer-features/miniforge:2": {} }, // Features to add to the dev container. More info: https://containers.dev/features. diff --git a/.github/workflows/conventional-prs.yml b/.github/workflows/conventional-prs.yml index d1bc280..8dc0113 100644 --- a/.github/workflows/conventional-prs.yml +++ b/.github/workflows/conventional-prs.yml @@ -17,6 +17,6 @@ jobs: statuses: write # for amannn/action-semantic-pull-request to mark status of analyzed PR runs-on: ubuntu-latest steps: - - uses: amannn/action-semantic-pull-request@v5.0.2 + - uses: amannn/action-semantic-pull-request@v5 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 00cb76b..1238bd1 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -29,11 +29,10 @@ jobs: - name: Check out the repository uses: actions/checkout@v4 - - name: Setup Mambaforge + - name: Setup Miniforge uses: conda-incubator/setup-miniconda@v3 with: activate-environment: happler - miniforge-variant: Mambaforge auto-activate-base: false miniforge-version: latest use-mamba: true @@ -44,7 +43,7 @@ jobs: shell: bash - name: Cache Conda env - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ${{ env.CONDA }}/envs key: @@ -94,7 +93,7 @@ jobs: uses: actions/checkout@v4 - name: Check for large files - uses: actionsdesk/lfs-warning@v3.2 + uses: ppremk/lfs-warning@v3.3 with: token: ${{ secrets.GITHUB_TOKEN }} # Optional filesizelimit: 500000b From 786db1d2d4029b601e21bf4bcb2dd24059cf6f14 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 26 Nov 2024 21:57:22 +0000 Subject: [PATCH 36/41] update lock file for latest haptools --- poetry.lock | 16 ++++++++++------ pyproject.toml | 2 +- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/poetry.lock b/poetry.lock index 9148d8f..2bc94f4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -673,14 +673,12 @@ woff = ["brotli (>=1.0.1)", "brotlicffi (>=0.8.0)", "zopfli (>=0.1.4)"] [[package]] name = "haptools" -version = "0.4.0" +version = "0.4.2" description = "Ancestry and haplotype aware simulation of genotypes and phenotypes for complex trait analysis" optional = false python-versions = ">=3.7,<4.0" -files = [ - {file = "haptools-0.4.0-py3-none-any.whl", hash = "sha256:73fa3c4b6f6119f71e59b84de226c34808646075f61a7fa033d84f434ec59738"}, - {file = "haptools-0.4.0.tar.gz", hash = "sha256:85dba0bde58e33ac71c298eb37e1272ca83276d06714d6516f8bc722c3455fe0"}, -] +files = [] +develop = false [package.dependencies] click = ">=8.0.3" @@ -690,6 +688,12 @@ numpy = ">=1.20.0" Pgenlib = ">=0.90.1" pysam = ">=0.19.0" +[package.source] +type = "git" +url = "https://github.com/cast-genomics/haptools.git" +reference = "f0ba9a3a464725fe66b9bbf31726af886c516374" +resolved_reference = "f0ba9a3a464725fe66b9bbf31726af886c516374" + [[package]] name = "humanfriendly" version = "10.0" @@ -2127,4 +2131,4 @@ test = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-it [metadata] lock-version = "2.0" python-versions = ">=3.8,<4.0" -content-hash = "1bb0c82732e8ba5ac5ac5ab616189cede6a4136e3652496196d6ba629fbf1172" +content-hash = "269cf6b4b6f2ca34cd1b9b3483109a94fa95a3748ccd2a3051786d5e26b39091" diff --git a/pyproject.toml b/pyproject.toml index c620062..5c95a4b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ networkx = ">=2.6.3" scipy = ">=1.7.3" pydot = ">=1.4.2" pysam = ">=0.19.0" -haptools = {git = "https://github.com/cast-genomics/haptools.git", , rev = "f0ba9a3a464725fe66b9bbf31726af886c516374"} # switch to 0.5.0 once released +haptools = {git = "https://github.com/cast-genomics/haptools.git", rev = "f0ba9a3a464725fe66b9bbf31726af886c516374"} # switch to 0.5.0 once released # haptools = ">=0.5.0" statsmodels = ">=0.13.4" From 592bf889cfc13fa16275e6b08f395185f9b10575 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 26 Nov 2024 22:09:07 +0000 Subject: [PATCH 37/41] update min python to 3.9 and add 3.13 ci test --- .github/workflows/tests.yml | 6 +++--- .readthedocs.yaml | 2 +- dev-env.yml | 2 +- noxfile.py | 4 ++-- pyproject.toml | 2 +- recipe/meta.yaml | 34 ---------------------------------- 6 files changed, 8 insertions(+), 42 deletions(-) delete mode 100644 recipe/meta.yaml diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 1238bd1..16c605a 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -10,15 +10,15 @@ jobs: fail-fast: false matrix: include: - - { python: "3.8", os: "ubuntu-latest", session: "lint" } - - { python: "3.8", os: "ubuntu-latest", session: "tests" } + - { python: "3.9", os: "ubuntu-latest", session: "lint" } - { python: "3.9", os: "ubuntu-latest", session: "tests" } - { python: "3.10", os: "ubuntu-latest", session: "tests" } - { python: "3.11", os: "ubuntu-latest", session: "tests" } - { python: "3.12", os: "ubuntu-latest", session: "tests" } + - { python: "3.13", os: "ubuntu-latest", session: "tests" } # - { python: "3.11", os: "windows-latest", session: "tests" } # - { python: "3.9", os: "macos-latest", session: "tests" } - - { python: "3.8", os: "ubuntu-latest", session: "size" } + - { python: "3.9", os: "ubuntu-latest", session: "size" } env: NOXSESSION: ${{ matrix.session }} diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 07c4eb8..968a06e 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -9,7 +9,7 @@ version: 2 build: os: "ubuntu-22.04" tools: - python: "3.8" + python: "3.9" jobs: post_create_environment: # Install poetry diff --git a/dev-env.yml b/dev-env.yml index 985b69e..3c4bfea 100644 --- a/dev-env.yml +++ b/dev-env.yml @@ -3,7 +3,7 @@ channels: - bioconda - nodefaults dependencies: - - conda-forge::python=3.8 # the lowest version of python that we formally support + - conda-forge::python=3.9 # the lowest version of python that we formally support - conda-forge::pip==23.3.2 - conda-forge::poetry==1.7.1 - conda-forge::nox==2023.04.22 diff --git a/noxfile.py b/noxfile.py index a1772ad..4955926 100644 --- a/noxfile.py +++ b/noxfile.py @@ -10,8 +10,8 @@ package = "happler" -python_versions = ["3.8", "3.9", "3.10", "3.11", "3.12"] -locked_python_version = "3.8" +python_versions = ["3.9", "3.10", "3.11", "3.12", "3.13"] +locked_python_version = "3.9" nox.needs_version = ">= 2022.11.21" nox.options.sessions = ( "docs", diff --git a/pyproject.toml b/pyproject.toml index 5c95a4b..54f9a17 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,7 @@ classifiers = [ ] [tool.poetry.dependencies] -python = ">=3.8,<4.0" +python = ">=3.9,<4.0" click = ">=8.0.4" networkx = ">=2.6.3" scipy = ">=1.7.3" diff --git a/recipe/meta.yaml b/recipe/meta.yaml deleted file mode 100644 index c7b0d4d..0000000 --- a/recipe/meta.yaml +++ /dev/null @@ -1,34 +0,0 @@ -package: - name: happler - version: v0.0.0 - -source: - path: '..' - -build: - number: 0 - script: "{{ PYTHON }} -m pip install . --no-deps --ignore-installed --no-cache-dir -vvv" - entry_points: - - happler = happler.__main__:main - noarch: python - -requirements: - host: - - pip >=19.0.3 - - python >=3.7 - - poetry-core >=1.0.0 - run: - - python >=3.7 - - statsmodels >=0.13.2 - - click >=8.0.4 - - networkx >=2.6.3 - - scipy >=1.7.3 - - pydot >=1.4.2 - -about: - home: https://github.com/gymrek-lab/happler - license: MIT - summary: 'A haplotype-based fine-mapping method' - dev_url: https://github.com/gymrek-lab/happler - doc_url: https://happler.readthedocs.io/ - doc_source_url: https://github.com/shibukawa/imagesize_py/blob/master/README.rst From b08162fde042f880ebb4d2856165647a3e63f347 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 26 Nov 2024 22:34:34 +0000 Subject: [PATCH 38/41] fix failing example tests with --out-thresh --- noxfile.py | 2 +- tests/test_examples.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/noxfile.py b/noxfile.py index 4955926..f127938 100644 --- a/noxfile.py +++ b/noxfile.py @@ -46,7 +46,7 @@ def install_handle_python_numpy(session): handle incompatibilities with python and numpy versions see https://github.com/cjolowicz/nox-poetry/issues/1116 """ - if session._session.python in ["3.11", "3.12"]: + if session._session.python in ["3.11", "3.12", "3.13"]: session._session.install(".") else: session.install(".") diff --git a/tests/test_examples.py b/tests/test_examples.py index 1604532..5f25730 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -716,7 +716,7 @@ def test_1000G_simulated_maf(capfd): out_vars = ("rs1046282", "rs36046716") for maf in (0.05, 0.30, 0.31, 0.38): - cmd = f"run --maf {maf} -o {out_hp_file} {gt_file} {pt_file}" + cmd = f"run --out-thresh 1 --maf {maf} -o {out_hp_file} {gt_file} {pt_file}" runner = CliRunner() result = runner.invoke(main, cmd.split(" "), catch_exceptions=False) captured = capfd.readouterr() @@ -758,7 +758,7 @@ def test_1000G_real(capfd, caplog): caplog.set_level(logging.INFO) for maf in (0.05, 0.14, 0.22, 0.23, 0.35): - cmd = f"run --maf {maf} -o {out_hp_file} {gt_file} {pt_file}" + cmd = f"run --out-thresh 1 --maf {maf} -o {out_hp_file} {gt_file} {pt_file}" runner = CliRunner() result = runner.invoke(main, cmd.split(" "), catch_exceptions=False) captured = capfd.readouterr() From b2912646f063154b21395117d9e7b4cfdd7add1d Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 26 Nov 2024 22:39:52 +0000 Subject: [PATCH 39/41] refmt with black --- happler/tree/assoc_test.py | 4 +++- happler/tree/forest_builder.py | 41 ++++++++++++++++++++++++---------- happler/tree/terminator.py | 14 ++++++++++-- happler/tree/tree_builder.py | 2 +- tests/test_tree.py | 2 +- 5 files changed, 46 insertions(+), 17 deletions(-) diff --git a/happler/tree/assoc_test.py b/happler/tree/assoc_test.py index f17f202..32a7324 100644 --- a/happler/tree/assoc_test.py +++ b/happler/tree/assoc_test.py @@ -325,7 +325,9 @@ def perform_test( # handle cases where our p-values are too powerful if pval == 0: # retrieve the pval at a higher precision - pval = AssocTestSimpleSM.pval_as_decimal(res.tvalues[-1], res.df_resid, precision=10) + pval = AssocTestSimpleSM.pval_as_decimal( + res.tvalues[-1], res.df_resid, precision=10 + ) if self.with_bic: return param, pval, stderr, bic else: diff --git a/happler/tree/forest_builder.py b/happler/tree/forest_builder.py index cecce98..d588d6a 100644 --- a/happler/tree/forest_builder.py +++ b/happler/tree/forest_builder.py @@ -34,8 +34,8 @@ def __init__( ): self.tree_builder = tree_builder self.max_iterations = max_iterations - self.trees = [None]*num_bins - self.haplotypes = [None]*num_bins + self.trees = [None] * num_bins + self.haplotypes = [None] * num_bins self.genotypes = self.tree_builder.gens self.phenotypes = self.tree_builder.phens self.log = log or getLogger(self.__class__.__name__) @@ -47,12 +47,17 @@ def run(self): iterations = self.max_iterations tree_idx = 0 while iterations: - self.log.debug(f"Iteration {self.max_iterations - iterations}: tree {tree_idx}") + self.log.debug( + f"Iteration {self.max_iterations - iterations}: tree {tree_idx}" + ) self.tree_builder.phens = self.get_residuals(tree_idx) self.tree_builder.run() self.trees[tree_idx] = self.tree_builder.tree self.haplotypes[tree_idx] = Haplotypes.from_tree( - None, self.trees[tree_idx], self.genotypes, self.log, + None, + self.trees[tree_idx], + self.genotypes, + self.log, ) # increment tree_idx until it reaches the end tree_idx += 1 @@ -89,22 +94,32 @@ def get_residuals(self, tree_idx: int) -> Phenotypes: if total_num_effects == 0: return self.phenotypes num_samps = len(self.phenotypes.samples) - effects = np.empty((num_samps, total_num_effects), dtype=self.phenotypes.data.dtype) + effects = np.empty( + (num_samps, total_num_effects), dtype=self.phenotypes.data.dtype + ) effect_arr_idx = 0 # iterate through every tree's Haplotypes obj and transform the haps into effects - self.log.debug(f"Extracting {total_num_effects} effects from {total_num_effects} total effects") + self.log.debug( + f"Extracting {total_num_effects} effects from {total_num_effects} total effects" + ) for hap_idx, hap in enumerate(self.haplotypes): if hap is None or hap_idx == tree_idx: continue new_idx = effect_arr_idx + len(hap.data) - effects[:, effect_arr_idx:new_idx] = hap.transform(self.genotypes).data[:,:,:2].sum(axis=2) + effects[:, effect_arr_idx:new_idx] = ( + hap.transform(self.genotypes).data[:, :, :2].sum(axis=2) + ) effect_arr_idx = new_idx resids = Phenotypes(fname=None, log=self.phenotypes.log) resids.samples = self.phenotypes.samples # get residuals with effects as covariates resids.names = self.phenotypes.names self.log.debug(f"Computing residuals for tree {tree_idx}") - resids.data = sm.OLS(self.phenotypes.data, sm.add_constant(effects)).fit().resid[:, np.newaxis] + resids.data = ( + sm.OLS(self.phenotypes.data, sm.add_constant(effects)) + .fit() + .resid[:, np.newaxis] + ) return resids def __repr__(self): @@ -124,7 +139,7 @@ def dot(self, remove_singletons: bool = False) -> str: remove_singletons: bool Whether to ignore trees composed of only a single SNP """ - node_pattern = r'(?)' + node_pattern = r"(?)" max_node_id = 0 trees_str = "strict digraph {\nforcelabels=true;\nrankdir=TB;\n" for idx, tree in enumerate(self.trees): @@ -133,15 +148,17 @@ def dot(self, remove_singletons: bool = False) -> str: tree_str = tree.dot().split("\n")[2:] if remove_singletons and len(tree_str) <= 5: continue - trees_str += f"subgraph tree_{idx}"+" {\nlabel=\""+f"Tree {idx}"+"\";\n" + trees_str += f"subgraph tree_{idx}" + ' {\nlabel="' + f"Tree {idx}" + '";\n' tree_str = "\n".join(tree_str) # increment the node IDs to ensure they remain unique across all trees - increment = lambda match: str(int(match.group(1))+max_node_id) + increment = lambda match: str(int(match.group(1)) + max_node_id) trees_str += re.sub(node_pattern, increment, tree_str) # what was the greatest node ID in this tree? # increment the next tree's IDs by that number + 1 if there were any nodes try: - max_node_id = max(map(int, re.findall(node_pattern, tree_str))) + max_node_id + 1 + max_node_id = ( + max(map(int, re.findall(node_pattern, tree_str))) + max_node_id + 1 + ) except ValueError: pass trees_str += "}" diff --git a/happler/tree/terminator.py b/happler/tree/terminator.py index 105ce2a..d696450 100644 --- a/happler/tree/terminator.py +++ b/happler/tree/terminator.py @@ -178,7 +178,12 @@ def check( num_tests: int, ) -> bool: computed_val = self.compute_val( - parent_res, node_res, results, best_idx, num_samps, num_tests, + parent_res, + node_res, + results, + best_idx, + num_samps, + num_tests, ) if isinstance(computed_val, bool): return computed_val @@ -256,7 +261,12 @@ def check( num_tests: int, ): computed_val = self.compute_val( - parent_res, node_res, results, best_idx, num_samps, num_tests, + parent_res, + node_res, + results, + best_idx, + num_samps, + num_tests, ) if isinstance(computed_val, bool): return computed_val diff --git a/happler/tree/tree_builder.py b/happler/tree/tree_builder.py index 41bc440..2db22e2 100644 --- a/happler/tree/tree_builder.py +++ b/happler/tree/tree_builder.py @@ -474,7 +474,7 @@ def _find_split_rigid( self.log.debug( f"The haplotype had a pval of {hap_indep_effect} < {self.indep_thresh}" " in an additive model with the allele and parent" - ) + ) best_allele_idx = best_res_idx[best_allele] best_results = results[allele].data[best_allele_idx] node_res = self.results_type.from_np(best_results) diff --git a/tests/test_tree.py b/tests/test_tree.py index 4bf8c76..c0f0278 100644 --- a/tests/test_tree.py +++ b/tests/test_tree.py @@ -230,7 +230,7 @@ def test_haplotype_transform(): if idx != variant_idx: np.testing.assert_allclose( hap.transform(gens, allele, idx), - gens_without_variant[:,(idx-1,)], + gens_without_variant[:, (idx - 1,)], ) From c4f1dcf2291654779e7e287f9de0d88d0b22da71 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Thu, 5 Dec 2024 13:08:09 -0800 Subject: [PATCH 40/41] adapt workflow to simulate STRs --- analysis/config/config-simulations.yml | 7 ++- analysis/workflow/Snakefile | 54 +++++++++----------- analysis/workflow/rules/genotypes.smk | 22 -------- analysis/workflow/rules/happler.smk | 2 +- analysis/workflow/rules/simulate.smk | 29 ++++++----- analysis/workflow/scripts/create_hap_file.sh | 40 +++++++++------ 6 files changed, 69 insertions(+), 85 deletions(-) diff --git a/analysis/config/config-simulations.yml b/analysis/config/config-simulations.yml index dfc35d7..86830a2 100644 --- a/analysis/config/config-simulations.yml +++ b/analysis/config/config-simulations.yml @@ -46,8 +46,10 @@ modes: pos: 19:45903859 id: 19:45903859 reps: 10 + beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45] snp: pos: 19:45910672 # rs1046282 + beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45] hap: alleles: [rs36046716:G, rs1046282:G] # 45892145, 45910672 beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45] @@ -80,10 +82,7 @@ min_maf: 0.05 # Sample sizes to use # sample_size: [777, 800, 1600, 2400, 3200] -sample_size: 777 - -# A threshold on the pvalues of the haplotypes -out_thresh: 5e-3 +sample_size: 3200 # Whether to include the causal variant in the set of genotypes provided to the # finemapping methods. Set this to true if you're interested in seeing how the diff --git a/analysis/workflow/Snakefile b/analysis/workflow/Snakefile index 1cefc93..2f33839 100644 --- a/analysis/workflow/Snakefile +++ b/analysis/workflow/Snakefile @@ -50,7 +50,8 @@ module genotypes: config: config use rule * from genotypes as genotypes_* -config["gts_str_panel"] = rules.genotypes_subset_str.output.vcf +if mode == "str": + config["gts_str_panel"] = config["str_panel"] if check_config("sample_size"): sampsize = config["sample_size"] config["gts_snp_panel"] = rules.genotypes_subset.output.pgen @@ -94,24 +95,32 @@ if check_config("covar"): if "pheno_matrix" in covar_config: covar_file = rules.covariates_merge.output.covar -if mode in ("snp", "hap", "ld_range"): +if mode in ("str", "snp", "hap", "ld_range"): happler_config = { "pheno": rules.simulate_simphenotype.output.pheno, "hap_file": rules.simulate_transform.input.hap, "snp_panel": config["gts_snp_panel"], - "out": config["out"] + "/happler/"+mode+"/{beta}", + "out": config["out"] + "/happler/"+mode+"/beta_{beta}", "random": None, "covar": covar_file, "min_maf": check_config("min_maf", 0), - "out_thresh": check_config("out_thresh", 5e-8), + "out_thresh": 1, # artificially set to 1 to avoid filtering } if mode in ("hap", "ld_range"): happler_config["snps"] = [] + happler_config["causal_gt"] = rules.simulate_transform.output if mode == "hap": happler_config["snps"] = config["modes"][mode]["alleles"] - happler_config["causal_gt"] = rules.simulate_transform.output - if mode == "ld_range": - happler_config["out"] = config["out"] + "/happler/"+mode+"/{num_haps}_haps/ld_{ld}/beta_{beta}/alpha_{alpha}/rep_{rep}" + elif mode == "ld_range": + happler_config["out"] = config["out"] + "/happler/"+mode+"/{num_haps}_haps/ld_{ld}/beta_{beta}/alpha_{alpha}" + elif mode in ("str", "snp"): + # also provide the SNP and STR IDs so that they can be used in the manhattan plot + happler_config["snps"] = (config["modes"][mode]["id"],) + # TODO: consider uncommenting this to compute LD with causal STR + # if mode == "str": + # happler_config["causal_gt"] = config["gts_str_panel"] + if check_config("reps", place=config["modes"][mode]): + happler_config["out"] = happler_config["out"] + "/rep_{rep}" elif mode == "run": happler_config = { "pheno": config["modes"][mode]["pheno"], @@ -125,8 +134,6 @@ elif mode == "run": } if check_config("SVs", place=config["modes"][mode]): happler_config["SVs"] = config["modes"][mode]["SVs"] -elif "str" == mode: - pass else: raise ValueError("Unsupported operating 'mode' in config") @@ -145,25 +152,6 @@ if mode == "ld_range" and config["modes"][mode]["random"]: config: happler_config use rule * from happler_random as happler_random_* -# if mode in ("hap", "str", "ld_range"): -# finemappers_config = { -# "pheno": rules.simulate_simphenotype.output.pheno, -# "out": config["out"] + "/finemappers/"+mode+"/{beta}", -# "snp_panel": config["gts_snp_panel"], -# } -# if mode in ("hap", "ld_range"): -# finemappers_config["causal_gt"] = rules.simulate_transform.output.pgen -# if mode == "ld_range": -# finemappers_config["out"] = config["out"] + "/finemappers/"+mode+"/ld_{ld}/beta_{beta}/alpha_{alpha}" -# else: -# raise ValueError("Not yet implemented operating mode 'str' in config") -# module finemappers: -# snakefile: "rules/finemappers.smk" -# config: finemappers_config -# use rule * from finemappers as finemappers_* -# else: -# raise ValueError(f"Unsupported operating mode '{mode}' in config") - plots_config = { "out": config["out"] + "/plots", "mode": mode, @@ -237,16 +225,22 @@ elif mode == "run": else: FINAL_OUTPUT = expand( [ + rules.happler_cond_linreg.output.png, rules.happler_tree.output.png, - rules.happler_manhattan.output.png, # rules.finemappers_results.output.susie_pdf, rules.happler_results.output.susie_pdf, - rules.plots_params.output.png, + # rules.plots_params.output.png, ], ex=(("in",) if mode == "hap" else ("ex", "in")), beta=config["modes"]["hap"]["beta"], allow_missing=True, ) + if check_config("reps", place=config["modes"][mode]): + FINAL_OUTPUT = expand( + FINAL_OUTPUT, + rep=range(config["modes"][mode]["reps"]), + allow_missing=True, + ) # If '--config debug=true' is specified, we won't try to build the entire DAG # This can be helpful when you want to play around with a specific target output diff --git a/analysis/workflow/rules/genotypes.smk b/analysis/workflow/rules/genotypes.smk index bfbc555..4eb126e 100644 --- a/analysis/workflow/rules/genotypes.smk +++ b/analysis/workflow/rules/genotypes.smk @@ -162,28 +162,6 @@ rule vcf2plink: "--threads {threads}{params.samps} --out {params.prefix} &>{log}" -rule subset_str: - """ subset samples from an STR VCF """ - input: - vcf=lambda wildcards: expand(config["str_panel"], chr=parse_locus(wildcards.locus)[0])[0], - vcf_idx=lambda wildcards: expand(config["str_panel"] + ".tbi", chr=parse_locus(wildcards.locus)[0]), - samples=rules.keep_samps.output.samples, - output: - vcf=out+"/strs.bcf", - vcf_idx=out+"/strs.bcf.csi", - resources: - runtime=30, - log: - logs + "/subset_str", - benchmark: - bench + "/subset_str", - conda: - "../envs/default.yml" - shell: - "bcftools view -S {input.samples} --write-index " - "-Ob -o {output.vcf} {input.vcf}" - - def subset_input(): if check_config("phase_map") or check_config("exclude_samples") or not config["snp_panel"].endswith(".pgen"): return { diff --git a/analysis/workflow/rules/happler.smk b/analysis/workflow/rules/happler.smk index d34f643..f3b430c 100644 --- a/analysis/workflow/rules/happler.smk +++ b/analysis/workflow/rules/happler.smk @@ -68,7 +68,7 @@ rule run: out_thresh=check_config("out_thresh", 5e-8), keep_SNPs="--remove-SNPs " if not ("{rep}" in out) else "", output: - hap=out + "/happler.hap", + hap=ensure(out + "/happler.hap", non_empty=True), gz=out + "/happler.hap.gz", idx=out + "/happler.hap.gz.tbi", dot=out + "/happler.dot", diff --git a/analysis/workflow/rules/simulate.smk b/analysis/workflow/rules/simulate.smk index 249e64d..72d92e3 100644 --- a/analysis/workflow/rules/simulate.smk +++ b/analysis/workflow/rules/simulate.smk @@ -24,6 +24,10 @@ def parse_locus(locus): hap_ld_range_output = out + "/create_ld_range/{num_haps}_haps/ld_{ld}/haplotype.hap" +gts_panel = Path( + config["gts_str_panel"] if mode == "str" else config["gts_snp_panel"] +) + checkpoint create_hap_ld_range: """ create a hap file suitable for haptools transform and simphenotype """ input: @@ -31,12 +35,12 @@ checkpoint create_hap_ld_range: gts_pvar=Path(config["gts_snp_panel"]).with_suffix(".pvar"), gts_psam=Path(config["gts_snp_panel"]).with_suffix(".psam"), params: - min_ld = config["modes"]["ld_range"]["min_ld"], - max_ld = config["modes"]["ld_range"]["max_ld"], - step = config["modes"]["ld_range"]["step"], - min_af = config["modes"]["ld_range"]["min_af"], - max_af = config["modes"]["ld_range"]["max_af"], - num_haps = "-n " + " -n ".join(map(str, config["modes"]["ld_range"]["num_haps"])), + min_ld = lambda wildcards: config["modes"]["ld_range"]["min_ld"], + max_ld = lambda wildcards: config["modes"]["ld_range"]["max_ld"], + step = lambda wildcards: config["modes"]["ld_range"]["step"], + min_af = lambda wildcards: config["modes"]["ld_range"]["min_af"], + max_af = lambda wildcards: config["modes"]["ld_range"]["max_af"], + num_haps = lambda wildcards: "-n " + " -n ".join(map(str, config["modes"]["ld_range"]["num_haps"])), out = lambda wildcards: expand( hap_ld_range_output, **wildcards, allow_missing=True, ), @@ -61,13 +65,14 @@ checkpoint create_hap_ld_range: rule create_hap: """ create a hap file suitable for haptools transform and simphenotype """ input: - gts=Path(config["gts_snp_panel"]).with_suffix(".pvar"), + gts=gts_panel.with_suffix(".pvar"), params: ignore="this", # the first parameter is always ignored for some reason chrom=lambda wildcards: parse_locus(wildcards.locus)[0], locus=lambda wildcards: wildcards.locus.split("_")[1].replace('-', '\t'), beta=0.99, - alleles=lambda wildcards: config["modes"]["hap"]["alleles"], + alleles=lambda wildcards: config["modes"]["hap"]["alleles"] if mode == "hap" else [], + repeat=lambda wildcards: config["modes"]["str"]["id"] if mode == "str" else 0, output: hap=out + "/haplotype.hap" resources: @@ -113,13 +118,13 @@ rule simphenotype: """ use the hap file to create simulated phenotypes for the haplotype """ input: hap=rules.transform.input.hap, - pgen=rules.transform.output.pgen, - pvar=rules.transform.output.pvar, - psam=rules.transform.output.psam, + pgen=rules.transform.output.pgen if mode != "str" else config["gts_str_panel"], + pvar=rules.transform.output.pvar if mode != "str" else Path(config["gts_str_panel"]).with_suffix(".pvar"), + psam=rules.transform.output.psam if mode != "str" else Path(config["gts_str_panel"]).with_suffix(".psam"), params: beta=lambda wildcards: wildcards.beta, seed = 42, - reps = config["modes"]["ld_range"]["reps"], + reps = lambda wildcards: config["modes"]["ld_range"]["reps"], output: pheno=out + "/{beta}.pheno", resources: diff --git a/analysis/workflow/scripts/create_hap_file.sh b/analysis/workflow/scripts/create_hap_file.sh index ae46e5d..cc32072 100755 --- a/analysis/workflow/scripts/create_hap_file.sh +++ b/analysis/workflow/scripts/create_hap_file.sh @@ -2,23 +2,31 @@ exec 2> "${snakemake_log}" -echo -e "#\torderH\tbeta" > "${snakemake_output[hap]}" -echo -e "#\tversion\t0.1.0" >> "${snakemake_output[hap]}" -echo -e "#H\tbeta\t.2f\tEffect size in linear model" >> "${snakemake_output[hap]}" +echo -e "#\tversion\t0.2.0" >> "${snakemake_output[hap]}" +echo -e "#R\tbeta\t.2f\tEffect size in linear model" >> "${snakemake_output[hap]}" -starts=() -ends=() -for snp in ${snakemake_params[4]}; do - snp_id="$(echo "$snp" | sed 's/\(.*\):/\1\t/')" - start_pos="$(grep -m1 "$(echo "$snp_id" | cut -f1)" "${snakemake_input[gts]}" | cut -f2)" - end_pos=$((start_pos+1)) - echo -e "V\thap\t$start_pos\t$end_pos\t$snp_id" >> "${snakemake_output[hap]}" - starts+=("$start_pos") - ends+=("$end_pos") -done +if [ "${snakemake_params[repeat]}" -eq "0" ]; then + starts=() + ends=() + for snp in ${snakemake_params[4]}; do + snp_id="$(echo "$snp" | sed 's/\(.*\):/\1\t/')" + start_pos="$(grep -Ev '^#' -m1 "$(echo "$snp_id" | cut -f1)" "${snakemake_input[gts]}" | cut -f2)" + end_pos=$((start_pos+1)) + echo -e "V\thap\t$start_pos\t$end_pos\t$snp_id" >> "${snakemake_output[hap]}" + starts+=("$start_pos") + ends+=("$end_pos") + done -start_pos="$(echo "${starts[@]}" | tr ' ' '\n' | sort -n | head -n1)" -end_pos="$(echo "${ends[@]}" | tr ' ' '\n' | sort -rn | head -n1)" -echo -e "H\t${snakemake_params[1]}\t$start_pos\t$end_pos\thap\t${snakemake_params[3]}" >> "${snakemake_output[hap]}" + start_pos="$(echo "${starts[@]}" | tr ' ' '\n' | sort -n | head -n1)" + end_pos="$(echo "${ends[@]}" | tr ' ' '\n' | sort -rn | head -n1)" + echo -e "H\t${snakemake_params[1]}\t$start_pos\t$end_pos\thap\t${snakemake_params[3]}" >> "${snakemake_output[hap]}" +else + for str in ${snakemake_params[5]}; do + str_id=$'\t'"$str"$'\t' + start_pos="$(grep --line-buffered -Ev '^#' "${snakemake_input[gts]}" | grep -Pm1 "${start_pos}${str_id}" | cut -f2)" + end_pos="$(grep --line-buffered -Ev '^#' "${snakemake_input[gts]}" | grep -Pm1 "${start_pos}${str_id}" | grep -Po '(?<=;END=)\d+(?=;PERIOD)')" + echo -e "R\t${snakemake_params[1]}\t$start_pos\t$end_pos\t$str\t${snakemake_params[3]}" >> "${snakemake_output[hap]}" + done +fi LC_ALL=C sort -k1,4 -o "${snakemake_output[hap]}" "${snakemake_output[hap]}" From e0dec77f13e0ef673ec84d97bfb802608822a273 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Thu, 5 Dec 2024 21:22:24 +0000 Subject: [PATCH 41/41] update lock file --- poetry.lock | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index 2bc94f4..c554e7f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -51,9 +51,6 @@ files = [ {file = "babel-2.15.0.tar.gz", hash = "sha256:8daf0e265d05768bc6c7a314cf1321e9a123afc328cc635c18622a2f30a04413"}, ] -[package.dependencies] -pytz = {version = ">=2015.7", markers = "python_version < \"3.9\""} - [package.extras] dev = ["freezegun (>=1.0,<2.0)", "pytest (>=6.0)", "pytest-cov"] @@ -2130,5 +2127,5 @@ test = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-it [metadata] lock-version = "2.0" -python-versions = ">=3.8,<4.0" -content-hash = "269cf6b4b6f2ca34cd1b9b3483109a94fa95a3748ccd2a3051786d5e26b39091" +python-versions = ">=3.9,<4.0" +content-hash = "a7bd4a4e3824ae27cb1cee5bf0599c747842d5c29e81c05e22be81ea45b71490"