From b420ee277240e120cc8ecd598a38616268a67447 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Wed, 25 Sep 2024 13:47:55 -0700
Subject: [PATCH 01/41] create first iteration of forest builder

---
 happler/__main__.py                           |  54 +-
 happler/tree/__init__.py                      |   1 +
 happler/tree/forest_builder.py                | 107 +++
 happler/tree/tree_builder.py                  |   4 -
 .../data/19_45401409-46401409_1000G.multi.hap |  12 +
 .../19_45401409-46401409_1000G.multi.pheno    | 777 ++++++++++++++++++
 tests/test_examples.py                        |  20 +
 7 files changed, 969 insertions(+), 6 deletions(-)
 create mode 100644 happler/tree/forest_builder.py
 create mode 100644 tests/data/19_45401409-46401409_1000G.multi.hap
 create mode 100644 tests/data/19_45401409-46401409_1000G.multi.pheno

diff --git a/happler/__main__.py b/happler/__main__.py
index f596afa..4b7d83a 100644
--- a/happler/__main__.py
+++ b/happler/__main__.py
@@ -103,6 +103,20 @@ def main():
     hidden=True,
     help="Do not check that variants are phased. Saves time and memory.",
 )
+@click.option(
+    "--max-signals",
+    type=int,
+    default=1,
+    show_default=True,
+    help="The maximum number of expected causal signals",
+)
+@click.option(
+    "--max-iterations",
+    type=int,
+    default=1,
+    show_default=True,
+    help="The max number of times to repeat the tree building",
+)
 @click.option(
     "-t",
     "--threshold",
@@ -150,6 +164,13 @@ def main():
         "no correction (n)"
     ),
 )
+@click.option(
+    "--remove-SNPs",
+    is_flag=True,
+    show_default=True,
+    default=False,
+    help="Remove haplotypes with only a single variant",
+)
 @click.option(
     "-o",
     "--output",
@@ -179,12 +200,15 @@ def run(
     chunk_size: int = None,
     maf: float = None,
     phased: bool = False,
+    max_signals: int = 1,
+    max_iterations: int = 1,
     threshold: float = 0.05,
     indep_thresh: float = 0.1,
     ld_prune_thresh: float = None,
     show_tree: bool = False,
     covars: Path = None,
     corrector: str = "n",
+    remove_snps: bool = False,
     output: Path = Path("/dev/stdout"),
     verbosity: str = "INFO",
 ):
@@ -316,9 +340,35 @@ def run(
         indep_thresh=indep_thresh,
         ld_prune_thresh=ld_prune_thresh,
         log=log,
-    ).run()
+    )
+    forest = tree.ForestBuilder(
+        hap_tree,
+        num_bins=max_signals,
+        max_iterations=max_iterations,
+        log=log,
+    )
     log.info("Outputting haplotypes")
-    tree.Haplotypes.from_tree(fname=output, tree=hap_tree, gts=gt, log=log).write()
+    haplotypes = data.Haplotypes(
+        fname=output,
+        haplotype=tree.haplotypes.HapplerHaplotype,
+        variant=tree.haplotypes.HapplerVariant,
+        log=log,
+    )
+    haplotypes.data = {}
+    hap_id = 0
+    # merge the Haplotypes objects
+    # TODO: use a method of the Haplotypes class
+    for haps in forest.run():
+        if haps is None:
+            continue
+        for hap in haps.data.values():
+            if len(hap.variants) <= 1 and remove_snps:
+                continue
+            hap.id = f"H{hap_id}"
+            haplotypes.data[hap.id] = hap
+            hap_id += 1
+    haplotypes.index()
+    haplotypes.write()
     if show_tree:
         dot_output = output
         if Path(output) != Path("/dev/stdout"):
diff --git a/happler/tree/__init__.py b/happler/tree/__init__.py
index e325bab..2bc2dc2 100644
--- a/happler/tree/__init__.py
+++ b/happler/tree/__init__.py
@@ -1,5 +1,6 @@
 from .tree import Tree
 from .tree_builder import TreeBuilder
+from .forest_builder import ForestBuilder
 from .variant import VariantType, Variant
 from .haplotypes import Haplotype, Haplotypes
 from .corrector import Corrector, BH, Bonferroni, BHSM
diff --git a/happler/tree/forest_builder.py b/happler/tree/forest_builder.py
new file mode 100644
index 0000000..ee03c3e
--- /dev/null
+++ b/happler/tree/forest_builder.py
@@ -0,0 +1,107 @@
+from __future__ import annotations
+from logging import Logger
+
+import numpy as np
+import numpy.typing as npt
+import statsmodels.api as sm
+from haptools.logging import getLogger
+from haptools.data import Genotypes, Phenotypes
+
+from .tree import Tree
+from .haplotypes import Haplotypes
+from .tree_builder import TreeBuilder
+
+
+class ForestBuilder:
+    """
+    Creates a ForestBuilder object from a provided TreeBuilder
+
+    Attributes
+    ----------
+    tree_builder: TreeBuilder
+        A TreeBuilder instance that we can run repeatedly
+    log: Logger
+        A logging instance for recording debug statements.
+    """
+
+    def __init__(
+        self,
+        tree_builder: TreeBuilder,
+        num_bins: int,
+        max_iterations: int = None,
+        log: Logger = None,
+    ):
+        self.tree_builder = tree_builder
+        self.max_iterations = max_iterations
+        self.trees = [None]*num_bins
+        self.haplotypes = [None]*num_bins
+        self.genotypes = self.tree_builder.gens
+        self.phenotypes = self.tree_builder.phens
+        self.log = log or getLogger(self.__class__.__name__)
+
+    def run(self):
+        """
+        Build a bunch of trees
+        """
+        iterations = self.max_iterations
+        tree_idx = 0
+        while iterations:
+            self.log.debug(f"Iteration {self.max_iterations - iterations}: tree {tree_idx}")
+            self.tree_builder.phens = self.get_residuals(tree_idx)
+            self.tree_builder.run()
+            self.trees[tree_idx] = self.tree_builder.tree
+            self.haplotypes[tree_idx] = Haplotypes.from_tree(
+                None, self.trees[tree_idx], self.genotypes, self.log,
+            )
+            # increment tree_idx until it reaches the end
+            tree_idx += 1
+            if tree_idx >= len(self.trees):
+                # TODO: implement a test to check whether to stop iterating
+                if iterations is not None:
+                    iterations -= 1
+                tree_idx = 0
+        return self.haplotypes
+
+    def get_residuals(self, tree_idx: int) -> Phenotypes:
+        """
+        Retrieve the current phenotype values, computed after regressing out the
+        haplotype effects from each tree except the one indicated by tree_idx
+
+        Parameters
+        ----------
+        tree_idx: int
+            The index of the tree whose haplotypes to exclude from consideration
+
+        Returns
+        -------
+        Phenotypes
+            The residuals after regressing out the effects of the haplotypes from all
+            other trees
+        """
+        # first: count the number of effects amongst all haplotypes
+        total_num_effects = sum(
+            len(hps.data) if hps is not None else 0
+            for hps_idx, hps in enumerate(self.haplotypes)
+            if hps_idx != tree_idx
+        )
+        # base case: if there are no effects to regress out
+        if total_num_effects == 0:
+            return self.phenotypes
+        num_samps = len(self.phenotypes.samples)
+        effects = np.empty((num_samps, total_num_effects), dtype=self.phenotypes.data.dtype)
+        effect_arr_idx = 0
+        # iterate through every tree's Haplotypes obj and transform the haps into effects
+        self.log.debug(f"Extracting {total_num_effects} effects from {total_num_effects} total effects")
+        for hap_idx, hap in enumerate(self.haplotypes):
+            if hap is None or hap_idx == tree_idx:
+                continue
+            new_idx = effect_arr_idx + len(hap.data)
+            effects[:, effect_arr_idx:new_idx] = hap.transform(self.genotypes).data[:,:,:2].sum(axis=2)
+            effect_arr_idx = new_idx
+        resids = Phenotypes(fname=None, log=self.phenotypes.log)
+        resids.samples = self.phenotypes.samples
+        # get residuals with effects as covariates
+        resids.names = self.phenotypes.names
+        self.log.debug(f"Computing residuals for tree {tree_idx}")
+        resids.data = sm.OLS(self.phenotypes.data, sm.add_constant(effects)).fit().resid[:, np.newaxis]
+        return resids
diff --git a/happler/tree/tree_builder.py b/happler/tree/tree_builder.py
index c060ed8..41bc440 100644
--- a/happler/tree/tree_builder.py
+++ b/happler/tree/tree_builder.py
@@ -89,10 +89,6 @@ def run(self):
         """
         Run the tree builder and create a tree rooted at the provided variant
         """
-        if self.tree is not None:
-            raise AssertionError(
-                "A tree already exists for this TreeBuilder. Please create a new one."
-            )
         # step one: initialize the tree
         self.tree = Tree(log=self.log)
         # step two: create a haplotype
diff --git a/tests/data/19_45401409-46401409_1000G.multi.hap b/tests/data/19_45401409-46401409_1000G.multi.hap
new file mode 100644
index 0000000..6fc8474
--- /dev/null
+++ b/tests/data/19_45401409-46401409_1000G.multi.hap
@@ -0,0 +1,12 @@
+#	orderH	beta	pval
+#	orderV	score
+#	version	0.2.0
+#H	beta	.2f	Effect size in linear model
+#H	pval	.2f	-log(pval) in linear model
+#V	score	.2f	-log(pval) assigned to this variant
+H	19	45410444	46310808	H0	0.36	30.13
+H	19	45892145	45910673	H1	0.36	30.24
+V	H0	46310807	46310808	rs12984785	C	9.52
+V	H0	45410444	45410445	rs769450	A	30.13
+V	H1	45892145	45892146	rs36046716	G	6.30
+V	H1	45910672	45910673	rs1046282	G	30.24
diff --git a/tests/data/19_45401409-46401409_1000G.multi.pheno b/tests/data/19_45401409-46401409_1000G.multi.pheno
new file mode 100644
index 0000000..93d2808
--- /dev/null
+++ b/tests/data/19_45401409-46401409_1000G.multi.pheno
@@ -0,0 +1,777 @@
+#IID	H0-H1
+HG00096	1.3017394225909524
+HG00097	-0.27455070497718187
+HG00099	0.9223961552249307
+HG00100	-0.4444101713652624
+HG00101	0.8260910716500838
+HG00102	-2.192160408889623
+HG00103	1.937411234430382
+HG00105	-0.6766475045485754
+HG00106	1.2938113228688546
+HG00107	-0.7601234658999931
+HG00108	1.3340426734082755
+HG00109	-0.3702278392333091
+HG00110	-0.657796336291472
+HG00111	0.7163642258408758
+HG00112	-0.5119745970171313
+HG00113	-0.7490020951497627
+HG00114	1.0517322599183143
+HG00115	0.03970553035468305
+HG00116	0.9382503170083054
+HG00117	-0.48446533442500284
+HG00118	0.1605062768449892
+HG00119	0.6650392570450632
+HG00120	-1.2239557284874587
+HG00121	-0.28155916154351257
+HG00122	-0.694600223401998
+HG00123	0.016002165837225357
+HG00125	0.6610124959978709
+HG00126	-1.8824897934414873
+HG00127	-0.0022863086650215525
+HG00128	-0.07817244477032392
+HG00129	-0.060144088248919036
+HG00130	0.20815469441867915
+HG00131	-0.4434759299302991
+HG00132	-0.3862605139969777
+HG00133	-0.01584928114183487
+HG00136	-0.5947118164891941
+HG00137	0.6007178043843452
+HG00138	1.341960465226173
+HG00139	-1.732650802300685
+HG00140	-0.06574912378085612
+HG00141	-1.1173921209561168
+HG00142	-0.8313815665565609
+HG00143	1.4673031969056392
+HG00145	-0.21654238430913353
+HG00146	0.8559756623313552
+HG00148	-0.6547746894235088
+HG00149	0.6512205590579918
+HG00150	0.1771917767565636
+HG00151	0.2835877343172233
+HG00154	-1.3223836301651573
+HG00155	-0.4867863486518235
+HG00157	-0.311207123560055
+HG00158	-0.7591601736064473
+HG00159	-0.46909876113819504
+HG00160	-0.45228839698679396
+HG00171	0.05534744773817596
+HG00173	-0.05728526482670221
+HG00174	0.11912007184337481
+HG00176	-0.5635001943233273
+HG00177	0.1927763795549795
+HG00178	1.5693430077698323
+HG00179	0.8442480505447929
+HG00180	0.5563159735566817
+HG00181	0.25739949585716204
+HG00182	0.20070166807808176
+HG00183	0.6315534221299418
+HG00185	0.2532479084681132
+HG00186	-1.538315230397934
+HG00187	0.4205562997540997
+HG00188	-0.3671931147770918
+HG00189	-0.682298236222033
+HG00190	1.1334295789662376
+HG00231	-1.6518714633900213
+HG00232	-0.5946064662889645
+HG00233	1.456274458550981
+HG00234	-0.673410561447305
+HG00235	0.7324998961620618
+HG00236	-0.7476782340145596
+HG00237	1.3742564614367883
+HG00238	-0.42666365031731934
+HG00239	0.8359563911296215
+HG00240	-0.8395670312438419
+HG00242	-1.926006404324073
+HG00243	0.8588343808065176
+HG00244	-1.0365863223317784
+HG00245	0.1843317854385289
+HG00246	0.732871711072655
+HG00250	-0.7025024579392856
+HG00251	0.2084041227613298
+HG00252	-1.4545955830939332
+HG00253	2.945009837767178
+HG00254	0.503912008880156
+HG00255	-1.3334829520303777
+HG00256	-0.5480397928115349
+HG00257	-0.938556923042386
+HG00258	-1.9106942563377778
+HG00259	1.6559216835838548
+HG00260	0.07826046127315978
+HG00261	0.09915001009411378
+HG00262	-0.03674349382971431
+HG00263	0.13961998399334552
+HG00264	-0.7313470923718625
+HG00265	-0.4638843982228111
+HG00266	1.883097199158406
+HG00267	0.6482813187326648
+HG00268	0.19621125364483505
+HG00269	0.23454471446401665
+HG00271	0.3856839957159506
+HG00272	-0.7613980713107256
+HG00273	0.5847564149458971
+HG00274	-2.200340761207475
+HG00275	0.9399114427840759
+HG00276	0.6208580988865791
+HG00277	1.9921086351108939
+HG00278	-0.7452347633051035
+HG00280	-0.04146856072515981
+HG00281	0.039852090431971354
+HG00282	-1.0473708422676937
+HG00284	1.210779477111332
+HG00285	-0.9212433227842408
+HG00288	-0.45267676876761587
+HG00290	0.21879672908691095
+HG00304	-1.4019032397917557
+HG00306	1.6882682875304393
+HG00308	-0.40454620804361807
+HG00309	0.14589913994153003
+HG00310	-0.8603828426604014
+HG00311	0.7859249725806237
+HG00313	-1.423515824608892
+HG00315	-0.2390376441256331
+HG00318	-0.23442109584659665
+HG00319	0.10717344738698581
+HG00320	-1.1879282057763563
+HG00321	2.010856476651868
+HG00323	1.9536925186312748
+HG00324	1.5630788196071053
+HG00325	-0.6746593314170943
+HG00326	0.645697774512938
+HG00327	0.27531983513078506
+HG00328	-0.3240679844184028
+HG00329	1.0858071408796113
+HG00330	-0.1380942934078231
+HG00331	-0.6617302418606026
+HG00332	-0.8196133472385092
+HG00334	1.8964153527428376
+HG00335	1.0979767394742055
+HG00336	0.45942599485848723
+HG00337	-0.29927846123948887
+HG00338	1.6589405080120647
+HG00339	1.0244855637755055
+HG00341	-0.1106888927519688
+HG00342	-1.6805177057993594
+HG00343	0.9484579844835173
+HG00344	-1.0120010079512851
+HG00345	-1.3587334871064143
+HG00346	-1.4503482183381182
+HG00349	-0.5961763016136395
+HG00350	-1.2248086186506415
+HG00351	0.14578426476484357
+HG00353	0.07659525915044929
+HG00355	1.112412479404136
+HG00356	-0.8814785938735148
+HG00357	-0.9926854251137129
+HG00358	-1.0394944051752386
+HG00360	0.2171639569494055
+HG00361	-0.8036340494724711
+HG00362	0.10075077066547061
+HG00364	0.4835962293815953
+HG00365	0.19382680907013505
+HG00366	0.886385556034768
+HG00367	-0.3880716375161831
+HG00368	0.47239284200124787
+HG00369	-0.7839477950837497
+HG00371	1.0910733414788725
+HG00372	0.8663702947127541
+HG00373	0.19311323966514193
+HG00375	1.0061648184902738
+HG00376	0.9748409248884979
+HG00378	-0.6743931986307602
+HG00379	-0.6408435039796199
+HG00380	1.7644488312594047
+HG00381	0.6343595917549711
+HG00382	1.5614824358020498
+HG00383	-0.010665982476151414
+HG00384	0.5756403538114765
+HG00403	-0.7644713441588533
+HG00404	-1.6355736051237253
+HG00406	0.034088299656111654
+HG00407	-0.06974035486253505
+HG00409	-0.3266224287338094
+HG00410	1.285133304190869
+HG00419	-0.17747468981685946
+HG00421	-0.5216809527310943
+HG00422	1.14580062711893
+HG00428	0.47079269261721013
+HG00436	-1.1991826901148706
+HG00437	1.0922802894213188
+HG00442	1.9524696339807277
+HG00443	-1.276527509624545
+HG00445	0.54720269777494
+HG00446	1.1167374950321065
+HG00448	-0.9943615939542239
+HG00449	0.3015798433428688
+HG00451	0.4581203738623839
+HG00452	-0.3992703612852424
+HG00457	0.010823749383855108
+HG00458	-0.8098101517066841
+HG00463	-0.2448101536344522
+HG00464	-0.8733383667141027
+HG00472	0.7939345553658519
+HG00473	-1.0655863577921791
+HG00475	-0.7222165148083026
+HG00476	-0.9624957925891318
+HG00478	-0.0013715478358110045
+HG00479	-2.40327667661413
+HG00500	0.022552083785861265
+HG00513	-0.0032847044070901665
+HG00524	1.3696957342330283
+HG00525	0.09996072091096792
+HG00530	-0.9253087837168033
+HG00531	0.05387819253735748
+HG00533	0.7185769926770722
+HG00534	0.782101367150142
+HG00536	1.3953406773510117
+HG00537	1.307723084377591
+HG00542	-0.24537359254826419
+HG00543	-1.1289364983902215
+HG00551	-0.3586001725428226
+HG00553	-0.40832412475470525
+HG00554	3.675822203483924
+HG00556	-0.024706080231385097
+HG00557	1.4971197939347933
+HG00559	-0.04398445083603797
+HG00560	-0.16407710687711266
+HG00565	-0.7682990906804746
+HG00566	0.4678824762747546
+HG00580	-0.6669300269342622
+HG00581	-0.6700846349247087
+HG00583	0.5326664895129668
+HG00584	-0.2779053234796991
+HG00589	0.7781462516317879
+HG00590	-2.2572436631832327
+HG00592	-0.6433114427345645
+HG00593	-0.8295006793537826
+HG00595	0.0333143832658932
+HG00596	-0.9998587655976234
+HG00598	1.2013710076031352
+HG00599	0.5780529636331116
+HG00607	0.19702922956298496
+HG00608	-2.3574654810466606
+HG00610	0.42108704704301125
+HG00611	0.20412384859956612
+HG00613	1.2923109821657752
+HG00614	0.4287672171393162
+HG00619	-1.6119721910689218
+HG00620	-0.4329263648859035
+HG00622	0.5107996725772186
+HG00623	1.4808488027236744
+HG00625	-0.7980477107711863
+HG00626	1.8805724032304343
+HG00628	-0.07851834224053256
+HG00629	0.6951191348864068
+HG00631	0.5565444486212561
+HG00632	0.35114830609902814
+HG00634	-0.4048559861541365
+HG00637	-1.0590360908856513
+HG00638	0.4012517489601062
+HG00640	1.9041051366764545
+HG00641	-0.11858765354391748
+HG00650	-0.2720992279196343
+HG00651	-1.1782306581396977
+HG00653	1.666163243028577
+HG00654	0.5642021449803208
+HG00656	0.3597572204389563
+HG00657	-0.22112341590961582
+HG00662	-0.042381968527955505
+HG00663	-0.3884744651759044
+HG00671	-1.0650830457336684
+HG00672	-0.10252285397338956
+HG00674	2.585256684460668
+HG00675	-0.8129770930286141
+HG00683	0.2444522922101258
+HG00684	-0.43755239020277786
+HG00689	0.8245288974242369
+HG00690	0.5723664131283869
+HG00692	-0.3255448137258444
+HG00693	0.653983451319051
+HG00698	0.46514845909471514
+HG00699	-0.22295696340365445
+HG00701	0.5971173027939229
+HG00704	0.5093580294077722
+HG00705	-0.8307086637095085
+HG00707	0.5822457415535465
+HG00708	1.0270007475453435
+HG00717	0.05359074401090719
+HG00728	-0.3114946729690229
+HG00729	0.6751192883172805
+HG00731	-0.5150034836230196
+HG00732	-0.12798522904888415
+HG00734	1.013546450246632
+HG00736	0.6869753353027072
+HG00737	-0.43850863654157746
+HG00739	-0.12255652316791327
+HG00740	-0.527991999123023
+HG00742	-1.0608342352170212
+HG00743	0.5488493979377136
+HG00759	1.6224631515376773
+HG00766	0.7129815223518607
+HG00844	1.6689149744084937
+HG00851	3.4439936596775818
+HG00864	0.4702543572838877
+HG00867	-0.36112178875507417
+HG00879	-0.8401056803849913
+HG00881	0.479824583361269
+HG00956	-0.9284945721170539
+HG00978	-0.5651294761397843
+HG00982	-0.2462626268569882
+HG01028	1.3308648016408742
+HG01029	0.5024251758196058
+HG01031	-0.5355374532246513
+HG01046	-0.09355501647908948
+HG01047	-0.24342541106565116
+HG01048	-1.4135462455456238
+HG01049	0.3184435881775337
+HG01051	0.08654867357833768
+HG01052	-0.9700670489633084
+HG01054	1.1559454140871412
+HG01055	0.954942760323882
+HG01058	-0.7945090781828414
+HG01060	-0.03160599031617728
+HG01061	1.1863047071089463
+HG01063	1.5466296361248897
+HG01064	2.3499793828226445
+HG01066	0.41661528341296594
+HG01067	-0.6671381843672244
+HG01069	-0.5700338006896557
+HG01070	-0.5356882437925449
+HG01072	1.2553552346346728
+HG01073	-0.08577863872989477
+HG01075	-0.5132933981410371
+HG01077	-0.20636577600968187
+HG01079	-0.2359923693120397
+HG01080	1.3485351237334335
+HG01082	-0.08055547871995244
+HG01083	-0.41481781990904054
+HG01085	0.6846503975446758
+HG01086	-0.5107365703121772
+HG01088	0.20198090541546027
+HG01089	0.8904506995124579
+HG01092	1.0944270476855125
+HG01094	-0.21981626608057178
+HG01095	-0.42575883800617836
+HG01097	-0.5421369768663082
+HG01098	1.4859840653728797
+HG01101	0.1594650524518848
+HG01102	-0.277404564506499
+HG01104	-0.3045459641613543
+HG01105	-0.9325085789807781
+HG01107	0.6424914557878557
+HG01108	1.3163034090697114
+HG01110	0.7228953081839569
+HG01111	1.446641348653603
+HG01112	2.758413479417553
+HG01113	0.6077301224018714
+HG01119	-1.0529642439135634
+HG01121	0.07800350107491305
+HG01122	-0.41451465734361315
+HG01124	0.7607002016290159
+HG01125	0.11432801847372626
+HG01130	-0.7547045432285291
+HG01131	-0.36652083695629256
+HG01133	-0.26931656477459875
+HG01134	-1.0605782936561656
+HG01136	-1.035459208762127
+HG01137	1.4085149809557134
+HG01139	1.5342969923703986
+HG01140	0.847378452146835
+HG01142	1.1869804368689174
+HG01148	-2.975475585672566
+HG01149	-1.4491824340278092
+HG01161	-0.27099060632372735
+HG01162	0.5608862674548538
+HG01164	-0.9619239427802997
+HG01167	0.03698832406871888
+HG01168	-0.27540547256201225
+HG01170	2.0834892540141996
+HG01171	1.5374771041833983
+HG01173	1.1906553399713435
+HG01174	2.0146504504029226
+HG01176	-0.3447339318710625
+HG01177	-0.8604036385814623
+HG01182	0.14568185748744056
+HG01183	0.6662993491207352
+HG01187	-1.6974530748263668
+HG01188	-0.4205852001293092
+HG01190	0.6589674340269704
+HG01191	0.5732249874049651
+HG01197	0.10351652834159647
+HG01198	-0.10986771760944436
+HG01200	-0.2660634544870619
+HG01204	-0.464694630514974
+HG01205	-0.649340330132664
+HG01241	0.7377542115358544
+HG01242	1.7102390787723925
+HG01247	-0.04887599452558372
+HG01248	0.6869072846148201
+HG01250	1.6122307271180065
+HG01251	-0.006269529786685435
+HG01253	-0.1535991560141787
+HG01254	0.06486385324861255
+HG01256	0.1624736133399305
+HG01257	0.12720126896016382
+HG01259	-0.5611041978979303
+HG01260	0.8452007910129891
+HG01269	0.5765633665009684
+HG01271	1.4559524762179028
+HG01272	0.11539500759997834
+HG01275	0.24790776340921294
+HG01277	-0.043472289345663884
+HG01280	-0.4625647482063142
+HG01281	-0.018876707413124416
+HG01284	-0.6796253368687399
+HG01286	-0.3044217529345804
+HG01302	-2.0884863350227922
+HG01303	-0.1895879405963826
+HG01305	-0.3639634579361043
+HG01308	-0.28096127211380345
+HG01311	-1.067006406153747
+HG01312	-0.9598078660623194
+HG01323	0.2975749948154106
+HG01325	1.2012638264667665
+HG01326	-0.8243348201288716
+HG01334	-0.1598622584409964
+HG01341	-0.19555366908325295
+HG01342	0.4205481991288811
+HG01344	-1.1857084242467257
+HG01345	0.18791668758807312
+HG01348	-0.24858056282419405
+HG01350	1.0647884665460232
+HG01351	0.3821792758340964
+HG01353	-0.7577895465340109
+HG01354	1.4973311894872399
+HG01356	0.08993696498914311
+HG01357	-0.07137276654248531
+HG01359	-0.24485034819562637
+HG01360	-0.19399268806109768
+HG01362	-0.3421934499181302
+HG01363	0.013594478789405562
+HG01365	-1.1735883598885404
+HG01366	-0.40882640279430355
+HG01369	-0.6980593857929052
+HG01372	-0.14920281916494887
+HG01374	-0.22564984724333909
+HG01375	-1.9057224481945723
+HG01377	-0.3349897961833549
+HG01378	-0.6893113802602657
+HG01383	0.06399427332450408
+HG01384	2.0879822732079005
+HG01389	0.2905065963973552
+HG01390	-1.8161393350145354
+HG01392	0.21471482465500513
+HG01393	-0.30892617532578204
+HG01395	-2.3077217301646984
+HG01396	0.6514749358352425
+HG01398	0.4574977350848046
+HG01402	-0.45921566369595745
+HG01403	-1.1713628906570077
+HG01405	-1.1567176774100134
+HG01412	-0.21502671123675055
+HG01413	0.9924112201862092
+HG01414	-1.1139932031627564
+HG01431	0.061196914990173656
+HG01432	-0.6163207046627842
+HG01435	-0.6807933307860816
+HG01437	1.7868857357670271
+HG01438	0.9800844685974784
+HG01440	-1.0449999061114703
+HG01441	0.024951109934860716
+HG01443	0.6300148056020438
+HG01444	-0.21867101194856767
+HG01447	0.5770469936526864
+HG01455	1.0428562054430608
+HG01456	0.2710026628017298
+HG01459	0.34076449871031944
+HG01461	-0.6359349294097668
+HG01462	-1.6552750583804026
+HG01464	-0.0695625090644314
+HG01465	2.0255787662136453
+HG01468	0.26621377307968563
+HG01474	-0.2106567704185593
+HG01479	-0.44983305929705014
+HG01485	0.14488375652371077
+HG01486	0.2543365153611554
+HG01488	-0.213711743816126
+HG01489	1.4281623092837072
+HG01491	0.8024576594075794
+HG01492	0.23204715616732297
+HG01494	-0.371881854159044
+HG01495	0.8919333998823771
+HG01497	-1.4175503353096701
+HG01498	2.9357075502684578
+HG01500	-0.8874043521741435
+HG01501	-1.0720726873652815
+HG01503	-1.3571194490760414
+HG01504	-0.49892894310137426
+HG01506	-0.03931832059904905
+HG01507	-0.9412032420446552
+HG01509	0.31813781483934755
+HG01510	0.3508669576237833
+HG01512	1.4729719731022497
+HG01513	-0.984343032601463
+HG01515	0.46543101076195226
+HG01516	0.4460419126941211
+HG01518	-0.30959991646573015
+HG01519	-0.9398263010270295
+HG01521	-1.056578331271975
+HG01522	-1.4239638772246848
+HG01524	0.28144387350616684
+HG01525	0.7343342379454842
+HG01527	0.901922144462221
+HG01528	-0.08574761688772858
+HG01530	1.3662470368018946
+HG01531	0.9365544794916194
+HG01536	-1.423792342169354
+HG01537	1.975297967956009
+HG01550	-0.1595864688955843
+HG01551	0.3876305272480949
+HG01556	0.6011569906689535
+HG01565	-0.4401050318778242
+HG01566	-0.25402628945193184
+HG01571	-0.13603079578388413
+HG01572	-1.0867770031518553
+HG01577	-0.6069600379514265
+HG01578	-0.2527960568553665
+HG01583	-0.27998804624640916
+HG01586	0.3367042336962939
+HG01589	0.5992709593147879
+HG01593	-0.5782306881740438
+HG01595	0.6188787969948111
+HG01596	-0.5319272035027639
+HG01597	0.4633309062649581
+HG01598	-0.14212038865574916
+HG01599	-0.9693440536402446
+HG01600	1.329534410844258
+HG01602	-0.7027942179755869
+HG01603	1.3258050259785459
+HG01605	0.7583742247627854
+HG01606	-0.03806140482527687
+HG01607	0.5228190851369188
+HG01608	0.5614434937244929
+HG01610	-2.154321123984033
+HG01612	-0.770020813211133
+HG01613	-0.057682039549263764
+HG01615	0.17553713718651348
+HG01617	0.8680841503716705
+HG01618	-0.09365334577581458
+HG01619	0.7567266158704564
+HG01620	-0.15316680848114536
+HG01623	-0.07167547502727623
+HG01624	-0.7098691112479755
+HG01625	-0.5376565585030909
+HG01626	-1.1934123893983926
+HG01628	-1.3137578792784572
+HG01630	-0.3885220706417852
+HG01631	0.06380003793158934
+HG01632	0.9882096449012903
+HG01668	-1.2272834638886467
+HG01669	1.3148086098043614
+HG01670	-0.7194148905373262
+HG01672	-0.6727658059545862
+HG01673	1.104059086797995
+HG01675	-0.05270769798165276
+HG01676	0.026817985814787115
+HG01678	-1.3430039189000706
+HG01679	2.26409616179076
+HG01680	-0.18251160976200353
+HG01682	-2.3800023943973114
+HG01684	-0.5838253170821261
+HG01685	-0.2624810541679711
+HG01686	0.6676106843315626
+HG01694	-1.4791573816281445
+HG01695	0.07375590012010846
+HG01697	-0.674513249152488
+HG01699	0.07580824435391786
+HG01700	-0.6064964653997095
+HG01702	0.46059787936145224
+HG01704	1.5242524603163903
+HG01705	-1.2629813567475658
+HG01707	-0.3341063588736711
+HG01708	0.26000648605015186
+HG01709	1.4530481156739392
+HG01710	1.3905686564870718
+HG01746	0.7199055806858283
+HG01747	-2.055455595648907
+HG01756	2.0605421164194477
+HG01757	1.0498093758574174
+HG01761	0.9819684152254962
+HG01762	-1.0015790396866084
+HG01765	1.3718804568385248
+HG01766	0.08083252031872545
+HG01767	-0.6699710837703073
+HG01768	0.32734826418883145
+HG01770	-0.29685780134204226
+HG01771	-0.7126405956209039
+HG01773	-0.1826718106288978
+HG01775	-0.3284833437322132
+HG01776	0.32753059127823597
+HG01777	-0.16288878224321937
+HG01779	-1.7150146316175758
+HG01781	1.1251339075733333
+HG01783	0.04474406200438674
+HG01784	-0.14884787663011023
+HG01785	0.3076492887242234
+HG01786	0.9799621694745567
+HG01789	-0.7007101666193455
+HG01790	-0.32123129675212875
+HG01791	0.7551301160304824
+HG01794	1.3310246063851228
+HG01795	0.9876089909632227
+HG01796	-1.7025563253400566
+HG01797	-0.20852960737783074
+HG01798	1.0381327312289705
+HG01799	-0.5763532553403252
+HG01800	0.6792764360942509
+HG01801	-1.0825184866495092
+HG01802	-0.8551577350061023
+HG01804	0.06947664819157517
+HG01805	-2.0075674260015806
+HG01806	-0.9769109706355256
+HG01807	2.235744016243585
+HG01808	1.1261728493320944
+HG01809	-0.16532081221444295
+HG01810	0.7993077856593014
+HG01811	1.008630911516867
+HG01812	0.7987820324660475
+HG01813	-1.1010309129607947
+HG01815	0.0941042900031055
+HG01816	0.07479391251206335
+HG01817	0.039881199213996577
+HG01840	0.0008107907883523335
+HG01841	-1.7886038967713078
+HG01842	2.057534865314539
+HG01843	1.1813704618564589
+HG01844	-1.3070428653364963
+HG01845	-0.10415054047715061
+HG01846	-1.6347736522854883
+HG01847	0.5981082049215186
+HG01848	0.49950714611498737
+HG01849	-0.5765966932048827
+HG01850	2.2319784225108634
+HG01851	-1.20065235030314
+HG01852	-2.693691429962721
+HG01853	-0.4551628489456577
+HG01855	0.010416563200790374
+HG01857	-0.872012578333183
+HG01858	-0.00406205758991629
+HG01859	0.1221464121021395
+HG01860	-1.21101586360234
+HG01861	-1.74648042284672
+HG01862	1.4403951528372323
+HG01863	0.6952367542086773
+HG01864	-0.5261923337560288
+HG01865	-1.7138386295283845
+HG01866	-0.9079108023365258
+HG01867	1.3602716532873491
+HG01868	-0.1417979917964054
+HG01869	-0.8708257438590182
+HG01870	-0.04784419231061443
+HG01871	-0.4825374843931246
+HG01872	-1.4039256375821905
+HG01873	-0.08833353399627719
+HG01874	0.876278638618728
+HG01878	0.7726110474957442
+HG01879	2.0191601784878435
+HG01880	-0.39752922689275405
+HG01882	0.7546151453166463
+HG01883	0.157970379027911
+HG01885	0.41436692431391553
+HG01886	-0.10523086231530376
+HG01889	-0.2814716907530671
+HG01890	-0.8412892446081937
+HG01892	-0.2663219512249372
+HG01893	-0.5026892598671168
+HG01894	-1.1154829810938125
+HG01896	-0.7017998643291423
+HG01912	0.2970673421544838
+HG01914	0.6118689092417687
+HG01915	-0.6479071868113228
+HG01917	0.07777768776857702
+HG01918	-0.14628277487012908
+HG01920	1.7681672680378433
+HG01921	-0.5881650245475535
+HG01923	0.760405468975605
+HG01924	-0.2889807756235889
+HG01926	0.43621226021683823
+HG01927	-0.23730446674064543
+HG01932	-0.43699599580467485
+HG01933	0.6528533731003703
+HG01935	-0.7475526354879827
+HG01936	1.5094554510014961
+HG01938	0.27680408013140567
+HG01939	0.2055593358165455
+HG01941	2.1313080055667206
+HG01942	-0.588644766506938
+HG01944	1.4556856718913858
+HG01945	0.4734631906831199
+HG01947	-2.0126453915253872
+HG01948	-0.9430370879616923
+HG01950	2.057970260358122
+HG01951	0.3662051249610203
+HG01953	1.6481910566574476
+HG01954	-0.19638725549125463
+HG01956	-0.6689727087737826
+HG01958	0.191203691445877
+HG01961	0.5246736326505623
+HG01965	-0.24383069116823602
+HG01967	0.04427224701346849
+HG01968	-0.8908742825629983
+HG01970	-1.6463329063438394
+HG01971	0.9858644361877709
+HG01973	-0.10704467374751558
+HG01974	-0.9699953853969524
+HG01976	-0.062087789341763455
+HG01977	0.46202489031645044
+HG01979	1.0267036217537027
+HG01980	-0.4526578880597739
+HG01982	-0.8737852556329553
+HG01985	0.23359501037722807
+HG01986	3.1371193492918787
+HG01988	-0.5475025726786117
+HG01989	0.47435323460452494
+HG01990	-0.7465050935474141
+HG01991	-0.7788253215478809
+HG01992	0.08730436546292941
+HG01997	2.3062118697618788
+HG02002	0.10162781629363232
+HG02003	0.15859012616985357
+HG02006	1.6777132942618456
+HG02008	0.663470945207608
+HG02009	1.3060371169640814
+HG02010	-0.13844724424252397
+HG02012	1.2101340707437047
+HG02013	-0.8861603038151078
+HG02014	-0.04874235699909152
+HG02016	1.010456347263374
+HG02017	0.8203727150068628
+HG02019	0.04450042815337485
+HG02020	0.8754819935860505
+HG02023	-1.5485864250277281
+HG02025	-0.5231651040972453
+HG02026	1.306239127064317
+HG02028	1.1220011665698573
+HG02029	-1.3101376762530863
+HG02031	0.5318604881477949
+HG02032	-1.454155235116148
+HG02035	-0.1240485386500875
+HG02040	0.36235164987935087
+HG02047	0.037354545301426845
+HG02048	-0.8901052017075429
+HG02049	-0.8902400343183031
+HG02050	0.15737609529255614
+HG02051	0.36827034879898163
+HG02052	-1.1733976935691066
+HG02053	-0.1942191451687208
+HG02054	0.42557959365741316
+HG02057	0.6864527381317347
+HG02058	-0.7021161310752574
+HG02060	0.25330874383085666
+HG02061	1.0197142120800042
+HG02064	-0.39158335442623227
+HG02067	-0.8839840865164575
+HG02069	-1.0947804286230107
+HG02070	-0.4746157067039667
+HG02072	-0.5444193957629733
+HG02073	1.2416663295950512
+HG02075	-0.11171153408631879
diff --git a/tests/test_examples.py b/tests/test_examples.py
index 5084f8b..1604532 100644
--- a/tests/test_examples.py
+++ b/tests/test_examples.py
@@ -684,6 +684,26 @@ def test_1000G_simulated(capfd):
     assert filecmp.cmp(hp_file, out_hp_file)
 
 
+def test_1000G_simulated_multihap(capfd):
+    """
+    Test using simulated data from the 1000G dataset
+
+    In this case, more than one haplotype is causal
+    """
+    gt_file = DATADIR / "19_45401409-46401409_1000G.pgen"
+    pt_file = DATADIR / "19_45401409-46401409_1000G.multi.pheno"
+    hp_file = DATADIR / "19_45401409-46401409_1000G.multi.hap"
+    out_hp_file = "test.hap"
+
+    cmd = f"run --remove-SNPs --max-signals 3 --max-iterations 3 -o {out_hp_file} {gt_file} {pt_file}"
+    runner = CliRunner()
+    result = runner.invoke(main, cmd.split(" "), catch_exceptions=False)
+    captured = capfd.readouterr()
+    assert captured.out == ""
+    assert result.exit_code == 0
+    assert filecmp.cmp(hp_file, out_hp_file)
+
+
 def test_1000G_simulated_maf(capfd):
     """
     Test MAF filtering using simulated data from the 1000G dataset

From c937bb6e9ba17c6d5d4691d7006a1345b39fda3c Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Mon, 30 Sep 2024 14:02:18 -0700
Subject: [PATCH 02/41] use new config file for simulations

---
 analysis/config/config-simulations.yml | 84 ++++++++++++++++++++++++++
 analysis/config/config.yml             |  2 +-
 2 files changed, 85 insertions(+), 1 deletion(-)
 create mode 100644 analysis/config/config-simulations.yml

diff --git a/analysis/config/config-simulations.yml b/analysis/config/config-simulations.yml
new file mode 100644
index 0000000..5b05122
--- /dev/null
+++ b/analysis/config/config-simulations.yml
@@ -0,0 +1,84 @@
+# This is the Snakemake configuration file that specifies paths and
+# and options for the pipeline. Anybody wishing to use
+# the provided snakemake pipeline should first fill out this file with paths to
+# their own data, as the Snakefile requires it.
+# Every config option has reasonable defaults unless it is labeled as "required."
+# All paths are relative to the directory that Snakemake is executed in.
+# Note: this file is written in the YAML syntax (https://learnxinyminutes.com/docs/yaml/)
+
+
+# Paths to a SNP-STR haplotype reference panel
+# You can download this from http://gymreklab.com/2018/03/05/snpstr_imputation.html
+# If the VCFs are per-chromosome, replace the contig name in the file name with "{chr}"
+# The VCF(s) must be sorted and indexed (with a .tbi file in the same directory)
+# required!
+snp_panel: "data/simulations/1000G/19_45401409-46401409.pgen"
+# str_panel: "/tscc/projects/ps-gymreklab/jmargoli/ukbiobank/str_imputed/runs/first_pass/vcfs/annotated_strs/chr{chr}.vcf.gz"
+
+# Path to a list of samples to exclude from the analysis
+# There should be one sample ID per line
+# exclude_samples: data/ukb_random_samples_exclude.tsv
+
+# If SNPs are unphased, provide the path to a SHAPEIT4 map file like these:
+# https://github.com/odelaneau/shapeit4/tree/master/maps
+# The map file should use the same reference genome as the reference panel VCFs
+# phase_map: data/genetic_maps/chr{chr}.b37.gmap.gz
+
+# A "locus" is a string with a contig name, a colon, the start position, a dash, and
+# the end position or a BED file with a ".bed" file ending
+# There are different simulation modes that you can use:
+# 1. "str" - a tandem repeat is a string with a contig name, a colon, and the start position
+# 2. "snp" - a SNP follows the same format as "str"
+# 3. "hap" - a haplotype
+# 4. "ld_range" - creates random two-SNP haplotypes with a range of LD values between the alleles of each haplotype
+# 5. "run" - execute happler on a locus without simulating anything
+# The STR and SNP positions should be contained within the locus.
+# The positions should be provided in the same coordinate system as the reference
+# genome of the reference panel VCFs
+# The contig should correspond with the contig name from the {chr} wildcard in the VCF
+# required! and unless otherwise noted, all attributes of each mode are required
+locus: 19:45401409-46401409 # center: 45901409 (APOe4)
+modes:
+ str:
+  pos: 19:45903857 # STR_691361
+ snp:
+  pos: 19:45910672 # rs1046282
+ hap:
+  alleles: [rs36046716:G, rs1046282:G] # 45892145, 45910672
+  beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
+ ld_range:
+  reps: 1
+  min_ld: 0
+  max_ld: 1
+  step: 0.1
+  min_af: 0.25
+  max_af: 0.75
+  # beta: [0.35]
+  beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
+  alpha: [0.05]
+  random: false # whether to also produce random haplotypes
+ run:
+  pheno: data/geuvadis/phenos/{trait}.pheno
+  SVs: data/geuvadis/pangenie_hprc_hg38_all-samples_bi_SVs-missing_removed.pgen
+  # pheno_matrix: data/geuvadis/EUR_converted_expr_hg38.csv # optional
+mode: ld_range
+
+# Covariates to use if they're needed
+# Otherwise, they're assumed to be regressed out of the phenotypes
+# Note: the finemapping methods won't be able to handle these covariates
+# covar: data/geuvadis/5PCs_sex.covar
+
+# Discard rare variants with a MAF below this number
+# Defaults to 0 if not specified
+min_maf: 0.05
+
+# Sample sizes to use
+# sample_size: [500, 1000, 1500, 2000, 2500]
+sample_size: 777
+
+# Whether to include the causal variant in the set of genotypes provided to the
+# finemapping methods. Set this to true if you're interested in seeing how the
+# methods perform when the causal variant is absent from the data.
+# Defaults to false if not specified
+# You can also provide a list of booleans, if you want to test both values
+exclude_causal: [true, false]
diff --git a/analysis/config/config.yml b/analysis/config/config.yml
index 34462af..51c999d 120000
--- a/analysis/config/config.yml
+++ b/analysis/config/config.yml
@@ -1 +1 @@
-config-ukb.yml
\ No newline at end of file
+config-simulations.yml
\ No newline at end of file

From 0bcf1b292b4d7fa6f75154c0b9e915c182ff7ff1 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Mon, 30 Sep 2024 14:11:06 -0700
Subject: [PATCH 03/41] use new --max-signals param in happler

---
 analysis/workflow/rules/happler.smk | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/analysis/workflow/rules/happler.smk b/analysis/workflow/rules/happler.smk
index 08cc773..10e879c 100644
--- a/analysis/workflow/rules/happler.smk
+++ b/analysis/workflow/rules/happler.smk
@@ -36,6 +36,8 @@ rule run:
         covar=lambda wildcards, input: ("--covar " + input["covar"] + " ") if check_config("covar") else "",
         maf = check_config("min_maf", 0),
         indep=lambda wildcards: 0.05 if "indep_alpha" not in wildcards else wildcards.indep_alpha,
+        max_signals=3,
+        max_iterations=3,
     output:
         hap=out + "/happler.hap",
         gz=out + "/happler.hap.gz",
@@ -62,8 +64,10 @@ rule run:
         "happler"
     shell:
         "happler run -o {output.hap} --verbosity DEBUG --maf {params.maf} "
-        "--discard-multiallelic --region {params.region} {params.covar} --indep-thresh"
-        " {params.indep} -t {params.thresh} --show-tree {input.gts} {input.pts} &>{log}"
+        "--max-signals {params.max_signals} --max-iterations {params.max_iterations} "
+        "--discard-multiallelic --remove-SNPs --region {params.region}"
+        " {params.covar} --indep-thresh {params.indep} -t {params.thresh} "
+        "--show-tree {input.gts} {input.pts} &>{log}"
         " && haptools index -o {output.gz} {output.hap} &>>{log}"
 
 

From 7a84684a3d95c0a316db5072543bfefa6e717b3c Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Mon, 30 Sep 2024 14:11:23 -0700
Subject: [PATCH 04/41] make simulate workflow reproducible

---
 analysis/workflow/rules/simulate.smk | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/analysis/workflow/rules/simulate.smk b/analysis/workflow/rules/simulate.smk
index 7a9dbc3..e1071c1 100644
--- a/analysis/workflow/rules/simulate.smk
+++ b/analysis/workflow/rules/simulate.smk
@@ -31,13 +31,15 @@ checkpoint create_hap_ld_range:
         gts_pvar=Path(config["gts_snp_panel"]).with_suffix(".pvar"),
         gts_psam=Path(config["gts_snp_panel"]).with_suffix(".psam"),
     params:
-        reps = config["modes"]["ld_range"]["reps"],
         min_ld = config["modes"]["ld_range"]["min_ld"],
         max_ld = config["modes"]["ld_range"]["max_ld"],
         step = config["modes"]["ld_range"]["step"],
         min_af = config["modes"]["ld_range"]["min_af"],
         max_af = config["modes"]["ld_range"]["max_af"],
-        out = lambda wildcards: hap_ld_range_output,
+        out = lambda wildcards: expand(
+            hap_ld_range_output, **wildcards, allow_missing=True,
+        ),
+        seed = 42,
     output:
         hap=directory(out + "/create_ld_range")
     resources:
@@ -49,9 +51,9 @@ checkpoint create_hap_ld_range:
     conda:
         "happler"
     shell:
-        "workflow/scripts/choose_different_ld.py -r {params.reps} "
+        "workflow/scripts/choose_different_ld.py "
         "--min-ld {params.min_ld} --max-ld {params.max_ld} "
-        "--step {params.step} --min-af {params.min_af} "
+        "--step {params.step} --min-af {params.min_af} --seed {params.seed} "
         "--max-af {params.max_af} {input.gts} {params.out} &> {log}"
 
 
@@ -115,6 +117,7 @@ rule simphenotype:
         psam=rules.transform.output.psam,
     params:
         beta=lambda wildcards: wildcards.beta,
+        seed = 42,
     output:
         pheno=out + "/{beta}.pheno",
     resources:
@@ -126,5 +129,5 @@ rule simphenotype:
     conda:
         "happler"
     shell:
-        "haptools simphenotype -o {output.pheno} {input.pgen} "
+        "haptools simphenotype --seed {params.seed} -o {output.pheno} {input.pgen} "
         "<( sed 's/\\t0.99$/\\t{params.beta}/' {input.hap} ) &>{log}"

From 0431d4bf2abbd3ff10ef3371642dd178df654a8a Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Mon, 30 Sep 2024 14:12:36 -0700
Subject: [PATCH 05/41] output multiple haps in choose_different_ld

---
 .../workflow/scripts/choose_different_ld.py   | 45 ++++++++++---------
 1 file changed, 23 insertions(+), 22 deletions(-)

diff --git a/analysis/workflow/scripts/choose_different_ld.py b/analysis/workflow/scripts/choose_different_ld.py
index 82f7fd1..befe50b 100755
--- a/analysis/workflow/scripts/choose_different_ld.py
+++ b/analysis/workflow/scripts/choose_different_ld.py
@@ -17,11 +17,11 @@ def find_haps(
     log: Logger,
     min_ld: float,
     max_ld: float,
-    reps: int = 1,
+    num_haps: int = 1,
     step: float = 0.1,
     seed: np.random.Generator = np.random.default_rng(None),
 ):
-    log.info(f"Using min_ld {min_ld}, max_ld {max_ld}, reps {reps}, and step {step}")
+    log.info(f"Using min_ld {min_ld}, max_ld {max_ld}, num_haps {num_haps}, and step {step}")
     sum_gts = gts.data.sum(axis=2)
 
     chrom = np.unique(gts.variants["chrom"])
@@ -29,8 +29,8 @@ def find_haps(
     chrom = chrom[0]
 
     intervals = np.arange(max(0, min_ld), min(1, max_ld), step) + step
-    ld_bins = {idx: 0 for idx in range(len(intervals))}
-    count = len(intervals) * reps
+    ld_bins = {idx: [] for idx in range(len(intervals))}
+    count = len(intervals) * num_haps
 
     combos = np.array(list(range(len(gts.variants))))
     seed.shuffle(combos)
@@ -51,10 +51,11 @@ def find_haps(
             combo_ld = np.abs(pearson_corr_ld(*tuple(sum_gts[:, snp_id] for snp_id in combo_id)))
             # which bin does this LD value fall in?
             bin_idx = np.argmax(combo_ld < intervals)
-            if ld_bins[bin_idx] < reps:
-                log.info(f"Outputting {hp_id} with LD {combo_ld} for bin {intervals[bin_idx]}")
-                yield combo_ld, hp
-                ld_bins[bin_idx] += 1
+            if len(ld_bins[bin_idx]) < num_haps:
+                log.info(f"Adding {hp_id} with LD {combo_ld} to bin {intervals[bin_idx]}")
+                ld_bins[bin_idx].append((combo_ld, hp))
+                if len(ld_bins[bin_idx]) == num_haps:
+                    yield ld_bins[bin_idx]
                 count -= 1
                 break
         if count == 0:
@@ -64,14 +65,6 @@ def find_haps(
 @click.command()
 @click.argument("gts", type=click.Path(exists=True, path_type=Path))
 @click.argument("output", type=click.Path(path_type=Path))
-@click.option(
-    "-r",
-    "--reps",
-    type=int,
-    default=1,
-    show_default=True,
-    help="The number of replicates to perform within each LD bin",
-)
 @click.option(
     "--min-ld",
     type=float,
@@ -107,6 +100,13 @@ def find_haps(
     show_default=True,
     help="The maximum LD value to allow",
 )
+@click.option(
+    "--num-haps",
+    type=int,
+    default=1,
+    show_default=True,
+    help="The number of haplotypes to output",
+)
 @click.option(
     "--seed",
     type=int,
@@ -125,12 +125,12 @@ def find_haps(
 def main(
     gts: Path,
     output: Path,
-    reps: int = 1,
     min_ld: float = 0,
     max_ld: float = 1,
     step: float = 0.1,
     min_af: float = 0.25,
     max_af: float = 0.75,
+    num_haps: int = 1,
     seed: int = None,
     verbosity: str = "DEBUG",
 ):
@@ -138,7 +138,7 @@ def main(
     Create haplotypes with a range of LD values between their alleles
 
     LD values are injected into the output from brace expressions
-    For example, if we are given a a path like "ld_{ld}/happler.hap", then
+    For example, if we are given a path like "ld_{ld}/happler.hap", then
     we will replace {ld} with the provided ld value
     """
     log = getLogger("choose", verbosity)
@@ -155,21 +155,22 @@ def main(
 
     seed = np.random.default_rng(seed)
 
-    for combo_ld, hp in find_haps(
+    for haps in find_haps(
         gts,
         log,
         min_ld=min_ld,
         max_ld=max_ld,
-        reps=reps,
+        num_haps=num_haps,
         step=step,
         seed=seed,
     ):
-        out_path = Path(str(output).format(ld=round(combo_ld, 2)))
+        avg_combo_ld = np.mean([combo_ld for combo_ld, hp in haps])
+        out_path = Path(str(output).format(ld=round(avg_combo_ld, 2)))
         out_path.parent.mkdir(parents=True, exist_ok=True)
         hps = Haplotypes(
             out_path, log=log, haplotype=Haplotype,
         )
-        hps.data = {hp.id: hp}
+        hps.data = {hp.id: hp for combo_ld, hp in haps}
         hps.write()
 
 

From c1dbe5bcb62dd065cfde7427976714c32eb49741 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Mon, 30 Sep 2024 14:22:02 -0700
Subject: [PATCH 06/41] try (and fail) to import graphviz trees TODO: fix this
 later

---
 happler/__main__.py            |  2 +-
 happler/tree/forest_builder.py | 24 ++++++++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/happler/__main__.py b/happler/__main__.py
index 4b7d83a..86db884 100644
--- a/happler/__main__.py
+++ b/happler/__main__.py
@@ -377,7 +377,7 @@ def run(
             dot_output = dot_output.with_suffix(".dot")
         log.info("Writing tree to dot file")
         with open(dot_output, "w") as dot_file:
-            dot_file.write(hap_tree.dot())
+            dot_file.write(forest.dot())
 
 
 @main.command(context_settings=CONTEXT_SETTINGS)
diff --git a/happler/tree/forest_builder.py b/happler/tree/forest_builder.py
index ee03c3e..e4d235b 100644
--- a/happler/tree/forest_builder.py
+++ b/happler/tree/forest_builder.py
@@ -105,3 +105,27 @@ def get_residuals(self, tree_idx: int) -> Phenotypes:
         self.log.debug(f"Computing residuals for tree {tree_idx}")
         resids.data = sm.OLS(self.phenotypes.data, sm.add_constant(effects)).fit().resid[:, np.newaxis]
         return resids
+
+    def __repr__(self):
+        return self.dot()
+
+    def dot(self) -> str:
+        """
+        Convert the trees to a representation in the dot language
+        This is useful for quickly viewing the forest on the command line
+
+        Returns
+        -------
+        str
+            A string representing the trees
+
+            Nodes are labeled by their variant ID and edges are labeled by their allele
+        """
+        trees_str = "strict digraph {\nforcelabels=true;\nrankdir=TB;\n"
+        for idx, tree in enumerate(self.trees):
+            if tree is None:
+                continue
+            trees_str += f"subgraph tree_{idx}"+" {\nlabel=\""+f"Tree {idx}"+"\";\n"
+            trees_str += "\n".join(tree.dot().split("\n")[2:])
+        trees_str += "}"
+        return trees_str

From 7a589bb9686156784fab0946f2cae13c50cf82c9 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Mon, 30 Sep 2024 14:22:58 -0700
Subject: [PATCH 07/41] simply output an empty file when all haps are SNPs

---
 happler/__main__.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/happler/__main__.py b/happler/__main__.py
index 86db884..af01260 100644
--- a/happler/__main__.py
+++ b/happler/__main__.py
@@ -367,6 +367,11 @@ def run(
             hap.id = f"H{hap_id}"
             haplotypes.data[hap.id] = hap
             hap_id += 1
+    if haplotypes.data == {}:
+        log.critical("No haplotypes were found")
+        with open(output, "w") as hap_file:
+            pass
+        return 1
     haplotypes.index()
     haplotypes.write()
     if show_tree:

From 51766a5c44aafd55d1b76bfd37e8496041446d8e Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Mon, 30 Sep 2024 16:27:04 -0700
Subject: [PATCH 08/41] enable graphviz output for multiple trees

---
 happler/__main__.py            |  8 ++++----
 happler/tree/forest_builder.py | 21 +++++++++++++++++++--
 2 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/happler/__main__.py b/happler/__main__.py
index af01260..ea0f171 100644
--- a/happler/__main__.py
+++ b/happler/__main__.py
@@ -371,9 +371,9 @@ def run(
         log.critical("No haplotypes were found")
         with open(output, "w") as hap_file:
             pass
-        return 1
-    haplotypes.index()
-    haplotypes.write()
+    else:
+        haplotypes.index()
+        haplotypes.write()
     if show_tree:
         dot_output = output
         if Path(output) != Path("/dev/stdout"):
@@ -382,7 +382,7 @@ def run(
             dot_output = dot_output.with_suffix(".dot")
         log.info("Writing tree to dot file")
         with open(dot_output, "w") as dot_file:
-            dot_file.write(forest.dot())
+            dot_file.write(forest.dot(remove_singletons=remove_snps))
 
 
 @main.command(context_settings=CONTEXT_SETTINGS)
diff --git a/happler/tree/forest_builder.py b/happler/tree/forest_builder.py
index e4d235b..cecce98 100644
--- a/happler/tree/forest_builder.py
+++ b/happler/tree/forest_builder.py
@@ -1,4 +1,5 @@
 from __future__ import annotations
+import re
 from logging import Logger
 
 import numpy as np
@@ -109,7 +110,7 @@ def get_residuals(self, tree_idx: int) -> Phenotypes:
     def __repr__(self):
         return self.dot()
 
-    def dot(self) -> str:
+    def dot(self, remove_singletons: bool = False) -> str:
         """
         Convert the trees to a representation in the dot language
         This is useful for quickly viewing the forest on the command line
@@ -120,12 +121,28 @@ def dot(self) -> str:
             A string representing the trees
 
             Nodes are labeled by their variant ID and edges are labeled by their allele
+        remove_singletons: bool
+            Whether to ignore trees composed of only a single SNP
         """
+        node_pattern = r'(?<!\d)(\d+)(?=\s\[|\s\s\[|\s->)'
+        max_node_id = 0
         trees_str = "strict digraph {\nforcelabels=true;\nrankdir=TB;\n"
         for idx, tree in enumerate(self.trees):
             if tree is None:
                 continue
+            tree_str = tree.dot().split("\n")[2:]
+            if remove_singletons and len(tree_str) <= 5:
+                continue
             trees_str += f"subgraph tree_{idx}"+" {\nlabel=\""+f"Tree {idx}"+"\";\n"
-            trees_str += "\n".join(tree.dot().split("\n")[2:])
+            tree_str = "\n".join(tree_str)
+            # increment the node IDs to ensure they remain unique across all trees
+            increment = lambda match: str(int(match.group(1))+max_node_id)
+            trees_str += re.sub(node_pattern, increment, tree_str)
+            # what was the greatest node ID in this tree?
+            # increment the next tree's IDs by that number + 1 if there were any nodes
+            try:
+                max_node_id = max(map(int, re.findall(node_pattern, tree_str))) + max_node_id + 1
+            except ValueError:
+                pass
         trees_str += "}"
         return trees_str

From 7cd90cb0d4df3c6ccf487ffebc7c5550ab3a85db Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Mon, 7 Oct 2024 12:51:23 -0700
Subject: [PATCH 09/41] simulate multiple haps as well

---
 analysis/config/config-simulations.yml        |  1 +
 analysis/workflow/Snakefile                   |  8 +++-
 analysis/workflow/rules/happler.smk           |  2 +-
 analysis/workflow/rules/plots.smk             | 11 +++++-
 analysis/workflow/rules/simulate.smk          | 11 +++---
 .../workflow/scripts/choose_different_ld.py   | 37 ++++++++++++++-----
 analysis/workflow/scripts/parameter_plot.py   |  1 +
 7 files changed, 51 insertions(+), 20 deletions(-)

diff --git a/analysis/config/config-simulations.yml b/analysis/config/config-simulations.yml
index 5b05122..35bb959 100644
--- a/analysis/config/config-simulations.yml
+++ b/analysis/config/config-simulations.yml
@@ -57,6 +57,7 @@ modes:
   beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
   alpha: [0.05]
   random: false # whether to also produce random haplotypes
+  num_haps: [1, 2, 3, 4]
  run:
   pheno: data/geuvadis/phenos/{trait}.pheno
   SVs: data/geuvadis/pangenie_hprc_hg38_all-samples_bi_SVs-missing_removed.pgen
diff --git a/analysis/workflow/Snakefile b/analysis/workflow/Snakefile
index df56b8d..c9ffb76 100644
--- a/analysis/workflow/Snakefile
+++ b/analysis/workflow/Snakefile
@@ -59,6 +59,10 @@ else:
     config["gts_snp_panel"] = rules.genotypes_subset.input.pgen
 config["random"] = False
 
+if mode == "ld_range" and check_config("num_haps", place=config["modes"][mode]):
+    if isinstance(config["modes"][mode]["num_haps"], int):
+        config["modes"][mode]["num_haps"] = [config["modes"][mode]["num_haps"],]
+
 if mode != "run":
     module simulate:
         snakefile: "rules/simulate.smk"
@@ -106,7 +110,7 @@ if mode in ("snp", "hap", "ld_range"):
             happler_config["snps"] = config["modes"][mode]["alleles"]
         happler_config["causal_gt"] = rules.simulate_transform.output
     if mode == "ld_range":
-        happler_config["out"] = config["out"] + "/happler/"+mode+"/ld_{ld}/beta_{beta}/alpha_{alpha}"
+        happler_config["out"] = config["out"] + "/happler/"+mode+"/{num_haps}_haps/ld_{ld}/beta_{beta}/alpha_{alpha}"
 elif mode == "run":
     happler_config = {
         "pheno": config["modes"][mode]["pheno"],
@@ -133,7 +137,7 @@ merged_happler = rules.happler_merge.output.pgen
 if mode == "ld_range" and config["modes"][mode]["random"]:
     happler_config["random"] = rules.simulate_random_transform.output
     happler_config["random_hap"] = rules.simulate_random_transform.input.hap
-    happler_config["out"] = config["out"] + "/happler/"+mode+"/random/ld_{ld}/beta_{beta}/alpha_{alpha}"
+    happler_config["out"] = config["out"] + "/happler/"+mode+"/random/{num_haps}_haps/ld_{ld}/beta_{beta}/alpha_{alpha}"
     module happler_random:
         snakefile: "rules/happler.smk"
         config: happler_config
diff --git a/analysis/workflow/rules/happler.smk b/analysis/workflow/rules/happler.smk
index 10e879c..d6e1c4a 100644
--- a/analysis/workflow/rules/happler.smk
+++ b/analysis/workflow/rules/happler.smk
@@ -65,7 +65,7 @@ rule run:
     shell:
         "happler run -o {output.hap} --verbosity DEBUG --maf {params.maf} "
         "--max-signals {params.max_signals} --max-iterations {params.max_iterations} "
-        "--discard-multiallelic --remove-SNPs --region {params.region}"
+        "--discard-multiallelic --region {params.region}"
         " {params.covar} --indep-thresh {params.indep} -t {params.thresh} "
         "--show-tree {input.gts} {input.pts} &>{log}"
         " && haptools index -o {output.gz} {output.hap} &>>{log}"
diff --git a/analysis/workflow/rules/plots.smk b/analysis/workflow/rules/plots.smk
index 2ef6632..e4c67f0 100644
--- a/analysis/workflow/rules/plots.smk
+++ b/analysis/workflow/rules/plots.smk
@@ -1,3 +1,4 @@
+import sys
 from pathlib import Path
 
 out = config["out"]
@@ -12,8 +13,8 @@ bench += "/" + mode
 def agg_ld(wildcards):
     """ a helper function for the other agg functions """
     checkpoint_output = config["ld_range_checkpoint"].get(**wildcards).output.hap
-    ld_vals = glob_wildcards(Path(checkpoint_output) / "ld_{ld}/haplotype.hap").ld
-    return checkpoint_output, ld_vals
+    ld_vals = glob_wildcards(Path(checkpoint_output) / "{num_haps}_haps/ld_{ld}/haplotype.hap").ld
+    return checkpoint_output, sorted(list(set(ld_vals)))
 
 def agg_ld_range_obs(wildcards):
     """ return a list of hap files from happler """
@@ -24,12 +25,14 @@ def agg_ld_range_obs(wildcards):
             ld=ld_vals,
             alpha=config["mode_attrs"]["alpha"],
             beta=config["mode_attrs"]["beta"],
+            num_haps=config["mode_attrs"]["num_haps"],
             **wildcards,
         )
     else:
         return expand(
             config["happler_hap"],
             beta=config["mode_attrs"]["beta"],
+            num_haps=config["mode_attrs"]["num_haps"],
             **wildcards,
         )
 
@@ -41,12 +44,14 @@ def agg_ld_range_causal(wildcards):
             str(checkpoint_output),
             ld=ld_vals,
             beta=config["mode_attrs"]["beta"],
+            num_haps=config["mode_attrs"]["num_haps"],
             **wildcards,
         )
     else:
         return expand(
             config["causal_hap"],
             beta=config["mode_attrs"]["beta"],
+            num_haps=config["mode_attrs"]["num_haps"],
             **wildcards,
         )
 
@@ -58,6 +63,7 @@ def agg_ld_range_metrics(wildcards):
             config["happler_metrics"],
             ld=ld_vals,
             beta=config["mode_attrs"]["beta"],
+            num_haps=config["mode_attrs"]["num_haps"],
             alpha=config["mode_attrs"]["alpha"],
             **wildcards,
         )
@@ -65,6 +71,7 @@ def agg_ld_range_metrics(wildcards):
         return expand(
             config["happler_metrics"],
             beta=config["mode_attrs"]["beta"],
+            num_haps=config["mode_attrs"]["num_haps"],
             **wildcards,
         )
 
diff --git a/analysis/workflow/rules/simulate.smk b/analysis/workflow/rules/simulate.smk
index e1071c1..a56aed3 100644
--- a/analysis/workflow/rules/simulate.smk
+++ b/analysis/workflow/rules/simulate.smk
@@ -22,7 +22,7 @@ def parse_locus(locus):
     start = locus.split("_")[1].split("-")[0]
     return chrom, start, end
 
-hap_ld_range_output = out + "/create_ld_range/ld_{ld}/haplotype.hap"
+hap_ld_range_output = out + "/create_ld_range/{num_haps}_haps/ld_{ld}/haplotype.hap"
 
 checkpoint create_hap_ld_range:
     """ create a hap file suitable for haptools transform and simphenotype """
@@ -36,6 +36,7 @@ checkpoint create_hap_ld_range:
         step = config["modes"]["ld_range"]["step"],
         min_af = config["modes"]["ld_range"]["min_af"],
         max_af = config["modes"]["ld_range"]["max_af"],
+        num_haps = "-n " + " -n ".join(map(str, config["modes"]["ld_range"]["num_haps"])),
         out = lambda wildcards: expand(
             hap_ld_range_output, **wildcards, allow_missing=True,
         ),
@@ -51,7 +52,7 @@ checkpoint create_hap_ld_range:
     conda:
         "happler"
     shell:
-        "workflow/scripts/choose_different_ld.py "
+        "workflow/scripts/choose_different_ld.py {params.num_haps} "
         "--min-ld {params.min_ld} --max-ld {params.max_ld} "
         "--step {params.step} --min-af {params.min_af} --seed {params.seed} "
         "--max-af {params.max_af} {input.gts} {params.out} &> {log}"
@@ -81,9 +82,9 @@ rule create_hap:
         "../scripts/create_hap_file.sh"
 
 if mode == "ld_range":
-    out += "/pheno/ld_{ld}"
-    logs += "/pheno/ld_{ld}"
-    bench += "/pheno/ld_{ld}"
+    out += "/pheno/{num_haps}_haps/ld_{ld}"
+    logs += "/pheno/{num_haps}_haps/ld_{ld}"
+    bench += "/pheno/{num_haps}_haps/ld_{ld}"
 
 rule transform:
     """ use the hap file to create a pgen of the haplotype """
diff --git a/analysis/workflow/scripts/choose_different_ld.py b/analysis/workflow/scripts/choose_different_ld.py
index befe50b..f09dd9b 100755
--- a/analysis/workflow/scripts/choose_different_ld.py
+++ b/analysis/workflow/scripts/choose_different_ld.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python
+from typing import Tuple
 from pathlib import Path
 from logging import Logger
 from itertools import product
@@ -29,6 +30,7 @@ def find_haps(
     chrom = chrom[0]
 
     intervals = np.arange(max(0, min_ld), min(1, max_ld), step) + step
+    # we return lists of haplotypes: one list for each bin
     ld_bins = {idx: [] for idx in range(len(intervals))}
     count = len(intervals) * num_haps
 
@@ -101,11 +103,16 @@ def find_haps(
     help="The maximum LD value to allow",
 )
 @click.option(
+    "-n",
     "--num-haps",
     type=int,
-    default=1,
+    multiple=True,
+    default=(1,),
     show_default=True,
-    help="The number of haplotypes to output",
+    help=(
+        "The number of haplotypes to output in each hap file. "
+        "If multiple, we will also match on the {num_haps} brace expression"
+    ),
 )
 @click.option(
     "--seed",
@@ -130,7 +137,7 @@ def main(
     step: float = 0.1,
     min_af: float = 0.25,
     max_af: float = 0.75,
-    num_haps: int = 1,
+    num_haps: Tuple[int] = (1,),
     seed: int = None,
     verbosity: str = "DEBUG",
 ):
@@ -155,23 +162,33 @@ def main(
 
     seed = np.random.default_rng(seed)
 
+    if "{num_haps}" not in str(output) and len(num_haps) > 1:
+        log.warning(
+            "Multiple num_haps values provided, but {num_haps} brace expression is "
+            "absent from output path. Only the last --num-haps value will be used."
+        )
+
     for haps in find_haps(
         gts,
         log,
         min_ld=min_ld,
         max_ld=max_ld,
-        num_haps=num_haps,
+        num_haps=max(num_haps),
         step=step,
         seed=seed,
     ):
         avg_combo_ld = np.mean([combo_ld for combo_ld, hp in haps])
-        out_path = Path(str(output).format(ld=round(avg_combo_ld, 2)))
-        out_path.parent.mkdir(parents=True, exist_ok=True)
-        hps = Haplotypes(
-            out_path, log=log, haplotype=Haplotype,
+        out_path_temp = Path(str(output).format(
+            ld=round(avg_combo_ld, 2), num_haps="{num_haps}"),
         )
-        hps.data = {hp.id: hp for combo_ld, hp in haps}
-        hps.write()
+        for num_hap in num_haps:
+            out_path = Path(str(out_path_temp).format(num_haps=num_hap))
+            out_path.parent.mkdir(parents=True, exist_ok=True)
+            hps = Haplotypes(
+                out_path, log=log, haplotype=Haplotype,
+            )
+            hps.data = {hp.id: hp for combo_ld, hp in haps[:num_hap]}
+            hps.write()
 
 
 if __name__ == "__main__":
diff --git a/analysis/workflow/scripts/parameter_plot.py b/analysis/workflow/scripts/parameter_plot.py
index 34ce946..2aaacc5 100755
--- a/analysis/workflow/scripts/parameter_plot.py
+++ b/analysis/workflow/scripts/parameter_plot.py
@@ -22,6 +22,7 @@
     "gt": "U5",
     "alpha": np.float64,
     "samp": np.uint32,
+    "num_haps": np.uint8,
 }
 
 LOG_SCALE = {"alpha",}

From 35eded450f4d5e51b1abef795242b2f3669eabf5 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Mon, 7 Oct 2024 12:53:44 -0700
Subject: [PATCH 10/41] match obs and causal haps by their LD

---
 analysis/workflow/rules/plots.smk           |   2 +-
 analysis/workflow/scripts/parameter_plot.py | 184 ++++++++++----------
 2 files changed, 95 insertions(+), 91 deletions(-)

diff --git a/analysis/workflow/rules/plots.smk b/analysis/workflow/rules/plots.smk
index e4c67f0..246bc34 100644
--- a/analysis/workflow/rules/plots.smk
+++ b/analysis/workflow/rules/plots.smk
@@ -104,7 +104,7 @@ rule params:
     conda:
         "happler"
     shell:
-        "workflow/scripts/parameter_plot.py -o {output.png} "
+        "workflow/scripts/parameter_plot.py --show-extras -o {output.png} "
         "{input.gts} {params.observed_haps} {params.causal_hap} &> {log}"
 
 
diff --git a/analysis/workflow/scripts/parameter_plot.py b/analysis/workflow/scripts/parameter_plot.py
index 2aaacc5..905f677 100755
--- a/analysis/workflow/scripts/parameter_plot.py
+++ b/analysis/workflow/scripts/parameter_plot.py
@@ -10,7 +10,8 @@
 from haptools import logging
 import matplotlib.pyplot as plt
 from haptools.ld import pearson_corr_ld
-from haptools.data import GenotypesVCF, GenotypesPLINK, Haplotypes
+from scipy.optimize import linear_sum_assignment
+from haptools.data import Genotypes, GenotypesVCF, GenotypesPLINK, Haplotypes
 
 from snakemake_io import glob_wildcards
 
@@ -28,12 +29,68 @@
 LOG_SCALE = {"alpha",}
 
 
-def get_num_haps(hap: Path, log: Logger = None):
-    hap = Haplotypes(hap, log=log)
-    hap.read()
-    return len(hap.data)
+def match_haps(gts: Genotypes, observed: Haplotypes, causal: Haplotypes, drop_extras: bool = True) -> tuple:
+    """
+    Match the observed and causal haplotypes to maximize best pairwise LD
+
+    This function uses scipy.optimize.linear_sum_assignment to find the optimal
+    match between causal and observed haplotypes to maximize the sum of the pairwise LD
 
-def get_best_ld(gts: GenotypesVCF, observed_hap: Path, causal_hap: Path, region: str = None, observed_id: str = None, causal_id: str = None, log: Logger = None):
+    Parameters
+    ----------
+    gts: Genotypes
+        The set of genotypes for each of the variants in the haplotypes
+    observed: Path
+        A path to a .hap file containing haplotypes output by happler
+    causal: Path
+        A path to a .hap file containing a simulated causal haplotype
+    drop_extras: bool, optional
+        Drop any observed haplotypes that don't have a match in the causal haps.
+        Otherwise, include a match for every observed haplotype!
+
+    Returns
+    -------
+    npt.NDArray
+        An array of the LD values for each pair of observed and causal haplotypes
+    npt.NDArray
+        An array of the row indices indicating the optimal match of causal haplotypes
+        to observed haplotypes
+    npt.NDArray
+        An array of the column indices indicating the optimal match of causal haplotypes
+        to observed haplotypes
+    """
+    obs = observed.transform(gts).data.sum(axis=2)
+    exp = causal.transform(gts).data.sum(axis=2)
+    # Compute the pairwise LD between each observed and causal haplotype
+    # The rows and columns of ld_mat both correspond to the observed and causal haps in
+    # the order they were given. For example, if there is 1 observed hap and 3 causal
+    # haps, then there should be four rows and four columns in ld_mat
+    # Note that ld_mat is symmetric, so the upper triangle is always redundant
+    # TODO: use the latest haptools.ld.pearson_corr_ld to do this, instead
+    ld_mat = np.corrcoef(obs, exp, rowvar=False)
+    # Retrieve only the correlation of observed haps (as rows) vs causal haps (as cols)
+    ld_mat = np.abs(ld_mat[:obs.shape[1], obs.shape[1]:])
+    # Return the ld_mat idxs of the optimal match of each observed hap to a causal hap
+    row_idx, col_idx = linear_sum_assignment(ld_mat, maximize=True)
+    if not drop_extras:
+        missing_rows = np.setdiff1d(range(ld_mat.shape[0]), row_idx)
+        # If there are more observed haps than causal haps, then we just assign the
+        # remaining haps to the best causal hap that matches for each of them
+        row_idx = np.concatenate((row_idx, missing_rows))
+        col_idx = np.concatenate((col_idx, ld_mat[missing_rows].argmax(axis=1)))
+    return ld_mat, row_idx, col_idx
+
+
+def get_best_ld(
+        gts: GenotypesVCF,
+        observed_hap: Path,
+        causal_hap: Path,
+        region: str = None,
+        observed_id: str = None,
+        causal_id: str = None,
+        show_extras: bool = False,
+        log: Logger = None
+    ):
     """
     Compute the best LD between the observed haplotypes and the causal haplotype
 
@@ -51,23 +108,24 @@ def get_best_ld(gts: GenotypesVCF, observed_hap: Path, causal_hap: Path, region:
         The ID to load from the observed_hap file (or just all of the IDs, otherwise)
     causal_id: str, optional
         The ID to load from the causal_hap file (or just the first hap, otherwise)
+    show_extras: bool, optional
+        Whether to show observed haps that don't match with a causal hap
     log: Logger, optional
         A logging module to pass to haptools
     """
     # load the causal haplotype given by 'causal_id' or just the first hap
     causal_hap = Haplotypes(causal_hap, log=log)
     causal_hap.read(haplotypes=(set((causal_id,)) if causal_id is not None else None))
-    if causal_id is None:
-        causal_id = list(causal_hap.data.keys())[0]
-        causal_hap.subset(haplotypes=(causal_id,), inplace=True)
-    causal_hap = causal_hap.data[causal_id]
 
     # load the observed haplotype given by 'observed_id' or just all of the haps
     observed_hap = Haplotypes(observed_hap, log=log)
     observed_hap.read(haplotypes=(set((observed_id,)) if observed_id is not None else None))
 
     # load the genotypes
-    variants = {v.id for v in causal_hap.variants}
+    variants = {
+        v.id for hap in causal_hap.data
+        for v in causal_hap.data[hap].variants
+    }
     variants.update({
         v.id for hap in observed_hap.data
         for v in observed_hap.data[hap].variants
@@ -79,20 +137,11 @@ def get_best_ld(gts: GenotypesVCF, observed_hap: Path, causal_hap: Path, region:
     gts.check_biallelic()
 
     # compute LD between every observed haplotype and the causal haplotype
-    causal_gt = causal_hap.transform(gts).sum(axis=1)
-    observed_gt = observed_hap.transform(gts)
-    observed_ld = np.array([
-        pearson_corr_ld(causal_gt, observed_gt.data[:, o].sum(axis=1))
-        for o in range(observed_gt.data.shape[1])
-    ])
-
-    # return the strongest LD among all haps
-    try:
-        best_observed_hap_idx = np.abs(observed_ld).argmax()
-    except ValueError:
-        # this can happen when there weren't any haplotypes output by happler
-        return 0
-    return observed_ld[best_observed_hap_idx]
+    observed_ld, best_row_idx, best_col_idx = match_haps(
+        gts, observed_hap, causal_hap, drop_extras=(not show_extras),
+    )
+    # return the strongest possible LD for each observed hap with a causal hap
+    return observed_ld[best_row_idx, best_col_idx]
 
 
 def get_finemap_metrics(metrics_path: Path, log: Logger = None):
@@ -150,8 +199,6 @@ def plot_params(
         params: npt.NDArray,
         vals: npt.NDArray,
         val_title: str,
-        hap_counts: npt.NDArray = None,
-        sign: bool = False,
     ):
     """
     Plot vals against parameter values
@@ -164,46 +211,37 @@ def plot_params(
         A numpy array containing the values of the plot
     val_title: str
         The name of the value that we are plotting
-    hap_counts: npt.NDArray, optional
-        The number of haplotypes present in each hap file
-    sign: bool, optional
-        If True, depict positive LD values via a special marker symbol for the point
     """
-    also_plot_counts = hap_counts is not None
     figsize = matplotlib.rcParams["figure.figsize"]
     if len(params) > 75:
         figsize[0] = len(params) / 15
     if len(params.dtype) > 5:
         figsize[1] = len(params.dtype) * 1.25
     fig, axs = plt.subplots(
-        nrows=len(params.dtype)+1+also_plot_counts, ncols=1,
+        nrows=len(params.dtype)+1, ncols=1,
         sharex=True, figsize=figsize,
     )
     fig.subplots_adjust(hspace=0)
     # create a plot for the vals, first
-    axs[0].plot(np.abs(vals), "g-")
-    if sign:
-        axs[0].scatter(
-            np.squeeze(np.where(vals > 0)), vals[vals > 0], c="g", marker="o",
-        )
+    x_vals = [j for j, arr in enumerate(vals) for i in arr]
+    axs[0].plot(x_vals, np.concatenate(vals), "go", markersize=2)
     axs[0].set_ylabel(val_title, rotation="horizontal", ha="right")
     axs[0].set_xticks(range(0, len(vals)))
     axs[0].set_xticklabels([])
     axs[0].grid(axis="x")
     axs[0].set_ylim(None, 1)
-    if also_plot_counts:
-        axs[1].plot(hap_counts, "m-")
-        axs[1].set_ylabel("num_haps", rotation="horizontal", ha="right")
-        axs[1].grid(axis="x")
     # now, plot each of the parameter values on the other axes
     for idx, param in enumerate(params.dtype.names):
         val_title = param
+        if len(np.unique(params[param])) == 1:
+            axs[idx+1].remove()
+            continue
         if param in LOG_SCALE:
             params[param] = -np.log10(params[param])
             val_title = "-log " + val_title
-        axs[idx+1+also_plot_counts].plot(params[param], "-")
-        axs[idx+1+also_plot_counts].set_ylabel(val_title, rotation="horizontal", ha="right")
-        axs[idx+1+also_plot_counts].grid(axis="x")
+        axs[idx+1].plot(params[param], "-")
+        axs[idx+1].set_ylabel(val_title, rotation="horizontal", ha="right")
+        axs[idx+1].grid(axis="x")
     return fig
 
 
@@ -211,8 +249,6 @@ def plot_params_simple(
         params: npt.NDArray,
         vals: npt.NDArray,
         val_title: str,
-        hap_counts: npt.NDArray = None,
-        sign: bool = False
     ):
     """
     Plot vals against only a single column of parameter values
@@ -225,31 +261,17 @@ def plot_params_simple(
         A numpy array containing the values of the plot
     val_title: str
         The name of the value that we are plotting
-    hap_counts: npt.NDArray, optional
-        The number of haplotypes present in each hap file
-    sign: bool, optional
-        If True, depict positive LD values via a special marker symbol for the point
     """
     fig, ax = plt.subplots(nrows=1, ncols=1, sharex=True)
     val_xlabel = params.dtype.names[0]
     if val_xlabel in LOG_SCALE:
         params = -np.log10(params)
         val_xlabel = "-log " + val_xlabel
+    params = [j for j, arr in zip(params, vals) for i in arr]
     # plot params against vals
-    ax.plot(params, np.abs(vals),  "g-")
-    if sign:
-        ax.scatter(
-            params[vals > 0], vals[vals > 0], c="g", marker="o",
-        )
+    ax.plot(params, np.concatenate(vals), "go", markersize=2)
     ax.set_ylabel(val_title, color="g")
     ax.set_xlabel(val_xlabel)
-    # also plot hap_counts as a second y-axis
-    if hap_counts is not None:
-        ax = ax.twinx()
-        ax.plot(params, hap_counts, "m-")
-        ax.set_ylabel("Number of observed haplotypes", color="m")
-        ax.set_yticks(list(range(max(hap_counts)+1)))
-        ax.set_yticklabels(list(range(max(hap_counts)+1)))
     return fig
 
 
@@ -289,19 +311,11 @@ def plot_params_simple(
     help="A haplotype ID from the causal .hap file",
 )
 @click.option(
-    "-n",
-    "--num-haps",
-    is_flag=True,
-    default=True,
-    show_default=True,
-    help="Whether to also depict the total number of haplotypes in the file",
-)
-@click.option(
-    "--sign",
+    "--show-extras",
     is_flag=True,
     default=False,
     show_default=True,
-    help="Depict the sign (pos vs neg) of the LD via the shape of the points",
+    help="Whether to show observed haps that don't match with a causal hap",
 )
 @click.option(
     "-o",
@@ -327,8 +341,7 @@ def main(
     region: str = None,
     observed_id: str = None,
     causal_id: str = None,
-    num_haps: bool = True,
-    sign: bool = False,
+    show_extras: bool = False,
     output: Path = Path("/dev/stdout"),
     verbosity: str = "ERROR",
 ):
@@ -359,7 +372,7 @@ def main(
     get_hap_fname = lambda hap_path, param_set: Path(str(hap_path).format(**dict(zip(dtypes.keys(), param_set))))
 
     # compute LD between the causal hap and the best observed hap across param vals
-    ld_vals = np.array([
+    ld_vals = [
         get_best_ld(
             gts,
             get_hap_fname(observed_hap, params[idx]),
@@ -367,10 +380,11 @@ def main(
             region=region,
             observed_id=observed_id,
             causal_id=causal_id,
+            show_extras=show_extras,
             log=log
         )
         for idx in range(len(params))
-    ])
+    ]
 
     # extract fine-mapping metrics for the observed hap
     if metrics is not None:
@@ -382,28 +396,18 @@ def main(
             for idx in range(len(params))
         ])
 
-    hap_counts = None
-    if num_haps:
-        hap_counts = np.array([
-            get_num_haps(get_hap_fname(observed_hap, params[idx]))
-            for idx in range(len(params))
-        ])
-        # if they're all 1, then there's no reason to depict it
-        if (hap_counts == 1).all():
-            hap_counts = None
-
     if len(dtypes) > 1 or metrics is not None:
         merged = params
         if metrics is not None:
             merged = np.lib.recfunctions.merge_arrays([params, metrics], flatten=True)
-        fig = plot_params(merged, ld_vals, "causal LD", hap_counts, sign)
+        fig = plot_params(merged, ld_vals, "causal LD")
     elif len(dtypes) == 1:
-        fig = plot_params_simple(params, ld_vals, "causal LD", hap_counts, sign)
+        fig = plot_params_simple(params, ld_vals, "causal LD")
     else:
         raise ValueError("No parameter values found")
 
     fig.tight_layout()
-    fig.savefig(output)
+    fig.savefig(output, bbox_inches="tight", pad_inches=0.05)
 
 if __name__ == "__main__":
     main()

From f28dda577d67244e4d9477d7ccb3ec2da89cb702 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Tue, 15 Oct 2024 20:22:51 -0700
Subject: [PATCH 11/41] implement a final haplotype threshold too

---
 happler/__main__.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/happler/__main__.py b/happler/__main__.py
index ea0f171..ef6e565 100644
--- a/happler/__main__.py
+++ b/happler/__main__.py
@@ -141,6 +141,13 @@ def main():
     show_default=True,
     help="The LD threshold used to prune leaf nodes based on LD with their siblings",
 )
+@click.option(
+    "--out-thresh",
+    type=float,
+    default=5e-8,
+    show_default=True,
+    help="Threshold used to determine whether to output haplotypes (single-SNP)",
+)
 @click.option(
     "--show-tree",
     is_flag=True,
@@ -203,6 +210,7 @@ def run(
     max_signals: int = 1,
     max_iterations: int = 1,
     threshold: float = 0.05,
+    out_thresh: float = 5e-8,
     indep_thresh: float = 0.1,
     ld_prune_thresh: float = None,
     show_tree: bool = False,
@@ -223,6 +231,7 @@ def run(
     Ex: happler run tests/data/simple.vcf tests/data/simple.tsv > simple.hap
     """
     from . import tree
+    import numpy as np
     from haptools import data
     from haptools import logging
 
@@ -361,6 +370,11 @@ def run(
     for haps in forest.run():
         if haps is None:
             continue
+        if len(haps.data) == 1:
+            hap = next(iter(haps.data.values()))
+            if hap.pval < -np.log10(out_thresh):
+                log.info("Ignoring haplotype with low pval")
+                continue
         for hap in haps.data.values():
             if len(hap.variants) <= 1 and remove_snps:
                 continue

From 635079e28e10deaf9038c5000df30554fb7f86f0 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Tue, 15 Oct 2024 20:23:36 -0700
Subject: [PATCH 12/41] color each point of the parameter plot by whether it is
 an extra

---
 analysis/workflow/scripts/parameter_plot.py | 41 +++++++++++++++------
 1 file changed, 30 insertions(+), 11 deletions(-)

diff --git a/analysis/workflow/scripts/parameter_plot.py b/analysis/workflow/scripts/parameter_plot.py
index 905f677..5a01a16 100755
--- a/analysis/workflow/scripts/parameter_plot.py
+++ b/analysis/workflow/scripts/parameter_plot.py
@@ -58,6 +58,9 @@ def match_haps(gts: Genotypes, observed: Haplotypes, causal: Haplotypes, drop_ex
     npt.NDArray
         An array of the column indices indicating the optimal match of causal haplotypes
         to observed haplotypes
+    npt.NDArray
+        An array of boolean values indicating whether each observed haplotype has a match
+        in the causal haplotypes
     """
     obs = observed.transform(gts).data.sum(axis=2)
     exp = causal.transform(gts).data.sum(axis=2)
@@ -72,13 +75,17 @@ def match_haps(gts: Genotypes, observed: Haplotypes, causal: Haplotypes, drop_ex
     ld_mat = np.abs(ld_mat[:obs.shape[1], obs.shape[1]:])
     # Return the ld_mat idxs of the optimal match of each observed hap to a causal hap
     row_idx, col_idx = linear_sum_assignment(ld_mat, maximize=True)
+    extras_labels = np.zeros(
+        len(row_idx) if drop_extras else ld_mat.shape[0], dtype=np.bool_,
+    )
     if not drop_extras:
         missing_rows = np.setdiff1d(range(ld_mat.shape[0]), row_idx)
+        extras_labels[missing_rows] = True
         # If there are more observed haps than causal haps, then we just assign the
         # remaining haps to the best causal hap that matches for each of them
         row_idx = np.concatenate((row_idx, missing_rows))
         col_idx = np.concatenate((col_idx, ld_mat[missing_rows].argmax(axis=1)))
-    return ld_mat, row_idx, col_idx
+    return ld_mat, row_idx, col_idx, extras_labels
 
 
 def get_best_ld(
@@ -121,6 +128,10 @@ def get_best_ld(
     observed_hap = Haplotypes(observed_hap, log=log)
     observed_hap.read(haplotypes=(set((observed_id,)) if observed_id is not None else None))
 
+    # if there are no observed haplotypes, then we can't compute LD
+    if len(observed_hap.data) == 0:
+        return np.array([np.nan,]), np.array([False,])
+
     # load the genotypes
     variants = {
         v.id for hap in causal_hap.data
@@ -137,11 +148,11 @@ def get_best_ld(
     gts.check_biallelic()
 
     # compute LD between every observed haplotype and the causal haplotype
-    observed_ld, best_row_idx, best_col_idx = match_haps(
+    observed_ld, best_row_idx, best_col_idx, extras_labels = match_haps(
         gts, observed_hap, causal_hap, drop_extras=(not show_extras),
     )
     # return the strongest possible LD for each observed hap with a causal hap
-    return observed_ld[best_row_idx, best_col_idx]
+    return observed_ld[best_row_idx, best_col_idx], extras_labels
 
 
 def get_finemap_metrics(metrics_path: Path, log: Logger = None):
@@ -199,6 +210,7 @@ def plot_params(
         params: npt.NDArray,
         vals: npt.NDArray,
         val_title: str,
+        val_color: npt.NDArray,
     ):
     """
     Plot vals against parameter values
@@ -207,10 +219,12 @@ def plot_params(
     ----------
     params: npt.NDArray
         A numpy array of mixed dtype. Each column contains the values of a parameter
-    vals: npt.NDArray
+    vals: list[npt.NDArray]
         A numpy array containing the values of the plot
     val_title: str
         The name of the value that we are plotting
+    val_color: list[npt.NDArray]
+        Whether to color each point corresponding to this value
     """
     figsize = matplotlib.rcParams["figure.figsize"]
     if len(params) > 75:
@@ -224,7 +238,8 @@ def plot_params(
     fig.subplots_adjust(hspace=0)
     # create a plot for the vals, first
     x_vals = [j for j, arr in enumerate(vals) for i in arr]
-    axs[0].plot(x_vals, np.concatenate(vals), "go", markersize=2)
+    val_color = ["red" if j else "green" for i in val_color for j in i]
+    axs[0].scatter(x_vals, np.concatenate(vals), marker="o", color=val_color, s=5)
     axs[0].set_ylabel(val_title, rotation="horizontal", ha="right")
     axs[0].set_xticks(range(0, len(vals)))
     axs[0].set_xticklabels([])
@@ -249,6 +264,7 @@ def plot_params_simple(
         params: npt.NDArray,
         vals: npt.NDArray,
         val_title: str,
+        val_color: npt.NDArray,
     ):
     """
     Plot vals against only a single column of parameter values
@@ -257,10 +273,12 @@ def plot_params_simple(
     ----------
     params: npt.NDArray
         A numpy array containing the values of a parameter
-    vals: npt.NDArray
+    vals: list[npt.NDArray]
         A numpy array containing the values of the plot
     val_title: str
         The name of the value that we are plotting
+    val_color: list[npt.NDArray]
+        Whether to color each point corresponding to this value
     """
     fig, ax = plt.subplots(nrows=1, ncols=1, sharex=True)
     val_xlabel = params.dtype.names[0]
@@ -268,8 +286,9 @@ def plot_params_simple(
         params = -np.log10(params)
         val_xlabel = "-log " + val_xlabel
     params = [j for j, arr in zip(params, vals) for i in arr]
+    val_color = ["red" if j else "green" for i in val_color for j in i]
     # plot params against vals
-    ax.plot(params, np.concatenate(vals), "go", markersize=2)
+    ax.scatter(params, np.concatenate(vals), marker="o", color=val_color, s=5)
     ax.set_ylabel(val_title, color="g")
     ax.set_xlabel(val_xlabel)
     return fig
@@ -372,7 +391,7 @@ def main(
     get_hap_fname = lambda hap_path, param_set: Path(str(hap_path).format(**dict(zip(dtypes.keys(), param_set))))
 
     # compute LD between the causal hap and the best observed hap across param vals
-    ld_vals = [
+    ld_vals, ld_extras_labels = zip(*tuple(
         get_best_ld(
             gts,
             get_hap_fname(observed_hap, params[idx]),
@@ -384,7 +403,7 @@ def main(
             log=log
         )
         for idx in range(len(params))
-    ]
+    ))
 
     # extract fine-mapping metrics for the observed hap
     if metrics is not None:
@@ -400,9 +419,9 @@ def main(
         merged = params
         if metrics is not None:
             merged = np.lib.recfunctions.merge_arrays([params, metrics], flatten=True)
-        fig = plot_params(merged, ld_vals, "causal LD")
+        fig = plot_params(merged, ld_vals, "causal LD", ld_extras_labels)
     elif len(dtypes) == 1:
-        fig = plot_params_simple(params, ld_vals, "causal LD")
+        fig = plot_params_simple(params, ld_vals, "causal LD", ld_extras_labels)
     else:
         raise ValueError("No parameter values found")
 

From c89f6e5073d2cff9b19542943a569c77f3b7c8ee Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Thu, 17 Oct 2024 13:30:17 -0700
Subject: [PATCH 13/41] enable simulation replicates

---
 analysis/config/config-simulations.yml      |   2 +-
 analysis/workflow/Snakefile                 |   4 +-
 analysis/workflow/rules/happler.smk         |   5 +-
 analysis/workflow/rules/plots.smk           |   4 +-
 analysis/workflow/rules/simulate.smk        |   4 +-
 analysis/workflow/scripts/parameter_plot.py | 173 +++++++++++++++-----
 happler/__main__.py                         |   4 -
 7 files changed, 141 insertions(+), 55 deletions(-)

diff --git a/analysis/config/config-simulations.yml b/analysis/config/config-simulations.yml
index 35bb959..59fd987 100644
--- a/analysis/config/config-simulations.yml
+++ b/analysis/config/config-simulations.yml
@@ -47,7 +47,7 @@ modes:
   alleles: [rs36046716:G, rs1046282:G] # 45892145, 45910672
   beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
  ld_range:
-  reps: 1
+  reps: 10
   min_ld: 0
   max_ld: 1
   step: 0.1
diff --git a/analysis/workflow/Snakefile b/analysis/workflow/Snakefile
index c9ffb76..93d4640 100644
--- a/analysis/workflow/Snakefile
+++ b/analysis/workflow/Snakefile
@@ -110,7 +110,7 @@ if mode in ("snp", "hap", "ld_range"):
             happler_config["snps"] = config["modes"][mode]["alleles"]
         happler_config["causal_gt"] = rules.simulate_transform.output
     if mode == "ld_range":
-        happler_config["out"] = config["out"] + "/happler/"+mode+"/{num_haps}_haps/ld_{ld}/beta_{beta}/alpha_{alpha}"
+        happler_config["out"] = config["out"] + "/happler/"+mode+"/{num_haps}_haps/ld_{ld}/beta_{beta}/alpha_{alpha}/rep_{rep}"
 elif mode == "run":
     happler_config = {
         "pheno": config["modes"][mode]["pheno"],
@@ -137,7 +137,7 @@ merged_happler = rules.happler_merge.output.pgen
 if mode == "ld_range" and config["modes"][mode]["random"]:
     happler_config["random"] = rules.simulate_random_transform.output
     happler_config["random_hap"] = rules.simulate_random_transform.input.hap
-    happler_config["out"] = config["out"] + "/happler/"+mode+"/random/{num_haps}_haps/ld_{ld}/beta_{beta}/alpha_{alpha}"
+    happler_config["out"] = config["out"] + "/happler/"+mode+"/random/{num_haps}_haps/ld_{ld}/beta_{beta}/alpha_{alpha}/rep_{rep}"
     module happler_random:
         snakefile: "rules/happler.smk"
         config: happler_config
diff --git a/analysis/workflow/rules/happler.smk b/analysis/workflow/rules/happler.smk
index d6e1c4a..2c75df5 100644
--- a/analysis/workflow/rules/happler.smk
+++ b/analysis/workflow/rules/happler.smk
@@ -38,6 +38,7 @@ rule run:
         indep=lambda wildcards: 0.05 if "indep_alpha" not in wildcards else wildcards.indep_alpha,
         max_signals=3,
         max_iterations=3,
+        rep=lambda wildcards: int(wildcards.rep),
     output:
         hap=out + "/happler.hap",
         gz=out + "/happler.hap.gz",
@@ -65,8 +66,8 @@ rule run:
     shell:
         "happler run -o {output.hap} --verbosity DEBUG --maf {params.maf} "
         "--max-signals {params.max_signals} --max-iterations {params.max_iterations} "
-        "--discard-multiallelic --region {params.region}"
-        " {params.covar} --indep-thresh {params.indep} -t {params.thresh} "
+        "--discard-multiallelic --region {params.region} --pheno {params.rep} "
+        "{params.covar}--indep-thresh {params.indep} -t {params.thresh} "
         "--show-tree {input.gts} {input.pts} &>{log}"
         " && haptools index -o {output.gz} {output.hap} &>>{log}"
 
diff --git a/analysis/workflow/rules/plots.smk b/analysis/workflow/rules/plots.smk
index 246bc34..403f175 100644
--- a/analysis/workflow/rules/plots.smk
+++ b/analysis/workflow/rules/plots.smk
@@ -26,6 +26,7 @@ def agg_ld_range_obs(wildcards):
             alpha=config["mode_attrs"]["alpha"],
             beta=config["mode_attrs"]["beta"],
             num_haps=config["mode_attrs"]["num_haps"],
+            rep=range(config["mode_attrs"]["reps"]),
             **wildcards,
         )
     else:
@@ -65,6 +66,7 @@ def agg_ld_range_metrics(wildcards):
             beta=config["mode_attrs"]["beta"],
             num_haps=config["mode_attrs"]["num_haps"],
             alpha=config["mode_attrs"]["alpha"],
+            rep=range(config["mode_attrs"]["reps"]),
             **wildcards,
         )
     else:
@@ -104,7 +106,7 @@ rule params:
     conda:
         "happler"
     shell:
-        "workflow/scripts/parameter_plot.py --show-extras -o {output.png} "
+        "workflow/scripts/parameter_plot.py -o {output.png} "
         "{input.gts} {params.observed_haps} {params.causal_hap} &> {log}"
 
 
diff --git a/analysis/workflow/rules/simulate.smk b/analysis/workflow/rules/simulate.smk
index a56aed3..249e64d 100644
--- a/analysis/workflow/rules/simulate.smk
+++ b/analysis/workflow/rules/simulate.smk
@@ -119,6 +119,7 @@ rule simphenotype:
     params:
         beta=lambda wildcards: wildcards.beta,
         seed = 42,
+        reps = config["modes"]["ld_range"]["reps"],
     output:
         pheno=out + "/{beta}.pheno",
     resources:
@@ -130,5 +131,6 @@ rule simphenotype:
     conda:
         "happler"
     shell:
-        "haptools simphenotype --seed {params.seed} -o {output.pheno} {input.pgen} "
+        "haptools simphenotype --seed {params.seed} -o {output.pheno} "
+        "--replications {params.reps} {input.pgen} "
         "<( sed 's/\\t0.99$/\\t{params.beta}/' {input.hap} ) &>{log}"
diff --git a/analysis/workflow/scripts/parameter_plot.py b/analysis/workflow/scripts/parameter_plot.py
index 5a01a16..bea84d2 100755
--- a/analysis/workflow/scripts/parameter_plot.py
+++ b/analysis/workflow/scripts/parameter_plot.py
@@ -7,6 +7,7 @@
 import numpy as np
 matplotlib.use('Agg')
 import numpy.typing as npt
+from scipy.stats import sem
 from haptools import logging
 import matplotlib.pyplot as plt
 from haptools.ld import pearson_corr_ld
@@ -28,8 +29,10 @@
 
 LOG_SCALE = {"alpha",}
 
+plt.rcParams['figure.dpi'] = 400  # Set the figure DPI to 300
+plt.rcParams['savefig.dpi'] = plt.rcParams['figure.dpi']  # Set the DPI for saving figures
 
-def match_haps(gts: Genotypes, observed: Haplotypes, causal: Haplotypes, drop_extras: bool = True) -> tuple:
+def match_haps(gts: Genotypes, observed: Haplotypes, causal: Haplotypes) -> tuple:
     """
     Match the observed and causal haplotypes to maximize best pairwise LD
 
@@ -44,9 +47,6 @@ def match_haps(gts: Genotypes, observed: Haplotypes, causal: Haplotypes, drop_ex
         A path to a .hap file containing haplotypes output by happler
     causal: Path
         A path to a .hap file containing a simulated causal haplotype
-    drop_extras: bool, optional
-        Drop any observed haplotypes that don't have a match in the causal haps.
-        Otherwise, include a match for every observed haplotype!
 
     Returns
     -------
@@ -58,6 +58,8 @@ def match_haps(gts: Genotypes, observed: Haplotypes, causal: Haplotypes, drop_ex
     npt.NDArray
         An array of the column indices indicating the optimal match of causal haplotypes
         to observed haplotypes
+    npt.NDArray
+        The index of the causal haplotype that each observed haplotype is matched to
     npt.NDArray
         An array of boolean values indicating whether each observed haplotype has a match
         in the causal haplotypes
@@ -75,17 +77,20 @@ def match_haps(gts: Genotypes, observed: Haplotypes, causal: Haplotypes, drop_ex
     ld_mat = np.abs(ld_mat[:obs.shape[1], obs.shape[1]:])
     # Return the ld_mat idxs of the optimal match of each observed hap to a causal hap
     row_idx, col_idx = linear_sum_assignment(ld_mat, maximize=True)
-    extras_labels = np.zeros(
-        len(row_idx) if drop_extras else ld_mat.shape[0], dtype=np.bool_,
-    )
-    if not drop_extras:
-        missing_rows = np.setdiff1d(range(ld_mat.shape[0]), row_idx)
-        extras_labels[missing_rows] = True
-        # If there are more observed haps than causal haps, then we just assign the
-        # remaining haps to the best causal hap that matches for each of them
-        row_idx = np.concatenate((row_idx, missing_rows))
-        col_idx = np.concatenate((col_idx, ld_mat[missing_rows].argmax(axis=1)))
-    return ld_mat, row_idx, col_idx, extras_labels
+    # now deal with the extras (observed haps with no causal hap match)
+    extras_labels = np.zeros(ld_mat.shape[0], dtype=np.uint8)
+    extras_labels_bool = np.zeros(ld_mat.shape[0], dtype=bool)
+    missing_rows = np.setdiff1d(range(ld_mat.shape[0]), row_idx)
+    extras_labels[row_idx] = col_idx
+    extras_labels_bool[row_idx] = True
+    # If there are more observed haps than causal haps, then we just assign the
+    # remaining haps to the best causal hap that matches for each of them
+    row_idx = np.concatenate((row_idx, missing_rows))
+    missing_cols = ld_mat[missing_rows].argmax(axis=1)
+    col_idx = np.concatenate((col_idx, missing_cols))
+    extras_labels[missing_rows] = missing_cols
+    extras_labels_bool[missing_rows] = False
+    return ld_mat, row_idx, col_idx, extras_labels, extras_labels_bool
 
 
 def get_best_ld(
@@ -95,7 +100,6 @@ def get_best_ld(
         region: str = None,
         observed_id: str = None,
         causal_id: str = None,
-        show_extras: bool = False,
         log: Logger = None
     ):
     """
@@ -115,8 +119,6 @@ def get_best_ld(
         The ID to load from the observed_hap file (or just all of the IDs, otherwise)
     causal_id: str, optional
         The ID to load from the causal_hap file (or just the first hap, otherwise)
-    show_extras: bool, optional
-        Whether to show observed haps that don't match with a causal hap
     log: Logger, optional
         A logging module to pass to haptools
     """
@@ -130,7 +132,11 @@ def get_best_ld(
 
     # if there are no observed haplotypes, then we can't compute LD
     if len(observed_hap.data) == 0:
-        return np.array([np.nan,]), np.array([False,])
+        return (
+            np.array([np.nan,]),
+            np.array([np.iinfo(np.uint8).max,], dtype=np.uint8),
+            np.array([False,], dtype=bool),
+        )
 
     # load the genotypes
     variants = {
@@ -148,11 +154,11 @@ def get_best_ld(
     gts.check_biallelic()
 
     # compute LD between every observed haplotype and the causal haplotype
-    observed_ld, best_row_idx, best_col_idx, extras_labels = match_haps(
-        gts, observed_hap, causal_hap, drop_extras=(not show_extras),
+    observed_ld, best_row_idx, best_col_idx, extras_labels, labels_bool = match_haps(
+        gts, observed_hap, causal_hap,
     )
     # return the strongest possible LD for each observed hap with a causal hap
-    return observed_ld[best_row_idx, best_col_idx], extras_labels
+    return observed_ld[best_row_idx, best_col_idx], extras_labels, labels_bool
 
 
 def get_finemap_metrics(metrics_path: Path, log: Logger = None):
@@ -209,6 +215,7 @@ def count_shared(observed, causal, log, observed_id: str = None, causal_id: str
 def plot_params(
         params: npt.NDArray,
         vals: npt.NDArray,
+        vals_sem: npt.NDArray,
         val_title: str,
         val_color: npt.NDArray,
     ):
@@ -219,11 +226,13 @@ def plot_params(
     ----------
     params: npt.NDArray
         A numpy array of mixed dtype. Each column contains the values of a parameter
-    vals: list[npt.NDArray]
-        A numpy array containing the values of the plot
+    vals: npt.NDArray
+        A numpy array of numpy arrays containing the values of the plot
+    vals_sem: npt.NDArray
+        A numpy array of numpy arrays containing the standard error of the values
     val_title: str
         The name of the value that we are plotting
-    val_color: list[npt.NDArray]
+    val_color: npt.NDArray
         Whether to color each point corresponding to this value
     """
     figsize = matplotlib.rcParams["figure.figsize"]
@@ -238,10 +247,13 @@ def plot_params(
     fig.subplots_adjust(hspace=0)
     # create a plot for the vals, first
     x_vals = [j for j, arr in enumerate(vals) for i in arr]
-    val_color = ["red" if j else "green" for i in val_color for j in i]
-    axs[0].scatter(x_vals, np.concatenate(vals), marker="o", color=val_color, s=5)
+    val_color = ["green" if j else "red" for i in val_color for j in i]
+    vals = np.concatenate(vals)
+    vals_sem = np.concatenate(vals_sem)
+    for v in zip(x_vals, vals, vals_sem, val_color):
+        axs[0].errorbar(v[0], v[1], yerr=v[2], marker="o", c=v[3], markersize=3)
     axs[0].set_ylabel(val_title, rotation="horizontal", ha="right")
-    axs[0].set_xticks(range(0, len(vals)))
+    axs[0].set_xticks(range(0, len(x_vals)))
     axs[0].set_xticklabels([])
     axs[0].grid(axis="x")
     axs[0].set_ylim(None, 1)
@@ -263,6 +275,7 @@ def plot_params(
 def plot_params_simple(
         params: npt.NDArray,
         vals: npt.NDArray,
+        val_sem: npt.NDArray,
         val_title: str,
         val_color: npt.NDArray,
     ):
@@ -273,11 +286,13 @@ def plot_params_simple(
     ----------
     params: npt.NDArray
         A numpy array containing the values of a parameter
-    vals: list[npt.NDArray]
-        A numpy array containing the values of the plot
+    vals: npt.NDArray
+        A numpy array of numpy arrays containing the values of the plot
+    val_sem: npt.NDArray
+        A numpy array of numpy arrays containing the standard error of the values
     val_title: str
         The name of the value that we are plotting
-    val_color: list[npt.NDArray]
+    val_color: npt.NDArray
         Whether to color each point corresponding to this value
     """
     fig, ax = plt.subplots(nrows=1, ncols=1, sharex=True)
@@ -286,7 +301,7 @@ def plot_params_simple(
         params = -np.log10(params)
         val_xlabel = "-log " + val_xlabel
     params = [j for j, arr in zip(params, vals) for i in arr]
-    val_color = ["red" if j else "green" for i in val_color for j in i]
+    val_color = ["green" if j else "red" for i in val_color for j in i]
     # plot params against vals
     ax.scatter(params, np.concatenate(vals), marker="o", color=val_color, s=5)
     ax.set_ylabel(val_title, color="g")
@@ -294,6 +309,77 @@ def plot_params_simple(
     return fig
 
 
+def group_by_rep(params: npt.NDArray, vals: npt.NDArray, causal_idxs: npt.NDArray, bools: npt.NDArray):
+    """
+    Group replicates with identical parameter values
+
+    Compute mean and standard error of the values for each group
+
+    Parameters
+    ----------
+    params: npt.NDArray
+        A numpy array of mixed dtype. Each column contains the values of a parameter
+    vals: npt.NDArray
+        A numpy array containing the values of the plot
+    causal_idxs: npt.NDArray
+        The index of the causal haplotype that each observed haplotype belongs to
+    bools: npt.NDArray
+        An array of boolean values indicating whether each observed haplotype has a match
+
+    Returns
+    -------
+    npt.NDArray
+        A numpy array of the unique parameter values
+    npt.NDArray
+        A numpy array containing the mean and std of the values over the replicates
+    """
+    other_param_names = [name for name in params.dtype.names if name != "rep"]
+    grouped_params = np.unique(params[other_param_names])
+    get_mean_std = lambda x: (np.mean(x), sem(x) if len(x) > 1 else np.nan)
+    filter_causal_idxs = lambda x: np.unique(x[x != np.iinfo(np.uint8).max])
+    grouped_vals = []
+    grouped_sem = []
+    grouped_bools = []
+    for group in grouped_params:
+        subgrouped_vals = []
+        subgrouped_sem = []
+        subgrouped_bools = []
+        curr_vals = vals[params[other_param_names] == group]
+        curr_causal_idxs = causal_idxs[params[other_param_names] == group]
+        curr_bools = bools[params[other_param_names] == group]
+        # compute mean and std err for each causal match
+        for causal_idx in filter_causal_idxs(np.concatenate(curr_causal_idxs)):
+            try:
+                curr_vals_causal_idxs = np.concatenate([
+                    val[(indices == causal_idx) & curr_bool]
+                    for val, indices, curr_bool in zip(curr_vals, curr_causal_idxs, curr_bools)
+                ])
+            except ValueError:
+                continue
+            val_mean, val_sem = get_mean_std(curr_vals_causal_idxs)
+            subgrouped_vals.append(val_mean)
+            subgrouped_sem.append(val_sem)
+            subgrouped_bools.append(True)
+            # compute mean and std err for each unmatched observed hap
+            for unmatched_idx in range(max([(~i).sum() for i in curr_bools])):
+                try:
+                    curr_vals_unmatched_idxs = np.array([
+                        val[(indices == causal_idx) & ~curr_bool][unmatched_idx]
+                        for val, indices, curr_bool in zip(curr_vals, curr_causal_idxs, curr_bools)
+                        if unmatched_idx < ((indices == causal_idx) & ~curr_bool).sum()
+                    ])
+                except ValueError:
+                    continue
+                val_mean, val_sem = get_mean_std(curr_vals_unmatched_idxs)
+                subgrouped_vals.append(val_mean)
+                subgrouped_sem.append(val_sem)
+                subgrouped_bools.append(False)
+        grouped_vals.append(np.array(subgrouped_vals, dtype=object))
+        grouped_sem.append(np.array(subgrouped_sem, dtype=object))
+        grouped_bools.append(np.array(subgrouped_bools, dtype=object))
+    return grouped_params, grouped_vals, grouped_sem, grouped_bools
+
+
 @click.command()
 @click.argument("genotypes", type=click.Path(exists=True, path_type=Path))
 @click.argument("observed_hap", type=click.Path(path_type=Path))
@@ -329,13 +415,6 @@ def plot_params_simple(
     show_default="the first haplotype in the file",
     help="A haplotype ID from the causal .hap file",
 )
-@click.option(
-    "--show-extras",
-    is_flag=True,
-    default=False,
-    show_default=True,
-    help="Whether to show observed haps that don't match with a causal hap",
-)
 @click.option(
     "-o",
     "--output",
@@ -360,7 +439,6 @@ def main(
     region: str = None,
     observed_id: str = None,
     causal_id: str = None,
-    show_extras: bool = False,
     output: Path = Path("/dev/stdout"),
     verbosity: str = "ERROR",
 ):
@@ -391,7 +469,7 @@ def main(
     get_hap_fname = lambda hap_path, param_set: Path(str(hap_path).format(**dict(zip(dtypes.keys(), param_set))))
 
     # compute LD between the causal hap and the best observed hap across param vals
-    ld_vals, ld_extras_labels = zip(*tuple(
+    ld_vals, ld_extras_idxs, ld_extras_bool = zip(*tuple(
         get_best_ld(
             gts,
             get_hap_fname(observed_hap, params[idx]),
@@ -399,11 +477,13 @@ def main(
             region=region,
             observed_id=observed_id,
             causal_id=causal_id,
-            show_extras=show_extras,
             log=log
         )
         for idx in range(len(params))
     ))
+    ld_vals = np.array(ld_vals, dtype=object)
+    ld_extras_idxs = np.array(ld_extras_idxs, dtype=object)
+    ld_extras_bool = np.array(ld_extras_bool, dtype=object)
 
     # extract fine-mapping metrics for the observed hap
     if metrics is not None:
@@ -415,13 +495,18 @@ def main(
             for idx in range(len(params))
         ])
 
+    # TODO: also handle the metrics in group_by_rep()
+    params, ld_vals, ld_sem, ld_extras_bool = group_by_rep(
+        params, ld_vals, ld_extras_idxs, ld_extras_bool,
+    )
+
     if len(dtypes) > 1 or metrics is not None:
         merged = params
         if metrics is not None:
             merged = np.lib.recfunctions.merge_arrays([params, metrics], flatten=True)
-        fig = plot_params(merged, ld_vals, "causal LD", ld_extras_labels)
+        fig = plot_params(merged, ld_vals, ld_sem, "causal LD", ld_extras_bool)
     elif len(dtypes) == 1:
-        fig = plot_params_simple(params, ld_vals, "causal LD", ld_extras_labels)
+        fig = plot_params_simple(params, ld_vals, ld_sem, "causal LD", ld_extras_bool)
     else:
         raise ValueError("No parameter values found")
 
diff --git a/happler/__main__.py b/happler/__main__.py
index ef6e565..2ed1739 100644
--- a/happler/__main__.py
+++ b/happler/__main__.py
@@ -297,10 +297,6 @@ def run(
     log.info("There are {} samples and {} variants".format(*gt.data.shape))
 
     # subset to just one phenotype
-    if len(ph.names) > 1:
-        log.warning("Ignoring all but the first trait in the phenotypes file")
-        ph.names = ph.names[:1]
-        ph.data = ph.data[:, :1]
     # also reorder and subset samples in Phenotypes to match those in the Genotypes
     ph.subset(samples=tuple(gt.samples), names=(pheno,), inplace=True)
 

From 8070c4e60045ed6f39cc18652d86665524ff230a Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Tue, 22 Oct 2024 10:02:40 -0700
Subject: [PATCH 14/41] output finemapping metrics for multiple haps

---
 analysis/workflow/rules/happler.smk       |  4 +-
 analysis/workflow/scripts/susie_metrics.R | 79 +++++++++++------------
 analysis/workflow/scripts/utils.R         | 13 ++++
 3 files changed, 52 insertions(+), 44 deletions(-)

diff --git a/analysis/workflow/rules/happler.smk b/analysis/workflow/rules/happler.smk
index 2c75df5..a83481e 100644
--- a/analysis/workflow/rules/happler.smk
+++ b/analysis/workflow/rules/happler.smk
@@ -392,11 +392,11 @@ rule metrics:
     input:
         finemap=expand(rules.finemapper.output.susie, ex="in", allow_missing=True),
         obs_hap=rules.run.output.hap,
-        caus_hap=config["hap_file"],
+        # caus_hap=config["hap_file"],
     output:
         metrics=out + "/susie_metrics.tsv",
     resources:
-        runtime=5,
+        runtime=10,
     threads: 6,
     log:
         logs + "/metrics",
diff --git a/analysis/workflow/scripts/susie_metrics.R b/analysis/workflow/scripts/susie_metrics.R
index 6147b76..d02a3df 100755
--- a/analysis/workflow/scripts/susie_metrics.R
+++ b/analysis/workflow/scripts/susie_metrics.R
@@ -4,7 +4,6 @@
 
 # param1: The path to a RDS file containing the output of SuSiE
 # param2: The path to a .hap file containing the observed haplotype
-# param3: The path to a .hap file containing the causal haplotype
 
 # Output will be written in tab-separated format to stdout
 # To debug this script, add a browser() call somewhere in it. Then run it with its command line args but replace the script name with "R --args".
@@ -13,25 +12,12 @@
 args = commandArgs(trailingOnly = TRUE)
 susie_res = args[1]
 obs_hap = args[2]
-caus_hap = args[3]
 
-write("Parsing observed hap file", stderr())
-for (line in readLines(obs_hap)) {
-    # we assume the first line in the hap file that begins with H denotes the haplotype
-    if (grepl("^H\\t", line)) break
-}
-# extract the start, end, and ID of the haplotype
-happler_hap = as.integer(strsplit(line, "\t")[[1]][c(3,4)])
-happler_hap_id = strsplit(line, "\t")[[1]][c(5)]
+# load functions to help read various file types
+source("workflow/scripts/utils.R")
 
-write("Parsing causal hap file", stderr())
-for (line in readLines(caus_hap)) {
-    # we assume the first line in the hap file that begins with H denotes the haplotype
-    if (grepl("^H\\t", line)) break
-}
-# extract the start, end, and ID of the haplotype
-causal_hap = as.integer(strsplit(line, "\t")[[1]][c(3,4)])
-causal_hap_id = strsplit(line, "\t")[[1]][c(5)]
+write("Parsing observed hap file", stderr())
+happler_hap = readHap(obs_hap)
 
 write("Parsing SuSiE results", stderr())
 # parse the susie data:
@@ -40,30 +26,39 @@ write("Parsing SuSiE results", stderr())
 susie_res = readRDS(susie_res)
 fitted = susie_res$fitted
 susie_pip = fitted$pip
-happler_hap_idx = which(names(susie_pip) %in% happler_hap_id)
 
-write("Computing metrics", stderr())
-# The metrics are:
-# 1) What is the PIP of the observed hap?
-# 2) Does the observed hap get the highest PIP?
-# 3) What is the best PIP among the variants?
-# 4) Is the observed hap in a credible set?
-# 5) What is the purity of the credible set?
-obs_pip = susie_pip[happler_hap_id]
-has_highest_pip = as.integer(sum(obs_pip < susie_pip) == 0)
-best_variant_pip = max(susie_pip[names(susie_pip) != names(obs_pip)])
-credible_set = NULL
-for (cs in names(fitted$sets$cs)) {
-    if (happler_hap_idx %in% fitted$sets$cs[cs]) {
-        credible_set = cs
-        break
+for (happler_hap_file_idx in 1:nrow(happler_hap)) {
+
+    happler_hap_id = happler_hap[happler_hap_file_idx, "id"]
+    happler_hap_idx = which(names(susie_pip) %in% happler_hap_id)
+
+    write(paste("Computing metrics for", happler_hap_id), stderr())
+    # The metrics are:
+    # 1) What is the PIP of the observed hap?
+    # 2) Does the observed hap get the highest PIP?
+    # 3) What is the best PIP among the variants?
+    # 4) Is the observed hap in a credible set?
+    # 5) What is the purity of the credible set?
+    # 6) What is the length of the credible set?
+    obs_pip = susie_pip[happler_hap_id]
+    has_highest_pip = as.integer(sum(obs_pip < susie_pip) == 0)
+    best_variant_pip = max(susie_pip[names(susie_pip) != names(obs_pip)])
+    credible_set = NULL
+    for (cs in names(fitted$sets$cs)) {
+        if (happler_hap_idx %in% fitted$sets$cs[cs]) {
+            credible_set = cs
+            break
+        }
     }
-}
-purity = 0
-if (!is.null(credible_set)) {
-    purity = fitted$sets$purity[cs, "mean.abs.corr"]
-}
-in_credible_set = as.integer(!is.null(credible_set))
+    purity = 0
+    cs_len = 0
+    if (!is.null(credible_set)) {
+        purity = fitted$sets$purity[cs, "mean.abs.corr"]
+        cs_length = length(fitted$sets$cs[credible_set])
+    }
+    in_credible_set = as.integer(!is.null(credible_set))
 
-write("Outputting metrics", stderr())
-write(paste(obs_pip, has_highest_pip, best_variant_pip, in_credible_set, purity), stdout())
+    write(paste("Outputting metrics for", happler_hap_id), stderr())
+    write(paste(obs_pip, has_highest_pip, best_variant_pip, in_credible_set, purity, cs_length), stdout())
+
+}
diff --git a/analysis/workflow/scripts/utils.R b/analysis/workflow/scripts/utils.R
index ba54c1f..b3a8219 100644
--- a/analysis/workflow/scripts/utils.R
+++ b/analysis/workflow/scripts/utils.R
@@ -97,3 +97,16 @@ save_curr_env = function(env = parent.frame(), filePath = "myenv.RData") {
 
   save(list = ls(all.names = TRUE, envir = env), file = filePath, envir = env)
 }
+
+readHap = function(hapfile) {
+  # Return a data.frame with the start, end, and ID columns of the haplotyp lines
+  # in the .hap file
+  # Parameters:
+  #   hapfile: The path to the .hap file
+  data <- read.table(hapfile, header = FALSE, fill = TRUE, stringsAsFactors = FALSE)
+  # Filter rows that start with "H" and select columns 3 to 5
+  h_data <- data[grep("^H", data$V1), c(3, 4, 5)]
+  # Rename the columns for clarity (optional)
+  colnames(h_data) <- c("start", "end", "id")
+  h_data
+}

From 4edfd019a93daffe4782e42fdf4b71cf6acbc2bf Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Tue, 22 Oct 2024 17:43:05 -0700
Subject: [PATCH 15/41] allow for setting output threshold as a param in the
 config

---
 analysis/config/config-simulations.yml | 5 ++++-
 analysis/workflow/Snakefile            | 1 +
 analysis/workflow/rules/happler.smk    | 3 ++-
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/analysis/config/config-simulations.yml b/analysis/config/config-simulations.yml
index 59fd987..a48ae14 100644
--- a/analysis/config/config-simulations.yml
+++ b/analysis/config/config-simulations.yml
@@ -74,9 +74,12 @@ mode: ld_range
 min_maf: 0.05
 
 # Sample sizes to use
-# sample_size: [500, 1000, 1500, 2000, 2500]
+# sample_size: [777, 800, 1600, 2400, 3200]
 sample_size: 777
 
+# A threshold on the pvalues of the haplotypes
+out_thresh: 5e-5
+
 # Whether to include the causal variant in the set of genotypes provided to the
 # finemapping methods. Set this to true if you're interested in seeing how the
 # methods perform when the causal variant is absent from the data.
diff --git a/analysis/workflow/Snakefile b/analysis/workflow/Snakefile
index 93d4640..2747744 100644
--- a/analysis/workflow/Snakefile
+++ b/analysis/workflow/Snakefile
@@ -103,6 +103,7 @@ if mode in ("snp", "hap", "ld_range"):
         "random": None,
         "covar": covar_file,
         "min_maf": check_config("min_maf", 0),
+        "out_thresh": check_config("out_thresh", 5e-8),
     }
     if mode in ("hap", "ld_range"):
         happler_config["snps"] = []
diff --git a/analysis/workflow/rules/happler.smk b/analysis/workflow/rules/happler.smk
index a83481e..11c5b24 100644
--- a/analysis/workflow/rules/happler.smk
+++ b/analysis/workflow/rules/happler.smk
@@ -39,6 +39,7 @@ rule run:
         max_signals=3,
         max_iterations=3,
         rep=lambda wildcards: int(wildcards.rep),
+        out_thresh=check_config("out_thresh", 5e-8),
     output:
         hap=out + "/happler.hap",
         gz=out + "/happler.hap.gz",
@@ -68,7 +69,7 @@ rule run:
         "--max-signals {params.max_signals} --max-iterations {params.max_iterations} "
         "--discard-multiallelic --region {params.region} --pheno {params.rep} "
         "{params.covar}--indep-thresh {params.indep} -t {params.thresh} "
-        "--show-tree {input.gts} {input.pts} &>{log}"
+        "--out-thresh {params.out_thresh} --show-tree {input.gts} {input.pts} &>{log}"
         " && haptools index -o {output.gz} {output.hap} &>>{log}"
 
 

From 8371c80ccec013eb03f7cefd8089630a5a79a57b Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Tue, 22 Oct 2024 17:44:13 -0700
Subject: [PATCH 16/41] clean up and update scripts README

---
 analysis/workflow/scripts/README.md          | 12 +++--
 analysis/workflow/scripts/compute_pgen_ld.py |  2 +-
 analysis/workflow/scripts/haplotype_ld.py    | 49 --------------------
 3 files changed, 10 insertions(+), 53 deletions(-)
 delete mode 100755 analysis/workflow/scripts/haplotype_ld.py

diff --git a/analysis/workflow/scripts/README.md b/analysis/workflow/scripts/README.md
index 24538c9..99f7f4f 100644
--- a/analysis/workflow/scripts/README.md
+++ b/analysis/workflow/scripts/README.md
@@ -6,6 +6,12 @@ All python scripts implement the --help argument. For bash, awk, and R scripts,
 ## [choose_different_ld.py](choose_different_ld.py)
 Choose haplotype alleles such that their SNPs have a range of strengths of LD with the causal haplotype.
 
+## [compute_pgen_ld.py](compute_pgen_ld.py)
+Quickly compute the LD between a single variant and all other variants in a PGEN file.
+
+## [conditional_regression_plots.py](conditional_regression_plots.py)
+Make manhattan plots for a region after conditioning on 1) haplotypes and 2) their alleles together and 3) separately. Also, 4) show the original manhattan plot without any conditioning.
+
 ## [create_hap_file.sh](create_hap_file.sh)
 Create a [.hap file](https://haptools.readthedocs.io/en/stable/formats/haplotypes.html) for use by haptools when simulating causal variables.
 
@@ -33,12 +39,12 @@ Create a manhattan plot to visualize the results of a GWAS.
 ## [merge_plink.py](merge_plink.py)
 Merge variants from two PGEN files that have the same set of samples.
 
-## [midway_manhattan.bash](midway_manhattan.bash)
-Create a "midway" manhattan plot depicting the p-values on a branch of a tree mid-way through happler's tree-building process.
-
 ## [midway_manhattan_summary.py](midway_manhattan_summary.py)
 Summarize the results of multiple midway manhattan plots via an ROC curve
 
+## [midway_manhattan.bash](midway_manhattan.bash)
+Create a "midway" manhattan plot depicting the p-values on a branch of a tree mid-way through happler's tree-building process.
+
 ## [parameter_plot.py](parameter_plot.py)
 Plot the LD between a causal haplotype and a set of observed haplotypes across a range of hyper-parameters.
 
diff --git a/analysis/workflow/scripts/compute_pgen_ld.py b/analysis/workflow/scripts/compute_pgen_ld.py
index bc95a39..075af17 100755
--- a/analysis/workflow/scripts/compute_pgen_ld.py
+++ b/analysis/workflow/scripts/compute_pgen_ld.py
@@ -7,7 +7,7 @@
 
 from haptools.logging import getLogger
 from haptools.ld import pearson_corr_ld
-from haptools.data import Data, Genotypes, GenotypesPLINK
+from haptools.data import Data, GenotypesPLINK
 
 
 def corr(a, b):
diff --git a/analysis/workflow/scripts/haplotype_ld.py b/analysis/workflow/scripts/haplotype_ld.py
deleted file mode 100755
index b4f9012..0000000
--- a/analysis/workflow/scripts/haplotype_ld.py
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/usr/bin/env python
-import pandas as pd
-import seaborn as sns
-from matplotlib import pyplot as plt
-
-
-sns.set(rc={'figure.figsize':(11.7,8.27)})
-
-phens = pd.read_csv("../phens.tsv.gz", sep="\t", index_col=0, header=0)
-gens = pd.read_csv("genotypes.tsv", sep="\t", index_col=1)
-gt = gens.iloc[:,3:]
-alleles = gens['alleles']
-
-SNPs = gt.iloc[~gt.index.str.startswith('STR')]
-STR = gt.iloc[gt.index.str.startswith('STR')]
-
-samples = phens.index.astype(str)
-num_samp = len(samples)
-
-lens = {str(i): len(al) for i, al in enumerate(gens.loc[STR.index[0],'alleles'].split(','))}
-str_gt = STR.iloc[0].str.split('|', expand=True).applymap(lens.__getitem__)
-str_gt = str_gt.loc[samples]
-str_gts = pd.concat((str_gt[0], str_gt[1]), axis=0, ignore_index=True)
-split_chroms = pd.DataFrame({STR.index[0]: str_gts})
-
-fig, axes = plt.subplots(1, len(SNPs.index))
-for idx, snp in enumerate(SNPs.index):
-    snp_gt = gt.loc[snp].str.split('|', expand=True)
-    snp_gt = snp_gt.loc[samples]
-    snp_gts = pd.concat((snp_gt[0], snp_gt[1]), axis=0, ignore_index=True).astype(int)
-    split_chroms[snp] = snp_gts
-    sns.violinplot(ax=axes[idx], x=snp, y=STR.index[0], data=split_chroms, order=[0,1])
-
-plt.savefig('snps.png')
-plt.clf()
-
-fig, axes = plt.subplots(1, 2)
-hap_alleles = {snp: 1 for snp in SNPs.index}
-hap = pd.DataFrame([(split_chroms[snp] == hap_alleles[snp]).astype(bool) for snp in SNPs.index]).T
-hap['haplotype'] = hap.all(axis=1).astype(int)
-hap[STR.index[0]] = split_chroms[STR.index[0]]
-sns.violinplot(ax=axes[0], x='haplotype', y=STR.index[0], data=hap, order=[0,1])
-
-hap_sum = pd.DataFrame({
-    'haplotype': (hap['haplotype'].iloc[:num_samp].to_numpy() + hap['haplotype'].iloc[num_samp:].to_numpy()),
-    STR.index[0]: (hap[STR.index[0]].iloc[:num_samp].to_numpy() + hap[STR.index[0]].iloc[num_samp:].to_numpy()),
-})
-sns.regplot(ax=axes[1], x='haplotype', y=STR.index[0], data=hap_sum)
-plt.savefig('haplotype.png')

From 1b5a9edca3633fa3974833a870a1ea4f6768364a Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Tue, 22 Oct 2024 17:47:45 -0700
Subject: [PATCH 17/41] handle empty hap files in utils.R

---
 analysis/workflow/scripts/susie_metrics.R |  2 +-
 analysis/workflow/scripts/utils.R         | 16 ++++++++++------
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/analysis/workflow/scripts/susie_metrics.R b/analysis/workflow/scripts/susie_metrics.R
index d02a3df..3d1733e 100755
--- a/analysis/workflow/scripts/susie_metrics.R
+++ b/analysis/workflow/scripts/susie_metrics.R
@@ -51,7 +51,7 @@ for (happler_hap_file_idx in 1:nrow(happler_hap)) {
         }
     }
     purity = 0
-    cs_len = 0
+    cs_length = 0
     if (!is.null(credible_set)) {
         purity = fitted$sets$purity[cs, "mean.abs.corr"]
         cs_length = length(fitted$sets$cs[credible_set])
diff --git a/analysis/workflow/scripts/utils.R b/analysis/workflow/scripts/utils.R
index b3a8219..52c0886 100644
--- a/analysis/workflow/scripts/utils.R
+++ b/analysis/workflow/scripts/utils.R
@@ -103,10 +103,14 @@ readHap = function(hapfile) {
   # in the .hap file
   # Parameters:
   #   hapfile: The path to the .hap file
-  data <- read.table(hapfile, header = FALSE, fill = TRUE, stringsAsFactors = FALSE)
-  # Filter rows that start with "H" and select columns 3 to 5
-  h_data <- data[grep("^H", data$V1), c(3, 4, 5)]
-  # Rename the columns for clarity (optional)
-  colnames(h_data) <- c("start", "end", "id")
-  h_data
+  data <- tryCatch(read.table(hapfile, header = FALSE, fill = TRUE, stringsAsFactors = FALSE), error=function(e) data.frame())
+  if (nrow(data) != 0) {
+    # Filter rows that start with "H" and select columns 3 to 5
+    h_data <- data[grep("^H", data$V1), c(3, 4, 5)]
+    # Rename the columns for clarity (optional)
+    colnames(h_data) <- c("start", "end", "id")
+    h_data
+  } else {
+    data
+  }
 }

From c8e2eb5bfe9114c4a4be621c3e8c0e70f41cb8fa Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Tue, 22 Oct 2024 17:50:02 -0700
Subject: [PATCH 18/41] allow for reordering rows of parameter_plot

---
 analysis/workflow/rules/plots.smk           |  6 ++-
 analysis/workflow/scripts/parameter_plot.py | 51 +++++++++++++++++++--
 2 files changed, 50 insertions(+), 7 deletions(-)

diff --git a/analysis/workflow/rules/plots.smk b/analysis/workflow/rules/plots.smk
index 403f175..890cbff 100644
--- a/analysis/workflow/rules/plots.smk
+++ b/analysis/workflow/rules/plots.smk
@@ -98,7 +98,7 @@ rule params:
     output:
         png=out + "/happler_params.png",
     resources:
-        runtime=10,
+        runtime=20,
     log:
         logs + "/plot_params",
     benchmark:
@@ -107,6 +107,7 @@ rule params:
         "happler"
     shell:
         "workflow/scripts/parameter_plot.py -o {output.png} "
+        "--order num_haps,beta,ld "
         "{input.gts} {params.observed_haps} {params.causal_hap} &> {log}"
 
 
@@ -126,7 +127,7 @@ rule metrics:
     output:
         png=out + "/finemapping_metrics.png",
     resources:
-        runtime=10,
+        runtime=20,
     log:
         logs + "/metrics",
     benchmark:
@@ -135,4 +136,5 @@ rule metrics:
         "happler"
     shell:
         "workflow/scripts/parameter_plot.py -o {output.png} -m {params.metrics} "
+        "--order num_haps,beta,ld "
         "{input.gts} {params.observed_haps} {params.causal_hap} &> {log}"
diff --git a/analysis/workflow/scripts/parameter_plot.py b/analysis/workflow/scripts/parameter_plot.py
index bea84d2..af1db31 100755
--- a/analysis/workflow/scripts/parameter_plot.py
+++ b/analysis/workflow/scripts/parameter_plot.py
@@ -218,6 +218,7 @@ def plot_params(
         vals_sem: npt.NDArray,
         val_title: str,
         val_color: npt.NDArray,
+        hide_extras: bool = False,
     ):
     """
     Plot vals against parameter values
@@ -234,6 +235,8 @@ def plot_params(
         The name of the value that we are plotting
     val_color: npt.NDArray
         Whether to color each point corresponding to this value
+    hide_extras: bool
+        Whether to hide the observed haps that have no causal hap match
     """
     figsize = matplotlib.rcParams["figure.figsize"]
     if len(params) > 75:
@@ -251,6 +254,8 @@ def plot_params(
     vals = np.concatenate(vals)
     vals_sem = np.concatenate(vals_sem)
     for v in zip(x_vals, vals, vals_sem, val_color):
+        if hide_extras and v[3] != "green":
+            continue
         axs[0].errorbar(v[0], v[1], yerr=v[2], marker="o", c=v[3], markersize=3)
     axs[0].set_ylabel(val_title, rotation="horizontal", ha="right")
     axs[0].set_xticks(range(0, len(x_vals)))
@@ -275,9 +280,10 @@ def plot_params(
 def plot_params_simple(
         params: npt.NDArray,
         vals: npt.NDArray,
-        val_sem: npt.NDArray,
+        vals_sem: npt.NDArray,
         val_title: str,
         val_color: npt.NDArray,
+        hide_extras: bool = False,
     ):
     """
     Plot vals against only a single column of parameter values
@@ -288,12 +294,14 @@ def plot_params_simple(
         A numpy array containing the values of a parameter
     vals: npt.NDArray
         A numpy array of numpy arrays containing the values of the plot
-    val_sem: npt.NDArray
+    vals_sem: npt.NDArray
         A numpy array of numpy arrays containing the standard error of the values
     val_title: str
         The name of the value that we are plotting
     val_color: npt.NDArray
         Whether to color each point corresponding to this value
+    hide_extras: bool
+        Whether to hide the observed haps that have no causal hap match
     """
     fig, ax = plt.subplots(nrows=1, ncols=1, sharex=True)
     val_xlabel = params.dtype.names[0]
@@ -302,8 +310,13 @@ def plot_params_simple(
         val_xlabel = "-log " + val_xlabel
     params = [j for j, arr in zip(params, vals) for i in arr]
     val_color = ["green" if j else "red" for i in val_color for j in i]
+    vals = np.concatenate(vals)
+    vals_sem = np.concatenate(vals_sem)
     # plot params against vals
-    ax.scatter(params, np.concatenate(vals), marker="o", color=val_color, s=5)
+    for v in zip(params, vals, vals_sem, val_color):
+        if hide_extras and v[3] != "green":
+            continue
+        ax.errorbar(v[0], v[1], yerr=v[2], marker="o", c=v[3], markersize=5)
     ax.set_ylabel(val_title, color="g")
     ax.set_xlabel(val_xlabel)
     return fig
@@ -415,6 +428,20 @@ def group_by_rep(params: npt.NDArray, vals: npt.NDArray, causal_idxs: npt.NDArra
     show_default="the first haplotype in the file",
     help="A haplotype ID from the causal .hap file",
 )
+@click.option(
+    "--hide-extras",
+    is_flag=True,
+    default=False,
+    show_default=True,
+    help="Hide the observed haps that have no causal hap match",
+)
+@click.option(
+    "--order",
+    type=str,
+    default=None,
+    show_default="The order of the wildcards in the observed_hap path",
+    help="The order of the parameters in the plot, as a comma-separated list",
+)
 @click.option(
     "-o",
     "--output",
@@ -439,6 +466,8 @@ def main(
     region: str = None,
     observed_id: str = None,
     causal_id: str = None,
+    hide_extras: bool = False,
+    order: str = None,
     output: Path = Path("/dev/stdout"),
     verbosity: str = "ERROR",
 ):
@@ -463,6 +492,17 @@ def main(
     params = dict(glob_wildcards(observed_hap)._asdict())
     # convert the dictionary to a numpy mixed dtype array
     dtypes = {k: DTYPES[k] for k in params.keys()}
+    # if the user specified an order, then we will sort the parameters by that order
+    if order is not None:
+        new_params = {}
+        for k in order.split(",")[::-1]:
+            if k not in dtypes:
+                log.warning(f"Parameter {k} not found in the observed hap path")
+                continue
+            dtypes = {k: dtypes.pop(k), **dtypes}
+        for k in dtypes:
+            new_params[k] = params.pop(k)
+        params = new_params
     params = np.array(list(zip(*params.values())), dtype=list(dtypes.items()))
     params.sort()
 
@@ -499,14 +539,15 @@ def main(
     params, ld_vals, ld_sem, ld_extras_bool = group_by_rep(
         params, ld_vals, ld_extras_idxs, ld_extras_bool,
     )
+    del dtypes["rep"]
 
     if len(dtypes) > 1 or metrics is not None:
         merged = params
         if metrics is not None:
             merged = np.lib.recfunctions.merge_arrays([params, metrics], flatten=True)
-        fig = plot_params(merged, ld_vals, ld_sem, "causal LD", ld_extras_bool)
+        fig = plot_params(merged, ld_vals, ld_sem, "causal LD", ld_extras_bool, hide_extras)
     elif len(dtypes) == 1:
-        fig = plot_params_simple(params, ld_vals, ld_sem, "causal LD", ld_extras_bool)
+        fig = plot_params_simple(params, ld_vals, ld_sem, "causal LD", ld_extras_bool, hide_extras)
     else:
         raise ValueError("No parameter values found")
 

From 4f5b24b6cabefa175a095ee3e2dc874b252076b7 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Tue, 22 Oct 2024 17:50:49 -0700
Subject: [PATCH 19/41] allow for leniency when estimating pvals in happler

---
 happler/tree/assoc_test.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/happler/tree/assoc_test.py b/happler/tree/assoc_test.py
index f1e6f95..fa9c02d 100644
--- a/happler/tree/assoc_test.py
+++ b/happler/tree/assoc_test.py
@@ -146,7 +146,7 @@ def pval_as_decimal(t_stat: float, df: int, precision: int = 1000) -> Decimal:
         ------
         ValueError
             This function is only valid when the t distribution is approximately
-            equivalent to the normal distribution. Thus, if df < 10000, we raise a
+            equivalent to the normal distribution. Thus, if df < 1000, we raise a
             ValueError to indicate that the function will not return an accurate value
 
         Returns
@@ -154,8 +154,8 @@ def pval_as_decimal(t_stat: float, df: int, precision: int = 1000) -> Decimal:
         Decimal
             An approximate, higher precision p-value for the provided t statistic
         """
-        if df < 10000:
-            raise ValueError("You need a larger sample size to approximate this p-value")
+        if df < 1000:
+            log.warning("You need a larger sample size to approximate this p-value")
         log10_pval = stats.norm.logsf(np.abs(t_stat)) / np.log(10) + np.log10(2)
         # set the desired precision
         getcontext().prec = precision

From 46fe5ca8048a5359e098f7ac182b593b0aff3099 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Tue, 22 Oct 2024 21:35:19 -0700
Subject: [PATCH 20/41] make thresh more lenient

---
 analysis/config/config-simulations.yml | 2 +-
 happler/__main__.py                    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/analysis/config/config-simulations.yml b/analysis/config/config-simulations.yml
index a48ae14..d983344 100644
--- a/analysis/config/config-simulations.yml
+++ b/analysis/config/config-simulations.yml
@@ -78,7 +78,7 @@ min_maf: 0.05
 sample_size: 777
 
 # A threshold on the pvalues of the haplotypes
-out_thresh: 5e-5
+out_thresh: 5e-3
 
 # Whether to include the causal variant in the set of genotypes provided to the
 # finemapping methods. Set this to true if you're interested in seeing how the
diff --git a/happler/__main__.py b/happler/__main__.py
index 2ed1739..5748e97 100644
--- a/happler/__main__.py
+++ b/happler/__main__.py
@@ -369,7 +369,7 @@ def run(
         if len(haps.data) == 1:
             hap = next(iter(haps.data.values()))
             if hap.pval < -np.log10(out_thresh):
-                log.info("Ignoring haplotype with low pval")
+                log.info(f"Ignoring haplotype with low pval {hap.pval}")
                 continue
         for hap in haps.data.values():
             if len(hap.variants) <= 1 and remove_snps:

From 15848901e0c8883ba7757b5c95634a0a0431a94b Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Tue, 22 Oct 2024 21:36:20 -0700
Subject: [PATCH 21/41] split pheno for happler rules

---
 analysis/workflow/rules/happler.smk | 43 +++++++++++++++++++++++------
 1 file changed, 34 insertions(+), 9 deletions(-)

diff --git a/analysis/workflow/rules/happler.smk b/analysis/workflow/rules/happler.smk
index 11c5b24..f98f666 100644
--- a/analysis/workflow/rules/happler.smk
+++ b/analysis/workflow/rules/happler.smk
@@ -24,11 +24,36 @@ def parse_locus(locus):
     return chrom, start, end
 
 
+rule sub_pheno:
+    """subset the phenotype file to include only the desired replicate"""
+    input:
+        pheno=config["pheno"],
+    params:
+        rep=lambda wildcards: int(wildcards.rep)+2,
+    output:
+        pheno=out + "/phen.pheno",
+    resources:
+        runtime=1,
+    threads: 1,
+    log:
+        logs + "/sub_pheno",
+    benchmark:
+        bench + "/sub_pheno",
+    conda:
+        "../envs/default.yml"
+    shell:
+        "cut -f 1,{params.rep} {input.pheno} | "
+        "(echo -e \"#IID\\thap\" && tail -n+2) >{output.pheno} 2>{log}"
+
+
+pheno = rules.sub_pheno.output.pheno if "{rep}" in out else config["pheno"]
+
+
 rule run:
     """ execute happler! """
     input:
         gts=config["snp_panel"],
-        pts=config["pheno"],
+        pts=pheno,
         covar=config["covar"],
     params:
         thresh=lambda wildcards: 0.05 if "alpha" not in wildcards else wildcards.alpha,
@@ -106,7 +131,7 @@ rule cond_linreg:
         pvar=Path(config["snp_panel"]).with_suffix(".pvar"),
         psam=Path(config["snp_panel"]).with_suffix(".psam"),
         hap=rules.run.output.hap,
-        pts=config["pheno"],
+        pts=pheno,
     params:
         maf = check_config("min_maf", 0),
         region=lambda wildcards: wildcards.locus.replace("_", ":"),
@@ -138,7 +163,7 @@ rule heatmap:
         pvar=Path(config["snp_panel"]).with_suffix(".pvar"),
         psam=Path(config["snp_panel"]).with_suffix(".psam"),
         hap=rules.run.output.hap,
-        pts=config["pheno"],
+        pts=pheno,
     params:
         region=lambda wildcards: wildcards.locus.replace("_", ":"),
     output:
@@ -202,7 +227,7 @@ rule transform:
         pgen=config["snp_panel"],
         pvar=Path(config["snp_panel"]).with_suffix(".pvar"),
         psam=Path(config["snp_panel"]).with_suffix(".psam"),
-        pts=config["pheno"],
+        pts=pheno,
     params:
         region=lambda wildcards: wildcards.locus.replace("_", ":"),
     output:
@@ -229,7 +254,7 @@ rule linreg:
         hap=rules.transform.output.pgen,
         hap_pvar=rules.transform.output.pvar,
         hap_psam=rules.transform.output.psam,
-        pts=config["pheno"],
+        pts=pheno,
     params:
         region=lambda wildcards: wildcards.locus.replace("_", ":"),
     output:
@@ -339,7 +364,7 @@ rule finemapper:
         gt=lambda wildcards: finemapper_input(wildcards).pgen,
         gt_pvar=lambda wildcards: finemapper_input(wildcards).pvar,
         gt_psam=lambda wildcards: finemapper_input(wildcards).psam,
-        phen=config["pheno"],
+        phen=pheno,
     params:
         outdir=lambda wildcards, output: Path(output.susie).parent,
         exclude_causal="NULL",
@@ -347,7 +372,7 @@ rule finemapper:
         # num_signals=lambda wildcards: get_num_haplotypes(
         #     expand(rules.run.output.hap, **wildcards)[0]
         # ) + 1,
-        num_signals=5,
+        num_signals=10,
         # TODO: also add MAF filter
     output:
         susie=out + "/{ex}clude/susie.rds",
@@ -433,7 +458,7 @@ rule results:
     """
     input:
         gt=rules.finemapper.input.gt,
-        phen=config["pheno"],
+        phen=pheno,
         susie=rules.finemapper.output.susie,
         happler_hap=results_happler_hap_input,
         causal_gt=config["causal_gt"].pgen if "causal_gt" in config else [],
@@ -491,7 +516,7 @@ rule gwas:
         pgen=(rules.merge_SVs if check_config("SVs") else rules.merge).output.pgen,
         pvar=(rules.merge_SVs if check_config("SVs") else rules.merge).output.pvar,
         psam=(rules.merge_SVs if check_config("SVs") else rules.merge).output.psam,
-        pts=config["pheno"],
+        pts=pheno,
         covar=config["covar"],
     params:
         maf = check_config("min_maf", 0),

From 63bd60be7846353ab3e45e6de37bad93333efe37 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Wed, 23 Oct 2024 20:43:45 -0700
Subject: [PATCH 22/41] set up workflow for running on geuvadis again

---
 analysis/workflow/Snakefile         |  1 +
 analysis/workflow/rules/happler.smk | 28 +++++++++++-----------------
 2 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/analysis/workflow/Snakefile b/analysis/workflow/Snakefile
index 2747744..5a9672c 100644
--- a/analysis/workflow/Snakefile
+++ b/analysis/workflow/Snakefile
@@ -121,6 +121,7 @@ elif mode == "run":
         "random": None,
         "covar": covar_file,
         "min_maf": check_config("min_maf", 0),
+        "out_thresh": check_config("out_thresh", 5e-8),
     }
     if check_config("SVs", place=config["modes"][mode]):
         happler_config["SVs"] = config["modes"][mode]["SVs"]
diff --git a/analysis/workflow/rules/happler.smk b/analysis/workflow/rules/happler.smk
index f98f666..6bd4b65 100644
--- a/analysis/workflow/rules/happler.smk
+++ b/analysis/workflow/rules/happler.smk
@@ -33,7 +33,7 @@ rule sub_pheno:
     output:
         pheno=out + "/phen.pheno",
     resources:
-        runtime=1,
+        runtime=5,
     threads: 1,
     log:
         logs + "/sub_pheno",
@@ -63,8 +63,8 @@ rule run:
         indep=lambda wildcards: 0.05 if "indep_alpha" not in wildcards else wildcards.indep_alpha,
         max_signals=3,
         max_iterations=3,
-        rep=lambda wildcards: int(wildcards.rep),
         out_thresh=check_config("out_thresh", 5e-8),
+        keep_SNPs="--remove-SNPs " if "{rep}" in out else "",
     output:
         hap=out + "/happler.hap",
         gz=out + "/happler.hap.gz",
@@ -72,15 +72,12 @@ rule run:
         dot=out + "/happler.dot",
     resources:
         runtime=lambda wildcards, input: (
-            Path(input.gts).with_suffix(".pvar").stat().st_size/1000 * 0.5454649806119475 + 0.6935132147046765
-            if Path(input.gts).suffix == ".pgen" and check_config("dynamic_resources") else 45
+            max(45, Path(input.gts).with_suffix(".pvar").stat().st_size/1000 * 0.5454649806119475 + 0.6935132147046765)
         ),
         # slurm_partition="hotel",
         # slurm_extra="--qos=hotel",
-        # mem_mb=lambda wildcards, threads: threads*4.57,
         mem_mb=lambda wildcards, input: (
-            Path(input.gts).with_suffix(".pvar").stat().st_size/1000 * 14.90840694595845 + 401.8011129664152
-            if Path(input.gts).suffix == ".pgen" and check_config("dynamic_resources") else 5000
+            max(5000, Path(input.gts).with_suffix(".pvar").stat().st_size/1000 * 14.90840694595845 + 401.8011129664152)
         ),
     threads: 1
     log:
@@ -92,7 +89,7 @@ rule run:
     shell:
         "happler run -o {output.hap} --verbosity DEBUG --maf {params.maf} "
         "--max-signals {params.max_signals} --max-iterations {params.max_iterations} "
-        "--discard-multiallelic --region {params.region} --pheno {params.rep} "
+        "--discard-multiallelic --region {params.region} {params.keep_SNPs}"
         "{params.covar}--indep-thresh {params.indep} -t {params.thresh} "
         "--out-thresh {params.out_thresh} --show-tree {input.gts} {input.pts} &>{log}"
         " && haptools index -o {output.gz} {output.hap} &>>{log}"
@@ -139,9 +136,9 @@ rule cond_linreg:
         png=out + "/cond_linreg.pdf",
     resources:
         runtime=lambda wildcards, input: (
-            1.5 * Path(input.pvar).stat().st_size/1000 * (
+            max(50, 1.5 * Path(input.pvar).stat().st_size/1000 * (
                 0.2400978997329614 + get_num_variants(input.hap) * 0.045194464826048095
-            ) if Path(input.pgen).suffix == ".pgen" and check_config("dynamic_resources") else 50
+            ))
         ),
     log:
         logs + "/cond_linreg",
@@ -335,8 +332,7 @@ rule merge:
         psam=temp(out + "/{ex}clude/merged.psam"),
     resources:
         runtime=lambda wildcards, input: (
-            Path(input.gts_pvar).stat().st_size/1000 * 0.05652315368728583 + 2.0888654705656844
-            if Path(input.gts).suffix == ".pgen" and check_config("dynamic_resources") else 10
+            max(10, Path(input.gts_pvar).stat().st_size/1000 * 0.05652315368728583 + 2.0888654705656844)
         ),
     log:
         logs + "/{ex}clude/merge",
@@ -377,11 +373,9 @@ rule finemapper:
     output:
         susie=out + "/{ex}clude/susie.rds",
     resources:
-#        runtime=lambda wildcards, input: (
-#            Path(input.gt_pvar).stat().st_size/1000 * 0.08
-#            if Path(input.gt).suffix == ".pgen" and check_config("dynamic_resources") else 75
-#        ),
-        runtime=120,
+        runtime=lambda wildcards, input: (
+            max(120, Path(input.gt_pvar).stat().st_size/1000 * 0.08)
+        ),
         mem_mb = 7000,
     threads: 1,
     log:

From eeb4659b9c276dfb04783c9ec87a18076a966122 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Wed, 23 Oct 2024 20:44:41 -0700
Subject: [PATCH 23/41] switch to geuvadis config

---
 analysis/config/config-geuvadis.yml | 3 +++
 analysis/config/config.yml          | 2 +-
 happler/tree/assoc_test.py          | 1 -
 happler/tree/terminator.py          | 2 +-
 4 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/analysis/config/config-geuvadis.yml b/analysis/config/config-geuvadis.yml
index b5f23d9..6089e55 100644
--- a/analysis/config/config-geuvadis.yml
+++ b/analysis/config/config-geuvadis.yml
@@ -79,6 +79,9 @@ min_maf: 0.1
 # sample_size: [500, 1000, 1500, 2000, 2500]
 # sample_size: 777
 
+# A threshold on the pvalues of the haplotypes
+out_thresh: 5e-6
+
 # Whether to include the causal variant in the set of genotypes provided to the
 # finemapping methods. Set this to true if you're interested in seeing how the
 # methods perform when the causal variant is absent from the data.
diff --git a/analysis/config/config.yml b/analysis/config/config.yml
index 51c999d..c7dda66 120000
--- a/analysis/config/config.yml
+++ b/analysis/config/config.yml
@@ -1 +1 @@
-config-simulations.yml
\ No newline at end of file
+config-geuvadis.yml
\ No newline at end of file
diff --git a/happler/tree/assoc_test.py b/happler/tree/assoc_test.py
index fa9c02d..f17f202 100644
--- a/happler/tree/assoc_test.py
+++ b/happler/tree/assoc_test.py
@@ -7,7 +7,6 @@
 from scipy import stats
 import numpy.typing as npt
 import statsmodels.api as sm
-from scipy.stats import t as t_dist
 
 
 # We declare this class to be a dataclass to automatically define __init__ and a few
diff --git a/happler/tree/terminator.py b/happler/tree/terminator.py
index 13daee3..105ce2a 100644
--- a/happler/tree/terminator.py
+++ b/happler/tree/terminator.py
@@ -132,7 +132,7 @@ def compute_val(
                 # terminate if the effect sizes have gone in the opposite direction
                 self.log.debug("Terminated b/c effect size did not improve")
                 return True
-            # perform a two tailed, two-sample t-test using the difference of the effect sizes
+            # perform a one tailed, two-sample t-test using the difference of the effect sizes
             # first, we compute the standard error of the difference of the effect sizes
             std_err = np.sqrt(
                 ((results.data["stderr"] ** 2) + (parent_res.stderr**2)) / 2

From 1dbe559733493114831ae0588f2f51a471b4b29a Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Wed, 23 Oct 2024 20:45:34 -0700
Subject: [PATCH 24/41] enable pickling as a parameter plot option

---
 analysis/workflow/scripts/parameter_plot.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/analysis/workflow/scripts/parameter_plot.py b/analysis/workflow/scripts/parameter_plot.py
index af1db31..58a18aa 100755
--- a/analysis/workflow/scripts/parameter_plot.py
+++ b/analysis/workflow/scripts/parameter_plot.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python
+import pickle
 from pathlib import Path
 from logging import Logger
 
@@ -303,7 +304,7 @@ def plot_params_simple(
     hide_extras: bool
         Whether to hide the observed haps that have no causal hap match
     """
-    fig, ax = plt.subplots(nrows=1, ncols=1, sharex=True)
+    fig, ax = plt.subplots(nrows=1, ncols=1, sharex=True, figsize=(3.5, 3))
     val_xlabel = params.dtype.names[0]
     if val_xlabel in LOG_SCALE:
         params = -np.log10(params)
@@ -319,6 +320,7 @@ def plot_params_simple(
         ax.errorbar(v[0], v[1], yerr=v[2], marker="o", c=v[3], markersize=5)
     ax.set_ylabel(val_title, color="g")
     ax.set_xlabel(val_xlabel)
+    ax.set_ylim(0, 1.03)
     return fig
 
 
@@ -435,6 +437,13 @@ def group_by_rep(params: npt.NDArray, vals: npt.NDArray, causal_idxs: npt.NDArra
     show_default=True,
     help="Hide the observed haps that have no causal hap match",
 )
+@click.option(
+    "--pickle-out",
+    is_flag=True,
+    default=False,
+    show_default=True,
+    help="Save the output as a pickle file as well",
+)
 @click.option(
     "--order",
     type=str,
@@ -467,6 +476,7 @@ def main(
     observed_id: str = None,
     causal_id: str = None,
     hide_extras: bool = False,
+    pickle_out: bool = False,
     order: str = None,
     output: Path = Path("/dev/stdout"),
     verbosity: str = "ERROR",
@@ -541,6 +551,10 @@ def main(
     )
     del dtypes["rep"]
 
+    if pickle_out:
+        with open(output.with_suffix(".pickle"), "wb") as f:
+            pickle.dump([params, ld_vals, ld_sem, ld_extras_bool], f)
+
     if len(dtypes) > 1 or metrics is not None:
         merged = params
         if metrics is not None:

From 4383e3f7f97dbe9201408651b318023e4c469dee Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Fri, 25 Oct 2024 06:43:34 -0700
Subject: [PATCH 25/41] start to try to handle multiple haps in
 summarize_results.R

---
 analysis/workflow/scripts/summarize_results.R | 42 ++++++-------------
 analysis/workflow/scripts/susie_metrics.R     |  1 +
 2 files changed, 13 insertions(+), 30 deletions(-)

diff --git a/analysis/workflow/scripts/summarize_results.R b/analysis/workflow/scripts/summarize_results.R
index b31cdf9..5a92017 100755
--- a/analysis/workflow/scripts/summarize_results.R
+++ b/analysis/workflow/scripts/summarize_results.R
@@ -28,9 +28,10 @@ X = readPGEN(snakemake@input[["gt"]], region=region, samples=samples[,2])
 # also load the positions of each of the variants
 pos = readPVAR(snakemake@input[["gt"]], region=region)
 if (length(snakemake@input[["causal_gt"]])) {
+    write("Loading causal genotype data too", stderr())
     causal_gt_file = snakemake@input[["causal_gt"]]
     causal_gt_samples = readPSAM(causal_gt_file, samples=readPheno(phen)[,1])
-    causal_gt = readPGEN(causal_gt_file, region=region, samples=causal_gt_samples)
+    causal_gt = readPGEN(causal_gt_file, region=region, samples=causal_gt_samples[,2])
     X = cbind(X, causal_gt)
     causal_variant = colnames(causal_gt)[1]
     # also load the positions of each of the variants
@@ -52,43 +53,24 @@ hap = FALSE
 causal_hap_is_provided = FALSE
 if (length(snakemake@input[["happler_hap"]]) > 0) {
     hap = TRUE
-    happler_haplotype = file(snakemake@input[["happler_hap"]], "rt")
+    happler_haplotype = snakemake@input[["happler_hap"]]
 }
 if (length(snakemake@params[["causal_hap"]]) > 0) {
     causal_hap_is_provided = TRUE
-    causal_haplotype = file(snakemake@params[["causal_hap"]], "rt")
+    causal_haplotype = snakemake@params[["causal_hap"]]
 }
 
 if (hap) {
     write("Parsing happler hap file", stderr())
-    while(TRUE) {
-        line = readLines(happler_haplotype, 1)
-        # we assume the first line in the hap file that begins with H denotes the haplotype
-        if (grepl("^H\\t", line)) break
-    }
-    # extract the start, end, and ID of the haplotype
-    happler_hap = as.integer(strsplit(line, "\t")[[1]][c(3,4)])
-    happler_hap_id = strsplit(line, "\t")[[1]][c(5)]
+    happler_hap = readHap(happler_haplotype)
 }
 if (causal_hap_is_provided) {
     write("Parsing causal hap file", stderr())
-    while(TRUE) {
-        line = readLines(causal_haplotype, 1)
-        # we assume the first line in the hap file that begins with H denotes the haplotype
-        if (grepl("^H\\t", line)) break
-    }
-    # extract the start, end, and ID of the haplotype
-    causal_hap = as.integer(strsplit(line, "\t")[[1]][c(3,4)])
-    causal_hap_id = strsplit(line, "\t")[[1]][c(5)]
-    stopifnot(causal_hap_id == causal_variant)
-    haplotypes = t(data.frame(
-        causal_hap = c("start"=causal_hap[1], "end"=causal_hap[2], "id"=causal_hap_id),
-        happler_hap = c("start"=happler_hap[1], "end"=happler_hap[2], "id"=happler_hap_id)
-    ))
-} else {
-    haplotypes = t(data.frame(
-        happler_hap = c("start"=happler_hap[1], "end"=happler_hap[2], "id"=happler_hap_id)
-    ))
+    causal_hap = readHap(causal_haplotype)
+    stopifnot(causal_variant %in% causal_hap["id"])
+    haplotypes = cbind(causal_hap, happler_hap)
+} else if (hap) {
+    haplotypes = happler_hap
 }
 
 write("Formatting genotypes and creating output dir", stderr())
@@ -253,7 +235,7 @@ if (hap) {
 } else {
     pip_plot(susie_pip, X, b, pos, susie_cs=names(susie_pip[fitted$sets$cs[['L1']]]))
 }
-ggsave(paste0(out,'/susie.pdf'), width=10, height=5, device='pdf')
+ggsave(paste0(out,'/susie.pdf'), width=6, height=5, device='pdf')
 dev.off()
 
 if (!is.null(finemap_results)) {
@@ -263,6 +245,6 @@ if (!is.null(finemap_results)) {
     } else {
         pip_plot(finemap_pip, X, b, pos)
     }
-    ggsave(paste0(out,'/finemap.pdf'), width=10, height=5, device='pdf')
+    ggsave(paste0(out,'/finemap.pdf'), width=6, height=5, device='pdf')
     while (!is.null(dev.list())) dev.off()
 }
diff --git a/analysis/workflow/scripts/susie_metrics.R b/analysis/workflow/scripts/susie_metrics.R
index 3d1733e..f3fd402 100755
--- a/analysis/workflow/scripts/susie_metrics.R
+++ b/analysis/workflow/scripts/susie_metrics.R
@@ -42,6 +42,7 @@ for (happler_hap_file_idx in 1:nrow(happler_hap)) {
     # 6) What is the length of the credible set?
     obs_pip = susie_pip[happler_hap_id]
     has_highest_pip = as.integer(sum(obs_pip < susie_pip) == 0)
+    # TODO: fix this so that it also excludes other haplotypes besides obs_pip
     best_variant_pip = max(susie_pip[names(susie_pip) != names(obs_pip)])
     credible_set = NULL
     for (cs in names(fitted$sets$cs)) {

From 23735cfe6deb72dcae756a2517652573127b03d9 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Fri, 25 Oct 2024 06:44:27 -0700
Subject: [PATCH 26/41] enable computing mean and stderr of finemapping metrics
 in parameter_plot

---
 analysis/workflow/scripts/parameter_plot.py | 221 +++++++++++++++++---
 1 file changed, 187 insertions(+), 34 deletions(-)

diff --git a/analysis/workflow/scripts/parameter_plot.py b/analysis/workflow/scripts/parameter_plot.py
index 58a18aa..107482d 100755
--- a/analysis/workflow/scripts/parameter_plot.py
+++ b/analysis/workflow/scripts/parameter_plot.py
@@ -179,17 +179,59 @@ def get_finemap_metrics(metrics_path: Path, log: Logger = None):
     # 3) What is the best PIP among the variants?
     # 4) Is the observed hap in a credible set?
     # 5) What is the purity of the credible set?
-    return np.loadtxt(
-        fname=metrics_path,
-        delimiter=" ",
-        dtype=[
-            ("pip", np.float64),
-            ("has_highest_pip", np.bool_),
-            ("best_variant_pip", np.float64),
-            ("in_credible_set", np.bool_),
-            ("cs_purity", np.float64),
-        ],
+    # 6) What is the length of the credible set?
+    # If you add or remove any metrics here, make sure to also change the
+    # "num_expected_vals" in get_metrics_mean_std so it knows the number of metrics
+    dtype = [
+        ("pip", np.float64),
+        ("has_highest_pip", np.bool_),
+        ("best_variant_pip", np.float64),
+        ("in_credible_set", np.bool_),
+        ("cs_purity", np.float64),
+        ("cs_length", np.float64)
+    ]
+    null_val = np.array([
+        (np.nan, False, np.nan, False, np.nan, np.nan),
+    ], dtype=dtype)
+    # if there are no observed haplotypes, then we can't compute LD
+    if not metrics_path.exists():
+        return null_val
+    metrics = np.loadtxt(fname=metrics_path, delimiter=" ", dtype=dtype)
+    if not metrics.shape:
+        log.debug(f"No metrics found in the metrics file {metrics_path}")
+        return null_val
+    return metrics
+
+
+def get_metrics_mean_std(metrics_vals: npt.NDArray, num_expected_vals: int = 6):
+    """
+    Compute the mean and standard error of the metrics
+
+    Parameters
+    ----------
+    metrics_vals: npt.NDArray
+        A numpy array containing the metrics for each observed hap
+    num_expected_vals: int, optional
+        The number of expected metrics
+
+    Returns
+    -------
+    npt.NDArray
+        A numpy array containing the mean of the metrics
+    npt.NDArray
+        A numpy array containing the standard error of the metrics
+    """
+    if len(metrics_vals) == 0:
+        return np.array([np.nan,]*6), np.array([np.nan,]*6)
+    nan_vals = np.isnan(metrics_vals["pip"])
+    metrics_vals = metrics_vals[~nan_vals]
+    mean_vals = np.array(
+        [np.nanmean(metrics_vals[field]) for field in metrics_vals.dtype.names],
+    )
+    sem_vals = np.array(
+        [sem(metrics_vals[field], nan_policy="omit") for field in metrics_vals.dtype.names],
     )
+    return mean_vals, sem_vals
 
 
 def count_shared(observed, causal, log, observed_id: str = None, causal_id: str = None):
@@ -219,6 +261,7 @@ def plot_params(
         vals_sem: npt.NDArray,
         val_title: str,
         val_color: npt.NDArray,
+        metrics: dict = None,
         hide_extras: bool = False,
     ):
     """
@@ -236,16 +279,22 @@ def plot_params(
         The name of the value that we are plotting
     val_color: npt.NDArray
         Whether to color each point corresponding to this value
-    hide_extras: bool
+    metrics: dict, optional
+        A dict mapping fine-mapping metrics to tuples of two lists of numpy arrays
+        containing means and standard errors of fine-mapping metrics for each observed hap
+    hide_extras: bool, optional
         Whether to hide the observed haps that have no causal hap match
     """
     figsize = matplotlib.rcParams["figure.figsize"]
     if len(params) > 75:
         figsize[0] = len(params) / 15
-    if len(params.dtype) > 5:
-        figsize[1] = len(params.dtype) * 1.25
+    num_rows = len(params.dtype)
+    if metrics is not None:
+        num_rows = len(params.dtype) + len(metrics)
+    if num_rows > 5:
+        figsize[1] = num_rows * 1.25
     fig, axs = plt.subplots(
-        nrows=len(params.dtype)+1, ncols=1,
+        nrows=num_rows+1, ncols=1,
         sharex=True, figsize=figsize,
     )
     fig.subplots_adjust(hspace=0)
@@ -275,6 +324,21 @@ def plot_params(
         axs[idx+1].plot(params[param], "-")
         axs[idx+1].set_ylabel(val_title, rotation="horizontal", ha="right")
         axs[idx+1].grid(axis="x")
+    if metrics is not None:
+        for idx, metric in enumerate(metrics.keys()):
+            curr_ax = axs[idx+len(params.dtype)+1]
+            val_title = metric
+            vals = np.concatenate(metrics[metric][0])
+            if len(np.unique(vals)) == 1:
+                curr_ax.remove()
+                continue
+            vals_sem = np.concatenate(metrics[metric][1])
+            for v in zip(x_vals, vals, vals_sem, val_color):
+                if hide_extras and v[3] != "green":
+                    continue
+                curr_ax.errorbar(v[0], v[1], yerr=v[2], marker="o", c=v[3], markersize=3)
+            curr_ax.set_ylabel(val_title, rotation="horizontal", ha="right")
+            curr_ax.grid(axis="x")
     return fig
 
 
@@ -301,7 +365,7 @@ def plot_params_simple(
         The name of the value that we are plotting
     val_color: npt.NDArray
         Whether to color each point corresponding to this value
-    hide_extras: bool
+    hide_extras: bool, optional
         Whether to hide the observed haps that have no causal hap match
     """
     fig, ax = plt.subplots(nrows=1, ncols=1, sharex=True, figsize=(3.5, 3))
@@ -324,7 +388,13 @@ def plot_params_simple(
     return fig
 
 
-def group_by_rep(params: npt.NDArray, vals: npt.NDArray, causal_idxs: npt.NDArray, bools: npt.NDArray):
+def group_by_rep(
+        params: npt.NDArray,
+        vals: npt.NDArray,
+        causal_idxs: npt.NDArray,
+        bools: npt.NDArray,
+        metrics: npt.NDArray = None,
+    ):
     """
     Group replicates with identical parameter values
 
@@ -340,6 +410,8 @@ def group_by_rep(params: npt.NDArray, vals: npt.NDArray, causal_idxs: npt.NDArra
         The index of the causal haplotype that each observed haplotype belongs to
     bools: npt.NDArray
         An array of boolean values indicating whether each observed haplotype has a match
+    metrics: npt.NDArray
+        A numpy array of numpy arrays containing the metrics for each observed hap
 
     Returns
     -------
@@ -350,18 +422,25 @@ def group_by_rep(params: npt.NDArray, vals: npt.NDArray, causal_idxs: npt.NDArra
     """
     other_param_names = [name for name in params.dtype.names if name != "rep"]
     grouped_params = np.unique(params[other_param_names])
-    get_mean_std = lambda x: (np.mean(x), sem(x) if len(x) > 1 else np.nan)
+    get_mean_std = lambda x: (np.nanmean(x), sem(x, nan_policy="omit") if len(x) > 1 else np.nan)
     filter_causal_idxs = lambda x: np.unique(x[x != np.iinfo(np.uint8).max])
     grouped_vals = []
     grouped_sem = []
     grouped_bools = []
+    grouped_metrics = []
+    grouped_metrics_sem = []
     for group in grouped_params:
         subgrouped_vals = []
         subgrouped_sem = []
         subgrouped_bools = []
+        subgrouped_metrics = []
+        subgrouped_metrics_sem = []
         curr_vals = vals[params[other_param_names] == group]
         curr_causal_idxs = causal_idxs[params[other_param_names] == group]
         curr_bools = bools[params[other_param_names] == group]
+        curr_metrics = None
+        if metrics is not None:
+            curr_metrics = metrics[params[other_param_names] == group]
         # compute mean and std err for each causal match
         for causal_idx in filter_causal_idxs(np.concatenate(curr_causal_idxs)):
             try:
@@ -375,6 +454,14 @@ def group_by_rep(params: npt.NDArray, vals: npt.NDArray, causal_idxs: npt.NDArra
             subgrouped_vals.append(val_mean)
             subgrouped_sem.append(val_sem)
             subgrouped_bools.append(True)
+            if curr_metrics is not None:
+                curr_metrics_causal_idxs = np.concatenate([
+                    val[(indices == causal_idx) & curr_bool]
+                    for val, indices, curr_bool in zip(curr_metrics, curr_causal_idxs, curr_bools)
+                ])
+                metrics_mean, metrics_sem = get_metrics_mean_std(curr_metrics_causal_idxs)
+                subgrouped_metrics.append(metrics_mean)
+                subgrouped_metrics_sem.append(metrics_sem)
             # compute mean and std err for each unmatched observed hap
             for unmatched_idx in range(max([(~i).sum() for i in curr_bools])):
                 try:
@@ -389,9 +476,23 @@ def group_by_rep(params: npt.NDArray, vals: npt.NDArray, causal_idxs: npt.NDArra
                 subgrouped_vals.append(val_mean)
                 subgrouped_sem.append(val_sem)
                 subgrouped_bools.append(False)
+                if curr_metrics is not None:
+                    curr_metrics_unmatched_idxs = np.array([
+                        val[(indices == causal_idx) & ~curr_bool][unmatched_idx]
+                        for val, indices, curr_bool in zip(curr_metrics, curr_causal_idxs, curr_bools)
+                        if unmatched_idx < ((indices == causal_idx) & ~curr_bool).sum()
+                    ])
+                    metrics_mean, metrics_sem = get_metrics_mean_std(curr_metrics_unmatched_idxs)
+                    subgrouped_metrics.append(metrics_mean)
+                    subgrouped_metrics_sem.append(metrics_sem)
         grouped_vals.append(np.array(subgrouped_vals, dtype=object))
         grouped_sem.append(np.array(subgrouped_sem, dtype=object))
         grouped_bools.append(np.array(subgrouped_bools, dtype=object))
+        if curr_metrics is not None:
+            grouped_metrics.append(np.array(subgrouped_metrics, dtype=object))
+            grouped_metrics_sem.append(np.array(subgrouped_metrics_sem, dtype=object))
+    if curr_metrics is not None:
+        return grouped_params, grouped_vals, grouped_sem, grouped_bools, grouped_metrics, grouped_metrics_sem
     return grouped_params, grouped_vals, grouped_sem, grouped_bools
 
 
@@ -407,6 +508,13 @@ def group_by_rep(params: npt.NDArray, vals: npt.NDArray, causal_idxs: npt.NDArra
     show_default=True,
     help="Plot fine-mapping metrics, as well",
 )
+@click.option(
+    "--use-metric",
+    type=str,
+    default=None,
+    show_default="observed LD",
+    help="Use this fine-mapping metric to color the plot",
+)
 @click.option(
     "--region",
     type=str,
@@ -472,6 +580,7 @@ def main(
     observed_hap: Path,
     causal_hap: Path,
     metrics: Path,
+    use_metric: str = None,
     region: str = None,
     observed_id: str = None,
     causal_id: str = None,
@@ -543,30 +652,74 @@ def main(
                 log=log
             )
             for idx in range(len(params))
-        ])
-
-    # TODO: also handle the metrics in group_by_rep()
-    params, ld_vals, ld_sem, ld_extras_bool = group_by_rep(
-        params, ld_vals, ld_extras_idxs, ld_extras_bool,
-    )
+        ], dtype=object)
+        params, ld_vals, ld_sem, ld_extras_bool, metrics_vals, metrics_vals_sem = group_by_rep(
+            params, ld_vals, ld_extras_idxs, ld_extras_bool, metrics,
+        )
+        extract_metrics_vals = lambda x, idx: np.array([i[:, idx] for i in x], dtype=object)
+        metrics = {
+            name: (
+                extract_metrics_vals(metrics_vals, idx),
+                extract_metrics_vals(metrics_vals_sem, idx)
+            ) for idx, name in enumerate(metrics[0].dtype.names)
+        }
+    else:
+        params, ld_vals, ld_sem, ld_extras_bool = group_by_rep(
+            params, ld_vals, ld_extras_idxs, ld_extras_bool,
+        )
     del dtypes["rep"]
 
     if pickle_out:
         with open(output.with_suffix(".pickle"), "wb") as f:
-            pickle.dump([params, ld_vals, ld_sem, ld_extras_bool], f)
-
-    if len(dtypes) > 1 or metrics is not None:
-        merged = params
-        if metrics is not None:
-            merged = np.lib.recfunctions.merge_arrays([params, metrics], flatten=True)
-        fig = plot_params(merged, ld_vals, ld_sem, "causal LD", ld_extras_bool, hide_extras)
-    elif len(dtypes) == 1:
-        fig = plot_params_simple(params, ld_vals, ld_sem, "causal LD", ld_extras_bool, hide_extras)
+            if metrics is not None:
+                pickle.dump([params, ld_vals, ld_sem, ld_extras_bool, metrics], f)
+            else:
+                pickle.dump([params, ld_vals, ld_sem, ld_extras_bool], f)
+
+    if use_metric is None:
+        if len(dtypes) > 1:
+            fig = plot_params(
+                params,
+                ld_vals,
+                ld_sem,
+                "Observed LD",
+                ld_extras_bool,
+                metrics=metrics,
+                hide_extras=hide_extras
+            )
+        elif len(dtypes) == 1:
+            fig = plot_params_simple(
+                params,
+                ld_vals,
+                ld_sem,
+                "Observed LD",
+                ld_extras_bool,
+                hide_extras=hide_extras
+            )
     else:
-        raise ValueError("No parameter values found")
+        if len(dtypes) > 2:
+            fig = plot_params(
+                params,
+                metrics[use_metric][0],
+                metrics[use_metric][1],
+                use_metric,
+                ld_extras_bool,
+                hide_extras=hide_extras
+            )
+        elif len(dtypes) == 1:
+            fig = plot_params_simple(
+                params,
+                metrics[use_metric][0],
+                metrics[use_metric][1],
+                use_metric,
+                ld_extras_bool,
+                ld_extras_bool,
+                hide_extras=hide_extras
+            )
 
     fig.tight_layout()
-    fig.savefig(output, bbox_inches="tight", pad_inches=0.05)
+    fig.savefig(output,
+    bbox_inches="tight", pad_inches=0.05)
 
 if __name__ == "__main__":
     main()

From b9a1c22fda8914042fd188ccb3fcc9b898dee3da Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Fri, 25 Oct 2024 06:46:32 -0700
Subject: [PATCH 27/41] adjust the amount of time for the multiline rule

---
 analysis/workflow/Snakefile           | 2 +-
 analysis/workflow/rules/genotypes.smk | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/analysis/workflow/Snakefile b/analysis/workflow/Snakefile
index 5a9672c..1cefc93 100644
--- a/analysis/workflow/Snakefile
+++ b/analysis/workflow/Snakefile
@@ -196,7 +196,7 @@ elif mode == "run":
         output:
             hps = orig_out + "/multiline.tsv",
         resources:
-            runtime=3,
+            runtime=10,
         log:
             orig_out + "/logs/multiline",
         benchmark:
diff --git a/analysis/workflow/rules/genotypes.smk b/analysis/workflow/rules/genotypes.smk
index 8520d46..bfbc555 100644
--- a/analysis/workflow/rules/genotypes.smk
+++ b/analysis/workflow/rules/genotypes.smk
@@ -163,7 +163,7 @@ rule vcf2plink:
 
 
 rule subset_str:
-    """ subset samples from a STR VCF """
+    """ subset samples from an STR VCF """
     input:
         vcf=lambda wildcards: expand(config["str_panel"], chr=parse_locus(wildcards.locus)[0])[0],
         vcf_idx=lambda wildcards: expand(config["str_panel"] + ".tbi", chr=parse_locus(wildcards.locus)[0]),

From e1f01c3a5bd105987a6c0081dad43d2937997217 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Fri, 25 Oct 2024 06:47:05 -0700
Subject: [PATCH 28/41] start on STR analysis

---
 analysis/config/config-simulations.STR.yml | 89 ++++++++++++++++++++++
 analysis/config/config-simulations.yml     |  1 +
 2 files changed, 90 insertions(+)
 create mode 100644 analysis/config/config-simulations.STR.yml

diff --git a/analysis/config/config-simulations.STR.yml b/analysis/config/config-simulations.STR.yml
new file mode 100644
index 0000000..e346e6d
--- /dev/null
+++ b/analysis/config/config-simulations.STR.yml
@@ -0,0 +1,89 @@
+# This is the Snakemake configuration file that specifies paths and
+# and options for the pipeline. Anybody wishing to use
+# the provided snakemake pipeline should first fill out this file with paths to
+# their own data, as the Snakefile requires it.
+# Every config option has reasonable defaults unless it is labeled as "required."
+# All paths are relative to the directory that Snakemake is executed in.
+# Note: this file is written in the YAML syntax (https://learnxinyminutes.com/docs/yaml/)
+
+
+# Paths to a SNP-STR haplotype reference panel
+# You can download this from http://gymreklab.com/2018/03/05/snpstr_imputation.html
+# If the VCFs are per-chromosome, replace the contig name in the file name with "{chr}"
+# The VCF(s) must be sorted and indexed (with a .tbi file in the same directory)
+# required!
+snp_panel: "data/simulations/1000G/19_45401409-46401409.pgen"
+# str_panel: "/tscc/projects/ps-gymreklab/jmargoli/ukbiobank/str_imputed/runs/first_pass/vcfs/annotated_strs/chr{chr}.vcf.gz"
+str_panel: "data/simulations/1000G.SNP-STR/19_45401409-46401409.STRs.pgen"
+
+# Path to a list of samples to exclude from the analysis
+# There should be one sample ID per line
+# exclude_samples: data/ukb_random_samples_exclude.tsv
+
+# If SNPs are unphased, provide the path to a SHAPEIT4 map file like these:
+# https://github.com/odelaneau/shapeit4/tree/master/maps
+# The map file should use the same reference genome as the reference panel VCFs
+# phase_map: data/genetic_maps/chr{chr}.b37.gmap.gz
+
+# A "locus" is a string with a contig name, a colon, the start position, a dash, and
+# the end position or a BED file with a ".bed" file ending
+# There are different simulation modes that you can use:
+# 1. "str" - a tandem repeat is a string with a contig name, a colon, and the start position
+# 2. "snp" - a SNP follows the same format as "str"
+# 3. "hap" - a haplotype
+# 4. "ld_range" - creates random two-SNP haplotypes with a range of LD values between the alleles of each haplotype
+# 5. "run" - execute happler on a locus without simulating anything
+# The STR and SNP positions should be contained within the locus.
+# The positions should be provided in the same coordinate system as the reference
+# genome of the reference panel VCFs
+# The contig should correspond with the contig name from the {chr} wildcard in the VCF
+# required! and unless otherwise noted, all attributes of each mode are required
+locus: 19:45401409-46401409 # center: 45901409 (APOe4)
+modes:
+ str:
+  pos: 19:45903859
+ snp:
+  pos: 19:45910672 # rs1046282
+ hap:
+  alleles: [rs36046716:G, rs1046282:G] # 45892145, 45910672
+  beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
+ ld_range:
+  reps: 10
+  min_ld: 0
+  max_ld: 1
+  step: 0.1
+  min_af: 0.25
+  max_af: 0.75
+  # beta: [0.35]
+  beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
+  alpha: [0.05]
+  random: false # whether to also produce random haplotypes
+  num_haps: [1, 2, 3, 4]
+ run:
+  pheno: data/geuvadis/phenos/{trait}.pheno
+  SVs: data/geuvadis/pangenie_hprc_hg38_all-samples_bi_SVs-missing_removed.pgen
+  # pheno_matrix: data/geuvadis/EUR_converted_expr_hg38.csv # optional
+mode: str
+
+# Covariates to use if they're needed
+# Otherwise, they're assumed to be regressed out of the phenotypes
+# Note: the finemapping methods won't be able to handle these covariates
+# covar: data/geuvadis/5PCs_sex.covar
+
+# Discard rare variants with a MAF below this number
+# Defaults to 0 if not specified
+min_maf: 0.05
+
+# Sample sizes to use
+# sample_size: [777, 800, 1600, 2400, 3200]
+sample_size: 777
+
+# A threshold on the pvalues of the haplotypes
+out_thresh: 5e-3
+
+# Whether to include the causal variant in the set of genotypes provided to the
+# finemapping methods. Set this to true if you're interested in seeing how the
+# methods perform when the causal variant is absent from the data.
+# Defaults to false if not specified
+# You can also provide a list of booleans, if you want to test both values
+exclude_causal: [true, false]
diff --git a/analysis/config/config-simulations.yml b/analysis/config/config-simulations.yml
index d983344..156c1fe 100644
--- a/analysis/config/config-simulations.yml
+++ b/analysis/config/config-simulations.yml
@@ -14,6 +14,7 @@
 # required!
 snp_panel: "data/simulations/1000G/19_45401409-46401409.pgen"
 # str_panel: "/tscc/projects/ps-gymreklab/jmargoli/ukbiobank/str_imputed/runs/first_pass/vcfs/annotated_strs/chr{chr}.vcf.gz"
+# str_panel: "data/simulations/1000G.SNP-STR/19_45401409-46401409.pgen"
 
 # Path to a list of samples to exclude from the analysis
 # There should be one sample ID per line

From 321ced6e0ba07cfb50291cc00c734dc40cb49a21 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Fri, 25 Oct 2024 06:47:29 -0700
Subject: [PATCH 29/41] revert to min for now. need to come up with a better
 way of doing this eventually

---
 analysis/workflow/rules/happler.smk | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/analysis/workflow/rules/happler.smk b/analysis/workflow/rules/happler.smk
index 6bd4b65..f185191 100644
--- a/analysis/workflow/rules/happler.smk
+++ b/analysis/workflow/rules/happler.smk
@@ -64,7 +64,7 @@ rule run:
         max_signals=3,
         max_iterations=3,
         out_thresh=check_config("out_thresh", 5e-8),
-        keep_SNPs="--remove-SNPs " if "{rep}" in out else "",
+        keep_SNPs="--remove-SNPs " if not ("{rep}" in out) else "",
     output:
         hap=out + "/happler.hap",
         gz=out + "/happler.hap.gz",
@@ -72,12 +72,12 @@ rule run:
         dot=out + "/happler.dot",
     resources:
         runtime=lambda wildcards, input: (
-            max(45, Path(input.gts).with_suffix(".pvar").stat().st_size/1000 * 0.5454649806119475 + 0.6935132147046765)
+            min(45, Path(input.gts).with_suffix(".pvar").stat().st_size/1000 * 0.5454649806119475 + 0.6935132147046765)
         ),
         # slurm_partition="hotel",
         # slurm_extra="--qos=hotel",
         mem_mb=lambda wildcards, input: (
-            max(5000, Path(input.gts).with_suffix(".pvar").stat().st_size/1000 * 14.90840694595845 + 401.8011129664152)
+            min(5000, Path(input.gts).with_suffix(".pvar").stat().st_size/1000 * 14.90840694595845 + 401.8011129664152)
         ),
     threads: 1
     log:
@@ -136,7 +136,7 @@ rule cond_linreg:
         png=out + "/cond_linreg.pdf",
     resources:
         runtime=lambda wildcards, input: (
-            max(50, 1.5 * Path(input.pvar).stat().st_size/1000 * (
+            min(50, 1.5 * Path(input.pvar).stat().st_size/1000 * (
                 0.2400978997329614 + get_num_variants(input.hap) * 0.045194464826048095
             ))
         ),
@@ -280,7 +280,7 @@ rule sv_ld:
         sv_pvar=lambda wildcards: Path(config["SVs"]).with_suffix(".pvar"),
         sv_psam=lambda wildcards: Path(config["SVs"]).with_suffix(".psam"),
     params:
-        start=lambda wildcards: max(0, int(parse_locus(wildcards.locus)[1])-1000000),
+        start=lambda wildcards: min(0, int(parse_locus(wildcards.locus)[1])-1000000),
         end=lambda wildcards: int(parse_locus(wildcards.locus)[2])+1000000,
         chrom=lambda wildcards: parse_locus(wildcards.locus)[0],
         maf = check_config("min_maf", 0),
@@ -332,7 +332,7 @@ rule merge:
         psam=temp(out + "/{ex}clude/merged.psam"),
     resources:
         runtime=lambda wildcards, input: (
-            max(10, Path(input.gts_pvar).stat().st_size/1000 * 0.05652315368728583 + 2.0888654705656844)
+            min(10, Path(input.gts_pvar).stat().st_size/1000 * 0.05652315368728583 + 2.0888654705656844)
         ),
     log:
         logs + "/{ex}clude/merge",
@@ -374,7 +374,7 @@ rule finemapper:
         susie=out + "/{ex}clude/susie.rds",
     resources:
         runtime=lambda wildcards, input: (
-            max(120, Path(input.gt_pvar).stat().st_size/1000 * 0.08)
+            min(120, Path(input.gt_pvar).stat().st_size/1000 * 0.08)
         ),
         mem_mb = 7000,
     threads: 1,
@@ -455,8 +455,8 @@ rule results:
         phen=pheno,
         susie=rules.finemapper.output.susie,
         happler_hap=results_happler_hap_input,
-        causal_gt=config["causal_gt"].pgen if "causal_gt" in config else [],
-        causal_hap=results_causal_hap_input,
+        # causal_gt=config["causal_gt"].pgen if "causal_gt" in config else [],
+        # causal_hap=results_causal_hap_input,
     params:
         outdir=lambda wildcards, output: Path(output.susie_pdf).parent,
         region=lambda wildcards: wildcards.locus.replace("_", ":"),

From f0b5b38a0355a72e40d4287033618004f03245ea Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Wed, 30 Oct 2024 13:31:35 -0700
Subject: [PATCH 30/41] allow for pickled output for explained_variances.png

---
 analysis/workflow/scripts/variance_explained_plot.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/analysis/workflow/scripts/variance_explained_plot.py b/analysis/workflow/scripts/variance_explained_plot.py
index 451536a..57fbf95 100755
--- a/analysis/workflow/scripts/variance_explained_plot.py
+++ b/analysis/workflow/scripts/variance_explained_plot.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python
+import pickle
 from pathlib import Path
 from logging import Logger
 
@@ -300,6 +301,9 @@ def main(
 
     f, (ax1, ax2) = plt.subplots(1, 2)
 
+    with open(output.with_suffix(".pickle"), "wb") as picklef:
+        pickle.dump((explained_variances, rsquareds), picklef)
+
     ax1.scatter(explained_variances[:, 1], explained_variances[:, 0])
     ax1.axline([0, 0], [max_ev_val, max_ev_val])
     ax1.set_title("Variance Explained")

From 03f9eef34e5690fd5f927184902df8660a99632f Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Fri, 22 Nov 2024 12:51:26 -0800
Subject: [PATCH 31/41] upgrade to newest susie

---
 analysis/workflow/envs/susie.post-deploy.sh |  4 ----
 analysis/workflow/envs/susie.yml            | 20 +++++++++++---------
 2 files changed, 11 insertions(+), 13 deletions(-)
 delete mode 100644 analysis/workflow/envs/susie.post-deploy.sh

diff --git a/analysis/workflow/envs/susie.post-deploy.sh b/analysis/workflow/envs/susie.post-deploy.sh
deleted file mode 100644
index 272cfdb..0000000
--- a/analysis/workflow/envs/susie.post-deploy.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!env bash
-set -o pipefail
-
-R -e 'install.packages("pgenlibr",repos="http://cran.us.r-project.org")'
diff --git a/analysis/workflow/envs/susie.yml b/analysis/workflow/envs/susie.yml
index fbd91b7..8e704f3 100644
--- a/analysis/workflow/envs/susie.yml
+++ b/analysis/workflow/envs/susie.yml
@@ -2,15 +2,17 @@ channels:
     - conda-forge
     - nodefaults
 dependencies:
-    - conda-forge::r-susier==0.11.42
+    - conda-forge::r-susier==0.12.35
     - conda-forge::r-abind==1.4_5
-    - conda-forge::r-r.utils==2.12.0
-    - conda-forge::r-dplyr==1.0.7
-    - conda-forge::r-readr==1.4.0
+    - conda-forge::r-r.utils==2.12.3
+    - conda-forge::r-dplyr==1.1.4
+    - conda-forge::r-readr==2.1.5
     - aryarm::r-dscrutils==0.4.3
-    - conda-forge::r-irkernel==1.2
-    - conda-forge::sos-notebook==0.22.3
-    - conda-forge::coreutils==8.31
+    - aryarm::r-pgenlibr==0.3.7
+    - conda-forge::r-irkernel==1.3.2
+    - conda-forge::sos-notebook==0.24.4
+    - conda-forge::coreutils=9.5
     - conda-forge::sed==4.8
-    - conda-forge::bash==5.0.018
-    - conda-forge::gzip==1.10
+    - conda-forge::bash==5.2.21
+    - conda-forge::gzip==1.13
+    - conda-forge::r-rfast==2.1.0

From 32f1a32bb530c35e72fbfae21095dd904ade3393 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Fri, 22 Nov 2024 12:55:08 -0800
Subject: [PATCH 32/41] recalibrate timing and mem

---
 analysis/.gitignore                 |  4 ++--
 analysis/workflow/rules/happler.smk | 22 ++++++++++++----------
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/analysis/.gitignore b/analysis/.gitignore
index 034afe0..06a969f 100644
--- a/analysis/.gitignore
+++ b/analysis/.gitignore
@@ -1,4 +1,4 @@
 midway_manhattans-indepterminator
 !midway_manhattans-indepterminator/LINKS
-old-outs
-HAPFILES
\ No newline at end of file
+outs
+HAPFILES
diff --git a/analysis/workflow/rules/happler.smk b/analysis/workflow/rules/happler.smk
index f185191..d34f643 100644
--- a/analysis/workflow/rules/happler.smk
+++ b/analysis/workflow/rules/happler.smk
@@ -47,6 +47,8 @@ rule sub_pheno:
 
 
 pheno = rules.sub_pheno.output.pheno if "{rep}" in out else config["pheno"]
+# if the pvar size is larger than 0.5 GB, just use the default memory instead
+rsrc_func = lambda x: max if .5 > Path(x).with_suffix(".pvar").stat().st_size/1000/1000/1000 else min
 
 
 rule run:
@@ -72,12 +74,12 @@ rule run:
         dot=out + "/happler.dot",
     resources:
         runtime=lambda wildcards, input: (
-            min(45, Path(input.gts).with_suffix(".pvar").stat().st_size/1000 * 0.5454649806119475 + 0.6935132147046765)
+            rsrc_func(input.gts)(45, Path(input.gts).with_suffix(".pvar").stat().st_size/1000 * 2.5379343786643838 + 20.878965342140603)
         ),
         # slurm_partition="hotel",
         # slurm_extra="--qos=hotel",
         mem_mb=lambda wildcards, input: (
-            min(5000, Path(input.gts).with_suffix(".pvar").stat().st_size/1000 * 14.90840694595845 + 401.8011129664152)
+            rsrc_func(input.gts)(2500, Path(input.gts).with_suffix(".pvar").stat().st_size/1000 * 7.5334226167661384 + 22.471377010118147)
         ),
     threads: 1
     log:
@@ -136,7 +138,7 @@ rule cond_linreg:
         png=out + "/cond_linreg.pdf",
     resources:
         runtime=lambda wildcards, input: (
-            min(50, 1.5 * Path(input.pvar).stat().st_size/1000 * (
+            rsrc_func(input.pgen)(50, 1.5 * Path(input.pvar).stat().st_size/1000 * (
                 0.2400978997329614 + get_num_variants(input.hap) * 0.045194464826048095
             ))
         ),
@@ -166,7 +168,7 @@ rule heatmap:
     output:
         png=out + "/heatmap.pdf",
     resources:
-        runtime=4,
+        runtime=10,
     log:
         logs + "/heatmap",
     benchmark:
@@ -232,7 +234,7 @@ rule transform:
         pvar=temp(out + "/happler.pvar"),
         psam=temp(out + "/happler.psam"),
     resources:
-        runtime=4,
+        runtime=10,
     log:
         logs + "/transform",
     benchmark:
@@ -332,7 +334,7 @@ rule merge:
         psam=temp(out + "/{ex}clude/merged.psam"),
     resources:
         runtime=lambda wildcards, input: (
-            min(10, Path(input.gts_pvar).stat().st_size/1000 * 0.05652315368728583 + 2.0888654705656844)
+            rsrc_func(input.gts)(10, Path(input.gts_pvar).stat().st_size/1000 * 0.05652315368728583 + 2.0888654705656844)
         ),
     log:
         logs + "/{ex}clude/merge",
@@ -374,9 +376,9 @@ rule finemapper:
         susie=out + "/{ex}clude/susie.rds",
     resources:
         runtime=lambda wildcards, input: (
-            min(120, Path(input.gt_pvar).stat().st_size/1000 * 0.08)
+            rsrc_func(input.gt)(120, Path(input.gt_pvar).stat().st_size/1000 * 0.3)
         ),
-        mem_mb = 7000,
+        mem_mb = 8500,
     threads: 1,
     log:
         logs + "/{ex}clude/finemapper",
@@ -395,7 +397,7 @@ rule pips:
     output:
         tsv=out + "/{ex}clude/susie_pips.tsv",
     resources:
-        runtime=5,
+        runtime=10,
     threads: 3,
     log:
         logs + "/{ex}clude/pips",
@@ -465,7 +467,7 @@ rule results:
         susie_pdf = out + "/{ex}clude/susie.pdf",
         # susie_eff_pdf=temp(out + "/susie_eff.pdf"),
     resources:
-        runtime=5,
+        runtime=10,
     log:
         logs + "/{ex}clude/results",
     benchmark:

From f0164e0b98f74bbc4abf858eaf5be6627da41973 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Fri, 22 Nov 2024 13:02:22 -0800
Subject: [PATCH 33/41] collapse sim configs

---
 analysis/config/config-geuvadis.yml        | 18 -----
 analysis/config/config-simulations.STR.yml | 89 ----------------------
 analysis/config/config-simulations.yml     | 12 ++-
 analysis/config/config.yml                 |  2 +-
 4 files changed, 9 insertions(+), 112 deletions(-)
 delete mode 100644 analysis/config/config-simulations.STR.yml

diff --git a/analysis/config/config-geuvadis.yml b/analysis/config/config-geuvadis.yml
index 6089e55..2229a2c 100644
--- a/analysis/config/config-geuvadis.yml
+++ b/analysis/config/config-geuvadis.yml
@@ -42,24 +42,6 @@ snp_panel: "data/geuvadis/geuvadis_ensemble_phasing.pgen"
 # locus: 19:45401409-46401409 # center: 45901409 (APOe4)
 locus: data/geuvadis/geuvadis_eqtl_genes.full.liftover.bed
 modes:
- str:
-  pos: 19:45903857 # STR_691361
- snp:
-  pos: 19:45910672 # rs1046282
- hap:
-  alleles: [rs36046716:G, rs1046282:G] # 45892145, 45910672
-  beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
- ld_range:
-  reps: 1
-  min_ld: 0
-  max_ld: 1
-  step: 0.1
-  min_af: 0.25
-  max_af: 0.75
-  # beta: [0.35]
-  beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
-  alpha: [0.05]
-  random: false # whether to also produce random haplotypes
  run:
   pheno: data/geuvadis/phenos/{trait}.pheno
   SVs: data/geuvadis/pangenie_hprc_hg38_all-samples_bi_SVs-missing_removed.pgen
diff --git a/analysis/config/config-simulations.STR.yml b/analysis/config/config-simulations.STR.yml
deleted file mode 100644
index e346e6d..0000000
--- a/analysis/config/config-simulations.STR.yml
+++ /dev/null
@@ -1,89 +0,0 @@
-# This is the Snakemake configuration file that specifies paths and
-# and options for the pipeline. Anybody wishing to use
-# the provided snakemake pipeline should first fill out this file with paths to
-# their own data, as the Snakefile requires it.
-# Every config option has reasonable defaults unless it is labeled as "required."
-# All paths are relative to the directory that Snakemake is executed in.
-# Note: this file is written in the YAML syntax (https://learnxinyminutes.com/docs/yaml/)
-
-
-# Paths to a SNP-STR haplotype reference panel
-# You can download this from http://gymreklab.com/2018/03/05/snpstr_imputation.html
-# If the VCFs are per-chromosome, replace the contig name in the file name with "{chr}"
-# The VCF(s) must be sorted and indexed (with a .tbi file in the same directory)
-# required!
-snp_panel: "data/simulations/1000G/19_45401409-46401409.pgen"
-# str_panel: "/tscc/projects/ps-gymreklab/jmargoli/ukbiobank/str_imputed/runs/first_pass/vcfs/annotated_strs/chr{chr}.vcf.gz"
-str_panel: "data/simulations/1000G.SNP-STR/19_45401409-46401409.STRs.pgen"
-
-# Path to a list of samples to exclude from the analysis
-# There should be one sample ID per line
-# exclude_samples: data/ukb_random_samples_exclude.tsv
-
-# If SNPs are unphased, provide the path to a SHAPEIT4 map file like these:
-# https://github.com/odelaneau/shapeit4/tree/master/maps
-# The map file should use the same reference genome as the reference panel VCFs
-# phase_map: data/genetic_maps/chr{chr}.b37.gmap.gz
-
-# A "locus" is a string with a contig name, a colon, the start position, a dash, and
-# the end position or a BED file with a ".bed" file ending
-# There are different simulation modes that you can use:
-# 1. "str" - a tandem repeat is a string with a contig name, a colon, and the start position
-# 2. "snp" - a SNP follows the same format as "str"
-# 3. "hap" - a haplotype
-# 4. "ld_range" - creates random two-SNP haplotypes with a range of LD values between the alleles of each haplotype
-# 5. "run" - execute happler on a locus without simulating anything
-# The STR and SNP positions should be contained within the locus.
-# The positions should be provided in the same coordinate system as the reference
-# genome of the reference panel VCFs
-# The contig should correspond with the contig name from the {chr} wildcard in the VCF
-# required! and unless otherwise noted, all attributes of each mode are required
-locus: 19:45401409-46401409 # center: 45901409 (APOe4)
-modes:
- str:
-  pos: 19:45903859
- snp:
-  pos: 19:45910672 # rs1046282
- hap:
-  alleles: [rs36046716:G, rs1046282:G] # 45892145, 45910672
-  beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
- ld_range:
-  reps: 10
-  min_ld: 0
-  max_ld: 1
-  step: 0.1
-  min_af: 0.25
-  max_af: 0.75
-  # beta: [0.35]
-  beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
-  alpha: [0.05]
-  random: false # whether to also produce random haplotypes
-  num_haps: [1, 2, 3, 4]
- run:
-  pheno: data/geuvadis/phenos/{trait}.pheno
-  SVs: data/geuvadis/pangenie_hprc_hg38_all-samples_bi_SVs-missing_removed.pgen
-  # pheno_matrix: data/geuvadis/EUR_converted_expr_hg38.csv # optional
-mode: str
-
-# Covariates to use if they're needed
-# Otherwise, they're assumed to be regressed out of the phenotypes
-# Note: the finemapping methods won't be able to handle these covariates
-# covar: data/geuvadis/5PCs_sex.covar
-
-# Discard rare variants with a MAF below this number
-# Defaults to 0 if not specified
-min_maf: 0.05
-
-# Sample sizes to use
-# sample_size: [777, 800, 1600, 2400, 3200]
-sample_size: 777
-
-# A threshold on the pvalues of the haplotypes
-out_thresh: 5e-3
-
-# Whether to include the causal variant in the set of genotypes provided to the
-# finemapping methods. Set this to true if you're interested in seeing how the
-# methods perform when the causal variant is absent from the data.
-# Defaults to false if not specified
-# You can also provide a list of booleans, if you want to test both values
-exclude_causal: [true, false]
diff --git a/analysis/config/config-simulations.yml b/analysis/config/config-simulations.yml
index 156c1fe..dfc35d7 100644
--- a/analysis/config/config-simulations.yml
+++ b/analysis/config/config-simulations.yml
@@ -12,9 +12,11 @@
 # If the VCFs are per-chromosome, replace the contig name in the file name with "{chr}"
 # The VCF(s) must be sorted and indexed (with a .tbi file in the same directory)
 # required!
-snp_panel: "data/simulations/1000G/19_45401409-46401409.pgen"
+# snp_panel: "data/simulations/1000G/19_45401409-46401409.pgen"
 # str_panel: "/tscc/projects/ps-gymreklab/jmargoli/ukbiobank/str_imputed/runs/first_pass/vcfs/annotated_strs/chr{chr}.vcf.gz"
-# str_panel: "data/simulations/1000G.SNP-STR/19_45401409-46401409.pgen"
+snp_panel: "data/simulations/1000G.SNP-STR/19_45401409-46401409.SNPs.pgen"
+str_panel: "data/simulations/1000G.SNP-STR/19_45401409-46401409.STRs.pgen"
+
 
 # Path to a list of samples to exclude from the analysis
 # There should be one sample ID per line
@@ -41,7 +43,9 @@ snp_panel: "data/simulations/1000G/19_45401409-46401409.pgen"
 locus: 19:45401409-46401409 # center: 45901409 (APOe4)
 modes:
  str:
-  pos: 19:45903857 # STR_691361
+  pos: 19:45903859
+  id: 19:45903859
+  reps: 10
  snp:
   pos: 19:45910672 # rs1046282
  hap:
@@ -63,7 +67,7 @@ modes:
   pheno: data/geuvadis/phenos/{trait}.pheno
   SVs: data/geuvadis/pangenie_hprc_hg38_all-samples_bi_SVs-missing_removed.pgen
   # pheno_matrix: data/geuvadis/EUR_converted_expr_hg38.csv # optional
-mode: ld_range
+mode: str
 
 # Covariates to use if they're needed
 # Otherwise, they're assumed to be regressed out of the phenotypes
diff --git a/analysis/config/config.yml b/analysis/config/config.yml
index c7dda66..51c999d 120000
--- a/analysis/config/config.yml
+++ b/analysis/config/config.yml
@@ -1 +1 @@
-config-geuvadis.yml
\ No newline at end of file
+config-simulations.yml
\ No newline at end of file

From 2ac2c270d4773aa3f73c75cbae7ba7f8e093fb09 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Tue, 26 Nov 2024 13:46:59 -0800
Subject: [PATCH 34/41] switch to dev version of haptools

---
 pyproject.toml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 31b780b..c620062 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,7 +25,8 @@ networkx = ">=2.6.3"
 scipy = ">=1.7.3"
 pydot = ">=1.4.2"
 pysam = ">=0.19.0"
-haptools = ">=0.4.0"
+haptools = {git = "https://github.com/cast-genomics/haptools.git", , rev = "f0ba9a3a464725fe66b9bbf31726af886c516374"} # switch to 0.5.0 once released
+# haptools = ">=0.5.0"
 statsmodels = ">=0.13.4"
 
 [tool.poetry.group.docs.dependencies]

From f6d5a931c253bcefd5e198ede5a6b4e82e45c90e Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Tue, 26 Nov 2024 21:57:03 +0000
Subject: [PATCH 35/41] ci: handle deprecations in gh actions and devcontainer

---
 .devcontainer.json                     | 5 +----
 .github/workflows/conventional-prs.yml | 2 +-
 .github/workflows/tests.yml            | 7 +++----
 3 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/.devcontainer.json b/.devcontainer.json
index 253be95..d20cbbf 100644
--- a/.devcontainer.json
+++ b/.devcontainer.json
@@ -5,10 +5,7 @@
 	// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
 	"image": "mcr.microsoft.com/devcontainers/base:jammy",
 	"features": {
-		"ghcr.io/rocker-org/devcontainer-features/miniforge:1": {
-			"version": "latest",
-			"variant": "Mambaforge"
-		}
+		"ghcr.io/rocker-org/devcontainer-features/miniforge:2": {}
 	},
 
 	// Features to add to the dev container. More info: https://containers.dev/features.
diff --git a/.github/workflows/conventional-prs.yml b/.github/workflows/conventional-prs.yml
index d1bc280..8dc0113 100644
--- a/.github/workflows/conventional-prs.yml
+++ b/.github/workflows/conventional-prs.yml
@@ -17,6 +17,6 @@ jobs:
       statuses: write  # for amannn/action-semantic-pull-request to mark status of analyzed PR
     runs-on: ubuntu-latest
     steps:
-      - uses: amannn/action-semantic-pull-request@v5.0.2
+      - uses: amannn/action-semantic-pull-request@v5
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 00cb76b..1238bd1 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -29,11 +29,10 @@ jobs:
       - name: Check out the repository
         uses: actions/checkout@v4
 
-      - name: Setup Mambaforge
+      - name: Setup Miniforge
         uses: conda-incubator/setup-miniconda@v3
         with:
           activate-environment: happler
-          miniforge-variant: Mambaforge
           auto-activate-base: false
           miniforge-version: latest
           use-mamba: true
@@ -44,7 +43,7 @@ jobs:
         shell: bash
 
       - name: Cache Conda env
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: ${{ env.CONDA }}/envs
           key:
@@ -94,7 +93,7 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Check for large files
-        uses: actionsdesk/lfs-warning@v3.2
+        uses: ppremk/lfs-warning@v3.3
         with:
           token: ${{ secrets.GITHUB_TOKEN }} # Optional
           filesizelimit: 500000b

From 786db1d2d4029b601e21bf4bcb2dd24059cf6f14 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Tue, 26 Nov 2024 21:57:22 +0000
Subject: [PATCH 36/41] update lock file for latest haptools

---
 poetry.lock    | 16 ++++++++++------
 pyproject.toml |  2 +-
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 9148d8f..2bc94f4 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -673,14 +673,12 @@ woff = ["brotli (>=1.0.1)", "brotlicffi (>=0.8.0)", "zopfli (>=0.1.4)"]
 
 [[package]]
 name = "haptools"
-version = "0.4.0"
+version = "0.4.2"
 description = "Ancestry and haplotype aware simulation of genotypes and phenotypes for complex trait analysis"
 optional = false
 python-versions = ">=3.7,<4.0"
-files = [
-    {file = "haptools-0.4.0-py3-none-any.whl", hash = "sha256:73fa3c4b6f6119f71e59b84de226c34808646075f61a7fa033d84f434ec59738"},
-    {file = "haptools-0.4.0.tar.gz", hash = "sha256:85dba0bde58e33ac71c298eb37e1272ca83276d06714d6516f8bc722c3455fe0"},
-]
+files = []
+develop = false
 
 [package.dependencies]
 click = ">=8.0.3"
@@ -690,6 +688,12 @@ numpy = ">=1.20.0"
 Pgenlib = ">=0.90.1"
 pysam = ">=0.19.0"
 
+[package.source]
+type = "git"
+url = "https://github.com/cast-genomics/haptools.git"
+reference = "f0ba9a3a464725fe66b9bbf31726af886c516374"
+resolved_reference = "f0ba9a3a464725fe66b9bbf31726af886c516374"
+
 [[package]]
 name = "humanfriendly"
 version = "10.0"
@@ -2127,4 +2131,4 @@ test = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-it
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8,<4.0"
-content-hash = "1bb0c82732e8ba5ac5ac5ab616189cede6a4136e3652496196d6ba629fbf1172"
+content-hash = "269cf6b4b6f2ca34cd1b9b3483109a94fa95a3748ccd2a3051786d5e26b39091"
diff --git a/pyproject.toml b/pyproject.toml
index c620062..5c95a4b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,7 +25,7 @@ networkx = ">=2.6.3"
 scipy = ">=1.7.3"
 pydot = ">=1.4.2"
 pysam = ">=0.19.0"
-haptools = {git = "https://github.com/cast-genomics/haptools.git", , rev = "f0ba9a3a464725fe66b9bbf31726af886c516374"} # switch to 0.5.0 once released
+haptools = {git = "https://github.com/cast-genomics/haptools.git", rev = "f0ba9a3a464725fe66b9bbf31726af886c516374"} # switch to 0.5.0 once released
 # haptools = ">=0.5.0"
 statsmodels = ">=0.13.4"
 

From 592bf889cfc13fa16275e6b08f395185f9b10575 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Tue, 26 Nov 2024 22:09:07 +0000
Subject: [PATCH 37/41] update min python to 3.9 and add 3.13 ci test

---
 .github/workflows/tests.yml |  6 +++---
 .readthedocs.yaml           |  2 +-
 dev-env.yml                 |  2 +-
 noxfile.py                  |  4 ++--
 pyproject.toml              |  2 +-
 recipe/meta.yaml            | 34 ----------------------------------
 6 files changed, 8 insertions(+), 42 deletions(-)
 delete mode 100644 recipe/meta.yaml

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 1238bd1..16c605a 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -10,15 +10,15 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - { python: "3.8", os: "ubuntu-latest", session: "lint" }
-          - { python: "3.8", os: "ubuntu-latest", session: "tests" }
+          - { python: "3.9", os: "ubuntu-latest", session: "lint" }
           - { python: "3.9", os: "ubuntu-latest", session: "tests" }
           - { python: "3.10", os: "ubuntu-latest", session: "tests" }
           - { python: "3.11", os: "ubuntu-latest", session: "tests" }
           - { python: "3.12", os: "ubuntu-latest", session: "tests" }
+          - { python: "3.13", os: "ubuntu-latest", session: "tests" }
           # - { python: "3.11", os: "windows-latest", session: "tests" }
           # - { python: "3.9", os: "macos-latest", session: "tests" }
-          - { python: "3.8", os: "ubuntu-latest", session: "size" }
+          - { python: "3.9", os: "ubuntu-latest", session: "size" }
 
     env:
       NOXSESSION: ${{ matrix.session }}
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 07c4eb8..968a06e 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -9,7 +9,7 @@ version: 2
 build:
   os: "ubuntu-22.04"
   tools:
-    python: "3.8"
+    python: "3.9"
   jobs:
     post_create_environment:
       # Install poetry
diff --git a/dev-env.yml b/dev-env.yml
index 985b69e..3c4bfea 100644
--- a/dev-env.yml
+++ b/dev-env.yml
@@ -3,7 +3,7 @@ channels:
   - bioconda
   - nodefaults
 dependencies:
-  - conda-forge::python=3.8 # the lowest version of python that we formally support
+  - conda-forge::python=3.9 # the lowest version of python that we formally support
   - conda-forge::pip==23.3.2
   - conda-forge::poetry==1.7.1
   - conda-forge::nox==2023.04.22
diff --git a/noxfile.py b/noxfile.py
index a1772ad..4955926 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -10,8 +10,8 @@
 
 
 package = "happler"
-python_versions = ["3.8", "3.9", "3.10", "3.11", "3.12"]
-locked_python_version = "3.8"
+python_versions = ["3.9", "3.10", "3.11", "3.12", "3.13"]
+locked_python_version = "3.9"
 nox.needs_version = ">= 2022.11.21"
 nox.options.sessions = (
     "docs",
diff --git a/pyproject.toml b/pyproject.toml
index 5c95a4b..54f9a17 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,7 +19,7 @@ classifiers = [
 ]
 
 [tool.poetry.dependencies]
-python = ">=3.8,<4.0"
+python = ">=3.9,<4.0"
 click = ">=8.0.4"
 networkx = ">=2.6.3"
 scipy = ">=1.7.3"
diff --git a/recipe/meta.yaml b/recipe/meta.yaml
deleted file mode 100644
index c7b0d4d..0000000
--- a/recipe/meta.yaml
+++ /dev/null
@@ -1,34 +0,0 @@
-package:
-  name: happler
-  version: v0.0.0
-
-source:
-  path: '..'
-
-build:
-  number: 0
-  script: "{{ PYTHON }} -m pip install . --no-deps --ignore-installed --no-cache-dir -vvv"
-  entry_points:
-    - happler = happler.__main__:main
-  noarch: python
-
-requirements:
-  host:
-    - pip >=19.0.3
-    - python >=3.7
-    - poetry-core >=1.0.0
-  run:
-    - python >=3.7
-    - statsmodels >=0.13.2
-    - click >=8.0.4
-    - networkx >=2.6.3
-    - scipy >=1.7.3
-    - pydot >=1.4.2
-
-about:
-  home: https://github.com/gymrek-lab/happler
-  license: MIT
-  summary: 'A haplotype-based fine-mapping method'
-  dev_url: https://github.com/gymrek-lab/happler
-  doc_url: https://happler.readthedocs.io/
-  doc_source_url: https://github.com/shibukawa/imagesize_py/blob/master/README.rst

From b08162fde042f880ebb4d2856165647a3e63f347 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Tue, 26 Nov 2024 22:34:34 +0000
Subject: [PATCH 38/41] fix failing example tests with --out-thresh

---
 noxfile.py             | 2 +-
 tests/test_examples.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/noxfile.py b/noxfile.py
index 4955926..f127938 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -46,7 +46,7 @@ def install_handle_python_numpy(session):
     handle incompatibilities with python and numpy versions
     see https://github.com/cjolowicz/nox-poetry/issues/1116
     """
-    if session._session.python in ["3.11", "3.12"]:
+    if session._session.python in ["3.11", "3.12", "3.13"]:
         session._session.install(".")
     else:
         session.install(".")
diff --git a/tests/test_examples.py b/tests/test_examples.py
index 1604532..5f25730 100644
--- a/tests/test_examples.py
+++ b/tests/test_examples.py
@@ -716,7 +716,7 @@ def test_1000G_simulated_maf(capfd):
     out_vars = ("rs1046282", "rs36046716")
 
     for maf in (0.05, 0.30, 0.31, 0.38):
-        cmd = f"run --maf {maf} -o {out_hp_file} {gt_file} {pt_file}"
+        cmd = f"run --out-thresh 1 --maf {maf} -o {out_hp_file} {gt_file} {pt_file}"
         runner = CliRunner()
         result = runner.invoke(main, cmd.split(" "), catch_exceptions=False)
         captured = capfd.readouterr()
@@ -758,7 +758,7 @@ def test_1000G_real(capfd, caplog):
     caplog.set_level(logging.INFO)
 
     for maf in (0.05, 0.14, 0.22, 0.23, 0.35):
-        cmd = f"run --maf {maf} -o {out_hp_file} {gt_file} {pt_file}"
+        cmd = f"run --out-thresh 1 --maf {maf} -o {out_hp_file} {gt_file} {pt_file}"
         runner = CliRunner()
         result = runner.invoke(main, cmd.split(" "), catch_exceptions=False)
         captured = capfd.readouterr()

From b2912646f063154b21395117d9e7b4cfdd7add1d Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Tue, 26 Nov 2024 22:39:52 +0000
Subject: [PATCH 39/41] refmt with black

---
 happler/tree/assoc_test.py     |  4 +++-
 happler/tree/forest_builder.py | 41 ++++++++++++++++++++++++----------
 happler/tree/terminator.py     | 14 ++++++++++--
 happler/tree/tree_builder.py   |  2 +-
 tests/test_tree.py             |  2 +-
 5 files changed, 46 insertions(+), 17 deletions(-)

diff --git a/happler/tree/assoc_test.py b/happler/tree/assoc_test.py
index f17f202..32a7324 100644
--- a/happler/tree/assoc_test.py
+++ b/happler/tree/assoc_test.py
@@ -325,7 +325,9 @@ def perform_test(
         # handle cases where our p-values are too powerful
         if pval == 0:
             # retrieve the pval at a higher precision
-            pval = AssocTestSimpleSM.pval_as_decimal(res.tvalues[-1], res.df_resid, precision=10)
+            pval = AssocTestSimpleSM.pval_as_decimal(
+                res.tvalues[-1], res.df_resid, precision=10
+            )
         if self.with_bic:
             return param, pval, stderr, bic
         else:
diff --git a/happler/tree/forest_builder.py b/happler/tree/forest_builder.py
index cecce98..d588d6a 100644
--- a/happler/tree/forest_builder.py
+++ b/happler/tree/forest_builder.py
@@ -34,8 +34,8 @@ def __init__(
     ):
         self.tree_builder = tree_builder
         self.max_iterations = max_iterations
-        self.trees = [None]*num_bins
-        self.haplotypes = [None]*num_bins
+        self.trees = [None] * num_bins
+        self.haplotypes = [None] * num_bins
         self.genotypes = self.tree_builder.gens
         self.phenotypes = self.tree_builder.phens
         self.log = log or getLogger(self.__class__.__name__)
@@ -47,12 +47,17 @@ def run(self):
         iterations = self.max_iterations
         tree_idx = 0
         while iterations:
-            self.log.debug(f"Iteration {self.max_iterations - iterations}: tree {tree_idx}")
+            self.log.debug(
+                f"Iteration {self.max_iterations - iterations}: tree {tree_idx}"
+            )
             self.tree_builder.phens = self.get_residuals(tree_idx)
             self.tree_builder.run()
             self.trees[tree_idx] = self.tree_builder.tree
             self.haplotypes[tree_idx] = Haplotypes.from_tree(
-                None, self.trees[tree_idx], self.genotypes, self.log,
+                None,
+                self.trees[tree_idx],
+                self.genotypes,
+                self.log,
             )
             # increment tree_idx until it reaches the end
             tree_idx += 1
@@ -89,22 +94,32 @@ def get_residuals(self, tree_idx: int) -> Phenotypes:
         if total_num_effects == 0:
             return self.phenotypes
         num_samps = len(self.phenotypes.samples)
-        effects = np.empty((num_samps, total_num_effects), dtype=self.phenotypes.data.dtype)
+        effects = np.empty(
+            (num_samps, total_num_effects), dtype=self.phenotypes.data.dtype
+        )
         effect_arr_idx = 0
         # iterate through every tree's Haplotypes obj and transform the haps into effects
-        self.log.debug(f"Extracting {total_num_effects} effects from {total_num_effects} total effects")
+        self.log.debug(
+            f"Extracting {total_num_effects} effects from {total_num_effects} total effects"
+        )
         for hap_idx, hap in enumerate(self.haplotypes):
             if hap is None or hap_idx == tree_idx:
                 continue
             new_idx = effect_arr_idx + len(hap.data)
-            effects[:, effect_arr_idx:new_idx] = hap.transform(self.genotypes).data[:,:,:2].sum(axis=2)
+            effects[:, effect_arr_idx:new_idx] = (
+                hap.transform(self.genotypes).data[:, :, :2].sum(axis=2)
+            )
             effect_arr_idx = new_idx
         resids = Phenotypes(fname=None, log=self.phenotypes.log)
         resids.samples = self.phenotypes.samples
         # get residuals with effects as covariates
         resids.names = self.phenotypes.names
         self.log.debug(f"Computing residuals for tree {tree_idx}")
-        resids.data = sm.OLS(self.phenotypes.data, sm.add_constant(effects)).fit().resid[:, np.newaxis]
+        resids.data = (
+            sm.OLS(self.phenotypes.data, sm.add_constant(effects))
+            .fit()
+            .resid[:, np.newaxis]
+        )
         return resids
 
     def __repr__(self):
@@ -124,7 +139,7 @@ def dot(self, remove_singletons: bool = False) -> str:
         remove_singletons: bool
             Whether to ignore trees composed of only a single SNP
         """
-        node_pattern = r'(?<!\d)(\d+)(?=\s\[|\s\s\[|\s->)'
+        node_pattern = r"(?<!\d)(\d+)(?=\s\[|\s\s\[|\s->)"
         max_node_id = 0
         trees_str = "strict digraph {\nforcelabels=true;\nrankdir=TB;\n"
         for idx, tree in enumerate(self.trees):
@@ -133,15 +148,17 @@ def dot(self, remove_singletons: bool = False) -> str:
             tree_str = tree.dot().split("\n")[2:]
             if remove_singletons and len(tree_str) <= 5:
                 continue
-            trees_str += f"subgraph tree_{idx}"+" {\nlabel=\""+f"Tree {idx}"+"\";\n"
+            trees_str += f"subgraph tree_{idx}" + ' {\nlabel="' + f"Tree {idx}" + '";\n'
             tree_str = "\n".join(tree_str)
             # increment the node IDs to ensure they remain unique across all trees
-            increment = lambda match: str(int(match.group(1))+max_node_id)
+            increment = lambda match: str(int(match.group(1)) + max_node_id)
             trees_str += re.sub(node_pattern, increment, tree_str)
             # what was the greatest node ID in this tree?
             # increment the next tree's IDs by that number + 1 if there were any nodes
             try:
-                max_node_id = max(map(int, re.findall(node_pattern, tree_str))) + max_node_id + 1
+                max_node_id = (
+                    max(map(int, re.findall(node_pattern, tree_str))) + max_node_id + 1
+                )
             except ValueError:
                 pass
         trees_str += "}"
diff --git a/happler/tree/terminator.py b/happler/tree/terminator.py
index 105ce2a..d696450 100644
--- a/happler/tree/terminator.py
+++ b/happler/tree/terminator.py
@@ -178,7 +178,12 @@ def check(
         num_tests: int,
     ) -> bool:
         computed_val = self.compute_val(
-            parent_res, node_res, results, best_idx, num_samps, num_tests,
+            parent_res,
+            node_res,
+            results,
+            best_idx,
+            num_samps,
+            num_tests,
         )
         if isinstance(computed_val, bool):
             return computed_val
@@ -256,7 +261,12 @@ def check(
         num_tests: int,
     ):
         computed_val = self.compute_val(
-            parent_res, node_res, results, best_idx, num_samps, num_tests,
+            parent_res,
+            node_res,
+            results,
+            best_idx,
+            num_samps,
+            num_tests,
         )
         if isinstance(computed_val, bool):
             return computed_val
diff --git a/happler/tree/tree_builder.py b/happler/tree/tree_builder.py
index 41bc440..2db22e2 100644
--- a/happler/tree/tree_builder.py
+++ b/happler/tree/tree_builder.py
@@ -474,7 +474,7 @@ def _find_split_rigid(
                 self.log.debug(
                     f"The haplotype had a pval of {hap_indep_effect} < {self.indep_thresh}"
                     " in an additive model with the allele and parent"
-            )
+                )
             best_allele_idx = best_res_idx[best_allele]
             best_results = results[allele].data[best_allele_idx]
             node_res = self.results_type.from_np(best_results)
diff --git a/tests/test_tree.py b/tests/test_tree.py
index 4bf8c76..c0f0278 100644
--- a/tests/test_tree.py
+++ b/tests/test_tree.py
@@ -230,7 +230,7 @@ def test_haplotype_transform():
         if idx != variant_idx:
             np.testing.assert_allclose(
                 hap.transform(gens, allele, idx),
-                gens_without_variant[:,(idx-1,)],
+                gens_without_variant[:, (idx - 1,)],
             )
 
 

From c4f1dcf2291654779e7e287f9de0d88d0b22da71 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Thu, 5 Dec 2024 13:08:09 -0800
Subject: [PATCH 40/41] adapt workflow to simulate STRs

---
 analysis/config/config-simulations.yml       |  7 ++-
 analysis/workflow/Snakefile                  | 54 +++++++++-----------
 analysis/workflow/rules/genotypes.smk        | 22 --------
 analysis/workflow/rules/happler.smk          |  2 +-
 analysis/workflow/rules/simulate.smk         | 29 ++++++-----
 analysis/workflow/scripts/create_hap_file.sh | 40 +++++++++------
 6 files changed, 69 insertions(+), 85 deletions(-)

diff --git a/analysis/config/config-simulations.yml b/analysis/config/config-simulations.yml
index dfc35d7..86830a2 100644
--- a/analysis/config/config-simulations.yml
+++ b/analysis/config/config-simulations.yml
@@ -46,8 +46,10 @@ modes:
   pos: 19:45903859
   id: 19:45903859
   reps: 10
+  beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
  snp:
   pos: 19:45910672 # rs1046282
+  beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
  hap:
   alleles: [rs36046716:G, rs1046282:G] # 45892145, 45910672
   beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
@@ -80,10 +82,7 @@ min_maf: 0.05
 
 # Sample sizes to use
 # sample_size: [777, 800, 1600, 2400, 3200]
-sample_size: 777
-
-# A threshold on the pvalues of the haplotypes
-out_thresh: 5e-3
+sample_size: 3200
 
 # Whether to include the causal variant in the set of genotypes provided to the
 # finemapping methods. Set this to true if you're interested in seeing how the
diff --git a/analysis/workflow/Snakefile b/analysis/workflow/Snakefile
index 1cefc93..2f33839 100644
--- a/analysis/workflow/Snakefile
+++ b/analysis/workflow/Snakefile
@@ -50,7 +50,8 @@ module genotypes:
     config: config
 
 use rule * from genotypes as genotypes_*
-config["gts_str_panel"] = rules.genotypes_subset_str.output.vcf
+if mode == "str":
+    config["gts_str_panel"] = config["str_panel"]
 if check_config("sample_size"):
     sampsize = config["sample_size"]
     config["gts_snp_panel"] = rules.genotypes_subset.output.pgen
@@ -94,24 +95,32 @@ if check_config("covar"):
     if "pheno_matrix" in covar_config:
         covar_file = rules.covariates_merge.output.covar
 
-if mode in ("snp", "hap", "ld_range"):
+if mode in ("str", "snp", "hap", "ld_range"):
     happler_config = {
         "pheno": rules.simulate_simphenotype.output.pheno,
         "hap_file": rules.simulate_transform.input.hap,
         "snp_panel": config["gts_snp_panel"],
-        "out": config["out"] + "/happler/"+mode+"/{beta}",
+        "out": config["out"] + "/happler/"+mode+"/beta_{beta}",
         "random": None,
         "covar": covar_file,
         "min_maf": check_config("min_maf", 0),
-        "out_thresh": check_config("out_thresh", 5e-8),
+        "out_thresh": 1, # artificially set to 1 to avoid filtering
     }
     if mode in ("hap", "ld_range"):
         happler_config["snps"] = []
+        happler_config["causal_gt"] = rules.simulate_transform.output
         if mode == "hap":
             happler_config["snps"] = config["modes"][mode]["alleles"]
-        happler_config["causal_gt"] = rules.simulate_transform.output
-    if mode == "ld_range":
-        happler_config["out"] = config["out"] + "/happler/"+mode+"/{num_haps}_haps/ld_{ld}/beta_{beta}/alpha_{alpha}/rep_{rep}"
+        elif mode == "ld_range":
+            happler_config["out"] = config["out"] + "/happler/"+mode+"/{num_haps}_haps/ld_{ld}/beta_{beta}/alpha_{alpha}"
+    elif mode in ("str", "snp"):
+        # also provide the SNP and STR IDs so that they can be used in the manhattan plot
+        happler_config["snps"] = (config["modes"][mode]["id"],)
+        # TODO: consider uncommenting this to compute LD with causal STR
+        # if mode == "str":
+        #     happler_config["causal_gt"] = config["gts_str_panel"]
+    if check_config("reps", place=config["modes"][mode]):
+        happler_config["out"] = happler_config["out"] + "/rep_{rep}"
 elif mode == "run":
     happler_config = {
         "pheno": config["modes"][mode]["pheno"],
@@ -125,8 +134,6 @@ elif mode == "run":
     }
     if check_config("SVs", place=config["modes"][mode]):
         happler_config["SVs"] = config["modes"][mode]["SVs"]
-elif "str" == mode:
-    pass
 else:
     raise ValueError("Unsupported operating 'mode' in config")
 
@@ -145,25 +152,6 @@ if mode == "ld_range" and config["modes"][mode]["random"]:
         config: happler_config
     use rule * from happler_random as happler_random_*
 
-# if mode in ("hap", "str", "ld_range"):
-#     finemappers_config = {
-#         "pheno": rules.simulate_simphenotype.output.pheno,
-#         "out": config["out"] + "/finemappers/"+mode+"/{beta}",
-#         "snp_panel": config["gts_snp_panel"],
-#     }
-#     if mode in ("hap", "ld_range"):
-#         finemappers_config["causal_gt"] = rules.simulate_transform.output.pgen
-#         if mode == "ld_range":
-#             finemappers_config["out"] = config["out"] + "/finemappers/"+mode+"/ld_{ld}/beta_{beta}/alpha_{alpha}"
-#     else:
-#         raise ValueError("Not yet implemented operating mode 'str' in config")
-#     module finemappers:
-#         snakefile: "rules/finemappers.smk"
-#         config: finemappers_config
-#     use rule * from finemappers as finemappers_*
-# else:
-#     raise ValueError(f"Unsupported operating mode '{mode}' in config")
-
 plots_config = {
     "out": config["out"] + "/plots",
     "mode": mode,
@@ -237,16 +225,22 @@ elif mode == "run":
 else:
     FINAL_OUTPUT = expand(
         [
+            rules.happler_cond_linreg.output.png,
             rules.happler_tree.output.png,
-            rules.happler_manhattan.output.png,
             # rules.finemappers_results.output.susie_pdf,
             rules.happler_results.output.susie_pdf,
-            rules.plots_params.output.png,
+            # rules.plots_params.output.png,
         ],
         ex=(("in",) if mode == "hap" else ("ex", "in")),
         beta=config["modes"]["hap"]["beta"],
         allow_missing=True,
     )
+    if check_config("reps", place=config["modes"][mode]):
+        FINAL_OUTPUT = expand(
+            FINAL_OUTPUT,
+            rep=range(config["modes"][mode]["reps"]),
+            allow_missing=True,
+        )
 
 # If '--config debug=true' is specified, we won't try to build the entire DAG
 # This can be helpful when you want to play around with a specific target output
diff --git a/analysis/workflow/rules/genotypes.smk b/analysis/workflow/rules/genotypes.smk
index bfbc555..4eb126e 100644
--- a/analysis/workflow/rules/genotypes.smk
+++ b/analysis/workflow/rules/genotypes.smk
@@ -162,28 +162,6 @@ rule vcf2plink:
         "--threads {threads}{params.samps} --out {params.prefix} &>{log}"
 
 
-rule subset_str:
-    """ subset samples from an STR VCF """
-    input:
-        vcf=lambda wildcards: expand(config["str_panel"], chr=parse_locus(wildcards.locus)[0])[0],
-        vcf_idx=lambda wildcards: expand(config["str_panel"] + ".tbi", chr=parse_locus(wildcards.locus)[0]),
-        samples=rules.keep_samps.output.samples,
-    output:
-        vcf=out+"/strs.bcf",
-        vcf_idx=out+"/strs.bcf.csi",
-    resources:
-        runtime=30,
-    log:
-        logs + "/subset_str",
-    benchmark:
-        bench + "/subset_str",
-    conda:
-        "../envs/default.yml"
-    shell:
-        "bcftools view -S {input.samples} --write-index "
-        "-Ob -o {output.vcf} {input.vcf}"
-
-
 def subset_input():
     if check_config("phase_map") or check_config("exclude_samples") or not config["snp_panel"].endswith(".pgen"):
         return {
diff --git a/analysis/workflow/rules/happler.smk b/analysis/workflow/rules/happler.smk
index d34f643..f3b430c 100644
--- a/analysis/workflow/rules/happler.smk
+++ b/analysis/workflow/rules/happler.smk
@@ -68,7 +68,7 @@ rule run:
         out_thresh=check_config("out_thresh", 5e-8),
         keep_SNPs="--remove-SNPs " if not ("{rep}" in out) else "",
     output:
-        hap=out + "/happler.hap",
+        hap=ensure(out + "/happler.hap", non_empty=True),
         gz=out + "/happler.hap.gz",
         idx=out + "/happler.hap.gz.tbi",
         dot=out + "/happler.dot",
diff --git a/analysis/workflow/rules/simulate.smk b/analysis/workflow/rules/simulate.smk
index 249e64d..72d92e3 100644
--- a/analysis/workflow/rules/simulate.smk
+++ b/analysis/workflow/rules/simulate.smk
@@ -24,6 +24,10 @@ def parse_locus(locus):
 
 hap_ld_range_output = out + "/create_ld_range/{num_haps}_haps/ld_{ld}/haplotype.hap"
 
+gts_panel = Path(
+    config["gts_str_panel"] if mode == "str" else config["gts_snp_panel"]
+)
+
 checkpoint create_hap_ld_range:
     """ create a hap file suitable for haptools transform and simphenotype """
     input:
@@ -31,12 +35,12 @@ checkpoint create_hap_ld_range:
         gts_pvar=Path(config["gts_snp_panel"]).with_suffix(".pvar"),
         gts_psam=Path(config["gts_snp_panel"]).with_suffix(".psam"),
     params:
-        min_ld = config["modes"]["ld_range"]["min_ld"],
-        max_ld = config["modes"]["ld_range"]["max_ld"],
-        step = config["modes"]["ld_range"]["step"],
-        min_af = config["modes"]["ld_range"]["min_af"],
-        max_af = config["modes"]["ld_range"]["max_af"],
-        num_haps = "-n " + " -n ".join(map(str, config["modes"]["ld_range"]["num_haps"])),
+        min_ld = lambda wildcards: config["modes"]["ld_range"]["min_ld"],
+        max_ld = lambda wildcards: config["modes"]["ld_range"]["max_ld"],
+        step = lambda wildcards: config["modes"]["ld_range"]["step"],
+        min_af = lambda wildcards: config["modes"]["ld_range"]["min_af"],
+        max_af = lambda wildcards: config["modes"]["ld_range"]["max_af"],
+        num_haps = lambda wildcards: "-n " + " -n ".join(map(str, config["modes"]["ld_range"]["num_haps"])),
         out = lambda wildcards: expand(
             hap_ld_range_output, **wildcards, allow_missing=True,
         ),
@@ -61,13 +65,14 @@ checkpoint create_hap_ld_range:
 rule create_hap:
     """ create a hap file suitable for haptools transform and simphenotype """
     input:
-        gts=Path(config["gts_snp_panel"]).with_suffix(".pvar"),
+        gts=gts_panel.with_suffix(".pvar"),
     params:
         ignore="this", # the first parameter is always ignored for some reason
         chrom=lambda wildcards: parse_locus(wildcards.locus)[0],
         locus=lambda wildcards: wildcards.locus.split("_")[1].replace('-', '\t'),
         beta=0.99,
-        alleles=lambda wildcards: config["modes"]["hap"]["alleles"],
+        alleles=lambda wildcards: config["modes"]["hap"]["alleles"] if mode == "hap" else [],
+        repeat=lambda wildcards: config["modes"]["str"]["id"] if mode == "str" else 0,
     output:
         hap=out + "/haplotype.hap"
     resources:
@@ -113,13 +118,13 @@ rule simphenotype:
     """ use the hap file to create simulated phenotypes for the haplotype """
     input:
         hap=rules.transform.input.hap,
-        pgen=rules.transform.output.pgen,
-        pvar=rules.transform.output.pvar,
-        psam=rules.transform.output.psam,
+        pgen=rules.transform.output.pgen if mode != "str" else config["gts_str_panel"],
+        pvar=rules.transform.output.pvar if mode != "str" else Path(config["gts_str_panel"]).with_suffix(".pvar"),
+        psam=rules.transform.output.psam if mode != "str" else Path(config["gts_str_panel"]).with_suffix(".psam"),
     params:
         beta=lambda wildcards: wildcards.beta,
         seed = 42,
-        reps = config["modes"]["ld_range"]["reps"],
+        reps = lambda wildcards: config["modes"]["ld_range"]["reps"],
     output:
         pheno=out + "/{beta}.pheno",
     resources:
diff --git a/analysis/workflow/scripts/create_hap_file.sh b/analysis/workflow/scripts/create_hap_file.sh
index ae46e5d..cc32072 100755
--- a/analysis/workflow/scripts/create_hap_file.sh
+++ b/analysis/workflow/scripts/create_hap_file.sh
@@ -2,23 +2,31 @@
 
 exec 2> "${snakemake_log}"
 
-echo -e "#\torderH\tbeta" > "${snakemake_output[hap]}"
-echo -e "#\tversion\t0.1.0" >> "${snakemake_output[hap]}"
-echo -e "#H\tbeta\t.2f\tEffect size in linear model" >> "${snakemake_output[hap]}"
+echo -e "#\tversion\t0.2.0" >> "${snakemake_output[hap]}"
+echo -e "#R\tbeta\t.2f\tEffect size in linear model" >> "${snakemake_output[hap]}"
 
-starts=()
-ends=()
-for snp in ${snakemake_params[4]}; do
-    snp_id="$(echo "$snp" | sed 's/\(.*\):/\1\t/')"
-    start_pos="$(grep -m1 "$(echo "$snp_id" | cut -f1)" "${snakemake_input[gts]}" | cut -f2)"
-    end_pos=$((start_pos+1))
-    echo -e "V\thap\t$start_pos\t$end_pos\t$snp_id" >> "${snakemake_output[hap]}"
-    starts+=("$start_pos")
-    ends+=("$end_pos")
-done
+if [ "${snakemake_params[repeat]}" -eq "0" ]; then
+    starts=()
+    ends=()
+    for snp in ${snakemake_params[4]}; do
+        snp_id="$(echo "$snp" | sed 's/\(.*\):/\1\t/')"
+        start_pos="$(grep -Ev '^#' -m1 "$(echo "$snp_id" | cut -f1)" "${snakemake_input[gts]}" | cut -f2)"
+        end_pos=$((start_pos+1))
+        echo -e "V\thap\t$start_pos\t$end_pos\t$snp_id" >> "${snakemake_output[hap]}"
+        starts+=("$start_pos")
+        ends+=("$end_pos")
+    done
 
-start_pos="$(echo "${starts[@]}" | tr ' ' '\n' | sort -n | head -n1)"
-end_pos="$(echo "${ends[@]}" | tr ' ' '\n' | sort -rn | head -n1)"
-echo -e "H\t${snakemake_params[1]}\t$start_pos\t$end_pos\thap\t${snakemake_params[3]}" >> "${snakemake_output[hap]}"
+    start_pos="$(echo "${starts[@]}" | tr ' ' '\n' | sort -n | head -n1)"
+    end_pos="$(echo "${ends[@]}" | tr ' ' '\n' | sort -rn | head -n1)"
+    echo -e "H\t${snakemake_params[1]}\t$start_pos\t$end_pos\thap\t${snakemake_params[3]}" >> "${snakemake_output[hap]}"
+else
+    for str in ${snakemake_params[5]}; do
+        str_id=$'\t'"$str"$'\t'
+        start_pos="$(grep --line-buffered -Ev '^#' "${snakemake_input[gts]}" | grep -Pm1 "${start_pos}${str_id}" | cut -f2)"
+        end_pos="$(grep --line-buffered -Ev '^#' "${snakemake_input[gts]}" | grep -Pm1 "${start_pos}${str_id}" | grep -Po '(?<=;END=)\d+(?=;PERIOD)')"
+        echo -e "R\t${snakemake_params[1]}\t$start_pos\t$end_pos\t$str\t${snakemake_params[3]}" >> "${snakemake_output[hap]}"
+    done
+fi
 
 LC_ALL=C sort -k1,4 -o "${snakemake_output[hap]}" "${snakemake_output[hap]}"

From e0dec77f13e0ef673ec84d97bfb802608822a273 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Thu, 5 Dec 2024 21:22:24 +0000
Subject: [PATCH 41/41] update lock file

---
 poetry.lock | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 2bc94f4..c554e7f 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -51,9 +51,6 @@ files = [
     {file = "babel-2.15.0.tar.gz", hash = "sha256:8daf0e265d05768bc6c7a314cf1321e9a123afc328cc635c18622a2f30a04413"},
 ]
 
-[package.dependencies]
-pytz = {version = ">=2015.7", markers = "python_version < \"3.9\""}
-
 [package.extras]
 dev = ["freezegun (>=1.0,<2.0)", "pytest (>=6.0)", "pytest-cov"]
 
@@ -2130,5 +2127,5 @@ test = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-it
 
 [metadata]
 lock-version = "2.0"
-python-versions = ">=3.8,<4.0"
-content-hash = "269cf6b4b6f2ca34cd1b9b3483109a94fa95a3748ccd2a3051786d5e26b39091"
+python-versions = ">=3.9,<4.0"
+content-hash = "a7bd4a4e3824ae27cb1cee5bf0599c747842d5c29e81c05e22be81ea45b71490"