diff --git a/.devcontainer.json b/.devcontainer.json index 253be95..d20cbbf 100644 --- a/.devcontainer.json +++ b/.devcontainer.json @@ -5,10 +5,7 @@ // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile "image": "mcr.microsoft.com/devcontainers/base:jammy", "features": { - "ghcr.io/rocker-org/devcontainer-features/miniforge:1": { - "version": "latest", - "variant": "Mambaforge" - } + "ghcr.io/rocker-org/devcontainer-features/miniforge:2": {} }, // Features to add to the dev container. More info: https://containers.dev/features. diff --git a/.github/workflows/conventional-prs.yml b/.github/workflows/conventional-prs.yml index d1bc280..8dc0113 100644 --- a/.github/workflows/conventional-prs.yml +++ b/.github/workflows/conventional-prs.yml @@ -17,6 +17,6 @@ jobs: statuses: write # for amannn/action-semantic-pull-request to mark status of analyzed PR runs-on: ubuntu-latest steps: - - uses: amannn/action-semantic-pull-request@v5.0.2 + - uses: amannn/action-semantic-pull-request@v5 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 00cb76b..16c605a 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -10,15 +10,15 @@ jobs: fail-fast: false matrix: include: - - { python: "3.8", os: "ubuntu-latest", session: "lint" } - - { python: "3.8", os: "ubuntu-latest", session: "tests" } + - { python: "3.9", os: "ubuntu-latest", session: "lint" } - { python: "3.9", os: "ubuntu-latest", session: "tests" } - { python: "3.10", os: "ubuntu-latest", session: "tests" } - { python: "3.11", os: "ubuntu-latest", session: "tests" } - { python: "3.12", os: "ubuntu-latest", session: "tests" } + - { python: "3.13", os: "ubuntu-latest", session: "tests" } # - { python: "3.11", os: "windows-latest", session: "tests" } # - { python: "3.9", os: "macos-latest", session: "tests" } - - { python: "3.8", os: "ubuntu-latest", session: "size" } + - { python: "3.9", os: "ubuntu-latest", session: "size" } env: NOXSESSION: ${{ matrix.session }} @@ -29,11 +29,10 @@ jobs: - name: Check out the repository uses: actions/checkout@v4 - - name: Setup Mambaforge + - name: Setup Miniforge uses: conda-incubator/setup-miniconda@v3 with: activate-environment: happler - miniforge-variant: Mambaforge auto-activate-base: false miniforge-version: latest use-mamba: true @@ -44,7 +43,7 @@ jobs: shell: bash - name: Cache Conda env - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ${{ env.CONDA }}/envs key: @@ -94,7 +93,7 @@ jobs: uses: actions/checkout@v4 - name: Check for large files - uses: actionsdesk/lfs-warning@v3.2 + uses: ppremk/lfs-warning@v3.3 with: token: ${{ secrets.GITHUB_TOKEN }} # Optional filesizelimit: 500000b diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 07c4eb8..968a06e 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -9,7 +9,7 @@ version: 2 build: os: "ubuntu-22.04" tools: - python: "3.8" + python: "3.9" jobs: post_create_environment: # Install poetry diff --git a/analysis/.gitignore b/analysis/.gitignore index 034afe0..06a969f 100644 --- a/analysis/.gitignore +++ b/analysis/.gitignore @@ -1,4 +1,4 @@ midway_manhattans-indepterminator !midway_manhattans-indepterminator/LINKS -old-outs -HAPFILES \ No newline at end of file +outs +HAPFILES diff --git a/analysis/config/config-geuvadis.yml b/analysis/config/config-geuvadis.yml index b5f23d9..2229a2c 100644 --- a/analysis/config/config-geuvadis.yml +++ b/analysis/config/config-geuvadis.yml @@ -42,24 +42,6 @@ snp_panel: "data/geuvadis/geuvadis_ensemble_phasing.pgen" # locus: 19:45401409-46401409 # center: 45901409 (APOe4) locus: data/geuvadis/geuvadis_eqtl_genes.full.liftover.bed modes: - str: - pos: 19:45903857 # STR_691361 - snp: - pos: 19:45910672 # rs1046282 - hap: - alleles: [rs36046716:G, rs1046282:G] # 45892145, 45910672 - beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45] - ld_range: - reps: 1 - min_ld: 0 - max_ld: 1 - step: 0.1 - min_af: 0.25 - max_af: 0.75 - # beta: [0.35] - beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45] - alpha: [0.05] - random: false # whether to also produce random haplotypes run: pheno: data/geuvadis/phenos/{trait}.pheno SVs: data/geuvadis/pangenie_hprc_hg38_all-samples_bi_SVs-missing_removed.pgen @@ -79,6 +61,9 @@ min_maf: 0.1 # sample_size: [500, 1000, 1500, 2000, 2500] # sample_size: 777 +# A threshold on the pvalues of the haplotypes +out_thresh: 5e-6 + # Whether to include the causal variant in the set of genotypes provided to the # finemapping methods. Set this to true if you're interested in seeing how the # methods perform when the causal variant is absent from the data. diff --git a/analysis/config/config-simulations.yml b/analysis/config/config-simulations.yml new file mode 100644 index 0000000..86830a2 --- /dev/null +++ b/analysis/config/config-simulations.yml @@ -0,0 +1,92 @@ +# This is the Snakemake configuration file that specifies paths and +# and options for the pipeline. Anybody wishing to use +# the provided snakemake pipeline should first fill out this file with paths to +# their own data, as the Snakefile requires it. +# Every config option has reasonable defaults unless it is labeled as "required." +# All paths are relative to the directory that Snakemake is executed in. +# Note: this file is written in the YAML syntax (https://learnxinyminutes.com/docs/yaml/) + + +# Paths to a SNP-STR haplotype reference panel +# You can download this from http://gymreklab.com/2018/03/05/snpstr_imputation.html +# If the VCFs are per-chromosome, replace the contig name in the file name with "{chr}" +# The VCF(s) must be sorted and indexed (with a .tbi file in the same directory) +# required! +# snp_panel: "data/simulations/1000G/19_45401409-46401409.pgen" +# str_panel: "/tscc/projects/ps-gymreklab/jmargoli/ukbiobank/str_imputed/runs/first_pass/vcfs/annotated_strs/chr{chr}.vcf.gz" +snp_panel: "data/simulations/1000G.SNP-STR/19_45401409-46401409.SNPs.pgen" +str_panel: "data/simulations/1000G.SNP-STR/19_45401409-46401409.STRs.pgen" + + +# Path to a list of samples to exclude from the analysis +# There should be one sample ID per line +# exclude_samples: data/ukb_random_samples_exclude.tsv + +# If SNPs are unphased, provide the path to a SHAPEIT4 map file like these: +# https://github.com/odelaneau/shapeit4/tree/master/maps +# The map file should use the same reference genome as the reference panel VCFs +# phase_map: data/genetic_maps/chr{chr}.b37.gmap.gz + +# A "locus" is a string with a contig name, a colon, the start position, a dash, and +# the end position or a BED file with a ".bed" file ending +# There are different simulation modes that you can use: +# 1. "str" - a tandem repeat is a string with a contig name, a colon, and the start position +# 2. "snp" - a SNP follows the same format as "str" +# 3. "hap" - a haplotype +# 4. "ld_range" - creates random two-SNP haplotypes with a range of LD values between the alleles of each haplotype +# 5. "run" - execute happler on a locus without simulating anything +# The STR and SNP positions should be contained within the locus. +# The positions should be provided in the same coordinate system as the reference +# genome of the reference panel VCFs +# The contig should correspond with the contig name from the {chr} wildcard in the VCF +# required! and unless otherwise noted, all attributes of each mode are required +locus: 19:45401409-46401409 # center: 45901409 (APOe4) +modes: + str: + pos: 19:45903859 + id: 19:45903859 + reps: 10 + beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45] + snp: + pos: 19:45910672 # rs1046282 + beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45] + hap: + alleles: [rs36046716:G, rs1046282:G] # 45892145, 45910672 + beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45] + ld_range: + reps: 10 + min_ld: 0 + max_ld: 1 + step: 0.1 + min_af: 0.25 + max_af: 0.75 + # beta: [0.35] + beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45] + alpha: [0.05] + random: false # whether to also produce random haplotypes + num_haps: [1, 2, 3, 4] + run: + pheno: data/geuvadis/phenos/{trait}.pheno + SVs: data/geuvadis/pangenie_hprc_hg38_all-samples_bi_SVs-missing_removed.pgen + # pheno_matrix: data/geuvadis/EUR_converted_expr_hg38.csv # optional +mode: str + +# Covariates to use if they're needed +# Otherwise, they're assumed to be regressed out of the phenotypes +# Note: the finemapping methods won't be able to handle these covariates +# covar: data/geuvadis/5PCs_sex.covar + +# Discard rare variants with a MAF below this number +# Defaults to 0 if not specified +min_maf: 0.05 + +# Sample sizes to use +# sample_size: [777, 800, 1600, 2400, 3200] +sample_size: 3200 + +# Whether to include the causal variant in the set of genotypes provided to the +# finemapping methods. Set this to true if you're interested in seeing how the +# methods perform when the causal variant is absent from the data. +# Defaults to false if not specified +# You can also provide a list of booleans, if you want to test both values +exclude_causal: [true, false] diff --git a/analysis/config/config.yml b/analysis/config/config.yml index 34462af..51c999d 120000 --- a/analysis/config/config.yml +++ b/analysis/config/config.yml @@ -1 +1 @@ -config-ukb.yml \ No newline at end of file +config-simulations.yml \ No newline at end of file diff --git a/analysis/workflow/Snakefile b/analysis/workflow/Snakefile index df56b8d..2f33839 100644 --- a/analysis/workflow/Snakefile +++ b/analysis/workflow/Snakefile @@ -50,7 +50,8 @@ module genotypes: config: config use rule * from genotypes as genotypes_* -config["gts_str_panel"] = rules.genotypes_subset_str.output.vcf +if mode == "str": + config["gts_str_panel"] = config["str_panel"] if check_config("sample_size"): sampsize = config["sample_size"] config["gts_snp_panel"] = rules.genotypes_subset.output.pgen @@ -59,6 +60,10 @@ else: config["gts_snp_panel"] = rules.genotypes_subset.input.pgen config["random"] = False +if mode == "ld_range" and check_config("num_haps", place=config["modes"][mode]): + if isinstance(config["modes"][mode]["num_haps"], int): + config["modes"][mode]["num_haps"] = [config["modes"][mode]["num_haps"],] + if mode != "run": module simulate: snakefile: "rules/simulate.smk" @@ -90,23 +95,32 @@ if check_config("covar"): if "pheno_matrix" in covar_config: covar_file = rules.covariates_merge.output.covar -if mode in ("snp", "hap", "ld_range"): +if mode in ("str", "snp", "hap", "ld_range"): happler_config = { "pheno": rules.simulate_simphenotype.output.pheno, "hap_file": rules.simulate_transform.input.hap, "snp_panel": config["gts_snp_panel"], - "out": config["out"] + "/happler/"+mode+"/{beta}", + "out": config["out"] + "/happler/"+mode+"/beta_{beta}", "random": None, "covar": covar_file, "min_maf": check_config("min_maf", 0), + "out_thresh": 1, # artificially set to 1 to avoid filtering } if mode in ("hap", "ld_range"): happler_config["snps"] = [] + happler_config["causal_gt"] = rules.simulate_transform.output if mode == "hap": happler_config["snps"] = config["modes"][mode]["alleles"] - happler_config["causal_gt"] = rules.simulate_transform.output - if mode == "ld_range": - happler_config["out"] = config["out"] + "/happler/"+mode+"/ld_{ld}/beta_{beta}/alpha_{alpha}" + elif mode == "ld_range": + happler_config["out"] = config["out"] + "/happler/"+mode+"/{num_haps}_haps/ld_{ld}/beta_{beta}/alpha_{alpha}" + elif mode in ("str", "snp"): + # also provide the SNP and STR IDs so that they can be used in the manhattan plot + happler_config["snps"] = (config["modes"][mode]["id"],) + # TODO: consider uncommenting this to compute LD with causal STR + # if mode == "str": + # happler_config["causal_gt"] = config["gts_str_panel"] + if check_config("reps", place=config["modes"][mode]): + happler_config["out"] = happler_config["out"] + "/rep_{rep}" elif mode == "run": happler_config = { "pheno": config["modes"][mode]["pheno"], @@ -116,11 +130,10 @@ elif mode == "run": "random": None, "covar": covar_file, "min_maf": check_config("min_maf", 0), + "out_thresh": check_config("out_thresh", 5e-8), } if check_config("SVs", place=config["modes"][mode]): happler_config["SVs"] = config["modes"][mode]["SVs"] -elif "str" == mode: - pass else: raise ValueError("Unsupported operating 'mode' in config") @@ -133,31 +146,12 @@ merged_happler = rules.happler_merge.output.pgen if mode == "ld_range" and config["modes"][mode]["random"]: happler_config["random"] = rules.simulate_random_transform.output happler_config["random_hap"] = rules.simulate_random_transform.input.hap - happler_config["out"] = config["out"] + "/happler/"+mode+"/random/ld_{ld}/beta_{beta}/alpha_{alpha}" + happler_config["out"] = config["out"] + "/happler/"+mode+"/random/{num_haps}_haps/ld_{ld}/beta_{beta}/alpha_{alpha}/rep_{rep}" module happler_random: snakefile: "rules/happler.smk" config: happler_config use rule * from happler_random as happler_random_* -# if mode in ("hap", "str", "ld_range"): -# finemappers_config = { -# "pheno": rules.simulate_simphenotype.output.pheno, -# "out": config["out"] + "/finemappers/"+mode+"/{beta}", -# "snp_panel": config["gts_snp_panel"], -# } -# if mode in ("hap", "ld_range"): -# finemappers_config["causal_gt"] = rules.simulate_transform.output.pgen -# if mode == "ld_range": -# finemappers_config["out"] = config["out"] + "/finemappers/"+mode+"/ld_{ld}/beta_{beta}/alpha_{alpha}" -# else: -# raise ValueError("Not yet implemented operating mode 'str' in config") -# module finemappers: -# snakefile: "rules/finemappers.smk" -# config: finemappers_config -# use rule * from finemappers as finemappers_* -# else: -# raise ValueError(f"Unsupported operating mode '{mode}' in config") - plots_config = { "out": config["out"] + "/plots", "mode": mode, @@ -190,7 +184,7 @@ elif mode == "run": output: hps = orig_out + "/multiline.tsv", resources: - runtime=3, + runtime=10, log: orig_out + "/logs/multiline", benchmark: @@ -231,16 +225,22 @@ elif mode == "run": else: FINAL_OUTPUT = expand( [ + rules.happler_cond_linreg.output.png, rules.happler_tree.output.png, - rules.happler_manhattan.output.png, # rules.finemappers_results.output.susie_pdf, rules.happler_results.output.susie_pdf, - rules.plots_params.output.png, + # rules.plots_params.output.png, ], ex=(("in",) if mode == "hap" else ("ex", "in")), beta=config["modes"]["hap"]["beta"], allow_missing=True, ) + if check_config("reps", place=config["modes"][mode]): + FINAL_OUTPUT = expand( + FINAL_OUTPUT, + rep=range(config["modes"][mode]["reps"]), + allow_missing=True, + ) # If '--config debug=true' is specified, we won't try to build the entire DAG # This can be helpful when you want to play around with a specific target output diff --git a/analysis/workflow/envs/susie.post-deploy.sh b/analysis/workflow/envs/susie.post-deploy.sh deleted file mode 100644 index 272cfdb..0000000 --- a/analysis/workflow/envs/susie.post-deploy.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!env bash -set -o pipefail - -R -e 'install.packages("pgenlibr",repos="http://cran.us.r-project.org")' diff --git a/analysis/workflow/envs/susie.yml b/analysis/workflow/envs/susie.yml index fbd91b7..8e704f3 100644 --- a/analysis/workflow/envs/susie.yml +++ b/analysis/workflow/envs/susie.yml @@ -2,15 +2,17 @@ channels: - conda-forge - nodefaults dependencies: - - conda-forge::r-susier==0.11.42 + - conda-forge::r-susier==0.12.35 - conda-forge::r-abind==1.4_5 - - conda-forge::r-r.utils==2.12.0 - - conda-forge::r-dplyr==1.0.7 - - conda-forge::r-readr==1.4.0 + - conda-forge::r-r.utils==2.12.3 + - conda-forge::r-dplyr==1.1.4 + - conda-forge::r-readr==2.1.5 - aryarm::r-dscrutils==0.4.3 - - conda-forge::r-irkernel==1.2 - - conda-forge::sos-notebook==0.22.3 - - conda-forge::coreutils==8.31 + - aryarm::r-pgenlibr==0.3.7 + - conda-forge::r-irkernel==1.3.2 + - conda-forge::sos-notebook==0.24.4 + - conda-forge::coreutils=9.5 - conda-forge::sed==4.8 - - conda-forge::bash==5.0.018 - - conda-forge::gzip==1.10 + - conda-forge::bash==5.2.21 + - conda-forge::gzip==1.13 + - conda-forge::r-rfast==2.1.0 diff --git a/analysis/workflow/rules/genotypes.smk b/analysis/workflow/rules/genotypes.smk index 8520d46..4eb126e 100644 --- a/analysis/workflow/rules/genotypes.smk +++ b/analysis/workflow/rules/genotypes.smk @@ -162,28 +162,6 @@ rule vcf2plink: "--threads {threads}{params.samps} --out {params.prefix} &>{log}" -rule subset_str: - """ subset samples from a STR VCF """ - input: - vcf=lambda wildcards: expand(config["str_panel"], chr=parse_locus(wildcards.locus)[0])[0], - vcf_idx=lambda wildcards: expand(config["str_panel"] + ".tbi", chr=parse_locus(wildcards.locus)[0]), - samples=rules.keep_samps.output.samples, - output: - vcf=out+"/strs.bcf", - vcf_idx=out+"/strs.bcf.csi", - resources: - runtime=30, - log: - logs + "/subset_str", - benchmark: - bench + "/subset_str", - conda: - "../envs/default.yml" - shell: - "bcftools view -S {input.samples} --write-index " - "-Ob -o {output.vcf} {input.vcf}" - - def subset_input(): if check_config("phase_map") or check_config("exclude_samples") or not config["snp_panel"].endswith(".pgen"): return { diff --git a/analysis/workflow/rules/happler.smk b/analysis/workflow/rules/happler.smk index 08cc773..f3b430c 100644 --- a/analysis/workflow/rules/happler.smk +++ b/analysis/workflow/rules/happler.smk @@ -24,11 +24,38 @@ def parse_locus(locus): return chrom, start, end +rule sub_pheno: + """subset the phenotype file to include only the desired replicate""" + input: + pheno=config["pheno"], + params: + rep=lambda wildcards: int(wildcards.rep)+2, + output: + pheno=out + "/phen.pheno", + resources: + runtime=5, + threads: 1, + log: + logs + "/sub_pheno", + benchmark: + bench + "/sub_pheno", + conda: + "../envs/default.yml" + shell: + "cut -f 1,{params.rep} {input.pheno} | " + "(echo -e \"#IID\\thap\" && tail -n+2) >{output.pheno} 2>{log}" + + +pheno = rules.sub_pheno.output.pheno if "{rep}" in out else config["pheno"] +# if the pvar size is larger than 0.5 GB, just use the default memory instead +rsrc_func = lambda x: max if .5 > Path(x).with_suffix(".pvar").stat().st_size/1000/1000/1000 else min + + rule run: """ execute happler! """ input: gts=config["snp_panel"], - pts=config["pheno"], + pts=pheno, covar=config["covar"], params: thresh=lambda wildcards: 0.05 if "alpha" not in wildcards else wildcards.alpha, @@ -36,22 +63,23 @@ rule run: covar=lambda wildcards, input: ("--covar " + input["covar"] + " ") if check_config("covar") else "", maf = check_config("min_maf", 0), indep=lambda wildcards: 0.05 if "indep_alpha" not in wildcards else wildcards.indep_alpha, + max_signals=3, + max_iterations=3, + out_thresh=check_config("out_thresh", 5e-8), + keep_SNPs="--remove-SNPs " if not ("{rep}" in out) else "", output: - hap=out + "/happler.hap", + hap=ensure(out + "/happler.hap", non_empty=True), gz=out + "/happler.hap.gz", idx=out + "/happler.hap.gz.tbi", dot=out + "/happler.dot", resources: runtime=lambda wildcards, input: ( - Path(input.gts).with_suffix(".pvar").stat().st_size/1000 * 0.5454649806119475 + 0.6935132147046765 - if Path(input.gts).suffix == ".pgen" and check_config("dynamic_resources") else 45 + rsrc_func(input.gts)(45, Path(input.gts).with_suffix(".pvar").stat().st_size/1000 * 2.5379343786643838 + 20.878965342140603) ), # slurm_partition="hotel", # slurm_extra="--qos=hotel", - # mem_mb=lambda wildcards, threads: threads*4.57, mem_mb=lambda wildcards, input: ( - Path(input.gts).with_suffix(".pvar").stat().st_size/1000 * 14.90840694595845 + 401.8011129664152 - if Path(input.gts).suffix == ".pgen" and check_config("dynamic_resources") else 5000 + rsrc_func(input.gts)(2500, Path(input.gts).with_suffix(".pvar").stat().st_size/1000 * 7.5334226167661384 + 22.471377010118147) ), threads: 1 log: @@ -62,8 +90,10 @@ rule run: "happler" shell: "happler run -o {output.hap} --verbosity DEBUG --maf {params.maf} " - "--discard-multiallelic --region {params.region} {params.covar} --indep-thresh" - " {params.indep} -t {params.thresh} --show-tree {input.gts} {input.pts} &>{log}" + "--max-signals {params.max_signals} --max-iterations {params.max_iterations} " + "--discard-multiallelic --region {params.region} {params.keep_SNPs}" + "{params.covar}--indep-thresh {params.indep} -t {params.thresh} " + "--out-thresh {params.out_thresh} --show-tree {input.gts} {input.pts} &>{log}" " && haptools index -o {output.gz} {output.hap} &>>{log}" @@ -100,7 +130,7 @@ rule cond_linreg: pvar=Path(config["snp_panel"]).with_suffix(".pvar"), psam=Path(config["snp_panel"]).with_suffix(".psam"), hap=rules.run.output.hap, - pts=config["pheno"], + pts=pheno, params: maf = check_config("min_maf", 0), region=lambda wildcards: wildcards.locus.replace("_", ":"), @@ -108,9 +138,9 @@ rule cond_linreg: png=out + "/cond_linreg.pdf", resources: runtime=lambda wildcards, input: ( - 1.5 * Path(input.pvar).stat().st_size/1000 * ( + rsrc_func(input.pgen)(50, 1.5 * Path(input.pvar).stat().st_size/1000 * ( 0.2400978997329614 + get_num_variants(input.hap) * 0.045194464826048095 - ) if Path(input.pgen).suffix == ".pgen" and check_config("dynamic_resources") else 50 + )) ), log: logs + "/cond_linreg", @@ -132,13 +162,13 @@ rule heatmap: pvar=Path(config["snp_panel"]).with_suffix(".pvar"), psam=Path(config["snp_panel"]).with_suffix(".psam"), hap=rules.run.output.hap, - pts=config["pheno"], + pts=pheno, params: region=lambda wildcards: wildcards.locus.replace("_", ":"), output: png=out + "/heatmap.pdf", resources: - runtime=4, + runtime=10, log: logs + "/heatmap", benchmark: @@ -196,7 +226,7 @@ rule transform: pgen=config["snp_panel"], pvar=Path(config["snp_panel"]).with_suffix(".pvar"), psam=Path(config["snp_panel"]).with_suffix(".psam"), - pts=config["pheno"], + pts=pheno, params: region=lambda wildcards: wildcards.locus.replace("_", ":"), output: @@ -204,7 +234,7 @@ rule transform: pvar=temp(out + "/happler.pvar"), psam=temp(out + "/happler.psam"), resources: - runtime=4, + runtime=10, log: logs + "/transform", benchmark: @@ -223,7 +253,7 @@ rule linreg: hap=rules.transform.output.pgen, hap_pvar=rules.transform.output.pvar, hap_psam=rules.transform.output.psam, - pts=config["pheno"], + pts=pheno, params: region=lambda wildcards: wildcards.locus.replace("_", ":"), output: @@ -252,7 +282,7 @@ rule sv_ld: sv_pvar=lambda wildcards: Path(config["SVs"]).with_suffix(".pvar"), sv_psam=lambda wildcards: Path(config["SVs"]).with_suffix(".psam"), params: - start=lambda wildcards: max(0, int(parse_locus(wildcards.locus)[1])-1000000), + start=lambda wildcards: min(0, int(parse_locus(wildcards.locus)[1])-1000000), end=lambda wildcards: int(parse_locus(wildcards.locus)[2])+1000000, chrom=lambda wildcards: parse_locus(wildcards.locus)[0], maf = check_config("min_maf", 0), @@ -304,8 +334,7 @@ rule merge: psam=temp(out + "/{ex}clude/merged.psam"), resources: runtime=lambda wildcards, input: ( - Path(input.gts_pvar).stat().st_size/1000 * 0.05652315368728583 + 2.0888654705656844 - if Path(input.gts).suffix == ".pgen" and check_config("dynamic_resources") else 10 + rsrc_func(input.gts)(10, Path(input.gts_pvar).stat().st_size/1000 * 0.05652315368728583 + 2.0888654705656844) ), log: logs + "/{ex}clude/merge", @@ -333,7 +362,7 @@ rule finemapper: gt=lambda wildcards: finemapper_input(wildcards).pgen, gt_pvar=lambda wildcards: finemapper_input(wildcards).pvar, gt_psam=lambda wildcards: finemapper_input(wildcards).psam, - phen=config["pheno"], + phen=pheno, params: outdir=lambda wildcards, output: Path(output.susie).parent, exclude_causal="NULL", @@ -341,17 +370,15 @@ rule finemapper: # num_signals=lambda wildcards: get_num_haplotypes( # expand(rules.run.output.hap, **wildcards)[0] # ) + 1, - num_signals=5, + num_signals=10, # TODO: also add MAF filter output: susie=out + "/{ex}clude/susie.rds", resources: -# runtime=lambda wildcards, input: ( -# Path(input.gt_pvar).stat().st_size/1000 * 0.08 -# if Path(input.gt).suffix == ".pgen" and check_config("dynamic_resources") else 75 -# ), - runtime=120, - mem_mb = 7000, + runtime=lambda wildcards, input: ( + rsrc_func(input.gt)(120, Path(input.gt_pvar).stat().st_size/1000 * 0.3) + ), + mem_mb = 8500, threads: 1, log: logs + "/{ex}clude/finemapper", @@ -370,7 +397,7 @@ rule pips: output: tsv=out + "/{ex}clude/susie_pips.tsv", resources: - runtime=5, + runtime=10, threads: 3, log: logs + "/{ex}clude/pips", @@ -387,11 +414,11 @@ rule metrics: input: finemap=expand(rules.finemapper.output.susie, ex="in", allow_missing=True), obs_hap=rules.run.output.hap, - caus_hap=config["hap_file"], + # caus_hap=config["hap_file"], output: metrics=out + "/susie_metrics.tsv", resources: - runtime=5, + runtime=10, threads: 6, log: logs + "/metrics", @@ -427,11 +454,11 @@ rule results: """ input: gt=rules.finemapper.input.gt, - phen=config["pheno"], + phen=pheno, susie=rules.finemapper.output.susie, happler_hap=results_happler_hap_input, - causal_gt=config["causal_gt"].pgen if "causal_gt" in config else [], - causal_hap=results_causal_hap_input, + # causal_gt=config["causal_gt"].pgen if "causal_gt" in config else [], + # causal_hap=results_causal_hap_input, params: outdir=lambda wildcards, output: Path(output.susie_pdf).parent, region=lambda wildcards: wildcards.locus.replace("_", ":"), @@ -440,7 +467,7 @@ rule results: susie_pdf = out + "/{ex}clude/susie.pdf", # susie_eff_pdf=temp(out + "/susie_eff.pdf"), resources: - runtime=5, + runtime=10, log: logs + "/{ex}clude/results", benchmark: @@ -485,7 +512,7 @@ rule gwas: pgen=(rules.merge_SVs if check_config("SVs") else rules.merge).output.pgen, pvar=(rules.merge_SVs if check_config("SVs") else rules.merge).output.pvar, psam=(rules.merge_SVs if check_config("SVs") else rules.merge).output.psam, - pts=config["pheno"], + pts=pheno, covar=config["covar"], params: maf = check_config("min_maf", 0), diff --git a/analysis/workflow/rules/plots.smk b/analysis/workflow/rules/plots.smk index 2ef6632..890cbff 100644 --- a/analysis/workflow/rules/plots.smk +++ b/analysis/workflow/rules/plots.smk @@ -1,3 +1,4 @@ +import sys from pathlib import Path out = config["out"] @@ -12,8 +13,8 @@ bench += "/" + mode def agg_ld(wildcards): """ a helper function for the other agg functions """ checkpoint_output = config["ld_range_checkpoint"].get(**wildcards).output.hap - ld_vals = glob_wildcards(Path(checkpoint_output) / "ld_{ld}/haplotype.hap").ld - return checkpoint_output, ld_vals + ld_vals = glob_wildcards(Path(checkpoint_output) / "{num_haps}_haps/ld_{ld}/haplotype.hap").ld + return checkpoint_output, sorted(list(set(ld_vals))) def agg_ld_range_obs(wildcards): """ return a list of hap files from happler """ @@ -24,12 +25,15 @@ def agg_ld_range_obs(wildcards): ld=ld_vals, alpha=config["mode_attrs"]["alpha"], beta=config["mode_attrs"]["beta"], + num_haps=config["mode_attrs"]["num_haps"], + rep=range(config["mode_attrs"]["reps"]), **wildcards, ) else: return expand( config["happler_hap"], beta=config["mode_attrs"]["beta"], + num_haps=config["mode_attrs"]["num_haps"], **wildcards, ) @@ -41,12 +45,14 @@ def agg_ld_range_causal(wildcards): str(checkpoint_output), ld=ld_vals, beta=config["mode_attrs"]["beta"], + num_haps=config["mode_attrs"]["num_haps"], **wildcards, ) else: return expand( config["causal_hap"], beta=config["mode_attrs"]["beta"], + num_haps=config["mode_attrs"]["num_haps"], **wildcards, ) @@ -58,13 +64,16 @@ def agg_ld_range_metrics(wildcards): config["happler_metrics"], ld=ld_vals, beta=config["mode_attrs"]["beta"], + num_haps=config["mode_attrs"]["num_haps"], alpha=config["mode_attrs"]["alpha"], + rep=range(config["mode_attrs"]["reps"]), **wildcards, ) else: return expand( config["happler_metrics"], beta=config["mode_attrs"]["beta"], + num_haps=config["mode_attrs"]["num_haps"], **wildcards, ) @@ -89,7 +98,7 @@ rule params: output: png=out + "/happler_params.png", resources: - runtime=10, + runtime=20, log: logs + "/plot_params", benchmark: @@ -98,6 +107,7 @@ rule params: "happler" shell: "workflow/scripts/parameter_plot.py -o {output.png} " + "--order num_haps,beta,ld " "{input.gts} {params.observed_haps} {params.causal_hap} &> {log}" @@ -117,7 +127,7 @@ rule metrics: output: png=out + "/finemapping_metrics.png", resources: - runtime=10, + runtime=20, log: logs + "/metrics", benchmark: @@ -126,4 +136,5 @@ rule metrics: "happler" shell: "workflow/scripts/parameter_plot.py -o {output.png} -m {params.metrics} " + "--order num_haps,beta,ld " "{input.gts} {params.observed_haps} {params.causal_hap} &> {log}" diff --git a/analysis/workflow/rules/simulate.smk b/analysis/workflow/rules/simulate.smk index 7a9dbc3..72d92e3 100644 --- a/analysis/workflow/rules/simulate.smk +++ b/analysis/workflow/rules/simulate.smk @@ -22,7 +22,11 @@ def parse_locus(locus): start = locus.split("_")[1].split("-")[0] return chrom, start, end -hap_ld_range_output = out + "/create_ld_range/ld_{ld}/haplotype.hap" +hap_ld_range_output = out + "/create_ld_range/{num_haps}_haps/ld_{ld}/haplotype.hap" + +gts_panel = Path( + config["gts_str_panel"] if mode == "str" else config["gts_snp_panel"] +) checkpoint create_hap_ld_range: """ create a hap file suitable for haptools transform and simphenotype """ @@ -31,13 +35,16 @@ checkpoint create_hap_ld_range: gts_pvar=Path(config["gts_snp_panel"]).with_suffix(".pvar"), gts_psam=Path(config["gts_snp_panel"]).with_suffix(".psam"), params: - reps = config["modes"]["ld_range"]["reps"], - min_ld = config["modes"]["ld_range"]["min_ld"], - max_ld = config["modes"]["ld_range"]["max_ld"], - step = config["modes"]["ld_range"]["step"], - min_af = config["modes"]["ld_range"]["min_af"], - max_af = config["modes"]["ld_range"]["max_af"], - out = lambda wildcards: hap_ld_range_output, + min_ld = lambda wildcards: config["modes"]["ld_range"]["min_ld"], + max_ld = lambda wildcards: config["modes"]["ld_range"]["max_ld"], + step = lambda wildcards: config["modes"]["ld_range"]["step"], + min_af = lambda wildcards: config["modes"]["ld_range"]["min_af"], + max_af = lambda wildcards: config["modes"]["ld_range"]["max_af"], + num_haps = lambda wildcards: "-n " + " -n ".join(map(str, config["modes"]["ld_range"]["num_haps"])), + out = lambda wildcards: expand( + hap_ld_range_output, **wildcards, allow_missing=True, + ), + seed = 42, output: hap=directory(out + "/create_ld_range") resources: @@ -49,22 +56,23 @@ checkpoint create_hap_ld_range: conda: "happler" shell: - "workflow/scripts/choose_different_ld.py -r {params.reps} " + "workflow/scripts/choose_different_ld.py {params.num_haps} " "--min-ld {params.min_ld} --max-ld {params.max_ld} " - "--step {params.step} --min-af {params.min_af} " + "--step {params.step} --min-af {params.min_af} --seed {params.seed} " "--max-af {params.max_af} {input.gts} {params.out} &> {log}" rule create_hap: """ create a hap file suitable for haptools transform and simphenotype """ input: - gts=Path(config["gts_snp_panel"]).with_suffix(".pvar"), + gts=gts_panel.with_suffix(".pvar"), params: ignore="this", # the first parameter is always ignored for some reason chrom=lambda wildcards: parse_locus(wildcards.locus)[0], locus=lambda wildcards: wildcards.locus.split("_")[1].replace('-', '\t'), beta=0.99, - alleles=lambda wildcards: config["modes"]["hap"]["alleles"], + alleles=lambda wildcards: config["modes"]["hap"]["alleles"] if mode == "hap" else [], + repeat=lambda wildcards: config["modes"]["str"]["id"] if mode == "str" else 0, output: hap=out + "/haplotype.hap" resources: @@ -79,9 +87,9 @@ rule create_hap: "../scripts/create_hap_file.sh" if mode == "ld_range": - out += "/pheno/ld_{ld}" - logs += "/pheno/ld_{ld}" - bench += "/pheno/ld_{ld}" + out += "/pheno/{num_haps}_haps/ld_{ld}" + logs += "/pheno/{num_haps}_haps/ld_{ld}" + bench += "/pheno/{num_haps}_haps/ld_{ld}" rule transform: """ use the hap file to create a pgen of the haplotype """ @@ -110,11 +118,13 @@ rule simphenotype: """ use the hap file to create simulated phenotypes for the haplotype """ input: hap=rules.transform.input.hap, - pgen=rules.transform.output.pgen, - pvar=rules.transform.output.pvar, - psam=rules.transform.output.psam, + pgen=rules.transform.output.pgen if mode != "str" else config["gts_str_panel"], + pvar=rules.transform.output.pvar if mode != "str" else Path(config["gts_str_panel"]).with_suffix(".pvar"), + psam=rules.transform.output.psam if mode != "str" else Path(config["gts_str_panel"]).with_suffix(".psam"), params: beta=lambda wildcards: wildcards.beta, + seed = 42, + reps = lambda wildcards: config["modes"]["ld_range"]["reps"], output: pheno=out + "/{beta}.pheno", resources: @@ -126,5 +136,6 @@ rule simphenotype: conda: "happler" shell: - "haptools simphenotype -o {output.pheno} {input.pgen} " + "haptools simphenotype --seed {params.seed} -o {output.pheno} " + "--replications {params.reps} {input.pgen} " "<( sed 's/\\t0.99$/\\t{params.beta}/' {input.hap} ) &>{log}" diff --git a/analysis/workflow/scripts/README.md b/analysis/workflow/scripts/README.md index 24538c9..99f7f4f 100644 --- a/analysis/workflow/scripts/README.md +++ b/analysis/workflow/scripts/README.md @@ -6,6 +6,12 @@ All python scripts implement the --help argument. For bash, awk, and R scripts, ## [choose_different_ld.py](choose_different_ld.py) Choose haplotype alleles such that their SNPs have a range of strengths of LD with the causal haplotype. +## [compute_pgen_ld.py](compute_pgen_ld.py) +Quickly compute the LD between a single variant and all other variants in a PGEN file. + +## [conditional_regression_plots.py](conditional_regression_plots.py) +Make manhattan plots for a region after conditioning on 1) haplotypes and 2) their alleles together and 3) separately. Also, 4) show the original manhattan plot without any conditioning. + ## [create_hap_file.sh](create_hap_file.sh) Create a [.hap file](https://haptools.readthedocs.io/en/stable/formats/haplotypes.html) for use by haptools when simulating causal variables. @@ -33,12 +39,12 @@ Create a manhattan plot to visualize the results of a GWAS. ## [merge_plink.py](merge_plink.py) Merge variants from two PGEN files that have the same set of samples. -## [midway_manhattan.bash](midway_manhattan.bash) -Create a "midway" manhattan plot depicting the p-values on a branch of a tree mid-way through happler's tree-building process. - ## [midway_manhattan_summary.py](midway_manhattan_summary.py) Summarize the results of multiple midway manhattan plots via an ROC curve +## [midway_manhattan.bash](midway_manhattan.bash) +Create a "midway" manhattan plot depicting the p-values on a branch of a tree mid-way through happler's tree-building process. + ## [parameter_plot.py](parameter_plot.py) Plot the LD between a causal haplotype and a set of observed haplotypes across a range of hyper-parameters. diff --git a/analysis/workflow/scripts/choose_different_ld.py b/analysis/workflow/scripts/choose_different_ld.py index 82f7fd1..f09dd9b 100755 --- a/analysis/workflow/scripts/choose_different_ld.py +++ b/analysis/workflow/scripts/choose_different_ld.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +from typing import Tuple from pathlib import Path from logging import Logger from itertools import product @@ -17,11 +18,11 @@ def find_haps( log: Logger, min_ld: float, max_ld: float, - reps: int = 1, + num_haps: int = 1, step: float = 0.1, seed: np.random.Generator = np.random.default_rng(None), ): - log.info(f"Using min_ld {min_ld}, max_ld {max_ld}, reps {reps}, and step {step}") + log.info(f"Using min_ld {min_ld}, max_ld {max_ld}, num_haps {num_haps}, and step {step}") sum_gts = gts.data.sum(axis=2) chrom = np.unique(gts.variants["chrom"]) @@ -29,8 +30,9 @@ def find_haps( chrom = chrom[0] intervals = np.arange(max(0, min_ld), min(1, max_ld), step) + step - ld_bins = {idx: 0 for idx in range(len(intervals))} - count = len(intervals) * reps + # we return lists of haplotypes: one list for each bin + ld_bins = {idx: [] for idx in range(len(intervals))} + count = len(intervals) * num_haps combos = np.array(list(range(len(gts.variants)))) seed.shuffle(combos) @@ -51,10 +53,11 @@ def find_haps( combo_ld = np.abs(pearson_corr_ld(*tuple(sum_gts[:, snp_id] for snp_id in combo_id))) # which bin does this LD value fall in? bin_idx = np.argmax(combo_ld < intervals) - if ld_bins[bin_idx] < reps: - log.info(f"Outputting {hp_id} with LD {combo_ld} for bin {intervals[bin_idx]}") - yield combo_ld, hp - ld_bins[bin_idx] += 1 + if len(ld_bins[bin_idx]) < num_haps: + log.info(f"Adding {hp_id} with LD {combo_ld} to bin {intervals[bin_idx]}") + ld_bins[bin_idx].append((combo_ld, hp)) + if len(ld_bins[bin_idx]) == num_haps: + yield ld_bins[bin_idx] count -= 1 break if count == 0: @@ -64,14 +67,6 @@ def find_haps( @click.command() @click.argument("gts", type=click.Path(exists=True, path_type=Path)) @click.argument("output", type=click.Path(path_type=Path)) -@click.option( - "-r", - "--reps", - type=int, - default=1, - show_default=True, - help="The number of replicates to perform within each LD bin", -) @click.option( "--min-ld", type=float, @@ -107,6 +102,18 @@ def find_haps( show_default=True, help="The maximum LD value to allow", ) +@click.option( + "-n", + "--num-haps", + type=int, + multiple=True, + default=(1,), + show_default=True, + help=( + "The number of haplotypes to output in each hap file. " + "If multiple, we will also match on the {num_haps} brace expression" + ), +) @click.option( "--seed", type=int, @@ -125,12 +132,12 @@ def find_haps( def main( gts: Path, output: Path, - reps: int = 1, min_ld: float = 0, max_ld: float = 1, step: float = 0.1, min_af: float = 0.25, max_af: float = 0.75, + num_haps: Tuple[int] = (1,), seed: int = None, verbosity: str = "DEBUG", ): @@ -138,7 +145,7 @@ def main( Create haplotypes with a range of LD values between their alleles LD values are injected into the output from brace expressions - For example, if we are given a a path like "ld_{ld}/happler.hap", then + For example, if we are given a path like "ld_{ld}/happler.hap", then we will replace {ld} with the provided ld value """ log = getLogger("choose", verbosity) @@ -155,22 +162,33 @@ def main( seed = np.random.default_rng(seed) - for combo_ld, hp in find_haps( + if "{num_haps}" not in str(output) and len(num_haps) > 1: + log.warning( + "Multiple num_haps values provided, but {num_haps} brace expression is " + "absent from output path. Only the last --num-haps value will be used." + ) + + for haps in find_haps( gts, log, min_ld=min_ld, max_ld=max_ld, - reps=reps, + num_haps=max(num_haps), step=step, seed=seed, ): - out_path = Path(str(output).format(ld=round(combo_ld, 2))) - out_path.parent.mkdir(parents=True, exist_ok=True) - hps = Haplotypes( - out_path, log=log, haplotype=Haplotype, + avg_combo_ld = np.mean([combo_ld for combo_ld, hp in haps]) + out_path_temp = Path(str(output).format( + ld=round(avg_combo_ld, 2), num_haps="{num_haps}"), ) - hps.data = {hp.id: hp} - hps.write() + for num_hap in num_haps: + out_path = Path(str(out_path_temp).format(num_haps=num_hap)) + out_path.parent.mkdir(parents=True, exist_ok=True) + hps = Haplotypes( + out_path, log=log, haplotype=Haplotype, + ) + hps.data = {hp.id: hp for combo_ld, hp in haps[:num_hap]} + hps.write() if __name__ == "__main__": diff --git a/analysis/workflow/scripts/compute_pgen_ld.py b/analysis/workflow/scripts/compute_pgen_ld.py index bc95a39..075af17 100755 --- a/analysis/workflow/scripts/compute_pgen_ld.py +++ b/analysis/workflow/scripts/compute_pgen_ld.py @@ -7,7 +7,7 @@ from haptools.logging import getLogger from haptools.ld import pearson_corr_ld -from haptools.data import Data, Genotypes, GenotypesPLINK +from haptools.data import Data, GenotypesPLINK def corr(a, b): diff --git a/analysis/workflow/scripts/create_hap_file.sh b/analysis/workflow/scripts/create_hap_file.sh index ae46e5d..cc32072 100755 --- a/analysis/workflow/scripts/create_hap_file.sh +++ b/analysis/workflow/scripts/create_hap_file.sh @@ -2,23 +2,31 @@ exec 2> "${snakemake_log}" -echo -e "#\torderH\tbeta" > "${snakemake_output[hap]}" -echo -e "#\tversion\t0.1.0" >> "${snakemake_output[hap]}" -echo -e "#H\tbeta\t.2f\tEffect size in linear model" >> "${snakemake_output[hap]}" +echo -e "#\tversion\t0.2.0" >> "${snakemake_output[hap]}" +echo -e "#R\tbeta\t.2f\tEffect size in linear model" >> "${snakemake_output[hap]}" -starts=() -ends=() -for snp in ${snakemake_params[4]}; do - snp_id="$(echo "$snp" | sed 's/\(.*\):/\1\t/')" - start_pos="$(grep -m1 "$(echo "$snp_id" | cut -f1)" "${snakemake_input[gts]}" | cut -f2)" - end_pos=$((start_pos+1)) - echo -e "V\thap\t$start_pos\t$end_pos\t$snp_id" >> "${snakemake_output[hap]}" - starts+=("$start_pos") - ends+=("$end_pos") -done +if [ "${snakemake_params[repeat]}" -eq "0" ]; then + starts=() + ends=() + for snp in ${snakemake_params[4]}; do + snp_id="$(echo "$snp" | sed 's/\(.*\):/\1\t/')" + start_pos="$(grep -Ev '^#' -m1 "$(echo "$snp_id" | cut -f1)" "${snakemake_input[gts]}" | cut -f2)" + end_pos=$((start_pos+1)) + echo -e "V\thap\t$start_pos\t$end_pos\t$snp_id" >> "${snakemake_output[hap]}" + starts+=("$start_pos") + ends+=("$end_pos") + done -start_pos="$(echo "${starts[@]}" | tr ' ' '\n' | sort -n | head -n1)" -end_pos="$(echo "${ends[@]}" | tr ' ' '\n' | sort -rn | head -n1)" -echo -e "H\t${snakemake_params[1]}\t$start_pos\t$end_pos\thap\t${snakemake_params[3]}" >> "${snakemake_output[hap]}" + start_pos="$(echo "${starts[@]}" | tr ' ' '\n' | sort -n | head -n1)" + end_pos="$(echo "${ends[@]}" | tr ' ' '\n' | sort -rn | head -n1)" + echo -e "H\t${snakemake_params[1]}\t$start_pos\t$end_pos\thap\t${snakemake_params[3]}" >> "${snakemake_output[hap]}" +else + for str in ${snakemake_params[5]}; do + str_id=$'\t'"$str"$'\t' + start_pos="$(grep --line-buffered -Ev '^#' "${snakemake_input[gts]}" | grep -Pm1 "${start_pos}${str_id}" | cut -f2)" + end_pos="$(grep --line-buffered -Ev '^#' "${snakemake_input[gts]}" | grep -Pm1 "${start_pos}${str_id}" | grep -Po '(?<=;END=)\d+(?=;PERIOD)')" + echo -e "R\t${snakemake_params[1]}\t$start_pos\t$end_pos\t$str\t${snakemake_params[3]}" >> "${snakemake_output[hap]}" + done +fi LC_ALL=C sort -k1,4 -o "${snakemake_output[hap]}" "${snakemake_output[hap]}" diff --git a/analysis/workflow/scripts/haplotype_ld.py b/analysis/workflow/scripts/haplotype_ld.py deleted file mode 100755 index b4f9012..0000000 --- a/analysis/workflow/scripts/haplotype_ld.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python -import pandas as pd -import seaborn as sns -from matplotlib import pyplot as plt - - -sns.set(rc={'figure.figsize':(11.7,8.27)}) - -phens = pd.read_csv("../phens.tsv.gz", sep="\t", index_col=0, header=0) -gens = pd.read_csv("genotypes.tsv", sep="\t", index_col=1) -gt = gens.iloc[:,3:] -alleles = gens['alleles'] - -SNPs = gt.iloc[~gt.index.str.startswith('STR')] -STR = gt.iloc[gt.index.str.startswith('STR')] - -samples = phens.index.astype(str) -num_samp = len(samples) - -lens = {str(i): len(al) for i, al in enumerate(gens.loc[STR.index[0],'alleles'].split(','))} -str_gt = STR.iloc[0].str.split('|', expand=True).applymap(lens.__getitem__) -str_gt = str_gt.loc[samples] -str_gts = pd.concat((str_gt[0], str_gt[1]), axis=0, ignore_index=True) -split_chroms = pd.DataFrame({STR.index[0]: str_gts}) - -fig, axes = plt.subplots(1, len(SNPs.index)) -for idx, snp in enumerate(SNPs.index): - snp_gt = gt.loc[snp].str.split('|', expand=True) - snp_gt = snp_gt.loc[samples] - snp_gts = pd.concat((snp_gt[0], snp_gt[1]), axis=0, ignore_index=True).astype(int) - split_chroms[snp] = snp_gts - sns.violinplot(ax=axes[idx], x=snp, y=STR.index[0], data=split_chroms, order=[0,1]) - -plt.savefig('snps.png') -plt.clf() - -fig, axes = plt.subplots(1, 2) -hap_alleles = {snp: 1 for snp in SNPs.index} -hap = pd.DataFrame([(split_chroms[snp] == hap_alleles[snp]).astype(bool) for snp in SNPs.index]).T -hap['haplotype'] = hap.all(axis=1).astype(int) -hap[STR.index[0]] = split_chroms[STR.index[0]] -sns.violinplot(ax=axes[0], x='haplotype', y=STR.index[0], data=hap, order=[0,1]) - -hap_sum = pd.DataFrame({ - 'haplotype': (hap['haplotype'].iloc[:num_samp].to_numpy() + hap['haplotype'].iloc[num_samp:].to_numpy()), - STR.index[0]: (hap[STR.index[0]].iloc[:num_samp].to_numpy() + hap[STR.index[0]].iloc[num_samp:].to_numpy()), -}) -sns.regplot(ax=axes[1], x='haplotype', y=STR.index[0], data=hap_sum) -plt.savefig('haplotype.png') diff --git a/analysis/workflow/scripts/parameter_plot.py b/analysis/workflow/scripts/parameter_plot.py index 34ce946..107482d 100755 --- a/analysis/workflow/scripts/parameter_plot.py +++ b/analysis/workflow/scripts/parameter_plot.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +import pickle from pathlib import Path from logging import Logger @@ -7,10 +8,12 @@ import numpy as np matplotlib.use('Agg') import numpy.typing as npt +from scipy.stats import sem from haptools import logging import matplotlib.pyplot as plt from haptools.ld import pearson_corr_ld -from haptools.data import GenotypesVCF, GenotypesPLINK, Haplotypes +from scipy.optimize import linear_sum_assignment +from haptools.data import Genotypes, GenotypesVCF, GenotypesPLINK, Haplotypes from snakemake_io import glob_wildcards @@ -22,17 +25,84 @@ "gt": "U5", "alpha": np.float64, "samp": np.uint32, + "num_haps": np.uint8, } LOG_SCALE = {"alpha",} +plt.rcParams['figure.dpi'] = 400 # Set the figure DPI to 300 +plt.rcParams['savefig.dpi'] = plt.rcParams['figure.dpi'] # Set the DPI for saving figures -def get_num_haps(hap: Path, log: Logger = None): - hap = Haplotypes(hap, log=log) - hap.read() - return len(hap.data) +def match_haps(gts: Genotypes, observed: Haplotypes, causal: Haplotypes) -> tuple: + """ + Match the observed and causal haplotypes to maximize best pairwise LD + + This function uses scipy.optimize.linear_sum_assignment to find the optimal + match between causal and observed haplotypes to maximize the sum of the pairwise LD + + Parameters + ---------- + gts: Genotypes + The set of genotypes for each of the variants in the haplotypes + observed: Path + A path to a .hap file containing haplotypes output by happler + causal: Path + A path to a .hap file containing a simulated causal haplotype -def get_best_ld(gts: GenotypesVCF, observed_hap: Path, causal_hap: Path, region: str = None, observed_id: str = None, causal_id: str = None, log: Logger = None): + Returns + ------- + npt.NDArray + An array of the LD values for each pair of observed and causal haplotypes + npt.NDArray + An array of the row indices indicating the optimal match of causal haplotypes + to observed haplotypes + npt.NDArray + An array of the column indices indicating the optimal match of causal haplotypes + to observed haplotypes + npt.NDArray + The index of the causal haplotype that each observed haplotype is matched to + npt.NDArray + An array of boolean values indicating whether each observed haplotype has a match + in the causal haplotypes + """ + obs = observed.transform(gts).data.sum(axis=2) + exp = causal.transform(gts).data.sum(axis=2) + # Compute the pairwise LD between each observed and causal haplotype + # The rows and columns of ld_mat both correspond to the observed and causal haps in + # the order they were given. For example, if there is 1 observed hap and 3 causal + # haps, then there should be four rows and four columns in ld_mat + # Note that ld_mat is symmetric, so the upper triangle is always redundant + # TODO: use the latest haptools.ld.pearson_corr_ld to do this, instead + ld_mat = np.corrcoef(obs, exp, rowvar=False) + # Retrieve only the correlation of observed haps (as rows) vs causal haps (as cols) + ld_mat = np.abs(ld_mat[:obs.shape[1], obs.shape[1]:]) + # Return the ld_mat idxs of the optimal match of each observed hap to a causal hap + row_idx, col_idx = linear_sum_assignment(ld_mat, maximize=True) + # now deal with the extras (observed haps with no causal hap match) + extras_labels = np.zeros(ld_mat.shape[0], dtype=np.uint8) + extras_labels_bool = np.zeros(ld_mat.shape[0], dtype=bool) + missing_rows = np.setdiff1d(range(ld_mat.shape[0]), row_idx) + extras_labels[row_idx] = col_idx + extras_labels_bool[row_idx] = True + # If there are more observed haps than causal haps, then we just assign the + # remaining haps to the best causal hap that matches for each of them + row_idx = np.concatenate((row_idx, missing_rows)) + missing_cols = ld_mat[missing_rows].argmax(axis=1) + col_idx = np.concatenate((col_idx, missing_cols)) + extras_labels[missing_rows] = missing_cols + extras_labels_bool[missing_rows] = False + return ld_mat, row_idx, col_idx, extras_labels, extras_labels_bool + + +def get_best_ld( + gts: GenotypesVCF, + observed_hap: Path, + causal_hap: Path, + region: str = None, + observed_id: str = None, + causal_id: str = None, + log: Logger = None + ): """ Compute the best LD between the observed haplotypes and the causal haplotype @@ -56,17 +126,24 @@ def get_best_ld(gts: GenotypesVCF, observed_hap: Path, causal_hap: Path, region: # load the causal haplotype given by 'causal_id' or just the first hap causal_hap = Haplotypes(causal_hap, log=log) causal_hap.read(haplotypes=(set((causal_id,)) if causal_id is not None else None)) - if causal_id is None: - causal_id = list(causal_hap.data.keys())[0] - causal_hap.subset(haplotypes=(causal_id,), inplace=True) - causal_hap = causal_hap.data[causal_id] # load the observed haplotype given by 'observed_id' or just all of the haps observed_hap = Haplotypes(observed_hap, log=log) observed_hap.read(haplotypes=(set((observed_id,)) if observed_id is not None else None)) + # if there are no observed haplotypes, then we can't compute LD + if len(observed_hap.data) == 0: + return ( + np.array([np.nan,]), + np.array([np.iinfo(np.uint8).max,], dtype=np.uint8), + np.array([False,], dtype=bool), + ) + # load the genotypes - variants = {v.id for v in causal_hap.variants} + variants = { + v.id for hap in causal_hap.data + for v in causal_hap.data[hap].variants + } variants.update({ v.id for hap in observed_hap.data for v in observed_hap.data[hap].variants @@ -78,20 +155,11 @@ def get_best_ld(gts: GenotypesVCF, observed_hap: Path, causal_hap: Path, region: gts.check_biallelic() # compute LD between every observed haplotype and the causal haplotype - causal_gt = causal_hap.transform(gts).sum(axis=1) - observed_gt = observed_hap.transform(gts) - observed_ld = np.array([ - pearson_corr_ld(causal_gt, observed_gt.data[:, o].sum(axis=1)) - for o in range(observed_gt.data.shape[1]) - ]) - - # return the strongest LD among all haps - try: - best_observed_hap_idx = np.abs(observed_ld).argmax() - except ValueError: - # this can happen when there weren't any haplotypes output by happler - return 0 - return observed_ld[best_observed_hap_idx] + observed_ld, best_row_idx, best_col_idx, extras_labels, labels_bool = match_haps( + gts, observed_hap, causal_hap, + ) + # return the strongest possible LD for each observed hap with a causal hap + return observed_ld[best_row_idx, best_col_idx], extras_labels, labels_bool def get_finemap_metrics(metrics_path: Path, log: Logger = None): @@ -111,17 +179,59 @@ def get_finemap_metrics(metrics_path: Path, log: Logger = None): # 3) What is the best PIP among the variants? # 4) Is the observed hap in a credible set? # 5) What is the purity of the credible set? - return np.loadtxt( - fname=metrics_path, - delimiter=" ", - dtype=[ - ("pip", np.float64), - ("has_highest_pip", np.bool_), - ("best_variant_pip", np.float64), - ("in_credible_set", np.bool_), - ("cs_purity", np.float64), - ], + # 6) What is the length of the credible set? + # If you add or remove any metrics here, make sure to also change the + # "num_expected_vals" in get_metrics_mean_std so it knows the number of metrics + dtype = [ + ("pip", np.float64), + ("has_highest_pip", np.bool_), + ("best_variant_pip", np.float64), + ("in_credible_set", np.bool_), + ("cs_purity", np.float64), + ("cs_length", np.float64) + ] + null_val = np.array([ + (np.nan, False, np.nan, False, np.nan, np.nan), + ], dtype=dtype) + # if there are no observed haplotypes, then we can't compute LD + if not metrics_path.exists(): + return null_val + metrics = np.loadtxt(fname=metrics_path, delimiter=" ", dtype=dtype) + if not metrics.shape: + log.debug(f"No metrics found in the metrics file {metrics_path}") + return null_val + return metrics + + +def get_metrics_mean_std(metrics_vals: npt.NDArray, num_expected_vals: int = 6): + """ + Compute the mean and standard error of the metrics + + Parameters + ---------- + metrics_vals: npt.NDArray + A numpy array containing the metrics for each observed hap + num_expected_vals: int, optional + The number of expected metrics + + Returns + ------- + npt.NDArray + A numpy array containing the mean of the metrics + npt.NDArray + A numpy array containing the standard error of the metrics + """ + if len(metrics_vals) == 0: + return np.array([np.nan,]*6), np.array([np.nan,]*6) + nan_vals = np.isnan(metrics_vals["pip"]) + metrics_vals = metrics_vals[~nan_vals] + mean_vals = np.array( + [np.nanmean(metrics_vals[field]) for field in metrics_vals.dtype.names], ) + sem_vals = np.array( + [sem(metrics_vals[field], nan_policy="omit") for field in metrics_vals.dtype.names], + ) + return mean_vals, sem_vals def count_shared(observed, causal, log, observed_id: str = None, causal_id: str = None): @@ -148,9 +258,11 @@ def count_shared(observed, causal, log, observed_id: str = None, causal_id: str def plot_params( params: npt.NDArray, vals: npt.NDArray, + vals_sem: npt.NDArray, val_title: str, - hap_counts: npt.NDArray = None, - sign: bool = False, + val_color: npt.NDArray, + metrics: dict = None, + hide_extras: bool = False, ): """ Plot vals against parameter values @@ -160,58 +272,83 @@ def plot_params( params: npt.NDArray A numpy array of mixed dtype. Each column contains the values of a parameter vals: npt.NDArray - A numpy array containing the values of the plot + A numpy array of numpy arrays containing the values of the plot + vals_sem: npt.NDArray + A numpy array of numpy arrays containing the standard error of the values val_title: str The name of the value that we are plotting - hap_counts: npt.NDArray, optional - The number of haplotypes present in each hap file - sign: bool, optional - If True, depict positive LD values via a special marker symbol for the point + val_color: npt.NDArray + Whether to color each point corresponding to this value + metrics: dict, optional + A dict mapping fine-mapping metrics to tuples of two lists of numpy arrays + containing means and standard errors of fine-mapping metrics for each observed hap + hide_extras: bool, optional + Whether to hide the observed haps that have no causal hap match """ - also_plot_counts = hap_counts is not None figsize = matplotlib.rcParams["figure.figsize"] if len(params) > 75: figsize[0] = len(params) / 15 - if len(params.dtype) > 5: - figsize[1] = len(params.dtype) * 1.25 + num_rows = len(params.dtype) + if metrics is not None: + num_rows = len(params.dtype) + len(metrics) + if num_rows > 5: + figsize[1] = num_rows * 1.25 fig, axs = plt.subplots( - nrows=len(params.dtype)+1+also_plot_counts, ncols=1, + nrows=num_rows+1, ncols=1, sharex=True, figsize=figsize, ) fig.subplots_adjust(hspace=0) # create a plot for the vals, first - axs[0].plot(np.abs(vals), "g-") - if sign: - axs[0].scatter( - np.squeeze(np.where(vals > 0)), vals[vals > 0], c="g", marker="o", - ) + x_vals = [j for j, arr in enumerate(vals) for i in arr] + val_color = ["green" if j else "red" for i in val_color for j in i] + vals = np.concatenate(vals) + vals_sem = np.concatenate(vals_sem) + for v in zip(x_vals, vals, vals_sem, val_color): + if hide_extras and v[3] != "green": + continue + axs[0].errorbar(v[0], v[1], yerr=v[2], marker="o", c=v[3], markersize=3) axs[0].set_ylabel(val_title, rotation="horizontal", ha="right") - axs[0].set_xticks(range(0, len(vals))) + axs[0].set_xticks(range(0, len(x_vals))) axs[0].set_xticklabels([]) axs[0].grid(axis="x") axs[0].set_ylim(None, 1) - if also_plot_counts: - axs[1].plot(hap_counts, "m-") - axs[1].set_ylabel("num_haps", rotation="horizontal", ha="right") - axs[1].grid(axis="x") # now, plot each of the parameter values on the other axes for idx, param in enumerate(params.dtype.names): val_title = param + if len(np.unique(params[param])) == 1: + axs[idx+1].remove() + continue if param in LOG_SCALE: params[param] = -np.log10(params[param]) val_title = "-log " + val_title - axs[idx+1+also_plot_counts].plot(params[param], "-") - axs[idx+1+also_plot_counts].set_ylabel(val_title, rotation="horizontal", ha="right") - axs[idx+1+also_plot_counts].grid(axis="x") + axs[idx+1].plot(params[param], "-") + axs[idx+1].set_ylabel(val_title, rotation="horizontal", ha="right") + axs[idx+1].grid(axis="x") + if metrics is not None: + for idx, metric in enumerate(metrics.keys()): + curr_ax = axs[idx+len(params.dtype)+1] + val_title = metric + vals = np.concatenate(metrics[metric][0]) + if len(np.unique(vals)) == 1: + curr_ax.remove() + continue + vals_sem = np.concatenate(metrics[metric][1]) + for v in zip(x_vals, vals, vals_sem, val_color): + if hide_extras and v[3] != "green": + continue + curr_ax.errorbar(v[0], v[1], yerr=v[2], marker="o", c=v[3], markersize=3) + curr_ax.set_ylabel(val_title, rotation="horizontal", ha="right") + curr_ax.grid(axis="x") return fig def plot_params_simple( params: npt.NDArray, vals: npt.NDArray, + vals_sem: npt.NDArray, val_title: str, - hap_counts: npt.NDArray = None, - sign: bool = False + val_color: npt.NDArray, + hide_extras: bool = False, ): """ Plot vals against only a single column of parameter values @@ -221,37 +358,144 @@ def plot_params_simple( params: npt.NDArray A numpy array containing the values of a parameter vals: npt.NDArray - A numpy array containing the values of the plot + A numpy array of numpy arrays containing the values of the plot + vals_sem: npt.NDArray + A numpy array of numpy arrays containing the standard error of the values val_title: str The name of the value that we are plotting - hap_counts: npt.NDArray, optional - The number of haplotypes present in each hap file - sign: bool, optional - If True, depict positive LD values via a special marker symbol for the point + val_color: npt.NDArray + Whether to color each point corresponding to this value + hide_extras: bool, optional + Whether to hide the observed haps that have no causal hap match """ - fig, ax = plt.subplots(nrows=1, ncols=1, sharex=True) + fig, ax = plt.subplots(nrows=1, ncols=1, sharex=True, figsize=(3.5, 3)) val_xlabel = params.dtype.names[0] if val_xlabel in LOG_SCALE: params = -np.log10(params) val_xlabel = "-log " + val_xlabel + params = [j for j, arr in zip(params, vals) for i in arr] + val_color = ["green" if j else "red" for i in val_color for j in i] + vals = np.concatenate(vals) + vals_sem = np.concatenate(vals_sem) # plot params against vals - ax.plot(params, np.abs(vals), "g-") - if sign: - ax.scatter( - params[vals > 0], vals[vals > 0], c="g", marker="o", - ) + for v in zip(params, vals, vals_sem, val_color): + if hide_extras and v[3] != "green": + continue + ax.errorbar(v[0], v[1], yerr=v[2], marker="o", c=v[3], markersize=5) ax.set_ylabel(val_title, color="g") ax.set_xlabel(val_xlabel) - # also plot hap_counts as a second y-axis - if hap_counts is not None: - ax = ax.twinx() - ax.plot(params, hap_counts, "m-") - ax.set_ylabel("Number of observed haplotypes", color="m") - ax.set_yticks(list(range(max(hap_counts)+1))) - ax.set_yticklabels(list(range(max(hap_counts)+1))) + ax.set_ylim(0, 1.03) return fig +def group_by_rep( + params: npt.NDArray, + vals: npt.NDArray, + causal_idxs: npt.NDArray, + bools: npt.NDArray, + metrics: npt.NDArray = None, + ): + """ + Group replicates with identical parameter values + + Compute mean and standard error of the values for each group + + Parameters + ---------- + params: npt.NDArray + A numpy array of mixed dtype. Each column contains the values of a parameter + vals: npt.NDArray + A numpy array containing the values of the plot + causal_idxs: npt.NDArray + The index of the causal haplotype that each observed haplotype belongs to + bools: npt.NDArray + An array of boolean values indicating whether each observed haplotype has a match + metrics: npt.NDArray + A numpy array of numpy arrays containing the metrics for each observed hap + + Returns + ------- + npt.NDArray + A numpy array of the unique parameter values + npt.NDArray + A numpy array containing the mean and std of the values over the replicates + """ + other_param_names = [name for name in params.dtype.names if name != "rep"] + grouped_params = np.unique(params[other_param_names]) + get_mean_std = lambda x: (np.nanmean(x), sem(x, nan_policy="omit") if len(x) > 1 else np.nan) + filter_causal_idxs = lambda x: np.unique(x[x != np.iinfo(np.uint8).max]) + grouped_vals = [] + grouped_sem = [] + grouped_bools = [] + grouped_metrics = [] + grouped_metrics_sem = [] + for group in grouped_params: + subgrouped_vals = [] + subgrouped_sem = [] + subgrouped_bools = [] + subgrouped_metrics = [] + subgrouped_metrics_sem = [] + curr_vals = vals[params[other_param_names] == group] + curr_causal_idxs = causal_idxs[params[other_param_names] == group] + curr_bools = bools[params[other_param_names] == group] + curr_metrics = None + if metrics is not None: + curr_metrics = metrics[params[other_param_names] == group] + # compute mean and std err for each causal match + for causal_idx in filter_causal_idxs(np.concatenate(curr_causal_idxs)): + try: + curr_vals_causal_idxs = np.concatenate([ + val[(indices == causal_idx) & curr_bool] + for val, indices, curr_bool in zip(curr_vals, curr_causal_idxs, curr_bools) + ]) + except ValueError: + continue + val_mean, val_sem = get_mean_std(curr_vals_causal_idxs) + subgrouped_vals.append(val_mean) + subgrouped_sem.append(val_sem) + subgrouped_bools.append(True) + if curr_metrics is not None: + curr_metrics_causal_idxs = np.concatenate([ + val[(indices == causal_idx) & curr_bool] + for val, indices, curr_bool in zip(curr_metrics, curr_causal_idxs, curr_bools) + ]) + metrics_mean, metrics_sem = get_metrics_mean_std(curr_metrics_causal_idxs) + subgrouped_metrics.append(metrics_mean) + subgrouped_metrics_sem.append(metrics_sem) + # compute mean and std err for each unmatched observed hap + for unmatched_idx in range(max([(~i).sum() for i in curr_bools])): + try: + curr_vals_unmatched_idxs = np.array([ + val[(indices == causal_idx) & ~curr_bool][unmatched_idx] + for val, indices, curr_bool in zip(curr_vals, curr_causal_idxs, curr_bools) + if unmatched_idx < ((indices == causal_idx) & ~curr_bool).sum() + ]) + except ValueError: + continue + val_mean, val_sem = get_mean_std(curr_vals_unmatched_idxs) + subgrouped_vals.append(val_mean) + subgrouped_sem.append(val_sem) + subgrouped_bools.append(False) + if curr_metrics is not None: + curr_metrics_unmatched_idxs = np.array([ + val[(indices == causal_idx) & ~curr_bool][unmatched_idx] + for val, indices, curr_bool in zip(curr_metrics, curr_causal_idxs, curr_bools) + if unmatched_idx < ((indices == causal_idx) & ~curr_bool).sum() + ]) + metrics_mean, metrics_sem = get_metrics_mean_std(curr_metrics_unmatched_idxs) + subgrouped_metrics.append(metrics_mean) + subgrouped_metrics_sem.append(metrics_sem) + grouped_vals.append(np.array(subgrouped_vals, dtype=object)) + grouped_sem.append(np.array(subgrouped_sem, dtype=object)) + grouped_bools.append(np.array(subgrouped_bools, dtype=object)) + if curr_metrics is not None: + grouped_metrics.append(np.array(subgrouped_metrics, dtype=object)) + grouped_metrics_sem.append(np.array(subgrouped_metrics_sem, dtype=object)) + if curr_metrics is not None: + return grouped_params, grouped_vals, grouped_sem, grouped_bools, grouped_metrics, grouped_metrics_sem + return grouped_params, grouped_vals, grouped_sem, grouped_bools + + @click.command() @click.argument("genotypes", type=click.Path(exists=True, path_type=Path)) @click.argument("observed_hap", type=click.Path(path_type=Path)) @@ -264,6 +508,13 @@ def plot_params_simple( show_default=True, help="Plot fine-mapping metrics, as well", ) +@click.option( + "--use-metric", + type=str, + default=None, + show_default="observed LD", + help="Use this fine-mapping metric to color the plot", +) @click.option( "--region", type=str, @@ -288,19 +539,25 @@ def plot_params_simple( help="A haplotype ID from the causal .hap file", ) @click.option( - "-n", - "--num-haps", + "--hide-extras", is_flag=True, - default=True, + default=False, show_default=True, - help="Whether to also depict the total number of haplotypes in the file", + help="Hide the observed haps that have no causal hap match", ) @click.option( - "--sign", + "--pickle-out", is_flag=True, default=False, show_default=True, - help="Depict the sign (pos vs neg) of the LD via the shape of the points", + help="Save the output as a pickle file as well", +) +@click.option( + "--order", + type=str, + default=None, + show_default="The order of the wildcards in the observed_hap path", + help="The order of the parameters in the plot, as a comma-separated list", ) @click.option( "-o", @@ -323,11 +580,13 @@ def main( observed_hap: Path, causal_hap: Path, metrics: Path, + use_metric: str = None, region: str = None, observed_id: str = None, causal_id: str = None, - num_haps: bool = True, - sign: bool = False, + hide_extras: bool = False, + pickle_out: bool = False, + order: str = None, output: Path = Path("/dev/stdout"), verbosity: str = "ERROR", ): @@ -352,13 +611,24 @@ def main( params = dict(glob_wildcards(observed_hap)._asdict()) # convert the dictionary to a numpy mixed dtype array dtypes = {k: DTYPES[k] for k in params.keys()} + # if the user specified an order, then we will sort the parameters by that order + if order is not None: + new_params = {} + for k in order.split(",")[::-1]: + if k not in dtypes: + log.warning(f"Parameter {k} not found in the observed hap path") + continue + dtypes = {k: dtypes.pop(k), **dtypes} + for k in dtypes: + new_params[k] = params.pop(k) + params = new_params params = np.array(list(zip(*params.values())), dtype=list(dtypes.items())) params.sort() get_hap_fname = lambda hap_path, param_set: Path(str(hap_path).format(**dict(zip(dtypes.keys(), param_set)))) # compute LD between the causal hap and the best observed hap across param vals - ld_vals = np.array([ + ld_vals, ld_extras_idxs, ld_extras_bool = zip(*tuple( get_best_ld( gts, get_hap_fname(observed_hap, params[idx]), @@ -369,7 +639,10 @@ def main( log=log ) for idx in range(len(params)) - ]) + )) + ld_vals = np.array(ld_vals, dtype=object) + ld_extras_idxs = np.array(ld_extras_idxs, dtype=object) + ld_extras_bool = np.array(ld_extras_bool, dtype=object) # extract fine-mapping metrics for the observed hap if metrics is not None: @@ -379,30 +652,74 @@ def main( log=log ) for idx in range(len(params)) - ]) - - hap_counts = None - if num_haps: - hap_counts = np.array([ - get_num_haps(get_hap_fname(observed_hap, params[idx])) - for idx in range(len(params)) - ]) - # if they're all 1, then there's no reason to depict it - if (hap_counts == 1).all(): - hap_counts = None - - if len(dtypes) > 1 or metrics is not None: - merged = params - if metrics is not None: - merged = np.lib.recfunctions.merge_arrays([params, metrics], flatten=True) - fig = plot_params(merged, ld_vals, "causal LD", hap_counts, sign) - elif len(dtypes) == 1: - fig = plot_params_simple(params, ld_vals, "causal LD", hap_counts, sign) + ], dtype=object) + params, ld_vals, ld_sem, ld_extras_bool, metrics_vals, metrics_vals_sem = group_by_rep( + params, ld_vals, ld_extras_idxs, ld_extras_bool, metrics, + ) + extract_metrics_vals = lambda x, idx: np.array([i[:, idx] for i in x], dtype=object) + metrics = { + name: ( + extract_metrics_vals(metrics_vals, idx), + extract_metrics_vals(metrics_vals_sem, idx) + ) for idx, name in enumerate(metrics[0].dtype.names) + } else: - raise ValueError("No parameter values found") + params, ld_vals, ld_sem, ld_extras_bool = group_by_rep( + params, ld_vals, ld_extras_idxs, ld_extras_bool, + ) + del dtypes["rep"] + + if pickle_out: + with open(output.with_suffix(".pickle"), "wb") as f: + if metrics is not None: + pickle.dump([params, ld_vals, ld_sem, ld_extras_bool, metrics], f) + else: + pickle.dump([params, ld_vals, ld_sem, ld_extras_bool], f) + + if use_metric is None: + if len(dtypes) > 1: + fig = plot_params( + params, + ld_vals, + ld_sem, + "Observed LD", + ld_extras_bool, + metrics=metrics, + hide_extras=hide_extras + ) + elif len(dtypes) == 1: + fig = plot_params_simple( + params, + ld_vals, + ld_sem, + "Observed LD", + ld_extras_bool, + hide_extras=hide_extras + ) + else: + if len(dtypes) > 2: + fig = plot_params( + params, + metrics[use_metric][0], + metrics[use_metric][1], + use_metric, + ld_extras_bool, + hide_extras=hide_extras + ) + elif len(dtypes) == 1: + fig = plot_params_simple( + params, + metrics[use_metric][0], + metrics[use_metric][1], + use_metric, + ld_extras_bool, + ld_extras_bool, + hide_extras=hide_extras + ) fig.tight_layout() - fig.savefig(output) + fig.savefig(output, + bbox_inches="tight", pad_inches=0.05) if __name__ == "__main__": main() diff --git a/analysis/workflow/scripts/summarize_results.R b/analysis/workflow/scripts/summarize_results.R index b31cdf9..5a92017 100755 --- a/analysis/workflow/scripts/summarize_results.R +++ b/analysis/workflow/scripts/summarize_results.R @@ -28,9 +28,10 @@ X = readPGEN(snakemake@input[["gt"]], region=region, samples=samples[,2]) # also load the positions of each of the variants pos = readPVAR(snakemake@input[["gt"]], region=region) if (length(snakemake@input[["causal_gt"]])) { + write("Loading causal genotype data too", stderr()) causal_gt_file = snakemake@input[["causal_gt"]] causal_gt_samples = readPSAM(causal_gt_file, samples=readPheno(phen)[,1]) - causal_gt = readPGEN(causal_gt_file, region=region, samples=causal_gt_samples) + causal_gt = readPGEN(causal_gt_file, region=region, samples=causal_gt_samples[,2]) X = cbind(X, causal_gt) causal_variant = colnames(causal_gt)[1] # also load the positions of each of the variants @@ -52,43 +53,24 @@ hap = FALSE causal_hap_is_provided = FALSE if (length(snakemake@input[["happler_hap"]]) > 0) { hap = TRUE - happler_haplotype = file(snakemake@input[["happler_hap"]], "rt") + happler_haplotype = snakemake@input[["happler_hap"]] } if (length(snakemake@params[["causal_hap"]]) > 0) { causal_hap_is_provided = TRUE - causal_haplotype = file(snakemake@params[["causal_hap"]], "rt") + causal_haplotype = snakemake@params[["causal_hap"]] } if (hap) { write("Parsing happler hap file", stderr()) - while(TRUE) { - line = readLines(happler_haplotype, 1) - # we assume the first line in the hap file that begins with H denotes the haplotype - if (grepl("^H\\t", line)) break - } - # extract the start, end, and ID of the haplotype - happler_hap = as.integer(strsplit(line, "\t")[[1]][c(3,4)]) - happler_hap_id = strsplit(line, "\t")[[1]][c(5)] + happler_hap = readHap(happler_haplotype) } if (causal_hap_is_provided) { write("Parsing causal hap file", stderr()) - while(TRUE) { - line = readLines(causal_haplotype, 1) - # we assume the first line in the hap file that begins with H denotes the haplotype - if (grepl("^H\\t", line)) break - } - # extract the start, end, and ID of the haplotype - causal_hap = as.integer(strsplit(line, "\t")[[1]][c(3,4)]) - causal_hap_id = strsplit(line, "\t")[[1]][c(5)] - stopifnot(causal_hap_id == causal_variant) - haplotypes = t(data.frame( - causal_hap = c("start"=causal_hap[1], "end"=causal_hap[2], "id"=causal_hap_id), - happler_hap = c("start"=happler_hap[1], "end"=happler_hap[2], "id"=happler_hap_id) - )) -} else { - haplotypes = t(data.frame( - happler_hap = c("start"=happler_hap[1], "end"=happler_hap[2], "id"=happler_hap_id) - )) + causal_hap = readHap(causal_haplotype) + stopifnot(causal_variant %in% causal_hap["id"]) + haplotypes = cbind(causal_hap, happler_hap) +} else if (hap) { + haplotypes = happler_hap } write("Formatting genotypes and creating output dir", stderr()) @@ -253,7 +235,7 @@ if (hap) { } else { pip_plot(susie_pip, X, b, pos, susie_cs=names(susie_pip[fitted$sets$cs[['L1']]])) } -ggsave(paste0(out,'/susie.pdf'), width=10, height=5, device='pdf') +ggsave(paste0(out,'/susie.pdf'), width=6, height=5, device='pdf') dev.off() if (!is.null(finemap_results)) { @@ -263,6 +245,6 @@ if (!is.null(finemap_results)) { } else { pip_plot(finemap_pip, X, b, pos) } - ggsave(paste0(out,'/finemap.pdf'), width=10, height=5, device='pdf') + ggsave(paste0(out,'/finemap.pdf'), width=6, height=5, device='pdf') while (!is.null(dev.list())) dev.off() } diff --git a/analysis/workflow/scripts/susie_metrics.R b/analysis/workflow/scripts/susie_metrics.R index 6147b76..f3fd402 100755 --- a/analysis/workflow/scripts/susie_metrics.R +++ b/analysis/workflow/scripts/susie_metrics.R @@ -4,7 +4,6 @@ # param1: The path to a RDS file containing the output of SuSiE # param2: The path to a .hap file containing the observed haplotype -# param3: The path to a .hap file containing the causal haplotype # Output will be written in tab-separated format to stdout # To debug this script, add a browser() call somewhere in it. Then run it with its command line args but replace the script name with "R --args". @@ -13,25 +12,12 @@ args = commandArgs(trailingOnly = TRUE) susie_res = args[1] obs_hap = args[2] -caus_hap = args[3] -write("Parsing observed hap file", stderr()) -for (line in readLines(obs_hap)) { - # we assume the first line in the hap file that begins with H denotes the haplotype - if (grepl("^H\\t", line)) break -} -# extract the start, end, and ID of the haplotype -happler_hap = as.integer(strsplit(line, "\t")[[1]][c(3,4)]) -happler_hap_id = strsplit(line, "\t")[[1]][c(5)] +# load functions to help read various file types +source("workflow/scripts/utils.R") -write("Parsing causal hap file", stderr()) -for (line in readLines(caus_hap)) { - # we assume the first line in the hap file that begins with H denotes the haplotype - if (grepl("^H\\t", line)) break -} -# extract the start, end, and ID of the haplotype -causal_hap = as.integer(strsplit(line, "\t")[[1]][c(3,4)]) -causal_hap_id = strsplit(line, "\t")[[1]][c(5)] +write("Parsing observed hap file", stderr()) +happler_hap = readHap(obs_hap) write("Parsing SuSiE results", stderr()) # parse the susie data: @@ -40,30 +26,40 @@ write("Parsing SuSiE results", stderr()) susie_res = readRDS(susie_res) fitted = susie_res$fitted susie_pip = fitted$pip -happler_hap_idx = which(names(susie_pip) %in% happler_hap_id) -write("Computing metrics", stderr()) -# The metrics are: -# 1) What is the PIP of the observed hap? -# 2) Does the observed hap get the highest PIP? -# 3) What is the best PIP among the variants? -# 4) Is the observed hap in a credible set? -# 5) What is the purity of the credible set? -obs_pip = susie_pip[happler_hap_id] -has_highest_pip = as.integer(sum(obs_pip < susie_pip) == 0) -best_variant_pip = max(susie_pip[names(susie_pip) != names(obs_pip)]) -credible_set = NULL -for (cs in names(fitted$sets$cs)) { - if (happler_hap_idx %in% fitted$sets$cs[cs]) { - credible_set = cs - break +for (happler_hap_file_idx in 1:nrow(happler_hap)) { + + happler_hap_id = happler_hap[happler_hap_file_idx, "id"] + happler_hap_idx = which(names(susie_pip) %in% happler_hap_id) + + write(paste("Computing metrics for", happler_hap_id), stderr()) + # The metrics are: + # 1) What is the PIP of the observed hap? + # 2) Does the observed hap get the highest PIP? + # 3) What is the best PIP among the variants? + # 4) Is the observed hap in a credible set? + # 5) What is the purity of the credible set? + # 6) What is the length of the credible set? + obs_pip = susie_pip[happler_hap_id] + has_highest_pip = as.integer(sum(obs_pip < susie_pip) == 0) + # TODO: fix this so that it also excludes other haplotypes besides obs_pip + best_variant_pip = max(susie_pip[names(susie_pip) != names(obs_pip)]) + credible_set = NULL + for (cs in names(fitted$sets$cs)) { + if (happler_hap_idx %in% fitted$sets$cs[cs]) { + credible_set = cs + break + } } -} -purity = 0 -if (!is.null(credible_set)) { - purity = fitted$sets$purity[cs, "mean.abs.corr"] -} -in_credible_set = as.integer(!is.null(credible_set)) + purity = 0 + cs_length = 0 + if (!is.null(credible_set)) { + purity = fitted$sets$purity[cs, "mean.abs.corr"] + cs_length = length(fitted$sets$cs[credible_set]) + } + in_credible_set = as.integer(!is.null(credible_set)) -write("Outputting metrics", stderr()) -write(paste(obs_pip, has_highest_pip, best_variant_pip, in_credible_set, purity), stdout()) + write(paste("Outputting metrics for", happler_hap_id), stderr()) + write(paste(obs_pip, has_highest_pip, best_variant_pip, in_credible_set, purity, cs_length), stdout()) + +} diff --git a/analysis/workflow/scripts/utils.R b/analysis/workflow/scripts/utils.R index ba54c1f..52c0886 100644 --- a/analysis/workflow/scripts/utils.R +++ b/analysis/workflow/scripts/utils.R @@ -97,3 +97,20 @@ save_curr_env = function(env = parent.frame(), filePath = "myenv.RData") { save(list = ls(all.names = TRUE, envir = env), file = filePath, envir = env) } + +readHap = function(hapfile) { + # Return a data.frame with the start, end, and ID columns of the haplotyp lines + # in the .hap file + # Parameters: + # hapfile: The path to the .hap file + data <- tryCatch(read.table(hapfile, header = FALSE, fill = TRUE, stringsAsFactors = FALSE), error=function(e) data.frame()) + if (nrow(data) != 0) { + # Filter rows that start with "H" and select columns 3 to 5 + h_data <- data[grep("^H", data$V1), c(3, 4, 5)] + # Rename the columns for clarity (optional) + colnames(h_data) <- c("start", "end", "id") + h_data + } else { + data + } +} diff --git a/analysis/workflow/scripts/variance_explained_plot.py b/analysis/workflow/scripts/variance_explained_plot.py index 451536a..57fbf95 100755 --- a/analysis/workflow/scripts/variance_explained_plot.py +++ b/analysis/workflow/scripts/variance_explained_plot.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +import pickle from pathlib import Path from logging import Logger @@ -300,6 +301,9 @@ def main( f, (ax1, ax2) = plt.subplots(1, 2) + with open(output.with_suffix(".pickle"), "wb") as picklef: + pickle.dump((explained_variances, rsquareds), picklef) + ax1.scatter(explained_variances[:, 1], explained_variances[:, 0]) ax1.axline([0, 0], [max_ev_val, max_ev_val]) ax1.set_title("Variance Explained") diff --git a/dev-env.yml b/dev-env.yml index 985b69e..3c4bfea 100644 --- a/dev-env.yml +++ b/dev-env.yml @@ -3,7 +3,7 @@ channels: - bioconda - nodefaults dependencies: - - conda-forge::python=3.8 # the lowest version of python that we formally support + - conda-forge::python=3.9 # the lowest version of python that we formally support - conda-forge::pip==23.3.2 - conda-forge::poetry==1.7.1 - conda-forge::nox==2023.04.22 diff --git a/happler/__main__.py b/happler/__main__.py index f596afa..5748e97 100644 --- a/happler/__main__.py +++ b/happler/__main__.py @@ -103,6 +103,20 @@ def main(): hidden=True, help="Do not check that variants are phased. Saves time and memory.", ) +@click.option( + "--max-signals", + type=int, + default=1, + show_default=True, + help="The maximum number of expected causal signals", +) +@click.option( + "--max-iterations", + type=int, + default=1, + show_default=True, + help="The max number of times to repeat the tree building", +) @click.option( "-t", "--threshold", @@ -127,6 +141,13 @@ def main(): show_default=True, help="The LD threshold used to prune leaf nodes based on LD with their siblings", ) +@click.option( + "--out-thresh", + type=float, + default=5e-8, + show_default=True, + help="Threshold used to determine whether to output haplotypes (single-SNP)", +) @click.option( "--show-tree", is_flag=True, @@ -150,6 +171,13 @@ def main(): "no correction (n)" ), ) +@click.option( + "--remove-SNPs", + is_flag=True, + show_default=True, + default=False, + help="Remove haplotypes with only a single variant", +) @click.option( "-o", "--output", @@ -179,12 +207,16 @@ def run( chunk_size: int = None, maf: float = None, phased: bool = False, + max_signals: int = 1, + max_iterations: int = 1, threshold: float = 0.05, + out_thresh: float = 5e-8, indep_thresh: float = 0.1, ld_prune_thresh: float = None, show_tree: bool = False, covars: Path = None, corrector: str = "n", + remove_snps: bool = False, output: Path = Path("/dev/stdout"), verbosity: str = "INFO", ): @@ -199,6 +231,7 @@ def run( Ex: happler run tests/data/simple.vcf tests/data/simple.tsv > simple.hap """ from . import tree + import numpy as np from haptools import data from haptools import logging @@ -264,10 +297,6 @@ def run( log.info("There are {} samples and {} variants".format(*gt.data.shape)) # subset to just one phenotype - if len(ph.names) > 1: - log.warning("Ignoring all but the first trait in the phenotypes file") - ph.names = ph.names[:1] - ph.data = ph.data[:, :1] # also reorder and subset samples in Phenotypes to match those in the Genotypes ph.subset(samples=tuple(gt.samples), names=(pheno,), inplace=True) @@ -316,9 +345,45 @@ def run( indep_thresh=indep_thresh, ld_prune_thresh=ld_prune_thresh, log=log, - ).run() + ) + forest = tree.ForestBuilder( + hap_tree, + num_bins=max_signals, + max_iterations=max_iterations, + log=log, + ) log.info("Outputting haplotypes") - tree.Haplotypes.from_tree(fname=output, tree=hap_tree, gts=gt, log=log).write() + haplotypes = data.Haplotypes( + fname=output, + haplotype=tree.haplotypes.HapplerHaplotype, + variant=tree.haplotypes.HapplerVariant, + log=log, + ) + haplotypes.data = {} + hap_id = 0 + # merge the Haplotypes objects + # TODO: use a method of the Haplotypes class + for haps in forest.run(): + if haps is None: + continue + if len(haps.data) == 1: + hap = next(iter(haps.data.values())) + if hap.pval < -np.log10(out_thresh): + log.info(f"Ignoring haplotype with low pval {hap.pval}") + continue + for hap in haps.data.values(): + if len(hap.variants) <= 1 and remove_snps: + continue + hap.id = f"H{hap_id}" + haplotypes.data[hap.id] = hap + hap_id += 1 + if haplotypes.data == {}: + log.critical("No haplotypes were found") + with open(output, "w") as hap_file: + pass + else: + haplotypes.index() + haplotypes.write() if show_tree: dot_output = output if Path(output) != Path("/dev/stdout"): @@ -327,7 +392,7 @@ def run( dot_output = dot_output.with_suffix(".dot") log.info("Writing tree to dot file") with open(dot_output, "w") as dot_file: - dot_file.write(hap_tree.dot()) + dot_file.write(forest.dot(remove_singletons=remove_snps)) @main.command(context_settings=CONTEXT_SETTINGS) diff --git a/happler/tree/__init__.py b/happler/tree/__init__.py index e325bab..2bc2dc2 100644 --- a/happler/tree/__init__.py +++ b/happler/tree/__init__.py @@ -1,5 +1,6 @@ from .tree import Tree from .tree_builder import TreeBuilder +from .forest_builder import ForestBuilder from .variant import VariantType, Variant from .haplotypes import Haplotype, Haplotypes from .corrector import Corrector, BH, Bonferroni, BHSM diff --git a/happler/tree/assoc_test.py b/happler/tree/assoc_test.py index f1e6f95..32a7324 100644 --- a/happler/tree/assoc_test.py +++ b/happler/tree/assoc_test.py @@ -7,7 +7,6 @@ from scipy import stats import numpy.typing as npt import statsmodels.api as sm -from scipy.stats import t as t_dist # We declare this class to be a dataclass to automatically define __init__ and a few @@ -146,7 +145,7 @@ def pval_as_decimal(t_stat: float, df: int, precision: int = 1000) -> Decimal: ------ ValueError This function is only valid when the t distribution is approximately - equivalent to the normal distribution. Thus, if df < 10000, we raise a + equivalent to the normal distribution. Thus, if df < 1000, we raise a ValueError to indicate that the function will not return an accurate value Returns @@ -154,8 +153,8 @@ def pval_as_decimal(t_stat: float, df: int, precision: int = 1000) -> Decimal: Decimal An approximate, higher precision p-value for the provided t statistic """ - if df < 10000: - raise ValueError("You need a larger sample size to approximate this p-value") + if df < 1000: + log.warning("You need a larger sample size to approximate this p-value") log10_pval = stats.norm.logsf(np.abs(t_stat)) / np.log(10) + np.log10(2) # set the desired precision getcontext().prec = precision @@ -326,7 +325,9 @@ def perform_test( # handle cases where our p-values are too powerful if pval == 0: # retrieve the pval at a higher precision - pval = AssocTestSimpleSM.pval_as_decimal(res.tvalues[-1], res.df_resid, precision=10) + pval = AssocTestSimpleSM.pval_as_decimal( + res.tvalues[-1], res.df_resid, precision=10 + ) if self.with_bic: return param, pval, stderr, bic else: diff --git a/happler/tree/forest_builder.py b/happler/tree/forest_builder.py new file mode 100644 index 0000000..d588d6a --- /dev/null +++ b/happler/tree/forest_builder.py @@ -0,0 +1,165 @@ +from __future__ import annotations +import re +from logging import Logger + +import numpy as np +import numpy.typing as npt +import statsmodels.api as sm +from haptools.logging import getLogger +from haptools.data import Genotypes, Phenotypes + +from .tree import Tree +from .haplotypes import Haplotypes +from .tree_builder import TreeBuilder + + +class ForestBuilder: + """ + Creates a ForestBuilder object from a provided TreeBuilder + + Attributes + ---------- + tree_builder: TreeBuilder + A TreeBuilder instance that we can run repeatedly + log: Logger + A logging instance for recording debug statements. + """ + + def __init__( + self, + tree_builder: TreeBuilder, + num_bins: int, + max_iterations: int = None, + log: Logger = None, + ): + self.tree_builder = tree_builder + self.max_iterations = max_iterations + self.trees = [None] * num_bins + self.haplotypes = [None] * num_bins + self.genotypes = self.tree_builder.gens + self.phenotypes = self.tree_builder.phens + self.log = log or getLogger(self.__class__.__name__) + + def run(self): + """ + Build a bunch of trees + """ + iterations = self.max_iterations + tree_idx = 0 + while iterations: + self.log.debug( + f"Iteration {self.max_iterations - iterations}: tree {tree_idx}" + ) + self.tree_builder.phens = self.get_residuals(tree_idx) + self.tree_builder.run() + self.trees[tree_idx] = self.tree_builder.tree + self.haplotypes[tree_idx] = Haplotypes.from_tree( + None, + self.trees[tree_idx], + self.genotypes, + self.log, + ) + # increment tree_idx until it reaches the end + tree_idx += 1 + if tree_idx >= len(self.trees): + # TODO: implement a test to check whether to stop iterating + if iterations is not None: + iterations -= 1 + tree_idx = 0 + return self.haplotypes + + def get_residuals(self, tree_idx: int) -> Phenotypes: + """ + Retrieve the current phenotype values, computed after regressing out the + haplotype effects from each tree except the one indicated by tree_idx + + Parameters + ---------- + tree_idx: int + The index of the tree whose haplotypes to exclude from consideration + + Returns + ------- + Phenotypes + The residuals after regressing out the effects of the haplotypes from all + other trees + """ + # first: count the number of effects amongst all haplotypes + total_num_effects = sum( + len(hps.data) if hps is not None else 0 + for hps_idx, hps in enumerate(self.haplotypes) + if hps_idx != tree_idx + ) + # base case: if there are no effects to regress out + if total_num_effects == 0: + return self.phenotypes + num_samps = len(self.phenotypes.samples) + effects = np.empty( + (num_samps, total_num_effects), dtype=self.phenotypes.data.dtype + ) + effect_arr_idx = 0 + # iterate through every tree's Haplotypes obj and transform the haps into effects + self.log.debug( + f"Extracting {total_num_effects} effects from {total_num_effects} total effects" + ) + for hap_idx, hap in enumerate(self.haplotypes): + if hap is None or hap_idx == tree_idx: + continue + new_idx = effect_arr_idx + len(hap.data) + effects[:, effect_arr_idx:new_idx] = ( + hap.transform(self.genotypes).data[:, :, :2].sum(axis=2) + ) + effect_arr_idx = new_idx + resids = Phenotypes(fname=None, log=self.phenotypes.log) + resids.samples = self.phenotypes.samples + # get residuals with effects as covariates + resids.names = self.phenotypes.names + self.log.debug(f"Computing residuals for tree {tree_idx}") + resids.data = ( + sm.OLS(self.phenotypes.data, sm.add_constant(effects)) + .fit() + .resid[:, np.newaxis] + ) + return resids + + def __repr__(self): + return self.dot() + + def dot(self, remove_singletons: bool = False) -> str: + """ + Convert the trees to a representation in the dot language + This is useful for quickly viewing the forest on the command line + + Returns + ------- + str + A string representing the trees + + Nodes are labeled by their variant ID and edges are labeled by their allele + remove_singletons: bool + Whether to ignore trees composed of only a single SNP + """ + node_pattern = r"(?)" + max_node_id = 0 + trees_str = "strict digraph {\nforcelabels=true;\nrankdir=TB;\n" + for idx, tree in enumerate(self.trees): + if tree is None: + continue + tree_str = tree.dot().split("\n")[2:] + if remove_singletons and len(tree_str) <= 5: + continue + trees_str += f"subgraph tree_{idx}" + ' {\nlabel="' + f"Tree {idx}" + '";\n' + tree_str = "\n".join(tree_str) + # increment the node IDs to ensure they remain unique across all trees + increment = lambda match: str(int(match.group(1)) + max_node_id) + trees_str += re.sub(node_pattern, increment, tree_str) + # what was the greatest node ID in this tree? + # increment the next tree's IDs by that number + 1 if there were any nodes + try: + max_node_id = ( + max(map(int, re.findall(node_pattern, tree_str))) + max_node_id + 1 + ) + except ValueError: + pass + trees_str += "}" + return trees_str diff --git a/happler/tree/terminator.py b/happler/tree/terminator.py index 13daee3..d696450 100644 --- a/happler/tree/terminator.py +++ b/happler/tree/terminator.py @@ -132,7 +132,7 @@ def compute_val( # terminate if the effect sizes have gone in the opposite direction self.log.debug("Terminated b/c effect size did not improve") return True - # perform a two tailed, two-sample t-test using the difference of the effect sizes + # perform a one tailed, two-sample t-test using the difference of the effect sizes # first, we compute the standard error of the difference of the effect sizes std_err = np.sqrt( ((results.data["stderr"] ** 2) + (parent_res.stderr**2)) / 2 @@ -178,7 +178,12 @@ def check( num_tests: int, ) -> bool: computed_val = self.compute_val( - parent_res, node_res, results, best_idx, num_samps, num_tests, + parent_res, + node_res, + results, + best_idx, + num_samps, + num_tests, ) if isinstance(computed_val, bool): return computed_val @@ -256,7 +261,12 @@ def check( num_tests: int, ): computed_val = self.compute_val( - parent_res, node_res, results, best_idx, num_samps, num_tests, + parent_res, + node_res, + results, + best_idx, + num_samps, + num_tests, ) if isinstance(computed_val, bool): return computed_val diff --git a/happler/tree/tree_builder.py b/happler/tree/tree_builder.py index c060ed8..2db22e2 100644 --- a/happler/tree/tree_builder.py +++ b/happler/tree/tree_builder.py @@ -89,10 +89,6 @@ def run(self): """ Run the tree builder and create a tree rooted at the provided variant """ - if self.tree is not None: - raise AssertionError( - "A tree already exists for this TreeBuilder. Please create a new one." - ) # step one: initialize the tree self.tree = Tree(log=self.log) # step two: create a haplotype @@ -478,7 +474,7 @@ def _find_split_rigid( self.log.debug( f"The haplotype had a pval of {hap_indep_effect} < {self.indep_thresh}" " in an additive model with the allele and parent" - ) + ) best_allele_idx = best_res_idx[best_allele] best_results = results[allele].data[best_allele_idx] node_res = self.results_type.from_np(best_results) diff --git a/noxfile.py b/noxfile.py index a1772ad..f127938 100644 --- a/noxfile.py +++ b/noxfile.py @@ -10,8 +10,8 @@ package = "happler" -python_versions = ["3.8", "3.9", "3.10", "3.11", "3.12"] -locked_python_version = "3.8" +python_versions = ["3.9", "3.10", "3.11", "3.12", "3.13"] +locked_python_version = "3.9" nox.needs_version = ">= 2022.11.21" nox.options.sessions = ( "docs", @@ -46,7 +46,7 @@ def install_handle_python_numpy(session): handle incompatibilities with python and numpy versions see https://github.com/cjolowicz/nox-poetry/issues/1116 """ - if session._session.python in ["3.11", "3.12"]: + if session._session.python in ["3.11", "3.12", "3.13"]: session._session.install(".") else: session.install(".") diff --git a/poetry.lock b/poetry.lock index 9148d8f..c554e7f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -51,9 +51,6 @@ files = [ {file = "babel-2.15.0.tar.gz", hash = "sha256:8daf0e265d05768bc6c7a314cf1321e9a123afc328cc635c18622a2f30a04413"}, ] -[package.dependencies] -pytz = {version = ">=2015.7", markers = "python_version < \"3.9\""} - [package.extras] dev = ["freezegun (>=1.0,<2.0)", "pytest (>=6.0)", "pytest-cov"] @@ -673,14 +670,12 @@ woff = ["brotli (>=1.0.1)", "brotlicffi (>=0.8.0)", "zopfli (>=0.1.4)"] [[package]] name = "haptools" -version = "0.4.0" +version = "0.4.2" description = "Ancestry and haplotype aware simulation of genotypes and phenotypes for complex trait analysis" optional = false python-versions = ">=3.7,<4.0" -files = [ - {file = "haptools-0.4.0-py3-none-any.whl", hash = "sha256:73fa3c4b6f6119f71e59b84de226c34808646075f61a7fa033d84f434ec59738"}, - {file = "haptools-0.4.0.tar.gz", hash = "sha256:85dba0bde58e33ac71c298eb37e1272ca83276d06714d6516f8bc722c3455fe0"}, -] +files = [] +develop = false [package.dependencies] click = ">=8.0.3" @@ -690,6 +685,12 @@ numpy = ">=1.20.0" Pgenlib = ">=0.90.1" pysam = ">=0.19.0" +[package.source] +type = "git" +url = "https://github.com/cast-genomics/haptools.git" +reference = "f0ba9a3a464725fe66b9bbf31726af886c516374" +resolved_reference = "f0ba9a3a464725fe66b9bbf31726af886c516374" + [[package]] name = "humanfriendly" version = "10.0" @@ -2126,5 +2127,5 @@ test = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-it [metadata] lock-version = "2.0" -python-versions = ">=3.8,<4.0" -content-hash = "1bb0c82732e8ba5ac5ac5ab616189cede6a4136e3652496196d6ba629fbf1172" +python-versions = ">=3.9,<4.0" +content-hash = "a7bd4a4e3824ae27cb1cee5bf0599c747842d5c29e81c05e22be81ea45b71490" diff --git a/pyproject.toml b/pyproject.toml index 31b780b..54f9a17 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,13 +19,14 @@ classifiers = [ ] [tool.poetry.dependencies] -python = ">=3.8,<4.0" +python = ">=3.9,<4.0" click = ">=8.0.4" networkx = ">=2.6.3" scipy = ">=1.7.3" pydot = ">=1.4.2" pysam = ">=0.19.0" -haptools = ">=0.4.0" +haptools = {git = "https://github.com/cast-genomics/haptools.git", rev = "f0ba9a3a464725fe66b9bbf31726af886c516374"} # switch to 0.5.0 once released +# haptools = ">=0.5.0" statsmodels = ">=0.13.4" [tool.poetry.group.docs.dependencies] diff --git a/recipe/meta.yaml b/recipe/meta.yaml deleted file mode 100644 index c7b0d4d..0000000 --- a/recipe/meta.yaml +++ /dev/null @@ -1,34 +0,0 @@ -package: - name: happler - version: v0.0.0 - -source: - path: '..' - -build: - number: 0 - script: "{{ PYTHON }} -m pip install . --no-deps --ignore-installed --no-cache-dir -vvv" - entry_points: - - happler = happler.__main__:main - noarch: python - -requirements: - host: - - pip >=19.0.3 - - python >=3.7 - - poetry-core >=1.0.0 - run: - - python >=3.7 - - statsmodels >=0.13.2 - - click >=8.0.4 - - networkx >=2.6.3 - - scipy >=1.7.3 - - pydot >=1.4.2 - -about: - home: https://github.com/gymrek-lab/happler - license: MIT - summary: 'A haplotype-based fine-mapping method' - dev_url: https://github.com/gymrek-lab/happler - doc_url: https://happler.readthedocs.io/ - doc_source_url: https://github.com/shibukawa/imagesize_py/blob/master/README.rst diff --git a/tests/data/19_45401409-46401409_1000G.multi.hap b/tests/data/19_45401409-46401409_1000G.multi.hap new file mode 100644 index 0000000..6fc8474 --- /dev/null +++ b/tests/data/19_45401409-46401409_1000G.multi.hap @@ -0,0 +1,12 @@ +# orderH beta pval +# orderV score +# version 0.2.0 +#H beta .2f Effect size in linear model +#H pval .2f -log(pval) in linear model +#V score .2f -log(pval) assigned to this variant +H 19 45410444 46310808 H0 0.36 30.13 +H 19 45892145 45910673 H1 0.36 30.24 +V H0 46310807 46310808 rs12984785 C 9.52 +V H0 45410444 45410445 rs769450 A 30.13 +V H1 45892145 45892146 rs36046716 G 6.30 +V H1 45910672 45910673 rs1046282 G 30.24 diff --git a/tests/data/19_45401409-46401409_1000G.multi.pheno b/tests/data/19_45401409-46401409_1000G.multi.pheno new file mode 100644 index 0000000..93d2808 --- /dev/null +++ b/tests/data/19_45401409-46401409_1000G.multi.pheno @@ -0,0 +1,777 @@ +#IID H0-H1 +HG00096 1.3017394225909524 +HG00097 -0.27455070497718187 +HG00099 0.9223961552249307 +HG00100 -0.4444101713652624 +HG00101 0.8260910716500838 +HG00102 -2.192160408889623 +HG00103 1.937411234430382 +HG00105 -0.6766475045485754 +HG00106 1.2938113228688546 +HG00107 -0.7601234658999931 +HG00108 1.3340426734082755 +HG00109 -0.3702278392333091 +HG00110 -0.657796336291472 +HG00111 0.7163642258408758 +HG00112 -0.5119745970171313 +HG00113 -0.7490020951497627 +HG00114 1.0517322599183143 +HG00115 0.03970553035468305 +HG00116 0.9382503170083054 +HG00117 -0.48446533442500284 +HG00118 0.1605062768449892 +HG00119 0.6650392570450632 +HG00120 -1.2239557284874587 +HG00121 -0.28155916154351257 +HG00122 -0.694600223401998 +HG00123 0.016002165837225357 +HG00125 0.6610124959978709 +HG00126 -1.8824897934414873 +HG00127 -0.0022863086650215525 +HG00128 -0.07817244477032392 +HG00129 -0.060144088248919036 +HG00130 0.20815469441867915 +HG00131 -0.4434759299302991 +HG00132 -0.3862605139969777 +HG00133 -0.01584928114183487 +HG00136 -0.5947118164891941 +HG00137 0.6007178043843452 +HG00138 1.341960465226173 +HG00139 -1.732650802300685 +HG00140 -0.06574912378085612 +HG00141 -1.1173921209561168 +HG00142 -0.8313815665565609 +HG00143 1.4673031969056392 +HG00145 -0.21654238430913353 +HG00146 0.8559756623313552 +HG00148 -0.6547746894235088 +HG00149 0.6512205590579918 +HG00150 0.1771917767565636 +HG00151 0.2835877343172233 +HG00154 -1.3223836301651573 +HG00155 -0.4867863486518235 +HG00157 -0.311207123560055 +HG00158 -0.7591601736064473 +HG00159 -0.46909876113819504 +HG00160 -0.45228839698679396 +HG00171 0.05534744773817596 +HG00173 -0.05728526482670221 +HG00174 0.11912007184337481 +HG00176 -0.5635001943233273 +HG00177 0.1927763795549795 +HG00178 1.5693430077698323 +HG00179 0.8442480505447929 +HG00180 0.5563159735566817 +HG00181 0.25739949585716204 +HG00182 0.20070166807808176 +HG00183 0.6315534221299418 +HG00185 0.2532479084681132 +HG00186 -1.538315230397934 +HG00187 0.4205562997540997 +HG00188 -0.3671931147770918 +HG00189 -0.682298236222033 +HG00190 1.1334295789662376 +HG00231 -1.6518714633900213 +HG00232 -0.5946064662889645 +HG00233 1.456274458550981 +HG00234 -0.673410561447305 +HG00235 0.7324998961620618 +HG00236 -0.7476782340145596 +HG00237 1.3742564614367883 +HG00238 -0.42666365031731934 +HG00239 0.8359563911296215 +HG00240 -0.8395670312438419 +HG00242 -1.926006404324073 +HG00243 0.8588343808065176 +HG00244 -1.0365863223317784 +HG00245 0.1843317854385289 +HG00246 0.732871711072655 +HG00250 -0.7025024579392856 +HG00251 0.2084041227613298 +HG00252 -1.4545955830939332 +HG00253 2.945009837767178 +HG00254 0.503912008880156 +HG00255 -1.3334829520303777 +HG00256 -0.5480397928115349 +HG00257 -0.938556923042386 +HG00258 -1.9106942563377778 +HG00259 1.6559216835838548 +HG00260 0.07826046127315978 +HG00261 0.09915001009411378 +HG00262 -0.03674349382971431 +HG00263 0.13961998399334552 +HG00264 -0.7313470923718625 +HG00265 -0.4638843982228111 +HG00266 1.883097199158406 +HG00267 0.6482813187326648 +HG00268 0.19621125364483505 +HG00269 0.23454471446401665 +HG00271 0.3856839957159506 +HG00272 -0.7613980713107256 +HG00273 0.5847564149458971 +HG00274 -2.200340761207475 +HG00275 0.9399114427840759 +HG00276 0.6208580988865791 +HG00277 1.9921086351108939 +HG00278 -0.7452347633051035 +HG00280 -0.04146856072515981 +HG00281 0.039852090431971354 +HG00282 -1.0473708422676937 +HG00284 1.210779477111332 +HG00285 -0.9212433227842408 +HG00288 -0.45267676876761587 +HG00290 0.21879672908691095 +HG00304 -1.4019032397917557 +HG00306 1.6882682875304393 +HG00308 -0.40454620804361807 +HG00309 0.14589913994153003 +HG00310 -0.8603828426604014 +HG00311 0.7859249725806237 +HG00313 -1.423515824608892 +HG00315 -0.2390376441256331 +HG00318 -0.23442109584659665 +HG00319 0.10717344738698581 +HG00320 -1.1879282057763563 +HG00321 2.010856476651868 +HG00323 1.9536925186312748 +HG00324 1.5630788196071053 +HG00325 -0.6746593314170943 +HG00326 0.645697774512938 +HG00327 0.27531983513078506 +HG00328 -0.3240679844184028 +HG00329 1.0858071408796113 +HG00330 -0.1380942934078231 +HG00331 -0.6617302418606026 +HG00332 -0.8196133472385092 +HG00334 1.8964153527428376 +HG00335 1.0979767394742055 +HG00336 0.45942599485848723 +HG00337 -0.29927846123948887 +HG00338 1.6589405080120647 +HG00339 1.0244855637755055 +HG00341 -0.1106888927519688 +HG00342 -1.6805177057993594 +HG00343 0.9484579844835173 +HG00344 -1.0120010079512851 +HG00345 -1.3587334871064143 +HG00346 -1.4503482183381182 +HG00349 -0.5961763016136395 +HG00350 -1.2248086186506415 +HG00351 0.14578426476484357 +HG00353 0.07659525915044929 +HG00355 1.112412479404136 +HG00356 -0.8814785938735148 +HG00357 -0.9926854251137129 +HG00358 -1.0394944051752386 +HG00360 0.2171639569494055 +HG00361 -0.8036340494724711 +HG00362 0.10075077066547061 +HG00364 0.4835962293815953 +HG00365 0.19382680907013505 +HG00366 0.886385556034768 +HG00367 -0.3880716375161831 +HG00368 0.47239284200124787 +HG00369 -0.7839477950837497 +HG00371 1.0910733414788725 +HG00372 0.8663702947127541 +HG00373 0.19311323966514193 +HG00375 1.0061648184902738 +HG00376 0.9748409248884979 +HG00378 -0.6743931986307602 +HG00379 -0.6408435039796199 +HG00380 1.7644488312594047 +HG00381 0.6343595917549711 +HG00382 1.5614824358020498 +HG00383 -0.010665982476151414 +HG00384 0.5756403538114765 +HG00403 -0.7644713441588533 +HG00404 -1.6355736051237253 +HG00406 0.034088299656111654 +HG00407 -0.06974035486253505 +HG00409 -0.3266224287338094 +HG00410 1.285133304190869 +HG00419 -0.17747468981685946 +HG00421 -0.5216809527310943 +HG00422 1.14580062711893 +HG00428 0.47079269261721013 +HG00436 -1.1991826901148706 +HG00437 1.0922802894213188 +HG00442 1.9524696339807277 +HG00443 -1.276527509624545 +HG00445 0.54720269777494 +HG00446 1.1167374950321065 +HG00448 -0.9943615939542239 +HG00449 0.3015798433428688 +HG00451 0.4581203738623839 +HG00452 -0.3992703612852424 +HG00457 0.010823749383855108 +HG00458 -0.8098101517066841 +HG00463 -0.2448101536344522 +HG00464 -0.8733383667141027 +HG00472 0.7939345553658519 +HG00473 -1.0655863577921791 +HG00475 -0.7222165148083026 +HG00476 -0.9624957925891318 +HG00478 -0.0013715478358110045 +HG00479 -2.40327667661413 +HG00500 0.022552083785861265 +HG00513 -0.0032847044070901665 +HG00524 1.3696957342330283 +HG00525 0.09996072091096792 +HG00530 -0.9253087837168033 +HG00531 0.05387819253735748 +HG00533 0.7185769926770722 +HG00534 0.782101367150142 +HG00536 1.3953406773510117 +HG00537 1.307723084377591 +HG00542 -0.24537359254826419 +HG00543 -1.1289364983902215 +HG00551 -0.3586001725428226 +HG00553 -0.40832412475470525 +HG00554 3.675822203483924 +HG00556 -0.024706080231385097 +HG00557 1.4971197939347933 +HG00559 -0.04398445083603797 +HG00560 -0.16407710687711266 +HG00565 -0.7682990906804746 +HG00566 0.4678824762747546 +HG00580 -0.6669300269342622 +HG00581 -0.6700846349247087 +HG00583 0.5326664895129668 +HG00584 -0.2779053234796991 +HG00589 0.7781462516317879 +HG00590 -2.2572436631832327 +HG00592 -0.6433114427345645 +HG00593 -0.8295006793537826 +HG00595 0.0333143832658932 +HG00596 -0.9998587655976234 +HG00598 1.2013710076031352 +HG00599 0.5780529636331116 +HG00607 0.19702922956298496 +HG00608 -2.3574654810466606 +HG00610 0.42108704704301125 +HG00611 0.20412384859956612 +HG00613 1.2923109821657752 +HG00614 0.4287672171393162 +HG00619 -1.6119721910689218 +HG00620 -0.4329263648859035 +HG00622 0.5107996725772186 +HG00623 1.4808488027236744 +HG00625 -0.7980477107711863 +HG00626 1.8805724032304343 +HG00628 -0.07851834224053256 +HG00629 0.6951191348864068 +HG00631 0.5565444486212561 +HG00632 0.35114830609902814 +HG00634 -0.4048559861541365 +HG00637 -1.0590360908856513 +HG00638 0.4012517489601062 +HG00640 1.9041051366764545 +HG00641 -0.11858765354391748 +HG00650 -0.2720992279196343 +HG00651 -1.1782306581396977 +HG00653 1.666163243028577 +HG00654 0.5642021449803208 +HG00656 0.3597572204389563 +HG00657 -0.22112341590961582 +HG00662 -0.042381968527955505 +HG00663 -0.3884744651759044 +HG00671 -1.0650830457336684 +HG00672 -0.10252285397338956 +HG00674 2.585256684460668 +HG00675 -0.8129770930286141 +HG00683 0.2444522922101258 +HG00684 -0.43755239020277786 +HG00689 0.8245288974242369 +HG00690 0.5723664131283869 +HG00692 -0.3255448137258444 +HG00693 0.653983451319051 +HG00698 0.46514845909471514 +HG00699 -0.22295696340365445 +HG00701 0.5971173027939229 +HG00704 0.5093580294077722 +HG00705 -0.8307086637095085 +HG00707 0.5822457415535465 +HG00708 1.0270007475453435 +HG00717 0.05359074401090719 +HG00728 -0.3114946729690229 +HG00729 0.6751192883172805 +HG00731 -0.5150034836230196 +HG00732 -0.12798522904888415 +HG00734 1.013546450246632 +HG00736 0.6869753353027072 +HG00737 -0.43850863654157746 +HG00739 -0.12255652316791327 +HG00740 -0.527991999123023 +HG00742 -1.0608342352170212 +HG00743 0.5488493979377136 +HG00759 1.6224631515376773 +HG00766 0.7129815223518607 +HG00844 1.6689149744084937 +HG00851 3.4439936596775818 +HG00864 0.4702543572838877 +HG00867 -0.36112178875507417 +HG00879 -0.8401056803849913 +HG00881 0.479824583361269 +HG00956 -0.9284945721170539 +HG00978 -0.5651294761397843 +HG00982 -0.2462626268569882 +HG01028 1.3308648016408742 +HG01029 0.5024251758196058 +HG01031 -0.5355374532246513 +HG01046 -0.09355501647908948 +HG01047 -0.24342541106565116 +HG01048 -1.4135462455456238 +HG01049 0.3184435881775337 +HG01051 0.08654867357833768 +HG01052 -0.9700670489633084 +HG01054 1.1559454140871412 +HG01055 0.954942760323882 +HG01058 -0.7945090781828414 +HG01060 -0.03160599031617728 +HG01061 1.1863047071089463 +HG01063 1.5466296361248897 +HG01064 2.3499793828226445 +HG01066 0.41661528341296594 +HG01067 -0.6671381843672244 +HG01069 -0.5700338006896557 +HG01070 -0.5356882437925449 +HG01072 1.2553552346346728 +HG01073 -0.08577863872989477 +HG01075 -0.5132933981410371 +HG01077 -0.20636577600968187 +HG01079 -0.2359923693120397 +HG01080 1.3485351237334335 +HG01082 -0.08055547871995244 +HG01083 -0.41481781990904054 +HG01085 0.6846503975446758 +HG01086 -0.5107365703121772 +HG01088 0.20198090541546027 +HG01089 0.8904506995124579 +HG01092 1.0944270476855125 +HG01094 -0.21981626608057178 +HG01095 -0.42575883800617836 +HG01097 -0.5421369768663082 +HG01098 1.4859840653728797 +HG01101 0.1594650524518848 +HG01102 -0.277404564506499 +HG01104 -0.3045459641613543 +HG01105 -0.9325085789807781 +HG01107 0.6424914557878557 +HG01108 1.3163034090697114 +HG01110 0.7228953081839569 +HG01111 1.446641348653603 +HG01112 2.758413479417553 +HG01113 0.6077301224018714 +HG01119 -1.0529642439135634 +HG01121 0.07800350107491305 +HG01122 -0.41451465734361315 +HG01124 0.7607002016290159 +HG01125 0.11432801847372626 +HG01130 -0.7547045432285291 +HG01131 -0.36652083695629256 +HG01133 -0.26931656477459875 +HG01134 -1.0605782936561656 +HG01136 -1.035459208762127 +HG01137 1.4085149809557134 +HG01139 1.5342969923703986 +HG01140 0.847378452146835 +HG01142 1.1869804368689174 +HG01148 -2.975475585672566 +HG01149 -1.4491824340278092 +HG01161 -0.27099060632372735 +HG01162 0.5608862674548538 +HG01164 -0.9619239427802997 +HG01167 0.03698832406871888 +HG01168 -0.27540547256201225 +HG01170 2.0834892540141996 +HG01171 1.5374771041833983 +HG01173 1.1906553399713435 +HG01174 2.0146504504029226 +HG01176 -0.3447339318710625 +HG01177 -0.8604036385814623 +HG01182 0.14568185748744056 +HG01183 0.6662993491207352 +HG01187 -1.6974530748263668 +HG01188 -0.4205852001293092 +HG01190 0.6589674340269704 +HG01191 0.5732249874049651 +HG01197 0.10351652834159647 +HG01198 -0.10986771760944436 +HG01200 -0.2660634544870619 +HG01204 -0.464694630514974 +HG01205 -0.649340330132664 +HG01241 0.7377542115358544 +HG01242 1.7102390787723925 +HG01247 -0.04887599452558372 +HG01248 0.6869072846148201 +HG01250 1.6122307271180065 +HG01251 -0.006269529786685435 +HG01253 -0.1535991560141787 +HG01254 0.06486385324861255 +HG01256 0.1624736133399305 +HG01257 0.12720126896016382 +HG01259 -0.5611041978979303 +HG01260 0.8452007910129891 +HG01269 0.5765633665009684 +HG01271 1.4559524762179028 +HG01272 0.11539500759997834 +HG01275 0.24790776340921294 +HG01277 -0.043472289345663884 +HG01280 -0.4625647482063142 +HG01281 -0.018876707413124416 +HG01284 -0.6796253368687399 +HG01286 -0.3044217529345804 +HG01302 -2.0884863350227922 +HG01303 -0.1895879405963826 +HG01305 -0.3639634579361043 +HG01308 -0.28096127211380345 +HG01311 -1.067006406153747 +HG01312 -0.9598078660623194 +HG01323 0.2975749948154106 +HG01325 1.2012638264667665 +HG01326 -0.8243348201288716 +HG01334 -0.1598622584409964 +HG01341 -0.19555366908325295 +HG01342 0.4205481991288811 +HG01344 -1.1857084242467257 +HG01345 0.18791668758807312 +HG01348 -0.24858056282419405 +HG01350 1.0647884665460232 +HG01351 0.3821792758340964 +HG01353 -0.7577895465340109 +HG01354 1.4973311894872399 +HG01356 0.08993696498914311 +HG01357 -0.07137276654248531 +HG01359 -0.24485034819562637 +HG01360 -0.19399268806109768 +HG01362 -0.3421934499181302 +HG01363 0.013594478789405562 +HG01365 -1.1735883598885404 +HG01366 -0.40882640279430355 +HG01369 -0.6980593857929052 +HG01372 -0.14920281916494887 +HG01374 -0.22564984724333909 +HG01375 -1.9057224481945723 +HG01377 -0.3349897961833549 +HG01378 -0.6893113802602657 +HG01383 0.06399427332450408 +HG01384 2.0879822732079005 +HG01389 0.2905065963973552 +HG01390 -1.8161393350145354 +HG01392 0.21471482465500513 +HG01393 -0.30892617532578204 +HG01395 -2.3077217301646984 +HG01396 0.6514749358352425 +HG01398 0.4574977350848046 +HG01402 -0.45921566369595745 +HG01403 -1.1713628906570077 +HG01405 -1.1567176774100134 +HG01412 -0.21502671123675055 +HG01413 0.9924112201862092 +HG01414 -1.1139932031627564 +HG01431 0.061196914990173656 +HG01432 -0.6163207046627842 +HG01435 -0.6807933307860816 +HG01437 1.7868857357670271 +HG01438 0.9800844685974784 +HG01440 -1.0449999061114703 +HG01441 0.024951109934860716 +HG01443 0.6300148056020438 +HG01444 -0.21867101194856767 +HG01447 0.5770469936526864 +HG01455 1.0428562054430608 +HG01456 0.2710026628017298 +HG01459 0.34076449871031944 +HG01461 -0.6359349294097668 +HG01462 -1.6552750583804026 +HG01464 -0.0695625090644314 +HG01465 2.0255787662136453 +HG01468 0.26621377307968563 +HG01474 -0.2106567704185593 +HG01479 -0.44983305929705014 +HG01485 0.14488375652371077 +HG01486 0.2543365153611554 +HG01488 -0.213711743816126 +HG01489 1.4281623092837072 +HG01491 0.8024576594075794 +HG01492 0.23204715616732297 +HG01494 -0.371881854159044 +HG01495 0.8919333998823771 +HG01497 -1.4175503353096701 +HG01498 2.9357075502684578 +HG01500 -0.8874043521741435 +HG01501 -1.0720726873652815 +HG01503 -1.3571194490760414 +HG01504 -0.49892894310137426 +HG01506 -0.03931832059904905 +HG01507 -0.9412032420446552 +HG01509 0.31813781483934755 +HG01510 0.3508669576237833 +HG01512 1.4729719731022497 +HG01513 -0.984343032601463 +HG01515 0.46543101076195226 +HG01516 0.4460419126941211 +HG01518 -0.30959991646573015 +HG01519 -0.9398263010270295 +HG01521 -1.056578331271975 +HG01522 -1.4239638772246848 +HG01524 0.28144387350616684 +HG01525 0.7343342379454842 +HG01527 0.901922144462221 +HG01528 -0.08574761688772858 +HG01530 1.3662470368018946 +HG01531 0.9365544794916194 +HG01536 -1.423792342169354 +HG01537 1.975297967956009 +HG01550 -0.1595864688955843 +HG01551 0.3876305272480949 +HG01556 0.6011569906689535 +HG01565 -0.4401050318778242 +HG01566 -0.25402628945193184 +HG01571 -0.13603079578388413 +HG01572 -1.0867770031518553 +HG01577 -0.6069600379514265 +HG01578 -0.2527960568553665 +HG01583 -0.27998804624640916 +HG01586 0.3367042336962939 +HG01589 0.5992709593147879 +HG01593 -0.5782306881740438 +HG01595 0.6188787969948111 +HG01596 -0.5319272035027639 +HG01597 0.4633309062649581 +HG01598 -0.14212038865574916 +HG01599 -0.9693440536402446 +HG01600 1.329534410844258 +HG01602 -0.7027942179755869 +HG01603 1.3258050259785459 +HG01605 0.7583742247627854 +HG01606 -0.03806140482527687 +HG01607 0.5228190851369188 +HG01608 0.5614434937244929 +HG01610 -2.154321123984033 +HG01612 -0.770020813211133 +HG01613 -0.057682039549263764 +HG01615 0.17553713718651348 +HG01617 0.8680841503716705 +HG01618 -0.09365334577581458 +HG01619 0.7567266158704564 +HG01620 -0.15316680848114536 +HG01623 -0.07167547502727623 +HG01624 -0.7098691112479755 +HG01625 -0.5376565585030909 +HG01626 -1.1934123893983926 +HG01628 -1.3137578792784572 +HG01630 -0.3885220706417852 +HG01631 0.06380003793158934 +HG01632 0.9882096449012903 +HG01668 -1.2272834638886467 +HG01669 1.3148086098043614 +HG01670 -0.7194148905373262 +HG01672 -0.6727658059545862 +HG01673 1.104059086797995 +HG01675 -0.05270769798165276 +HG01676 0.026817985814787115 +HG01678 -1.3430039189000706 +HG01679 2.26409616179076 +HG01680 -0.18251160976200353 +HG01682 -2.3800023943973114 +HG01684 -0.5838253170821261 +HG01685 -0.2624810541679711 +HG01686 0.6676106843315626 +HG01694 -1.4791573816281445 +HG01695 0.07375590012010846 +HG01697 -0.674513249152488 +HG01699 0.07580824435391786 +HG01700 -0.6064964653997095 +HG01702 0.46059787936145224 +HG01704 1.5242524603163903 +HG01705 -1.2629813567475658 +HG01707 -0.3341063588736711 +HG01708 0.26000648605015186 +HG01709 1.4530481156739392 +HG01710 1.3905686564870718 +HG01746 0.7199055806858283 +HG01747 -2.055455595648907 +HG01756 2.0605421164194477 +HG01757 1.0498093758574174 +HG01761 0.9819684152254962 +HG01762 -1.0015790396866084 +HG01765 1.3718804568385248 +HG01766 0.08083252031872545 +HG01767 -0.6699710837703073 +HG01768 0.32734826418883145 +HG01770 -0.29685780134204226 +HG01771 -0.7126405956209039 +HG01773 -0.1826718106288978 +HG01775 -0.3284833437322132 +HG01776 0.32753059127823597 +HG01777 -0.16288878224321937 +HG01779 -1.7150146316175758 +HG01781 1.1251339075733333 +HG01783 0.04474406200438674 +HG01784 -0.14884787663011023 +HG01785 0.3076492887242234 +HG01786 0.9799621694745567 +HG01789 -0.7007101666193455 +HG01790 -0.32123129675212875 +HG01791 0.7551301160304824 +HG01794 1.3310246063851228 +HG01795 0.9876089909632227 +HG01796 -1.7025563253400566 +HG01797 -0.20852960737783074 +HG01798 1.0381327312289705 +HG01799 -0.5763532553403252 +HG01800 0.6792764360942509 +HG01801 -1.0825184866495092 +HG01802 -0.8551577350061023 +HG01804 0.06947664819157517 +HG01805 -2.0075674260015806 +HG01806 -0.9769109706355256 +HG01807 2.235744016243585 +HG01808 1.1261728493320944 +HG01809 -0.16532081221444295 +HG01810 0.7993077856593014 +HG01811 1.008630911516867 +HG01812 0.7987820324660475 +HG01813 -1.1010309129607947 +HG01815 0.0941042900031055 +HG01816 0.07479391251206335 +HG01817 0.039881199213996577 +HG01840 0.0008107907883523335 +HG01841 -1.7886038967713078 +HG01842 2.057534865314539 +HG01843 1.1813704618564589 +HG01844 -1.3070428653364963 +HG01845 -0.10415054047715061 +HG01846 -1.6347736522854883 +HG01847 0.5981082049215186 +HG01848 0.49950714611498737 +HG01849 -0.5765966932048827 +HG01850 2.2319784225108634 +HG01851 -1.20065235030314 +HG01852 -2.693691429962721 +HG01853 -0.4551628489456577 +HG01855 0.010416563200790374 +HG01857 -0.872012578333183 +HG01858 -0.00406205758991629 +HG01859 0.1221464121021395 +HG01860 -1.21101586360234 +HG01861 -1.74648042284672 +HG01862 1.4403951528372323 +HG01863 0.6952367542086773 +HG01864 -0.5261923337560288 +HG01865 -1.7138386295283845 +HG01866 -0.9079108023365258 +HG01867 1.3602716532873491 +HG01868 -0.1417979917964054 +HG01869 -0.8708257438590182 +HG01870 -0.04784419231061443 +HG01871 -0.4825374843931246 +HG01872 -1.4039256375821905 +HG01873 -0.08833353399627719 +HG01874 0.876278638618728 +HG01878 0.7726110474957442 +HG01879 2.0191601784878435 +HG01880 -0.39752922689275405 +HG01882 0.7546151453166463 +HG01883 0.157970379027911 +HG01885 0.41436692431391553 +HG01886 -0.10523086231530376 +HG01889 -0.2814716907530671 +HG01890 -0.8412892446081937 +HG01892 -0.2663219512249372 +HG01893 -0.5026892598671168 +HG01894 -1.1154829810938125 +HG01896 -0.7017998643291423 +HG01912 0.2970673421544838 +HG01914 0.6118689092417687 +HG01915 -0.6479071868113228 +HG01917 0.07777768776857702 +HG01918 -0.14628277487012908 +HG01920 1.7681672680378433 +HG01921 -0.5881650245475535 +HG01923 0.760405468975605 +HG01924 -0.2889807756235889 +HG01926 0.43621226021683823 +HG01927 -0.23730446674064543 +HG01932 -0.43699599580467485 +HG01933 0.6528533731003703 +HG01935 -0.7475526354879827 +HG01936 1.5094554510014961 +HG01938 0.27680408013140567 +HG01939 0.2055593358165455 +HG01941 2.1313080055667206 +HG01942 -0.588644766506938 +HG01944 1.4556856718913858 +HG01945 0.4734631906831199 +HG01947 -2.0126453915253872 +HG01948 -0.9430370879616923 +HG01950 2.057970260358122 +HG01951 0.3662051249610203 +HG01953 1.6481910566574476 +HG01954 -0.19638725549125463 +HG01956 -0.6689727087737826 +HG01958 0.191203691445877 +HG01961 0.5246736326505623 +HG01965 -0.24383069116823602 +HG01967 0.04427224701346849 +HG01968 -0.8908742825629983 +HG01970 -1.6463329063438394 +HG01971 0.9858644361877709 +HG01973 -0.10704467374751558 +HG01974 -0.9699953853969524 +HG01976 -0.062087789341763455 +HG01977 0.46202489031645044 +HG01979 1.0267036217537027 +HG01980 -0.4526578880597739 +HG01982 -0.8737852556329553 +HG01985 0.23359501037722807 +HG01986 3.1371193492918787 +HG01988 -0.5475025726786117 +HG01989 0.47435323460452494 +HG01990 -0.7465050935474141 +HG01991 -0.7788253215478809 +HG01992 0.08730436546292941 +HG01997 2.3062118697618788 +HG02002 0.10162781629363232 +HG02003 0.15859012616985357 +HG02006 1.6777132942618456 +HG02008 0.663470945207608 +HG02009 1.3060371169640814 +HG02010 -0.13844724424252397 +HG02012 1.2101340707437047 +HG02013 -0.8861603038151078 +HG02014 -0.04874235699909152 +HG02016 1.010456347263374 +HG02017 0.8203727150068628 +HG02019 0.04450042815337485 +HG02020 0.8754819935860505 +HG02023 -1.5485864250277281 +HG02025 -0.5231651040972453 +HG02026 1.306239127064317 +HG02028 1.1220011665698573 +HG02029 -1.3101376762530863 +HG02031 0.5318604881477949 +HG02032 -1.454155235116148 +HG02035 -0.1240485386500875 +HG02040 0.36235164987935087 +HG02047 0.037354545301426845 +HG02048 -0.8901052017075429 +HG02049 -0.8902400343183031 +HG02050 0.15737609529255614 +HG02051 0.36827034879898163 +HG02052 -1.1733976935691066 +HG02053 -0.1942191451687208 +HG02054 0.42557959365741316 +HG02057 0.6864527381317347 +HG02058 -0.7021161310752574 +HG02060 0.25330874383085666 +HG02061 1.0197142120800042 +HG02064 -0.39158335442623227 +HG02067 -0.8839840865164575 +HG02069 -1.0947804286230107 +HG02070 -0.4746157067039667 +HG02072 -0.5444193957629733 +HG02073 1.2416663295950512 +HG02075 -0.11171153408631879 diff --git a/tests/test_examples.py b/tests/test_examples.py index 5084f8b..5f25730 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -684,6 +684,26 @@ def test_1000G_simulated(capfd): assert filecmp.cmp(hp_file, out_hp_file) +def test_1000G_simulated_multihap(capfd): + """ + Test using simulated data from the 1000G dataset + + In this case, more than one haplotype is causal + """ + gt_file = DATADIR / "19_45401409-46401409_1000G.pgen" + pt_file = DATADIR / "19_45401409-46401409_1000G.multi.pheno" + hp_file = DATADIR / "19_45401409-46401409_1000G.multi.hap" + out_hp_file = "test.hap" + + cmd = f"run --remove-SNPs --max-signals 3 --max-iterations 3 -o {out_hp_file} {gt_file} {pt_file}" + runner = CliRunner() + result = runner.invoke(main, cmd.split(" "), catch_exceptions=False) + captured = capfd.readouterr() + assert captured.out == "" + assert result.exit_code == 0 + assert filecmp.cmp(hp_file, out_hp_file) + + def test_1000G_simulated_maf(capfd): """ Test MAF filtering using simulated data from the 1000G dataset @@ -696,7 +716,7 @@ def test_1000G_simulated_maf(capfd): out_vars = ("rs1046282", "rs36046716") for maf in (0.05, 0.30, 0.31, 0.38): - cmd = f"run --maf {maf} -o {out_hp_file} {gt_file} {pt_file}" + cmd = f"run --out-thresh 1 --maf {maf} -o {out_hp_file} {gt_file} {pt_file}" runner = CliRunner() result = runner.invoke(main, cmd.split(" "), catch_exceptions=False) captured = capfd.readouterr() @@ -738,7 +758,7 @@ def test_1000G_real(capfd, caplog): caplog.set_level(logging.INFO) for maf in (0.05, 0.14, 0.22, 0.23, 0.35): - cmd = f"run --maf {maf} -o {out_hp_file} {gt_file} {pt_file}" + cmd = f"run --out-thresh 1 --maf {maf} -o {out_hp_file} {gt_file} {pt_file}" runner = CliRunner() result = runner.invoke(main, cmd.split(" "), catch_exceptions=False) captured = capfd.readouterr() diff --git a/tests/test_tree.py b/tests/test_tree.py index 4bf8c76..c0f0278 100644 --- a/tests/test_tree.py +++ b/tests/test_tree.py @@ -230,7 +230,7 @@ def test_haplotype_transform(): if idx != variant_idx: np.testing.assert_allclose( hap.transform(gens, allele, idx), - gens_without_variant[:,(idx-1,)], + gens_without_variant[:, (idx - 1,)], )