Skip to content

Commit

Permalink
Merge pull request #8 from logsdon-lab/feature/chrom-specific-params
Browse files Browse the repository at this point in the history
feat: Chromosome specific parameters
  • Loading branch information
koisland authored Jan 9, 2025
2 parents 681b6a4 + 429e295 commit 942d656
Show file tree
Hide file tree
Showing 8 changed files with 109 additions and 23 deletions.
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ test/cdr/input/CHM13_intersect.bed filter=lfs diff=lfs merge=lfs -text
test/cdr/input/HG00731_intersect.bed filter=lfs diff=lfs merge=lfs -text
test/cdr/input/NA19331_intersect.bed filter=lfs diff=lfs merge=lfs -text
test/cdr/expected/NA19331_cdr.bed filter=lfs diff=lfs merge=lfs -text
test/cdr/expected/CHM13_cdr_chr8_params.bed filter=lfs diff=lfs merge=lfs -text
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -142,4 +142,4 @@ make singularity
```

## Cite
* Glennis A Logsdon, F Kumara Mastrorosa, Keisuke K Oshima, Allison N Rozanski, William T Harvey, Evan E Eichler, Identification and annotation of centromeric hypomethylated regions with Centromere Dip Region (CDR)-Finder, Bioinformatics, 2024;, btae733, https://doi.org/10.1093/bioinformatics/btae733*
* Francesco Kumara Mastrorosa, Keisuke K Oshima, Allison N Rozanski, William T Harvey, Evan E Eichler, Glennis A Logsdon, Identification and annotation of centromeric hypomethylated regions with CDR-Finder, Bioinformatics, Volume 40, Issue 12, December 2024, btae733, https://doi.org/10.1093/bioinformatics/btae733
3 changes: 3 additions & 0 deletions test/cdr/expected/CHM13_cdr_chr8_params.bed
Git LFS file not shown
14 changes: 14 additions & 0 deletions test/cdr/test_cdr.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,20 @@
0.5,
tuple(["--bp_edge", str(500_000)]),
),
(
"test/cdr/input/CHM13_intersect.bed",
"test/cdr/expected/CHM13_cdr.bed",
0.5,
0.5,
tuple(
[
"--bp_edge",
str(500_000),
"--override_chrom_params",
"test/config/chr8_params.json",
]
),
),
],
)
def test_cdr_finder(
Expand Down
6 changes: 6 additions & 0 deletions test/config/chr8_params.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"chr8": {
"thr_height_perc_valley": 0.25,
"thr_prom_perc_valley": 0.25
}
}
29 changes: 29 additions & 0 deletions test/config/config_chr8_params.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
samples:
CHM13:
fasta: test/data/fa/CHM13_chr8_chr21.fa.gz
regions: test/data/bed/CHM13_cen_500kbp.bed
bam: test/data/bam/CHM13_chr8_chr21.bam
override_chrom_params: test/config/chr8_params.json
# Add plot titles
titles: test/data/bed/titles.tsv

# Size of the methylation windows to average over
window_size: 5000

# Size of ALR repeat stretches to include in search of CDR
alr_threshold: 100000

# Distance to merge adjacent CDRs. Can omit.
bp_merge: 1

# Threshold percent of the average methylation percentage needed as the minimal height of a valley from the median.
# ex. 0.1 allows valleys with a height from the median equal to 10% of the median.
height_perc_valley_threshold: 0.34

prom_perc_valley_threshold: 0.3

bp_alr_merge: 1000

bp_edge: 500000

extend_edges_std: null
15 changes: 12 additions & 3 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -335,8 +335,12 @@ rule call_cdrs:
methyl_bed=(
rules.intersect_RM.output.intersect_bed
if config.get("restrict_alr")
else
rules.add_target_bed_coords_windows.output
else rules.add_target_bed_coords_windows.output
),
override_chrom_params=lambda wc: (
MANIFEST[wc.sample]["override_chrom_params"]
if MANIFEST[wc.sample].get("override_chrom_params", [])
else []
),
output:
cdrs=join(OUTPUT_DIR, "bed", "{sample}_CDR.bed"),
Expand All @@ -360,6 +364,11 @@ rule call_cdrs:
else ""
),
thr_height_perc_valley=HEIGHT_THRESHOLD,
override_chrom_params=lambda wc, input: (
f"--override_chrom_params {input.override_chrom_params}"
if input.override_chrom_params
else ""
),
conda:
"envs/python.yaml"
threads: 1
Expand All @@ -371,7 +380,7 @@ rule call_cdrs:
--bp_edge {params.bp_edge} \
--edge_height_heuristic {params.edge_height_heuristic} \
{params.bp_merge} {params.thr_prom_perc_valley} \
{params.extend_edges_std} > {output} 2> {log}
{params.extend_edges_std} {params.override_chrom_params} > {output} 2> {log}
"""


Expand Down
62 changes: 43 additions & 19 deletions workflow/scripts/cdr_finder.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import sys
import math
import json
import argparse
import statistics
from collections import defaultdict
Expand Down Expand Up @@ -86,6 +87,12 @@ def main():
default="min",
help="Heuristic used to determine edge height of CDR.",
)
ap.add_argument(
"--override_chrom_params",
type=str,
default=None,
help="JSON file with params per chromosome name to override default settings. Each parameter name should match.",
)

args = ap.parse_args()
df = pl.read_csv(
Expand All @@ -94,11 +101,32 @@ def main():
has_header=False,
new_columns=["chrom", "st", "end", "avg"],
)
if args.override_chrom_params:
with open(args.override_chrom_params, "rt") as fh:
override_params = json.load(fh)
else:
override_params = {}

cdr_intervals: defaultdict[str, IntervalTree] = defaultdict(IntervalTree)
for chrom, df_chr_methyl in df.group_by(["chrom"]):
chrom: str = chrom[0]

# Get override params.
thr_prom_perc_valley = override_params.get(chrom, {}).get(
"thr_prom_perc_valley", args.thr_prom_perc_valley
)
thr_height_perc_valley = override_params.get(chrom, {}).get(
"thr_height_perc_valley", args.thr_height_perc_valley
)
bp_edge = override_params.get(chrom, {}).get("bp_edge", args.bp_edge)
edge_height_heuristic = override_params.get(chrom, {}).get(
"edge_height_heuristic", args.edge_height_heuristic
)
extend_edges_std = override_params.get(chrom, {}).get(
"extend_edges_std", args.extend_edges_std
)
bp_merge = override_params.get(chrom, {}).get("bp_merge", args.bp_merge)

# Group adjacent, contiguous intervals.
df_chr_methyl_adj_groups = (
df_chr_methyl.with_columns(brk=pl.col("end") == pl.col("st").shift(-1))
Expand All @@ -116,11 +144,9 @@ def main():
avg_methyl_mean = df_chr_methyl["avg"].mean()
avg_methyl_std = df_chr_methyl["avg"].std()
cdr_prom_thr = (
avg_methyl_median * args.thr_prom_perc_valley
if args.thr_prom_perc_valley
else None
avg_methyl_median * thr_prom_perc_valley if thr_prom_perc_valley else None
)
cdr_height_thr = avg_methyl_median * args.thr_height_perc_valley
cdr_height_thr = avg_methyl_median * thr_height_perc_valley
print(
f"Using CDR height threshold of {cdr_height_thr} and prominence threshold of {cdr_prom_thr} for {chrom}.",
file=sys.stderr,
Expand Down Expand Up @@ -155,8 +181,8 @@ def main():
ignore_intervals = grp_cdr_intervals.difference([interval])
df_cdr = get_interval(df_chr_methyl_adj_grp, interval)

interval_cdr_left = Interval(cdr_st - args.bp_edge, cdr_st)
interval_cdr_right = Interval(cdr_end, cdr_end + args.bp_edge)
interval_cdr_left = Interval(cdr_st - bp_edge, cdr_st)
interval_cdr_right = Interval(cdr_end, cdr_end + bp_edge)

# Get left and right side of CDR.
# Subtract intervals if overlapping bp edge region.
Expand Down Expand Up @@ -190,9 +216,9 @@ def main():
if cdr_left_median
else df_chr_methyl_adj_grp["avg"].median(),
]
if args.edge_height_heuristic == "min":
if edge_height_heuristic == "min":
cdr_edge_height = min(edge_heights)
elif args.edge_height_heuristic == "max":
elif edge_height_heuristic == "max":
cdr_edge_height = max(edge_heights)
else:
cdr_edge_height = statistics.mean(edge_heights)
Expand All @@ -209,10 +235,8 @@ def main():
file=sys.stderr,
)

if isinstance(args.extend_edges_std, int):
edge_thr = avg_methyl_mean + (
args.extend_edges_std * avg_methyl_std
)
if isinstance(extend_edges_std, int):
edge_thr = avg_methyl_mean + (extend_edges_std * avg_methyl_std)
try:
cdr_st = df_cdr_left.filter(
(pl.col("st") < cdr_st) & (pl.col("avg") >= edge_thr)
Expand All @@ -227,15 +251,15 @@ def main():
cdr_end = cdr_end

# Add merge distance bp.
if args.bp_merge:
cdr_st = cdr_st - args.bp_merge
cdr_end = cdr_end + args.bp_merge
if bp_merge:
cdr_st = cdr_st - bp_merge
cdr_end = cdr_end + bp_merge

cdr_intervals[chrom].add(Interval(cdr_st, cdr_end))

# Merge overlaps and output.
for chrom, cdrs in cdr_intervals.items():
if args.bp_merge:
if bp_merge:
starting_intervals = len(cdrs)
cdrs.merge_overlaps()
print(
Expand All @@ -245,9 +269,9 @@ def main():

for cdr in cdrs.iter():
cdr_st, cdr_end = cdr.begin, cdr.end
if args.bp_merge:
cdr_st += args.bp_merge
cdr_end -= args.bp_merge
if bp_merge:
cdr_st += bp_merge
cdr_end -= bp_merge

args.outfile.write(f"{chrom}\t{cdr_st}\t{cdr_end}\n")

Expand Down

0 comments on commit 942d656

Please sign in to comment.