diff --git a/CGATPipelines/PipelineRrbs.py b/CGATPipelines/PipelineRrbs.py new file mode 100644 index 00000000..2c7f3f64 --- /dev/null +++ b/CGATPipelines/PipelineRrbs.py @@ -0,0 +1,624 @@ +''' +PipelineRrbs.py - Utility functions for analysing rrbs sequence data +============================================================== + +:Author: Tom Smith +:Release: $Id$ +:Date: |today| +:Tags: Python + +Mapping reads is a common task in pipelines. Different pipelines +combine different sources of input (:term:`fastq` files, :term:`sra` files) +of different data (single end, paired end) with different mapping +algorithms (bowtie, tophat, stampy). This module provides utility +functions to abstract some of these variations. + +The pipeline does not know what kind of data it gets (a :term:`sra` archive +might contain single end or paired end data or both). + +A pipeline might get several input data (:term:`fastq` and :term:`sra` +formatted files at the same time). + +The module currently is able to deal with: + + * tophat mapping against genome + * bowtie mapping against transcriptome, genome and junctions + * bwa against genome + * stampy against genome + +It implements: + * .sra: paired-end and single-end + * .fastq: paired-end and single-end + * .csfasta: colour-space, single-end + +Code +---- + +''' + +import os +import shutil +import glob +import collections +import re +import gzip +import itertools +import CGAT.Pipeline as P +import logging as L +import CGAT.Experiment as E +import CGAT.IOTools as IOTools +import CGAT.GTF as GTF +import CGAT.Fastq as Fastq +import CGAT.IndexedFasta as IndexedFasta +import CGAT.Fasta as Fa +import pysam +import pandas as pd +from CGAT.Pipeline import cluster_runnable +import numpy as np +import pandas.rpy.common as com +from rpy2.robjects import r +import rpy2.robjects as robj +from rpy2.robjects.packages import importr + + +@cluster_runnable +def plotReadBias(infile, outfile): + '''take the m-bias outfile from bismark and plot read position biases''' + + def num(s): + try: + return int(s) + except ValueError: + return float(s) + + def dfFromTsvTable(lines, line, context): + row = 2 + rows = [] + row_values = lines[line + row].strip().split("\t") + header = [re.sub(" ", "_", re.sub("%", "perc", x)) + for x in row_values] + row += 1 + while lines[line + row].strip(): + print lines[line+row].strip() + row_values = lines[line + row].strip().split("\t") + row_values = map(num, row_values) + rows.append(row_values) + row += 1 + df = pd.DataFrame(rows, columns=header) + df['context'] = context + return(df) + + with open(infile, "r") as f: + lines = f.readlines() + line = 0 + while line < len(lines): + if lines[line].strip() == "CpG context": + df = dfFromTsvTable(lines, line, "CpG") + print df + elif lines[line].strip() == "CHG context": + df2 = dfFromTsvTable(lines, line, "CHG") + print df2 + elif lines[line].strip() == "CHH context": + df3 = dfFromTsvTable(lines, line, "CHH") + print df3 + line += 1 + final_df = pd.concat([df, df2, df3]) + final_df.index = range(len(final_df['context'])) + r_dataframe = com.convert_to_r_dataframe(final_df) + + plot1_out = (P.snip(outfile, ".read_position.tsv") + + "_read_position_methylation_bias.png") + plot2_out = re.sub("_methylation_bias", "_count", plot1_out) + + plotMbiasReadPositionSummary = r(""" + library(ggplot2) + library(reshape) + require(scales) + function(df){ + read_length = max(df$position) + p = ggplot(df, aes(x=position, y=perc_methylation, col=context)) + + geom_line() + geom_point(size=1) + + facet_grid(context ~ ., scales="free") + + scale_x_continuous(minor_breaks=seq(0,read_length), + breaks=seq(0,read_length,5)) + + scale_colour_discrete(guide = FALSE) + + xlab("Read position") + ylab("Methylation (percentage)") + ggsave('%(plot1_out)s', plot = p, width = 5, height = 5) + + p2 = ggplot(df, aes(x=position, + y=count_methylated+count_unmethylated, col=context)) + + geom_line() + geom_point(size=1) + + facet_grid(context ~ ., scales="free") + + scale_x_continuous(minor_breaks=seq(0,read_length), + breaks=seq(0,read_length,5)) + + scale_colour_discrete(guide = FALSE) + + scale_y_continuous(labels = comma) + + xlab("Read position") + ylab("Number of cytosines") + ggsave('%(plot2_out)s', plot = p2, width = 5, height = 5) + + write.table(df, file="%(outfile)s", row.names = F, sep="\t") + }""" % locals()) + + plotMbiasReadPositionSummary(r_dataframe) + + +@cluster_runnable +def pandasMerge(infile1, infile2, outfile, merge_type, left, right, + delim="\t", com="#"): + '''function to merge two tsv files using pandas + left and right are the columns to merge on''' + + def pandasRead(infile): + return pd.io.parsers.read_csv(infile, sep=delim, comment=com) + + df1 = pandasRead(infile1) + df2 = pandasRead(infile2) + merged = df1.merge(df2, how=merge_type, left_on=left, + right_on=right, sort=False) + merged.to_csv(outfile, sep="\t", index=False, na_rep="NA") + + +@cluster_runnable +def fasta2CpG(infile, outfile): + '''perform an insilico digest at MspI sites and return all CpGs + thorughout the genome, whether they are in a MspI fragment + and if so, what their read position is''' + # to do: paramterise (digestion site, read length, PE/SE) + + FA = Fa.Fasta(open(infile, "r")) + temp_contig = FA.FetchOne() + + outfile = open(outfile, "w") + outfile.write("contig\tposition\tstrand\tread_position\n") + + while len(temp_contig[1]) > 1: + contig, contig_seq = temp_contig + # find MspI sites + iter_pos = re.finditer("[cC][cC][gG][gG]", contig_seq) + pos_start = [x.start(0) for x in iter_pos] + # loop through pairs of MspI sites and add positions to dictionary + MspI_cpg_dict = {} + for n in range(0, len(pos_start)-1): + frag_length = pos_start[n+1] - pos_start[n] + if frag_length > 25 and frag_length < 300: + if frag_length >= 51: + # need to include additional base at the end of the read + # to check for final CG + read_f = contig_seq[pos_start[n]+1:pos_start[n]+53] + read_r = contig_seq[pos_start[n+1]-49:pos_start[n+1]+3] + else: + read_f = contig_seq[pos_start[n]+1:pos_start[n+1]+1] + read_r = contig_seq[pos_start[n]+3:pos_start[n+1]+3] + # find positions of CGs in in silico reads + f_cgs = re.finditer("[cC][gG]", read_f) + r_cgs = re.finditer("[cC][gG]", read_r) + for x in f_cgs: + # 1-based read position + read_pos = x.start(0) + 1 + # contig position 1-based + contig_position = pos_start[n] + 1 + read_pos + MspI_cpg_dict[contig_position] = read_pos + for x in r_cgs: + # 1-based read position + read_pos = len(read_r)-(x.start(0)+1) + # contig position 1-based + contig_position = pos_start[n+1] + 4 - read_pos + MspI_cpg_dict[contig_position] = read_pos + + # find location of ALL cpgs + all_cpgs = re.finditer("[cC][gG]", temp_contig[1]) + for cpg in all_cpgs: + # 2 Cs for each CpG (one on each strand) + c_pos, g_pos = (cpg.start(0) + 1, cpg.start(0) + 2) + read_pos = "NA" + # check if position of C is in the dictionary of MspI CpGs + if c_pos in MspI_cpg_dict: + read_pos = MspI_cpg_dict[c_pos] + outfile.write("%s\t%s\t%s\t%s\n" % (contig, c_pos, + "+", read_pos)) + read_pos = "NA" + if g_pos in MspI_cpg_dict: + read_pos = MspI_cpg_dict[g_pos] + outfile.write("%s\t%s\t%s\t%s\n" % (contig, g_pos, + "-", read_pos)) + temp_contig = FA.FetchOne() + outfile.close() + + +@cluster_runnable +def mergeAndDrop(cpgs, infiles, outfile): + # prepare dataframe with cpg locations + # do I need to index on location column? + all_cpgs = pd.io.parsers.read_csv(cpgs, sep='\t', comment='#') + all_cpgs['location'] = (all_cpgs['contig'] + ":" + + all_cpgs['position'].apply(str)) + # all_cpgs.drop(['contig', 'position'], axis=1, inplace=True) + + # for loop through files and merge each time with previous merged df + for infile in infiles: + sample_name = re.sub("\.fastq.*bismark", "", + P.snip(os.path.basename(infile), ".cov")) + + # prepare temporary dataframe with meth values + column_names = ["contig", "start", "stop", "meth_perc", + "count_meth", "count_unmeth"] + temp_meth_df = pd.io.parsers.read_csv(infile, sep='\t', + comment='#', header=None, + names=column_names) + temp_meth_df['location'] = (temp_meth_df['contig'] + ":" + + temp_meth_df['start'].apply(str)) + temp_meth_df.drop(['contig', 'start', 'stop', 'meth_perc'], + axis=1, inplace=True) + temp_meth_df.columns = [sample_name + "_meth", sample_name + "_unmeth", + 'location'] + + # merge into temp dataframe + merged = temp_meth_df.merge(all_cpgs, how="outer", + left_on='location', right_on='location', + sort=False) + all_cpgs = merged.copy(deep=True) + + merged.drop('location', axis=1, inplace=True) + merged.to_csv(outfile, sep="\t", index=False, na_rep="NA") + + +@cluster_runnable +def subsetToCovered(infile, outfile, cov_threshold=10): + '''take a flat file containing methylation calls for all cpgs + and output a flat file containing only sites above coverage threshold''' + + raw = pd.io.parsers.read_csv(infile, sep='\t', comment='#') + # only need the number of sample rows + samples = len(identifyExperimentalFactors(raw)[0]) + + def minCov(row, number_sample_rows): + column_range = range(0, number_sample_rows) + # take every other column (e.g meth count), add next column + # (e.g unmeth count) and find min for row + return min([np.nansum([row[i], row[i+1]]) + for i in column_range[::2]]) + + raw['min_cov'] = raw.apply(minCov, axis=1, args=(samples,)) + high_cov = raw[raw['min_cov'] >= cov_threshold] + high_cov['cpgi'].fillna('Non-CpGIsland', inplace=True) + # note: check setting NA to above works + high_cov.to_csv(outfile, sep="\t", index=False, na_rep="NA") + + +def identifyExperimentalFactors(df): + '''takes a data frame and returns the samples, treatment groups, + replicate numbers and tissues based on tissue-treatment-replicate + nomenclature''' + + columns = [] + for col in list(df.columns.values): + m = re.match("\S*-\S*-\S*", col) + if m is not None: + columns.append(col) + + samples = [re.sub("_\S*", "", x) for x in columns] + treatments = list(set([x.split("-")[1] for x in samples])) + tissues = list(set([x.split("-")[0] for x in samples])) + replicates = list(set([x.split("-")[2] for x in samples])) + + return samples, treatments, tissues, replicates + + +@cluster_runnable +def addTreatmentMean(infile, outfile): + '''take a flat file containing methylation calls and calculate + treatment means based on column names''' + + def getMethylationFrequency(row): + ''' this function assumes meth and unmeth columns are next to + one another - generalise''' + return row[0]/(row[0]+row[1]) + + df = pd.io.parsers.read_csv(infile, sep='\t', comment='#') + samples, treatments, tissues, replicates = identifyExperimentalFactors(df) + + for tissue in tissues: + for treatment in treatments: + treatment_columns = [] + for replicate in replicates: + m_col_name = "-".join([tissue, treatment, + replicate]) + "_meth" + um_col_name = "-".join([tissue, treatment, + replicate]) + "_unmeth" + temp_df = df[[m_col_name, um_col_name]] + meth_freq_col_name = tissue+"_"+treatment+"_"+replicate + df[meth_freq_col_name] = temp_df.apply(getMethylationFrequency, + axis=1) + treatment_columns.append(meth_freq_col_name) + temp_df = df[treatment_columns] + + df[tissue+"_"+treatment+"_mean"] = temp_df.apply(np.mean, axis=1) + df[tissue+"_"+treatment+"_var"] = temp_df.apply(np.var, axis=1) + + df.to_csv(outfile, sep="\t", index=False, na_rep="NA") + + +@cluster_runnable +def summaryPlots(infile, outfile): + '''take a flat file containing average methylation values and + produce various summary plots + + Plots: + 1. Read position vs. methylation level + 2. Distributions of average methylation, split by group and CpGI/Non-CpGI + 3. Methylation distributions by sample, split by CpGI/Non-CpGI + 4. Heatmap of Spearman's rho split by CpGI/Non-CpGI + 5. Correlation between mean value pairs for each treatment pair, + split by CpGI/Non-CpGI + 6. Read position vs. methylation difference for each treatment pair + + to do: + 1. Add all vs. all correlations plots + test! + ''' + + df = pd.io.parsers.read_csv(infile, sep='\t', comment='#') + samples, treatments, tissues, replicates = identifyExperimentalFactors(df) + r_dataframe = com.convert_to_r_dataframe(df) + + base = P.snip(outfile, ".log") + + gr = importr('grDevices') + + out = open(outfile, "w") + + plotFuncReadPositionMethylation = r(""" + library(ggplot2) + library(reshape) + function(df){ + read_length = max(df$read_position[is.finite(df$read_position)]) + df2 = df[, grepl("mean", names(df))] + df_agg = aggregate(df2, by=list(df$read_position),FUN=mean, na.rm=TRUE) + df_agg_m = melt(df_agg,id="Group.1") + p = ggplot(df_agg_m, aes(x=Group.1,y = value)) + + geom_line(aes(col=variable)) + + geom_point(aes(col=variable), size=1.5) + + ggtitle("Read Position vs. Methylation") + + scale_colour_discrete(name = "Sample") + + scale_x_continuous(minor_breaks=seq(0,read_length), + breaks=seq(0,read_length,5))+ + xlab("Read position") + + ylab("Methylation by read position") + + ggsave(paste0('%(base)s','-read_position_meth.png'), + plot = p, width = 5, height = 5) + }""" % locals()) + + plotFuncReadPositionMethylation(r_dataframe) + out.write("plot saved at: %(base)s-read_position_meth.png\n" % locals()) + + plotMethBySampleMean = r(""" + library(ggplot2) + library(reshape) + function(df){ + df2 = df[df$cpgi != "CpGIsland",] + df3 = df2[, grepl("mean", names(df))] + df4 = melt(df3) + + p=ggplot(df4, + aes(y=as.numeric(value), x=variable)) + + geom_violin(aes(col=variable)) + + scale_colour_discrete(name = "Sample", guide = FALSE) + + ylab("Fraction Methylated") + xlab("") + + scale_x_discrete(labels = gsub("_", "-", levels(df4$variable))) + + theme(legend.position="bottom", + axis.text.x = element_text(size=5), + axis.text.y = element_text(size=5)) + + ggtitle("Methylation by treatment group: Non-CpG Islands") + + ggsave(paste0('%(base)s','_methylation_by_sample_type_non_cpgi.png'), + plot = p, width = 5, height = 5) + + df5 = df[df$cpgi == "CpGIsland",] + df6 = df5[, grepl("mean", names(df))] + df7 = melt(df6) + + p2=ggplot(df7, + aes(y=as.numeric(value), x=variable)) + + geom_violin(aes(col=variable)) + + scale_colour_discrete(name = "Sample", guide = FALSE) + + ylab("Fraction Methylated") + xlab("") + + theme(legend.position="bottom", + axis.text.x = element_text(size=5), + axis.text.y = element_text(size=5)) + + ggtitle("Methylation by treatment group: CpG Islands") + + ggsave(paste0('%(base)s','_methylation_by_sample_type_cpgi.png'), + plot = p2, width = 5, height = 5) + }""" % locals()) + + plotMethBySampleMean(r_dataframe) + out.write("plot saved at: \ + %(base)s-methylation_by_sample_type_non_cpgi.png\n" % locals()) + out.write("plot saved at: \ + %(base)s-methylation_by_sample_type_cpgi.png\n" % locals()) + + meth_freq_columns = ["_".join((x, y, z)) + for x in tissues + for y in treatments + for z in replicates] + meth_freq_columns.append("cpgi") + + df_meth_freq = df[meth_freq_columns] + r_dataframe_meth_freq = com.convert_to_r_dataframe(df_meth_freq) + + # re-write with subfunctions in r function to reduce length of code + # each plot is repeated twice! + drop_cpgi = '!(colnames(df) %in% c("cpgi"))' + plotMethBySampleAndHeatmap = r(""" + library(ggplot2) + library(reshape) + library(amap) + function(df){ + df2 = df[, %(drop_cpgi)s] + var = apply(df2,1,var) + var[is.na(var)] <- 0 + mean = apply(df2,1,mean) + df3 = df[(var > 0 & mean >= 0.1 & mean <= 0.9),] + df4 = df3[df3$cpgi != "CpGIsland", %(drop_cpgi)s] + df4_m = melt(df4) + samples=data.frame(matrix(unlist(strsplit(as.character(df4_m$variable),"_")), + ncol=3,byrow=TRUE)) + colnames(samples) = c("tissue","treatment","replicate") + df4_m = cbind(samples, df4_m) + + p=ggplot(df4_m, + aes(x=as.numeric(value), group=variable)) + + geom_density(aes(col=paste0(tissue,"-",treatment), linetype=replicate)) + + scale_colour_discrete(name = "Treatment group") + + scale_linetype_discrete(name = "Replicate") + + xlab("Fraction Methylation") + + ggtitle("Methylation by sample: Non-CpG Islands") + + ggsave(paste0('%(base)s','_methylation_by_sample_non_cpgi.png'), + plot = p, width = 5, height = 5) + + df5 = as.matrix(sapply(df4, as.numeric)) + df6 = cor(df4, method="spearman") + df6_m = melt(df6) + p = ggplot(df6_m, aes(x=X1, y=X2, fill=value))+ + geom_tile() + xlab("") + ylab("") + + theme(axis.text.x = element_text(angle = 90)) + + scale_fill_continuous(name = "rho") + + ggtitle("non-CpG Islands") + + ggsave(paste0('%(base)s','_spearmans_heatmap_non_cpgi.png'), + plot = p, width = 5, height = 5) + + df7 = df3[df3$cpgi == "CpGIsland", %(drop_cpgi)s] + df7_m = melt(df7) + samples=data.frame(matrix(unlist(strsplit(as.character(df7_m$variable),"_")), + ncol=3,byrow=TRUE)) + colnames(samples) = c("tissue","treatment","replicate") + df7_m = cbind(samples, df7_m) + + p = ggplot(df7_m, + aes(x=as.numeric(value), group=variable)) + + geom_density(aes(col=paste0(tissue,"-",treatment), linetype=replicate)) + + scale_colour_discrete(name = "Treatment group") + + scale_linetype_discrete(name = "Replicate") + + xlab("Fraction Methylation") + + ggtitle("Methylation by sample: CpG Islands") + + ggsave(paste0('%(base)s','_methylation_by_sample_cpgi.png'), + plot = p, width = 5, height = 5) + + df8 = as.matrix(sapply(df7, as.numeric)) + df9 = cor(df8, method="spearman") + df9_m = melt(df9) + p2 = ggplot(df9_m, aes(x=X1, y=X2, fill=value))+ + theme(axis.text.x = element_text(angle = 90)) + + geom_tile() + xlab("") + ylab("") + + scale_fill_continuous(name = "rho") + + ggtitle("CpG Islands") + + ggsave(paste0('%(base)s','_spearmans_heatmap_cpgi.png'), + plot = p2, width = 5, height = 5) + + + library(pvclust) + # redefine distance function to use spearman's rank correlation + dist.pvclust <- function(x){ + res <- as.dist(1 - cor(x, method="spearman", use="pairwise.complete.obs")) + attr(res,"method") <- "correlation" + return(res) + x <- as.matrix(x) + P <- crossprod(x) + qq <- matrix(diag(P),ncol=ncol(P)) + Q <- sqrt(crossprod(qq)) + res <- as.dist(1 - P/Q) + attr(res,"method") <- "uncentered" + return(res)} + + result <- pvclust(as.matrix(df5), method.hclust="ward", nboot=100) + png(paste0('%(base)s','_hierarchical_clustering_non_cpgi.png')) + par(mar=c(0, 4, 4, 2)) + plot(result, xlab="", sub="", print.num=F, cex.pv=0.7, + col.pv=c(0,3,0), main="Non-CpG islands") + dev.off() + + result2 <- pvclust(as.matrix(df8), method.hclust="ward", nboot=100) + png(paste0('%(base)s','_hierarchical_clustering_cpgi.png')) + par(mar=c(0, 4, 4, 2)) + plot(result2, xlab="", sub="", print.num=F, cex.pv=0.7, + col.pv=c(0,3,0), main="CpG islands") + dev.off() + + }""" % locals()) + + plotMethBySampleAndHeatmap(r_dataframe_meth_freq) + out.write("plot saved at: \ + %(base)s-_spearmans_heatmap_non_cpgi.png\n" % locals()) + out.write("plot saved at: \ + %(base)s-_spearmans_heatmap_cpgi.png\n" % locals()) + + for tissue in tissues: + for treatment_pair in itertools.combinations(treatments, 2): + treatment1, treatment2 = map(str, treatment_pair) + tissue = str(tissue) + out.write("plotting tissue: %s, treatment: %s vs. %s\n" + % (tissue, treatment1, treatment2)) + + label_base = "-".join([tissue, treatment1, treatment2]) + + title = "Global Correlation: \n\ + %(tissue)s: %(treatment1)s vs. %(treatment2)s" % locals() + + # re-write this function to include r plotting function and then + # pass two dfs in turn split by cpgi so that only one python + # function is required + plotFuncGlobalCorrelation = r(""" + library(ggplot2) + function(df){ + df_r1rm = df[df$read_position!=1 | !(is.finite(df$read_position)),] + p = ggplot(df_r1rm, aes(x=%(tissue)s_%(treatment1)s_mean, + y=%(tissue)s_%(treatment2)s_mean)) + + geom_point(size=0.5, alpha = 0.2) + + xlab("%(treatment1)s") + ylab("%(treatment2)s") + + theme(axis.text.x = element_text(size=5), + axis.text.y = element_text(size=5)) + + ggtitle('%(title)s') + facet_grid(.~cpgi) + ggsave(paste0('%(base)s','-global_correlation', + '%(label_base)s','.png'), plot = p, width = 5, height = 5) + }""" % locals()) + + plotFuncGlobalCorrelation(r_dataframe) + out.write("plot saved at: \ + %(base)s-global_correlation_cpgi_%(label_base)s.png\n" % locals()) + + title = "Read Position vs. Difference in Methylation:\n\ + %(tissue)s: %(treatment1)s vs. %(treatment2)s" % locals() + plotFuncReadPositionDifference = r(""" + library(ggplot2) + function(df){ + + f <- function(x) { + r <- quantile(x, probs = c(0.05, 0.25, 0.5, 0.75, 0.95)) + names(r) <- c("ymin", "lower", "middle", "upper", "ymax") + r} + quantiles = quantile(df$%(tissue)s_%(treatment1)s_mean- + df$%(tissue)s_%(treatment2)s_mean, probs = c(0.01,0.99)) + + df$read_position = factor(df$read_position) + p = ggplot(df, aes(x=factor(read_position), + y = (%(tissue)s_%(treatment1)s_mean-\ + %(tissue)s_%(treatment2)s_mean))) + + stat_summary(fun.data=f, geom="boxplot") + ggtitle('%(title)s') + + theme(axis.text.x = element_text(size=5, angle=90)) + + xlab("Read position") + ylim(quantiles) + + ylab("Difference in methylation:\ + %(treatment1)s vs. %(treatment2)s") + ggsave(paste0('%(base)s','-read_position_meth_diff_', + '%(label_base)s','.png'), plot = p, width = 5, height = 5) + }""" % locals()) + # note:need to check whether final plot function works:pep8 changes + plotFuncReadPositionDifference(r_dataframe) + out.write("plot saved at: \ + %(base)s-read_position_meth_diff_%(label_base)s.png\n" % locals()) + + out.close() diff --git a/CGATPipelines/pipeline_docs/pipeline_rrbs/pipeline.rst b/CGATPipelines/pipeline_docs/pipeline_rrbs/pipeline.rst index 488fd8b6..ba35b3c5 100644 --- a/CGATPipelines/pipeline_docs/pipeline_rrbs/pipeline.rst +++ b/CGATPipelines/pipeline_docs/pipeline_rrbs/pipeline.rst @@ -5,12 +5,15 @@ RRBS pipeline Contents: .. toctree:: - :maxdepth: 2 + :maxdepth: 4 - pipeline/SummaryPlots.rst pipeline/QC.rst pipeline/Coverage.rst - + pipeline/summaryPlots.rst + pipeline/summaryPlots/ReadPositionsMethylationBias.rst + pipeline/summaryPlots/MethylationBySample.rst + pipeline/summaryPlots/Heatmaps.rst + .. errorlist:: diff --git a/CGATPipelines/pipeline_docs/pipeline_rrbs/pipeline/ReadPositionsMethylationBias.rst b/CGATPipelines/pipeline_docs/pipeline_rrbs/pipeline/ReadPositionsMethylationBias.rst new file mode 100644 index 00000000..b03e8ce2 --- /dev/null +++ b/CGATPipelines/pipeline_docs/pipeline_rrbs/pipeline/ReadPositionsMethylationBias.rst @@ -0,0 +1,39 @@ +======== +Coverage +======== +======== +Summary Plots +======== + +Methylation frequencies across the read length +============================================== + +The following plots show the average methylation value for each read +position, split by cytosine context, with one plot per sample. The +purpose of these plots is to identify systematic bias in bisulphite +conversion, for example, 3' bias. Note: read positions are taken from +the methylation caller which may be instructed to ignore the first n +bases from each read, in which case the read positions will be shifted +along by n. + +.. report:: SummaryPlots.readPositionMethylationBias + :render: gallery-plot + + Plots can be downloaded using the links below each plot + +Difference in methylation frequency across the read length +========================================================== + +The following plot(s) presents the difference in methylation between a +pair of treatments groups. It is assumed that most CpGs will remain +unchanged between treatment pairs and the median difference will +therefore be 0 with bioligical and technical variation resulting in a +normal distribution around zero. The purpose of this plot is to +identify instances of unexpectdly high variance which may be due to +systematic technical variation, for example, sequencing errors. The +whiskers contain the 99% of the data. Outliers are not shown. + +.. report:: SummaryPlots.readPositionMethylationDiff + :render: gallery-plot + + Plots can be downloaded using the links below each plot diff --git a/CGATPipelines/pipeline_docs/pipeline_rrbs/pipeline/SummaryPlots.rst b/CGATPipelines/pipeline_docs/pipeline_rrbs/pipeline/SummaryPlots.rst new file mode 100644 index 00000000..09720bf9 --- /dev/null +++ b/CGATPipelines/pipeline_docs/pipeline_rrbs/pipeline/SummaryPlots.rst @@ -0,0 +1,11 @@ +======== +Summary Plots +======== + +.. toctree:: + :maxdepth: 4 + + pipeline/summaryPlots/ReadPositionsMethylationBias.rst + pipeline/summaryPlots/MethylationBySample.rst + pipeline/summaryPlots/Heatmaps.rst + diff --git a/CGATPipelines/pipeline_docs/pipeline_rrbs/pipeline/summaryPlots/Heatmaps.rst b/CGATPipelines/pipeline_docs/pipeline_rrbs/pipeline/summaryPlots/Heatmaps.rst new file mode 100644 index 00000000..539cf95d --- /dev/null +++ b/CGATPipelines/pipeline_docs/pipeline_rrbs/pipeline/summaryPlots/Heatmaps.rst @@ -0,0 +1,43 @@ +===================== +Correlations, heatmaps and clustering +===================== + +Global correlation +========================== +The following plots are correlations between treatment groups for all +CpGs, split by CpG island / Non-CpG island. + +.. report:: SummaryPlots.correlations + :render: gallery-plot + + High resolution plots can be downloaded using the links below each plot + +Spearman's correlation heatmaps +========================== + +The following plots are heatmaps of spearman's rho values for all +possible pairwise comparisons. The samples are ordered by treatment +group and are not clustered. For clustering see the plots in the next +section. Heatmaps are split by CpG island / Non-CpG island. + +.. report:: SummaryPlots.heatmaps + :render: gallery-plot + + High resolution plots can be downloaded using the links below each plot + + +Hierachical clustering +========================== + +The following plots are hierarchical clusterings of the samples based +on pairwise spearman's rank correlations. Clusterings are split by are +split by CpG island / Non-CpG island. Distance between samples is +represented by height on the y-axis. The clustering has been +bootstrapped to estimate how strongly the data support the +cluster. The numbers in green are the bootstrap probabilities. + +.. report:: SummaryPlots.clustering + :render: gallery-plot + + High resolution plots can be downloaded using the links below each plot + diff --git a/CGATPipelines/pipeline_docs/pipeline_rrbs/pipeline/summaryPlots/MethylationBySample.rst b/CGATPipelines/pipeline_docs/pipeline_rrbs/pipeline/summaryPlots/MethylationBySample.rst new file mode 100644 index 00000000..e540878d --- /dev/null +++ b/CGATPipelines/pipeline_docs/pipeline_rrbs/pipeline/summaryPlots/MethylationBySample.rst @@ -0,0 +1,13 @@ + +Distribution of Methylation frequencies +========================== + +The following plots present the distribution of methylation values +split by sample and CpG island / Non-CpGI island. The first plots show +the distribution per sample. The later plots show the distribution for +the mean methylation values across treatment groups + +.. report:: SummaryPlots.methylationSample + :render: gallery-plot + + High resolution plots can be downloaded using the links below each plot diff --git a/CGATPipelines/pipeline_docs/pipeline_rrbs/pipeline/summaryPlots/ReadPositionsMethylationBias.rst b/CGATPipelines/pipeline_docs/pipeline_rrbs/pipeline/summaryPlots/ReadPositionsMethylationBias.rst new file mode 100644 index 00000000..f6f84c04 --- /dev/null +++ b/CGATPipelines/pipeline_docs/pipeline_rrbs/pipeline/summaryPlots/ReadPositionsMethylationBias.rst @@ -0,0 +1,32 @@ + +Methylation frequencies across the read length +============================================== + +The following plots show the average methylation value for each read +position, split by cytosine context, with one plot per sample. The +purpose of these plots is to identify systematic bias in bisulphite +conversion, for example, 3' bias. Note: read positions are taken from +the methylation caller which may be instructed to ignore the first n +bases from each read, in which case the read positions will be shifted +along by n. + +.. report:: SummaryPlots.readPositionMethylationBias + :render: gallery-plot + + Plots can be downloaded using the links below each plot + +Difference in methylation frequency across the read length +========================================================== + +The following plot(s) presents the difference in methylation between a +pair of treatments groups. It is assumed that most CpGs will remain +unchanged between treatment pairs and the median difference will +therefore be 0 with bioligical and technical variation resulting in a +normal distribution around zero. The purpose of this plot is to +identify instances of unexpectdly high variance which may be due to +systematic technical variation, for example, sequencing errors. + +.. report:: SummaryPlots.readPositionMethylationDiff + :render: gallery-plot + + Plots can be downloaded using the links below each plot diff --git a/CGATPipelines/pipeline_rrbs.py b/CGATPipelines/pipeline_rrbs.py index c24bb15e..6b35fcd6 100644 --- a/CGATPipelines/pipeline_rrbs.py +++ b/CGATPipelines/pipeline_rrbs.py @@ -32,9 +32,10 @@ ######## UPDATE THIS SECTION ############ -(See the readqc pipeline for details of the readqc functions (target ``readqc``).) +(See the readqc pipeline for details of the readqc functions +(target ``readqc``).) -The purpose of this pipeline is to pre-process reads (target ``full``). +The purpose of this pipeline is to pre-process reads (target ``full``). Implemented tasks are: @@ -48,7 +49,8 @@ Usage ===== -See :ref:`PipelineSettingUp` and :ref:`PipelineRunning` on general information how to use CGAT pipelines. +See :ref:`PipelineSettingUp` and :ref:`PipelineRunning` +on general information how to use CGAT pipelines. Configuration ------------- @@ -89,36 +91,30 @@ :term:`experiment`. The ``suffix`` determines the file type. The following suffixes/file types are possible: -sra - Short-Read Archive format. Reads will be extracted using the :file:`fastq-dump` tool. fastq.gz Single-end reads in fastq format. fastq.1.gz, fastq2.2.gz - Paired-end reads in fastq format. The two fastq files must be sorted by read-pair. - -.. note:: - - Quality scores need to be of the same scale for all input files. Thus it might be - difficult to mix different formats. + Paired-end reads in fastq format. + The two fastq files must be sorted by read-pair. Requirements ------------ -On top of the default CGAT setup, the pipeline requires the following software to be in the -path: +On top of the default CGAT setup, the pipeline requires the following +software to be in the path: -+--------------------+-------------------+------------------------------------------------+ -|*Program* |*Version* |*Purpose* | -+--------------------+-------------------+------------------------------------------------+ -|fastqc |>=0.9.0 |read quality control | -+--------------------+-------------------+------------------------------------------------+ -|sra-tools | |extracting reads from .sra files | -+--------------------+-------------------+------------------------------------------------+ -|picard |>=1.38 |bam/sam files. The .jar files need to be in your| -| | | CLASSPATH environment variable. | -+--------------------+-------------------+------------------------------------------------+ ++---------------+----------+------------------------------------------------+ +|*Program* |*Version* |*Purpose* | ++---------------+----------+------------------------------------------------+ +|fastqc |>=0.9.0 |read quality control | ++---------------+----------+------------------------------------------------+ +|sra-tools | |extracting reads from .sra files | ++---------------+----------+------------------------------------------------+ +|picard |>=1.38 |bam/sam files. The .jar files need to be in your| +| | | CLASSPATH environment variable. | ++---------------+----------+------------------------------------------------+ Pipeline output =============== @@ -129,21 +125,12 @@ Example ======= -Example data is available at -http://www.cgat.org/~andreas/sample_data/pipeline_readqc.tgz. To run -the example, simply unpack and untar:: - - wget http://www.cgat.org/~andreas/sample_data/pipeline_readqc.tgz - tar -xvzf pipeline_readqc.tgz - cd pipeline_readqc - python /pipeline_readqc.py make full - +Example data is available at ? +To run the example, simply unpack and untar:: TODO ==== -Code needs to be modularised, adapter sequences need to be moved to -/ifs/annotation. Code ==== @@ -254,7 +241,7 @@ def connect(): for suffix_name in SEQUENCESUFFIXES]) SEQUENCEFILES_REGEX = regex( - r".*/(\S+).(?Pfastq.1.gz|fastq.gz|fa.gz|sra|csfasta.gz|csfasta.F3.gz|export.txt.gz)") + r".*/(\S+).(?Pfastq.1.gz|fastq.gz)") ######################################################################### # summarise read 3' @@ -332,11 +319,9 @@ def mapReadsWithBismark(infile, outfile): print "job options: " + job_options outdir = "bismark.dir" bismark_options = PARAMS["bismark_options"] - to_cluster = True m = PipelineMapping.Bismark() statement = m.build((infile,), outfile) - P.run() ######################################################################### @@ -344,15 +329,6 @@ def mapReadsWithBismark(infile, outfile): ######################################################################### -# removed subdivide as it was causing issues downstream when I want to -# merge the .cov files with an input file (e.g mergeCoverage) -# should be able to remove CG,CHH & CHG files as only .cov file used downstream -'''@subdivide(mapReadsWithBismark, - regex("bismark.dir/(\S+).bam"), - [r"methylation.dir/CpG_context_\1.txt.gz", - r"methylation.dir/CHG_context_\1.txt.gz", - r"methylation.dir/CHH_context_\1.txt.gz", - r"methylation.dir/\1.bismark.cov"])''' @follows(mkdir("methylation.dir")) @transform(mapReadsWithBismark, regex("bismark.dir/(\S+).bam"), @@ -446,20 +422,20 @@ def makeCpgIslandsBed(outfile): out.close() -#@follows(makeCpgIslandsBed) -#@transform(sortFilterAndIndexBams, -# regex("bismark.dir/(\S+).sorted.bam"), -# add_inputs(makeCpgIslandsBed), -# r"coverage.dir/\1.cpg.coverage") -#def overlapCpgIslands(infiles, outfile): -# infile, bed = infiles +# @follows(makeCpgIslandsBed) +# @transform(sortFilterAndIndexBams, +# regex("bismark.dir/(\S+).sorted.bam"), +# add_inputs(makeCpgIslandsBed), +# r"coverage.dir/\1.cpg.coverage") +# def overlapCpgIslands(infiles, outfile): +# infile, bed = infiles # -# job_options = "-l mem_free=1G" -# statement = '''samtools view -b %(infile)s| -# coverageBed -hist -d -abam stdin -b %(bed)s -# > %(outfile)s''' % locals() -# print statement -# P.run() +# job_options = "-l mem_free=1G" +# statement = '''samtools view -b %(infile)s| +# coverageBed -hist -d -abam stdin -b %(bed)s +# > %(outfile)s''' % locals() +# print statement +# P.run() ######################################################################## # RRBS alternative CpGI coverage summary @@ -499,6 +475,7 @@ def make1basedCpgIslands(infile, outfiles): @transform(make1basedCpgIslands, regex("coverage.dir/(\S+).load"), r"coverage.dir/cpg_table.load") +# not currently implemented def joinAllCpGsAndCpGIslands(infile, outfile): tablename = P.toTable(outfile) @@ -507,16 +484,15 @@ def joinAllCpGsAndCpGIslands(infile, outfile): SELECT *"''' - - ######################################################################### # load bismark coverage and join tables in sql ######################################################################### -# this is just for testing BiSeq etc, delete in future + @transform(callMethylationStatus, suffix(".bismark.cov"), r".bismark.subset100.cov") +# this is just for testing BiSeq etc, delete in future def subsetCoverage(infile, outfile): statement = '''awk '($5+$6)>100' %(infile)s > %(outfile)s''' % locals() print statement @@ -534,29 +510,6 @@ def findCpGs(outfile): RRBS.fasta2CpG(genome_infile, outfile, submit=True, jobOptions=job_options) -# re-coded mergeAndDrop to deal with input without header -# so that makeCoverageHeader and addHeaderToCoverage can be removed - -#@follows(callMethylationStatus) -#@originate("methylation.dir/coverage_header.tsv") -#def makeCoverageHeader(outfile): -# f = open(outfile, "w") -# f.write("%s\t%s\t%s\t%s\t%s\t%s\n" % ("contig", "start", "stop", -# "meth_perc", "count_meth", -# "count_unmeth")) -# f.close() -# -# -#@follows(makeCoverageHeader) -#@transform(callMethylationStatus, -# regex("methylation.dir/(\S+).fastq(\S+)bismark.cov"), -# add_inputs(makeCoverageHeader), -# r"methylation.dir/\1.cov") -#def addHeaderToCoverage(infiles, outfile): -# infile, header = infiles -# statement = '''cat %(header)s %(infile)s > %(outfile)s''' % locals() -# P.run() - @follows(findCpGs) @merge([callMethylationStatus, findCpGs], @@ -585,6 +538,7 @@ def addCpGIs(infiles, outfile): right=['contig', 'position'], submit=True, jobOptions=job_options) + @transform(addCpGIs, suffix(".tsv"), ".load") @@ -605,16 +559,16 @@ def loadMergeCoverage(infile, outfile): ######################################################################### ######################################################################### ######################################################################### -# this is an entirely different approach to generating the coverage table by +# this is an entirely different approach to generating the coverage table # using sqlite3. Currently not correctly implemented (requires multiple # inner joins and union in sqlite3 command) -#@jobs_limit(1) -#@follows(makeCoverageHeader) -#@transform(callMethylationStatus, +# @jobs_limit(1) +# @follows(makeCoverageHeader) +# @transform(callMethylationStatus, # suffix(".cov"), # add_inputs(makeCoverageHeader), # ".cov.load") -#def loadAndIndexBismarkCoverage(infiles, outfile): +# def loadAndIndexBismarkCoverage(infiles, outfile): # infile, header = infiles # dbh = connect() # tablename = P.toTable(outfile) @@ -622,7 +576,7 @@ def loadMergeCoverage(infile, outfile): # index = "idx" + re.sub('_fastq_.+', "", tablename) # # -# statement = '''cat %(header)s %(infile)s | +# statement = '''cat %(header)s %(infile)s | # python %(scriptsdir)s/csv2db.py # --table %(tablename)s --retry --ignore-empty # > %(outfile)s; @@ -631,9 +585,9 @@ def loadMergeCoverage(infile, outfile): # P.run() # # -#@merge(loadAndIndexBismarkCoverage, +# @merge(loadAndIndexBismarkCoverage, # "methylation.dir/csvdb_join_bismark_coverage.sentinel") -#def joinBismarkCoverage(infiles, outfile): +# def joinBismarkCoverage(infiles, outfile): # print infiles # print outfile # @@ -650,7 +604,8 @@ def loadMergeCoverage(infile, outfile): # new_columns.append(letter + # ".count_meth AS %(column_track)s_meth" % locals()) # new_columns.append(letter + -# ".count_unmeth AS %(column_track)s_unmeth" % locals()) +# ".count_unmeth AS %(column_track)s_unmeth" % +# locals()) # if n == 0: # tables.append("%(table_track)s AS %(letter)s" % locals()) # else: @@ -663,7 +618,7 @@ def loadMergeCoverage(infile, outfile): # new_columns = ", ".join(new_columns) # tables = " ".join(tables) # -# statement = '''sqlite3 csvdb "CREATE TABLE merged_bismark_cov AS SELECT ''' +# statement = '''sqlite3 csvdb "CREATE TABLE merged_bismark_cov AS SELECT''' # statement += new_columns # statement += tables # statement += ''' " ''' @@ -676,7 +631,7 @@ def loadMergeCoverage(infile, outfile): ######################################################################### -# Summarise methylation +# Summarise methylation ######################################################################### # all these functions should be rewitten to take the output @@ -698,6 +653,7 @@ def summariseCoverage(infile, outfile): print statement P.run() + @follows(summariseCoverage) @merge(summariseCoverage, "methylation.dir/coverage.tsv") @@ -743,7 +699,7 @@ def summariseCpGOverlap(infiles, outfile): infile_list.append(x) print(len(infile_list)) infile_list = " ".join(infile_list) - coverage_range = [1,2,5,10,20,30] + coverage_range = [1, 2, 5, 10, 20, 30] coverage_range = " ".join(map(str, coverage_range)) statement = '''echo -e "CpGs\\toverlaps\\tthreshold" > %(outfile)s; @@ -824,13 +780,13 @@ def loadRemainingReads(infile, outfile): P.run() - ######################################################################## # RRBS Gene meta-profile ######################################################################## # this section makes plots of coverage across TSS # should other meta-profiles be included? + @follows(makeCpgIslandsBed) @transform(sortAndIndexBams, regex("bismark.dir/(\S+).sorted.bam"), @@ -843,7 +799,7 @@ def makeGeneProfiles(infiles, outfile): # ensures only large RAM nodes used job_options = "-l mem_free=24G" - statement = '''python /ifs/devel/toms/cgat/scripts/bam2geneprofile.py + statement = '''python %(scriptsdir)s/bam2geneprofile.py --bamfile=%(infile)s --gtffile=%(genes_gtf)s --method=tssprofile --reporter=gene -P %(outname)s @@ -860,6 +816,7 @@ def makeGeneProfiles(infiles, outfile): ######################################################################## # Produce summary plots of methylation between samples, CpGI vs. non-CpGI etc + @transform(addCpGIs, regex("methylation.dir/(\S+)_meth_cpgi.tsv"), r"plots.dir/\1_covered_meth_cpgi.tsv") @@ -904,10 +861,8 @@ def makeSummaryPlots(infile, outfile): "clusters.tsv") def runBiSeq(infiles, outfile): # use this when following callMethylationStatus(i.e full run) - #cov_infiles = filter(lambda x: 'Liver' in x, infiles) + # cov_infiles = filter(lambda x: 'Liver' in x, infiles) cov_infiles = infiles - - # implement properly if len(cov_infiles) >= 4: pass @@ -917,7 +872,8 @@ def runBiSeq(infiles, outfile): out.write("You don't have replicates! This wont work") out.close() return 0 - print "how did I get here given that I only have %s samples" % len(cov_infiles) + print ("how did I get here given that I only have %s samples" + % len(cov_infiles)) basenames = [os.path.basename(x) for x in cov_infiles] samples = [re.sub(r".fastq\S+", "", x) for x in basenames] @@ -925,14 +881,16 @@ def runBiSeq(infiles, outfile): c = '","'.join(cov_infiles) s = '","'.join(samples) g = '","'.join(groups) - print "basenames: %(basenames)s\ngroups: %(groups)s\nsamples: %(samples)s " % locals() + print ("basenames: %(basenames)s\ngroups: %(groups)s\nsamples: %(samples)s" + % locals()) print "c: %(c)s\ns: %(s)s\ng: %(g)s " % locals() # load BiSeq package r.library("BiSeq") r.library("ggplot2") grdevices = importr('grDevices') - rcode = ('raw = readBismark(files = c("%(c)s"),DataFrame(group = c("%(g)s"), row.names = c("%(s)s")))' % locals()) + rcode = ('raw = readBismark(files = c("%(c)s"),DataFrame(group = c("%(g)s"),\ + row.names = c("%(s)s")))' % locals()) r(rcode) @@ -946,15 +904,19 @@ def runBiSeq(infiles, outfile): r('print(colData(raw))') r('print(colData(raw)$group)') - r('rrbs.clust.unlim <- BiSeq::clusterSites(object = raw,groups = colData(raw)$group,perc.samples = 1,min.sites = 5,max.dist = 50,mc.cores=4)') + r('rrbs.clust.unlim <- BiSeq::clusterSites(\ + object = raw,groups = colData(raw)$group,perc.samples = 1,\ + min.sites = 5,max.dist = 50,mc.cores=4)') print "made it here" r('ind.cov <- totalReads(rrbs.clust.unlim) > 0') r('quant <- quantile(totalReads(rrbs.clust.unlim)[ind.cov], 0.9)') r('rrbs.clust.lim <- limitCov(rrbs.clust.unlim, maxCov = quant)') # emperically derive value for h (this is the bandwidth) r('predictedMeth <- predictMeth(object = rrbs.clust.lim,h=25,mc.cores=4)') - r('temp_df=data.frame(y=methLevel(predictedMeth)[order(rowData(predictedMeth)),1])') - r('temp_df["x"]=(methReads(rrbs.clust.lim)/totalReads(rrbs.clust.lim))[,1]') + r('temp_df=data.frame(y=methLevel(predictedMeth)\ + [order(rowData(predictedMeth)),1])') + r('temp_df["x"]=\ + (methReads(rrbs.clust.lim)/totalReads(rrbs.clust.lim))[,1]') r('temp_df["cov"] = totalReads(rrbs.clust.lim)[,1]') r('p=ggplot(temp_df,aes(x, y, size=cov))+geom_point(aes(colour=cov))') @@ -962,20 +924,24 @@ def runBiSeq(infiles, outfile): r('print(p)') grdevices.dev_off() - r('betaResults <- betaRegression(formula = ~group, link = "probit",object = predictedMeth, type = "BR",mc.cores=8)') + r('betaResults <- betaRegression(formula = ~group, link = "probit",\ + object = predictedMeth, type = "BR",mc.cores=8)') r('print(head(betaResults))') r('predictedMethNull <- predictedMeth') r('print((predictedMethNull))') r('print(head(methLevel(predictedMethNull)))') r('colData(predictedMethNull)$group.null <- rep(c(1,2), 3)') r('print((predictedMethNull))') - r('betaResultsNull <- betaRegression(formula = ~group.null, link = "probit",object = predictedMethNull, type = "BR",mc.cores=4)') + r('betaResultsNull <- betaRegression(formula = ~group.null,\ + link = "probit",object = predictedMethNull, type = "BR",mc.cores=4)') r('vario <- makeVariogram(betaResultsNull)') r('length_vario = length(vario$variogram[[1]][,2])') - r('estimated_sill = median(vario$variogram[[1]][-(1:(length_vario-50)),2])') + r('estimated_sill =\ + median(vario$variogram[[1]][-(1:(length_vario-50)),2])') r('vario.sm <- smoothVariogram(vario, sill = estimated_sill)') - grdevices.png(file="/ifs/projects/proj034/mapping_bismark_0.12.5/test_vario.png") + grdevices.png( + file="/ifs/projects/proj034/mapping_bismark_0.12.5/test_vario.png") r('plot(vario$variogram[[1]],ylim=c(0,5))') r('lines(vario.sm$variogram[,c("h", "v.sm")],col = "red", lwd = 1.5)') grdevices.dev_off() @@ -991,15 +957,18 @@ def runBiSeq(infiles, outfile): r('print(clusters.rej$clusters.reject)') r('clusters.trimmed <- trimClusters(clusters.rej,FDR.loc = 0.5)') r('print(clusters.trimmed)') - r('DMRs <- findDMRs(clusters.trimmed,max.dist = 100,diff.dir = TRUE)') + r('DMRs <- findDMRs(clusters.trimmed,max.dist = 100,diff.dir = TRUE)') r('print(DMRs)') # concatenate pvalue lists into one table to write out r('concat_vario = do.call(rbind, vario.sm$pValsList)') r('row_p = rowData(predictedMeth)') - r('df_ranges<- data.frame(seqnames=seqnames(row_p), starts=start(row_p), ends=end(row_p))') + r('df_ranges<- data.frame(seqnames=seqnames(row_p), starts=start(row_p),\ + ends=end(row_p))') r('ranges_pred_meth_df=cbind(df_ranges,methLevel(predictedMeth))') - r('write.table(concat_vario, "clusters_vario_table_test.tsv", sep="\t",quote = F,row.names = F)') - r('write.table(ranges_pred_meth_df, "clusters.tsv", sep="\t",quote = F,row.names = F)') + r('write.table(concat_vario, "clusters_vario_table_test.tsv",\ + sep="\t",quote = F,row.names = F)') + r('write.table(ranges_pred_meth_df, "clusters.tsv", sep="\t",\ + quote = F,row.names = F)') ######################################################################## #########################################################################