From 3812c78150d85ab68fabccf2e57b2143a0be2bae Mon Sep 17 00:00:00 2001 From: guoweilong Date: Mon, 12 Feb 2018 02:40:28 +0800 Subject: [PATCH] v0.1.0 --- cgmaptools | 2 +- src/CGmapFillIndex.py | 9 +++- src/CGmapFromBAM.c | 109 +++++++++++++++++++++++++++++++++++------- src/mCFragRegView.R | 6 ++- 4 files changed, 106 insertions(+), 20 deletions(-) diff --git a/cgmaptools b/cgmaptools index 0779c50..7329afa 100755 --- a/cgmaptools +++ b/cgmaptools @@ -44,7 +44,7 @@ import subprocess argv_len = len(sys.argv) def PrintVersion() : - print("Version: 0.0.9") + print("Version: 0.1.0") # if (argv_len) == 1 or sys.argv[1] in ["-h", "-H", "--help"]: diff --git a/src/CGmapFillIndex.py b/src/CGmapFillIndex.py index 08ce2f5..e7349b9 100755 --- a/src/CGmapFillIndex.py +++ b/src/CGmapFillIndex.py @@ -55,7 +55,7 @@ def CGmapFillIndex(index, CGmap_list, tag_list, min_cov=1, max_cov=200) : INDEX = sys.stdin # except IOError: - print ("\n[Error]:\n\t File cannot be open: %s" % index ) + sys.stderr.write ("\n[Error]:\n\t File cannot be open: %s" % index ) exit(-1) # CGmap_lst = CGmap_list.split(",") @@ -156,7 +156,14 @@ def main(): sys.stdout = open(options.outfile, 'w') # # + sys.stderr.write("The index file is : %s\n" % options.index) + sys.stderr.write("The CGmap list is : %s\n" % options.CGmap_list) + sys.stderr.write("The index list is : %s\n" % options.tag_list) + sys.stderr.write("The minimum coverage is : %s\n" % options.min_Cov) + sys.stderr.write("The maximum coverage is : %s\n" % options.max_Cov) + # CGmapFillIndex(options.index, options.CGmap_list, options.tag_list, int(options.min_Cov), int(options.max_Cov)) + # # # =========================================== diff --git a/src/CGmapFromBAM.c b/src/CGmapFromBAM.c index eebd93c..e4ece3e 100644 --- a/src/CGmapFromBAM.c +++ b/src/CGmapFromBAM.c @@ -343,6 +343,7 @@ dict_pair_t * dict_pair_new(char *key, void *val) { dict_pair_t *self; if (!(self = DICT_MALLOC(sizeof(dict_pair_t)))) return NULL; + //printf("malloc pair\n"); self->prev = NULL; self->next = NULL; self->key = key; @@ -354,6 +355,7 @@ dict_t * dict_new() { dict_t *self; if (!(self = DICT_MALLOC(sizeof(dict_t)))) return NULL; + //printf("malloc dict\n"); self->head = NULL; self->tail = NULL; self->free = NULL; @@ -365,17 +367,26 @@ void dict_destroy(dict_t *self) { dict_pair_t *curr = self->head; while (curr) { next = curr->next; - if (self->free) self->free(curr->key, curr->val); + if (self->free) { + self->free(curr->key, curr->val); + //printf("free 100\n"); + } DICT_FREE(curr); + //printf("free pair\n"); curr = next; } DICT_FREE(self); + //printf("free dict\n"); } // Add a key-value pair to dict. dict_pair_t * dict_set(dict_t *self, char *key, void *val) { dict_pair_t *pair = dict_get(self, key); if (pair) { if (self->free) self->free(pair->key, pair->val); + /*if (self->free) { + free(pair->key); + free(pair->val); + }*/ pair->val = val; } else { pair = dict_pair_new(key, val); @@ -586,10 +597,10 @@ static inline void context_calling(const char *chrom_seq, int chrom_seq_length, } // Get the clean qname of reads, by removing mate infor -// "READXXXXCCAC#1" => "READXXXXCCAC" +// "READXXXXCCAC#1" => "READXXXXCCAC" static int QnameClean (char * qname, char * qname_clean) { size_t qname_len = strlen(qname); - if (qname_len >= 3) { + if (qname_len >= 3) { char ch = qname[qname_len-2]; // printf("DEBUG | Char=%c\n", ch); if (ch == '.' || ch == '#' || ch == ':') { @@ -604,6 +615,44 @@ static int QnameClean (char * qname, char * qname_clean) { return 1; } +// Seems not works good for linux +// small space may not return to system in real time +void FreeDictEle(char * key, void* value) { + //free(key); + free(value); +} + +// Define a list of string point +// A pool of 10000 string for string sequence names +char * MEM[1000]; +char ** MEM_pointer; + +char ** InitStringList ( char ** p ) { + int i; + char ** q = p; + for(i=0; i<1000; i++) { + (*q) = DICT_MALLOC(50*sizeof(char)); + q++; + } + MEM_pointer = p; + return p; +} + +char * GetNextStringSpace (void) { + //printf("debug 0\n"); + //printf("%d\n", MEM_pointer-MEM); + if(MEM_pointer >= (MEM+999)) { + //printf("debug 1\n"); + MEM_pointer = MEM; + } else { + //printf("debug 2\n"); + MEM_pointer++; + //printf("debug 3\n"); + } + //printf("%d\n", MEM_pointer-MEM); + return (*MEM_pointer); +} + // callback for bam_plbuf_init() static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pileups, void *vinfo) { @@ -615,27 +664,35 @@ static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *p int rev_counts[5] = {0}; int current_tid = pileups->b->core.tid; char *chrom_name = info->in->header->target_name[current_tid]; - char * aln_qname = NULL; + //char * aln_qname = NULL; if (current_tid != info->current_tid) { load_chromosome(current_tid, info); gzprintf(info->wiggle, "variableStep chrom=%s\n", chrom_name); } dict_t * Dict = dict_new(); + char * aln_qname ; // A pointer to dynamic space to store reads name + int nucleotide; + //Dict->free = FreeDictEle; // Point to function for free space of dict element //printf("DEBUG | pileup_func: before \n"); for (i = 0; i < n; i ++) { -// printf("DEBUG | == %d ==\n", i); + // printf("DEBUG | == %d ==\n", i); pl = pileups + i; aln = pl->b; if (RmOverlap) { - aln_qname = DICT_MALLOC(100*sizeof(char)); // Fixed Error here, should use dynamic space + //printf("DEBUG 0\n"); + //aln_qname = DICT_MALLOC(50*sizeof(char)); // Fixed Error here, should use dynamic space + aln_qname = GetNextStringSpace(); // Find the space for a string + //printf("GetSpace\n"); + //printf("%s\n", bam1_qname(aln)); QnameClean(bam1_qname(aln), aln_qname); - //printf("DEBUG | %s\n", aln_qname); + //printf("%s\n", aln_qname); + //printf("DEBUG 1\n"); if ( dict_get(Dict, aln_qname) == NULL ) { //printf("DEBUG | First: %s\n", aln_qname); - dict_set(Dict, aln_qname, ""); + dict_set(Dict, aln_qname, NULL); // Redudent region A(1) if (!pl->indel){ - int nucleotide = NT_BAM_TO_IDX[bam1_seqi(bam1_seq(aln), pl->qpos)]; + nucleotide = NT_BAM_TO_IDX[bam1_seqi(bam1_seq(aln), pl->qpos)]; if (bam1_strand(aln)){ // negative strand rev_counts[nucleotide] ++; } else { // positive strand @@ -645,11 +702,13 @@ static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *p } //else { //printf("DEBUG | Second: %s\n", aln_qname); //} - DICT_FREE(aln_qname); // avoid using too much memory + //printf("DEBUG 2\n"); + //DICT_FREE(aln_qname); // avoid using too much memory } else { // Redudent region A(2) + //printf("DEBUG 3\n"); if (!pl->indel){ - int nucleotide = NT_BAM_TO_IDX[bam1_seqi(bam1_seq(aln), pl->qpos)]; + nucleotide = NT_BAM_TO_IDX[bam1_seqi(bam1_seq(aln), pl->qpos)]; if (bam1_strand(aln)){ // negative strand rev_counts[nucleotide] ++; } else { // positive strand @@ -658,14 +717,31 @@ static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *p } } } -// printf("DEBUG | pileup_func: after \n"); + // + //printf("DEBUG | pileup_func: after \n"); + /* + dict_pair_t *curr = Dict->head; + dict_pair_t *next; + while (curr) { + next = curr->next; + //if (Dict->free) Dict->free(curr->key, curr->val); + //printf("%s\n", curr->key); + //DICT_FREE(curr->key); + curr->key = NULL; + DICT_FREE(curr->val); + DICT_FREE(curr); + curr = next; + } + DICT_FREE(Dict); + Dict = NULL;*/ + // dict_destroy(Dict); char *context; char subcontext[3] = {0}; char nuc; context_calling(info->current_chrom_seq, info->current_chrom_length, pos, &nuc, &context, subcontext); -// printf("DEBUG | pileup_func: after \n"); + // printf("DEBUG | pileup_func: after \n"); double meth_level = 0; int meth_level_is_available = 0; int meth_cytosines = 0; @@ -776,7 +852,7 @@ int call_methylation(char *sorted_bam_filename, // ================ Main ================= BEGIN -// Functions for call back to parameter +// Functions for call back to parameter char BamFileName[1000]; char GenomeFileName[1000] = ""; char OutputPrefix[1000]; @@ -843,14 +919,15 @@ int main(int argc, char **argv){ fprintf(stderr, "[Error] genome file is not specified.\n"); exit(1); } - + //printf("%s\t%s\t%s\n", BamFileName, GenomeFileName, OutputPrefix); char ATCGmap_filename[1000], CGmap_filename[1000], wiggle_filename[1000]; sprintf(ATCGmap_filename, "%s.ATCGmap.gz", OutputPrefix); sprintf(CGmap_filename, "%s.CGmap.gz", OutputPrefix); sprintf(wiggle_filename, "%s.wig.gz", OutputPrefix); + InitStringList(MEM); call_methylation(BamFileName, GenomeFileName, ATCGmap_filename, CGmap_filename, wiggle_filename); - + return 0; } // ================ Main ================= END diff --git a/src/mCFragRegView.R b/src/mCFragRegView.R index 6626fb8..44878d9 100755 --- a/src/mCFragRegView.R +++ b/src/mCFragRegView.R @@ -64,12 +64,14 @@ parser <- OptionParser(usage = "cgmaptools fragreg [options]", option_list=option_list, description = " (aka mCFragRegView) \ Description: Plot methylation dynamics of target and flanking region for multiple samples \ Contact: Zhu, Ping; pingzhu.work@gmail.com\ -Last update: 2017-09-16\ +Last update: 2018-02-12\ Example: \ FragRegView.R -i input -r 5 -o genebody.pdf \ -Input File Format: \ 1st line is the header.\ -Each row contains methylation measurements of a sample. \ + Each row contains methylation measurements of a sample. \ + The user may need to use shell script to generate following format \ + based on the results of \"cgmaptools mfg\". \ Example: \ Sample Up1 Up2 ... Region1 Region2 ... Down1 Down2 ...\ Sample1 0.1 0.1 ... 0.2 0.2 ... 0.3 0.3 ...\