forked from harrisonlab/pipeline_canker_cherry
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcount_nucl.pl
executable file
·161 lines (111 loc) · 4.47 KB
/
count_nucl.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#!/usr/bin/perl -w
use strict;
# a program to read in every four lines of a file count the number of A,T,G,C, & N's
# in the second line & use these to estimate genome coverage.
# Andrew Armitage 07/03/14
### define usage of the program ###
my ($USAGE) = "count_nucleotides.pl -i seq_data.fastq -i seq_data2.fastq(optional) -g estimated genome size (Mb)";
my ($size);
my ($input);
my ($input2);
my ($total_nuc_count) = 0;
my ($est_cov) = 0;
###################
# Collect inputs
###################
# If input values are not entered, or contain whitespace print error messages and exit.
unless(@ARGV) {
print $USAGE;
exit;
}
if($ARGV[3] =~ /^\s*$/) {
print "Entry values undefined, please adhere to: $USAGE\n";
exit;
}
# Collect input values from the stating commands. Unless multiple inputs are entered
# set $input2 to "empty".
# When collecting estimated genome size, convert this to bp rather than Mbp
$input = $ARGV[1];
if ($ARGV[2] =~ /^-g/) {
$size = ($ARGV[3]*1000000);
$input2 = "empty";
}
elsif ($ARGV[2] =~ /^-i/ && $ARGV[4] =~ /^-g/ ) {
$input2 = $ARGV[3];
$size = ($ARGV[5]*1000000);
}
else {
print "Entry values undefined (error in multiple entry values) please adhere to: $USAGE\n";
exit;
}
###############
print "The estimated genome size is: $size bp\n\n";
###############
# Send input files to subroutine to
# count nucleotides & collect results
###############
$total_nuc_count = (count_nucl ($input));
unless ($input2 eq "empty") {
$total_nuc_count = $total_nuc_count + (count_nucl ($input2));
}
################
################
# Estimate genome coverage
################
$est_cov = ($total_nuc_count/$size);
$est_cov = (sprintf "%.2f", $est_cov);
################
# Print results & exit
################
print "\n\nTotal results:\n";
print "\n There are a total of $total_nuc_count nucleotides in this file.\n";
print "\n This equates to an estimated genome coverage of $est_cov .\n";
exit;
################
####################################################
# Main program - Subroutine for counting nucleotides
####################################################
#
# Read each line from a fastq file and print it to an outfile. Every 4 lines ($linecount1),
# increase the count of number of fastq accessions read by 1 ($linecount2). Also count
# every line read and if the number of total number of lines read so far ($totalline)
# If the number of lines read equals the desired number ($critvalue) then close the current
# output file, open the second outfile and divert the remaining output to that file.
# Read each line from a fastq file and count every base on the second line. If this line
# is empty, add 1 to a count of blank lines ($blankline). Every 4 lines ($linecount1)
# reset this value. Also count every line read ($totalline).
####################################################
sub count_nucl {
my ($infile) = @_;
my ($linecount1) = 0;
my ($currentline) = "";
my ($nuc_count) = 0;
my ($totalline) = 0;
my ($blankline) = 0;
my ($line_length) = 0;
print "\nThe input file is: $infile\n";
unless ( open(SEQFILE, $infile) ) {
print "cannot open the file \"$infile\" \n\n";
exit;
}
while ($currentline = <SEQFILE>) {
$linecount1++;
if ($linecount1 == 2) {
$line_length = ($currentline =~ tr/ATGCN//);
$nuc_count = ($nuc_count + $line_length);
if ($line_length == 0) {
$blankline++;
}
}
elsif ($linecount1 == 4) {
$linecount1 = 0;
$totalline++;
}
}
close (SEQFILE);
print "\nResults for: $infile\n";
print " Within this file of $nuc_count bp there were $totalline fastq sequences\n";
print " of these $blankline lines were empty.\n\n";
return $nuc_count;
}
####################################################