forked from DaehwanKimLab/centrifuge
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcentrifuge-promote
executable file
·122 lines (104 loc) · 2.41 KB
/
centrifuge-promote
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/env perl
use strict ;
use warnings ;
use File::Basename;
use Cwd;
use Cwd 'cwd' ;
use Cwd 'abs_path' ;
die "Usage: centrifuge-promote.pl centrifuge_index centrifuge_output level > output\n\n".
"Promote the taxonomy id to specified level in Centrifuge output.\n" if ( @ARGV == 0 ) ;
my $CWD = dirname( abs_path( $0 ) ) ;
# Go through the index to obtain the taxonomy tree
my %taxParent ;
my %taxIdToSeqId ;
my %taxLevel ;
my $centrifuge_index = $ARGV[0] ;
open FP1, "-|", "$CWD/centrifuge-inspect --taxonomy-tree $centrifuge_index" or die "can't open $!\n" ;
while ( <FP1> )
{
chomp ;
my @cols = split /\t\|\t/;
$taxParent{ $cols[0] } = $cols[1] ;
$taxLevel{ $cols[0] } = $cols[2] ;
}
close FP1 ;
open FP1, "-|", "$CWD/centrifuge-inspect --conversion-table $centrifuge_index" or die "can't open $!\n" ;
while ( <FP1> )
{
chomp ;
my @cols = split /\t/ ;
$taxIdToSeqId{ $cols[1] } = $cols[0] ;
}
close FP1 ;
# Go through the output of centrifuge
my $level = $ARGV[2] ;
sub PromoteTaxId
{
my $tid = $_[0] ;
return 0 if ( $tid <= 0 || !defined( $taxLevel{ $tid } ) ) ;
if ( $taxLevel{ $tid } eq $level )
{
return $tid ;
}
else
{
return 0 if ( $tid <= 1 ) ;
return PromoteTaxId( $taxParent{ $tid } ) ;
}
}
sub OutputPromotedLines
{
my @lines = @{ $_[0] } ;
return if ( scalar( @lines ) <= 0 ) ;
my @newLines ;
my $i ;
my $numMatches = 0 ;
my %showedUpTaxId ;
my $tab = sprintf( "\t" ) ;
for ( $i = 0 ; $i < scalar( @lines ) ; ++$i )
{
my @cols = split /\t+/, $lines[ $i ] ;
my $newTid = PromoteTaxId( $cols[2] ) ;
if ( $newTid <= 1 )
{
$newTid = $cols[2] ;
}
my $newLevel = $cols[1] ;
$newLevel = $taxLevel{ $newTid } if ( $newTid >= 1 && defined $taxLevel{ $newTid } ) ;
next if ( defined $showedUpTaxId{ $newTid } ) ;
$showedUpTaxId{ $newTid } = 1 ;
++$numMatches ;
$cols[2] = $newTid ;
$cols[1] = $newLevel ;
push @newLines, join( $tab, @cols ) ;
}
for ( $i = 0 ; $i < scalar( @newLines ) ; ++$i )
{
my @cols = split /\t+/, $newLines[$i] ;
$cols[-1] = $numMatches ;
print join( $tab, @cols ), "\n" ;
}
}
open FP1, $ARGV[1] ;
my $header = <FP1> ;
my $prevReadId = "" ;
my @lines ;
print $header ;
while ( <FP1> )
{
chomp ;
my @cols = split /\t/ ;
if ( $cols[0] eq $prevReadId )
{
push @lines, $_ ;
}
else
{
$prevReadId = $cols[0] ;
OutputPromotedLines( \@lines ) ;
undef @lines ;
push @lines, $_ ;
}
}
OutputPromotedLines( \@lines ) ;
close FP1 ;