-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbase2hex.pl
69 lines (58 loc) · 1.27 KB
/
base2hex.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/usr/bin/perl
#############################################
# Hexadecimal compression for DNA sequences #
#############################################
# Copyright 2017 Thomas Wolf, Hans-Knoell-Institute
# script found at http://www.biostars.org/p/8722
# posted by user "Benm"
# modified by Thomas Wolf
# compression with 7z is way slower, but also way smaller!
use strict;
use warnings;
use Bio::SeqIO;
my $input_file_name = $ARGV[0];
my $output_file_name = $input_file_name . ".hexc";
# not really hex
# would be without N|n
my %base_hex=(
"AA"=>0,
"AC"=>1,
"AG"=>2,
"AT"=>3,
"CA"=>4,
"CC"=>5,
"CG"=>6,
"CT"=>7,
"GA"=>8,
"GC"=>9,
"GG"=>'a',
"GT"=>'b',
"TA"=>'c',
"TC"=>'d',
"TG"=>'e',
"TT"=>'f',
"NN"=>'g',
"NA"=>'h',
"NC"=>'i',
"NG"=>'j',
"NT"=>'k',
"AN"=>'l',
"CN"=>'m',
"GN"=>'n',
"TN"=>'o'
);
my $sequence_io = Bio::SeqIO->new( -file => $input_file_name )->next_seq();
my $sequence = uc($sequence_io->seq());
my $header = $sequence_io->display_id();
$sequence =~ s/(\w{2})/$base_hex{$1}/eg;
$sequence =~ s/(.{60})/$1\n/g;
open (my $output, ">", $output_file_name) or die $!;
print $output ">$header\n";
print $output $sequence;
close $output;
# decode
#
# my @hex_base=sort(keys %base_hex);
# my $seq=$hex;
# $seq=~s/([^ACGT])/$hex_base[oct("0x".$1)]/eg;
# print "decode: $seq\n";