-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathaplib.py
302 lines (250 loc) · 9.9 KB
/
aplib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
#Kabopan - Readable Algorithms. Public Domain, 2007-2009
"""
aPLib, LZSS based lossless compression algorithm
Jorgen Ibsen U{http://www.ibsensoftware.com}
Original: http://code.google.com/p/kabopan/source/browse/trunk/kbp/comp/aplib.py
"""
import sys
import io
class BitsDecompress:
"""bit machine for variable-sized auto-reloading tag decompression"""
def __init__(self, data, tag_size, verbose=True):
self.__current_bit = 0 # The count of bits available to use in the tag
self.__tag = None # The tag is a bitstream dispersed through the file and read in chunks.
# This is the current chunk, shifted so the MSB is the next bit.
self.__tag_size = tag_size # Number of bytes per bitstream chunk, 1 by default
self.__in = data # The stream
self.out = bytearray()
self.max_offset = 0
self.max_match_length = 0
self.bits_count = 0
self.bytes_count = 0
self.verbose = verbose
def read_bit(self):
"""read next bit from the stream, reloads the tag if necessary"""
if self.__current_bit != 0:
# Move to the next bit
self.__current_bit -= 1
else:
# Select the MSB
self.__current_bit = (self.__tag_size * 8) - 1
# Read new data
self.__tag = self.read_byte()
self.bytes_count -= 1
for i in range(self.__tag_size - 1):
self.__tag += self.read_byte() << (8 * (i + 1))
# Then extract the bit in question
bit = (self.__tag >> ((self.__tag_size * 8) - 1)) & 0x01
# And shift it out of the tag
self.__tag <<= 1
self.bits_count += 1
return bit
def read_byte(self):
"""read next byte from the stream"""
result = self.__in.read(1)[0]
self.bytes_count += 1
return result
def read_fixed_number(self, num_bits, init=0):
"""reads a fixed bit-length number"""
result = init
for i in range(num_bits):
result = (result << 1) + self.read_bit()
return result
def read_variable_number(self):
"""return a variable bit-length number x, x >= 2
reads a bit until the next bit in the pair is not set"""
result = 1
result = (result << 1) + self.read_bit()
while self.read_bit():
result = (result << 1) + self.read_bit()
return result
def read_set_bits(self, max_bits, set_value=1):
"""read bits as long as their set or a maximum is reached"""
# Reads consecutive set bits from the bitstream, up to max_bits or until a zero is encountered.
# Returns the number of set bits read.
result = 0
while result < max_bits and self.read_bit() == set_value:
result += 1
return result
def back_copy(self, offset, length=1):
s = "offset %d, length %d:" % (offset, length)
for i in range(length):
b = self.out[-offset]
s += " %02x" % b
self.out.append(b)
self.print(s)
self.max_offset = max(self.max_offset, offset)
self.max_match_length = max(self.max_match_length, length)
return
def read_literal(self, value=None):
if value is None:
b = self.read_byte()
self.print("%02x" % b)
self.out.append(b)
else:
self.print("%02x" % value)
self.out.append(value)
return False
def print(self, *args, **kwargs):
if self.verbose:
print(*args, **kwargs)
class Decompress(BitsDecompress):
def __init__(self, data, verbose=True):
BitsDecompress.__init__(self, data, tag_size=1, verbose=verbose)
self.__pair = True # paired sequence
self.__last_offset = 0
self.__functions = [
self.__literal, # 0 = literal
self.__block, # 1 = block
self.__short_block, # 2 = short block
self.__single_byte] # 3 = single byte
return
def __literal(self):
self.print("Literal: ", end="")
self.read_literal()
self.__pair = True
return False
def __block(self):
b = self.read_variable_number() - 2
if b == 0 and self.__pair: # reuse the same offset
offset = self.__last_offset
length = self.read_variable_number() # 2-
self.print("Block with reused ", end="")
else:
if self.__pair:
b -= 1
offset = b * 256 + self.read_byte()
length = self.read_variable_number() # 2-
length += self.__length_delta(offset)
self.print("Block with encoded ", end="")
self.__last_offset = offset
self.back_copy(offset, length)
self.__pair = False
return False
@staticmethod
def __length_delta(offset):
if offset < 0x80 or 0x7D00 <= offset:
return 2
elif 0x500 <= offset:
return 1
return 0
def __short_block(self):
b = self.read_byte()
if b <= 1: # likely 0
self.print("Short block offset %d: EOF" % b)
return True
length = 2 + (b & 0x01) # 2-3
offset = b >> 1 # 1-127
self.print("Short block ", end="")
self.back_copy(offset, length)
self.__last_offset = offset
self.__pair = False
return False
def __single_byte(self):
offset = self.read_fixed_number(4) # 0-15
if offset:
self.print("Single byte ", end="")
self.back_copy(offset)
else:
self.print("Single byte zero: ", end="")
self.read_literal(0)
self.__pair = True
return False
def do(self):
"""returns decompressed buffer and consumed bytes counter"""
# First byte is a literal
self.print("Initial literal: ", end="")
self.read_literal()
while True:
# Read the gamma-coded (?) bitstream and then execute the relevant decoder based on what's found
if self.__functions[self.read_set_bits(3)]():
break
return self.out
if __name__ == "__main__":
input_file = open(sys.argv[1], "rb")
decompressor = Decompress(input_file)
decompressed = decompressor.do()
output_file = open(sys.argv[1] + ".out", "wb")
output_file.write(decompressed)
output_file.close()
input_file.close()
print("Max backref distance %d, max backref length %d" % (decompressor.max_offset, decompressor.max_match_length))
print("%d bits (= %d bytes) + %d bytes data" % (
decompressor.bits_count,
decompressor.bits_count / 8,
decompressor.bytes_count))
compressed_size = decompressor.bits_count / 8 + decompressor.bytes_count
decompressed_size = len(decompressed)
ratio = (decompressed_size - compressed_size) / decompressed_size
print("Total: %d bytes decompressed to %d bytes (%.2f%% compression)" % (compressed_size, decompressed_size, ratio * 100))
"""
The format is:
1st byte: literal
Remaining bytes: a bitstream interleaved with byte-sized data where appropriate.
The bitstream is read left to right. It encodes four different types of data, plus parameters where appropriate. As
soon as a byte of extra data is needed, it is stored in the following byte in the file (byte-aligned). The bitstream
then skips past the data as it continues.
The four different types of data are encoded as:
literal -> 0
block -> 10
short block -> 110
single byte -> 111
0: literal
==========
One byte of data is emitted.
1: block
========
An LZ block, referencing some data previously emitted. Offset and length are unlimited.
First parameter: encoded_offset, variable-length number (in bitstream)
Second parameter: length, variable-length number (in bitstream)
If encoded_offset == 2 and the last block was a literal or a single byte then
Emit length bytes from (last_offset) bytes ago
Else
last_offset = (encoded_offset - 3) << 8 + next_byte()
if (0 <= length <= 127) length += 2
else if (1280 <= length < 32000) length += 1
else if (length >= 32000) length += 2;
Emit length bytes from (last_offset) bytes ago
Note that you can have encoded_offset = 2 and last block is not the right type; this results in the decoded offset
being somewhat screwy - I'm not sure this ever happens. The Z80 decoder will have on offset of $ffnn at this point.
2: short block
==============
An LZ block, encoded in a single byte. The offset is in the range 1..127 (7 bits)
and the length in the range 2..3 (1 bit).
Next byte is (offset << 1) | (length - 2)
If offset = 0 then
This is the end of the file - terminate
Else
Emit length bytes from (offset) bytes ago
last_offset = offset
3: single byte
==============
An LZ block with a length of 1, encoded in four bits.
Parameter: offset, 4-bit number (in bitstream)
If offset = 0
Emit 0
Else
Emit 1 byte from (offset) bytes ago
Variable length numbers
=======================
These are numbers with value at least 2. They are encoded as:
1. The highest bit is implicit
2. The next bit is emitted
3. If there are more bits, a 1 is emitted and go to 2, else a zero is emitted
Decoding is the reverse. Here's some examples:
Example Binary Encoding
2 10 00
3 11 10
4 100 0100
7 111 1110
14 1110 111100
170 10101010 01110111011100
This results in a more compact representation for smaller numbers, plus (in theory) no limit to the size of the number
encoded.
Analysis
========
Literals (after the first byte) cost 10 bits per byte to encode.
LZ runs where there are occasional bytes which have changed are able to avoid encoding the offset more than once.
These occasional differences then have two different ways to encode the non-matching byte (a literal, or a 4-bit
encoded reference to a recent value (or zero).
"""