aplib.py

#Kabopan - Readable Algorithms. Public Domain, 2007-2009
"""
aPLib, LZSS based lossless compression algorithm
Jorgen Ibsen U{http://www.ibsensoftware.com}
Original: http://code.google.com/p/kabopan/source/browse/trunk/kbp/comp/aplib.py
"""

import sys
import io


class BitsDecompress:
    """bit machine for variable-sized auto-reloading tag decompression"""
    def __init__(self, data, tag_size, verbose=True):
        self.__current_bit = 0  # The count of bits available to use in the tag
        self.__tag = None  # The tag is a bitstream dispersed through the file and read in chunks.
                           # This is the current chunk, shifted so the MSB is the next bit.
        self.__tag_size = tag_size  # Number of bytes per bitstream chunk, 1 by default
        self.__in = data  # The stream
        self.out = bytearray()
        self.max_offset = 0
        self.max_match_length = 0
        self.bits_count = 0
        self.bytes_count = 0
        self.verbose = verbose

    def read_bit(self):
        """read next bit from the stream, reloads the tag if necessary"""
        
        if self.__current_bit != 0:
            # Move to the next bit
            self.__current_bit -= 1
        else:
            # Select the MSB
            self.__current_bit = (self.__tag_size * 8) - 1
            # Read new data
            self.__tag = self.read_byte()
            self.bytes_count -= 1
            for i in range(self.__tag_size - 1):
                self.__tag += self.read_byte() << (8 * (i + 1))

        # Then extract the bit in question
        bit = (self.__tag >> ((self.__tag_size * 8) - 1)) & 0x01
        # And shift it out of the tag
        self.__tag <<= 1
        self.bits_count += 1
        return bit

    def read_byte(self):
        """read next byte from the stream"""
        result = self.__in.read(1)[0]
        self.bytes_count += 1
        return result

    def read_fixed_number(self, num_bits, init=0):
        """reads a fixed bit-length number"""
        result = init
        for i in range(num_bits):
            result = (result << 1) + self.read_bit()
        return result

    def read_variable_number(self):
        """return a variable bit-length number x, x >= 2
        reads a bit until the next bit in the pair is not set"""
        result = 1
        result = (result << 1) + self.read_bit()
        while self.read_bit():
            result = (result << 1) + self.read_bit()
        return result

    def read_set_bits(self, max_bits, set_value=1):
        """read bits as long as their set or a maximum is reached"""
        # Reads consecutive set bits from the bitstream, up to max_bits or until a zero is encountered.
        # Returns the number of set bits read.
        result = 0
        while result < max_bits and self.read_bit() == set_value:
            result += 1
        return result

    def back_copy(self, offset, length=1):
        s = "offset %d, length %d:" % (offset, length)
        for i in range(length):
            b = self.out[-offset]
            s += " %02x" % b
            self.out.append(b)
        self.print(s)
        self.max_offset = max(self.max_offset, offset)
        self.max_match_length = max(self.max_match_length, length)
        return

    def read_literal(self, value=None):
        if value is None:
            b = self.read_byte()
            self.print("%02x" % b)
            self.out.append(b)
        else:
            self.print("%02x" % value)
            self.out.append(value)
        return False

    def print(self, *args, **kwargs):
        if self.verbose:
            print(*args, **kwargs)


class Decompress(BitsDecompress):
    def __init__(self, data, verbose=True):
        BitsDecompress.__init__(self, data, tag_size=1, verbose=verbose)
        self.__pair = True    # paired sequence
        self.__last_offset = 0
        self.__functions = [
            self.__literal,      # 0 = literal
            self.__block,        # 1 = block
            self.__short_block,  # 2 = short block
            self.__single_byte]  # 3 = single byte
        return

    def __literal(self):
        self.print("Literal: ", end="")
        self.read_literal()
        self.__pair = True
        return False

    def __block(self):
        b = self.read_variable_number() - 2
        if b == 0 and self.__pair:    # reuse the same offset
            offset = self.__last_offset
            length = self.read_variable_number()    # 2-
            self.print("Block with reused ", end="")
        else:
            if self.__pair:
                b -= 1
            offset = b * 256 + self.read_byte()
            length = self.read_variable_number()    # 2-
            length += self.__length_delta(offset)
            self.print("Block with encoded ", end="")
        self.__last_offset = offset
        self.back_copy(offset, length)
        self.__pair = False
        return False

    @staticmethod
    def __length_delta(offset):
        if offset < 0x80 or 0x7D00 <= offset:
            return 2
        elif 0x500 <= offset:
            return 1
        return 0

    def __short_block(self):
        b = self.read_byte()
        if b <= 1:    # likely 0
            self.print("Short block offset %d: EOF" % b)
            return True
        length = 2 + (b & 0x01)    # 2-3
        offset = b >> 1    # 1-127
        self.print("Short block ", end="")
        self.back_copy(offset, length)
        self.__last_offset = offset
        self.__pair = False
        return False

    def __single_byte(self):
        offset = self.read_fixed_number(4)  # 0-15
        if offset:
            self.print("Single byte ", end="")
            self.back_copy(offset)
        else:
            self.print("Single byte zero: ", end="")
            self.read_literal(0)
        self.__pair = True
        return False

    def do(self):
        """returns decompressed buffer and consumed bytes counter"""
        # First byte is a literal
        self.print("Initial literal: ", end="")
        self.read_literal()
        while True:
            # Read the gamma-coded (?) bitstream and then execute the relevant decoder based on what's found
            if self.__functions[self.read_set_bits(3)]():
                break
        return self.out

if __name__ == "__main__":
    input_file = open(sys.argv[1], "rb")
    decompressor = Decompress(input_file)
    decompressed = decompressor.do()
    output_file = open(sys.argv[1] + ".out", "wb")
    output_file.write(decompressed)
    output_file.close()
    input_file.close()
    print("Max backref distance %d, max backref length %d" % (decompressor.max_offset, decompressor.max_match_length))
    print("%d bits (= %d bytes) + %d bytes data" % (
        decompressor.bits_count,
        decompressor.bits_count / 8,
        decompressor.bytes_count))
    compressed_size = decompressor.bits_count / 8 + decompressor.bytes_count
    decompressed_size = len(decompressed)
    ratio = (decompressed_size - compressed_size) / decompressed_size
    print("Total: %d bytes decompressed to %d bytes (%.2f%% compression)" % (compressed_size, decompressed_size, ratio * 100))

"""

The format is:

1st byte: literal

Remaining bytes: a bitstream interleaved with byte-sized data where appropriate.

The bitstream is read left to right. It encodes four different types of data, plus parameters where appropriate. As
soon as a byte of extra data is needed, it is stored in the following byte in the file (byte-aligned). The bitstream
then skips past the data as it continues.

The four different types of data are encoded as:

literal     -> 0
block       -> 10
short block -> 110
single byte -> 111

0: literal
==========

One byte of data is emitted.

1: block
========

An LZ block, referencing some data previously emitted. Offset and length are unlimited.

First parameter: encoded_offset, variable-length number (in bitstream)
Second parameter: length, variable-length number (in bitstream)

If encoded_offset == 2 and the last block was a literal or a single byte then
    Emit length bytes from (last_offset) bytes ago
Else
    last_offset = (encoded_offset - 3) << 8 + next_byte()
    if (0 <= length <= 127) length += 2
    else if (1280 <= length < 32000) length += 1
    else if (length >= 32000) length += 2;
    Emit length bytes from (last_offset) bytes ago

Note that you can have encoded_offset = 2 and last block is not the right type; this results in the decoded offset
being somewhat screwy - I'm not sure this ever happens. The Z80 decoder will have on offset of $ffnn at this point.

2: short block
==============

An LZ block, encoded in a single byte. The offset is in the range 1..127 (7 bits)
and the length in the range 2..3 (1 bit).

Next byte is (offset << 1) | (length - 2)
If offset = 0 then
    This is the end of the file - terminate
Else
    Emit length bytes from (offset) bytes ago
    last_offset = offset

3: single byte
==============

An LZ block with a length of 1, encoded in four bits.

Parameter: offset, 4-bit number (in bitstream)

If offset = 0
    Emit 0
Else
    Emit 1 byte from (offset) bytes ago

Variable length numbers
=======================

These are numbers with value at least 2. They are encoded as:

1. The highest bit is implicit
2. The next bit is emitted
3. If there are more bits, a 1 is emitted and go to 2, else a zero is emitted

Decoding is the reverse. Here's some examples:

Example     Binary      Encoding
2           10          00
3           11          10
4           100         0100
7           111         1110
14          1110        111100
170         10101010    01110111011100

This results in a more compact representation for smaller numbers, plus (in theory) no limit to the size of the number
encoded.

Analysis
========

Literals (after the first byte) cost 10 bits per byte to encode.
LZ runs where there are occasional bytes which have changed are able to avoid encoding the offset more than once.
These occasional differences then have two different ways to encode the non-matching byte (a literal, or a 4-bit
encoded reference to a recent value (or zero).

"""