Skip to content

Commit

Permalink
Allow pre-compressed data to be written into a zipfile.
Browse files Browse the repository at this point in the history
Intended to be used when constructing zip files via a system that can
parallelize and cache all actions including compressing individual input
file contents so that you can distribute actions & avoid recomputation.
  • Loading branch information
gpshead committed Jan 10, 2024
1 parent f19b93f commit 9956ec0
Show file tree
Hide file tree
Showing 2 changed files with 187 additions and 41 deletions.
75 changes: 75 additions & 0 deletions Lib/test/test_zipfile/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -645,12 +645,27 @@ def test_add_file_after_2107(self):
zinfo = zipfp.getinfo(TESTFN)
self.assertEqual(zinfo.date_time, (2107, 12, 31, 23, 59, 59))

def test_write_zinfo(self):
zinfo = zipfile.ZipInfo("test_write_zinfo")
with zipfile.ZipFile(TESTFN2, "w") as zipfp:
zipfp.write(TESTFN, zinfo=zinfo)
with zipfile.ZipFile(TESTFN2, "r") as zr:
self.assertEqual(zr.read("test_write_zinfo"), self.data)


@requires_zlib()
class DeflateTestsWithSourceFile(AbstractTestsWithSourceFile,
unittest.TestCase):
compression = zipfile.ZIP_DEFLATED

@classmethod
def setUpClass(cls):
super().setUpClass()
import zlib
compressor = zlib.compressobj(2, zlib.DEFLATED, -15)
cls.z_data = compressor.compress(cls.data) + compressor.flush()
cls.data_crc = zlib.crc32(cls.data, 0)

def test_per_file_compression(self):
"""Check that files within a Zip archive can have different
compression options."""
Expand All @@ -662,6 +677,66 @@ def test_per_file_compression(self):
self.assertEqual(sinfo.compress_type, zipfile.ZIP_STORED)
self.assertEqual(dinfo.compress_type, zipfile.ZIP_DEFLATED)

def test_write_precompressed(self):
with open(TESTFN, "wb") as data_f:
data_f.write(self.z_data)
zinfo = zipfile.ZipInfo("test/write/precompressed")
zinfo.CRC = self.data_crc
zinfo.file_size = len(self.data)
with zipfile.ZipFile(TESTFN2, "w") as zipfp:
zipfp.write(TESTFN, compress_type=zipfile.ZIP_DEFLATED,
zinfo=zinfo, precompressed=True)
with zipfile.ZipFile(TESTFN2, "r") as zr:
self.assertEqual(zr.read("test/write/precompressed"), self.data)

def test_write_precompressed_fileobj(self):
zinfo = zipfile.ZipInfo("test/write/precompressed/fileobj")
zinfo.CRC = self.data_crc
zinfo.file_size = len(self.data)
fileobj = io.BytesIO(self.z_data)
with zipfile.ZipFile(TESTFN2, "w") as zipfp:
zipfp.write(fileobj, compress_type=zipfile.ZIP_DEFLATED,
zinfo=zinfo, precompressed=True)
with zipfile.ZipFile(TESTFN2, "r") as zr:
self.assertEqual(zr.read("test/write/precompressed/fileobj"),
self.data)

def test_writestr_precompressed(self):
zinfo = zipfile.ZipInfo("writestr/precomp")
zinfo.CRC = self.data_crc
zinfo.file_size = len(self.data)
with zipfile.ZipFile(TESTFN, "w") as zipfp:
zipfp.writestr(zinfo, self.z_data,
compress_type=zipfile.ZIP_DEFLATED,
precompressed=True)
with zipfile.ZipFile(TESTFN, "r") as zr:
self.assertEqual(zr.read("writestr/precomp"), self.data)

def test_writestr_precompressed_crc_missing(self):
zinfo = zipfile.ZipInfo("precompressed/crc/missing")
zinfo.file_size = len(self.data)
with zipfile.ZipFile(TESTFN, "w") as zipfp:
with self.assertRaises(AssertionError if __debug__ else struct.error):
zipfp.writestr(zinfo, self.z_data,
compress_type=zipfile.ZIP_DEFLATED,
precompressed=True)

def test_writestr_precompressed_no_zinfo(self):
with zipfile.ZipFile(TESTFN, "w") as zipfp:
with self.assertRaises(ValueError):
zipfp.writestr("writestr/no_zinfo", self.z_data,
compress_type=zipfile.ZIP_DEFLATED,
precompressed=True)

def test_write_precompressed_no_zinfo(self):
if not __debug__:
self.skipTest("__debug__ assertion test")
with zipfile.ZipFile(TESTFN, "w") as zipfp:
with self.assertRaises(AssertionError):
zipfp.write(TESTFN2, "precomp/no_zinfo",
compress_type=zipfile.ZIP_DEFLATED,
precompressed=True)

@requires_bz2()
class Bzip2TestsWithSourceFile(AbstractTestsWithSourceFile,
unittest.TestCase):
Expand Down
153 changes: 112 additions & 41 deletions Lib/zipfile/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,8 @@ def __init__(self, filename="NoName", date_time=(1980,1,1,0,0,0)):
self.external_attr = 0 # External file attributes
self.compress_size = 0 # Size of the compressed file
self.file_size = 0 # Size of the uncompressed file
# Other attributes are set by class ZipFile:
self.CRC = None # None indicates that it has not yet been filled in.
# Attributes set by class ZipFile:
# header_offset Byte offset to the file header
# CRC CRC-32 of the uncompressed file

Expand Down Expand Up @@ -1008,7 +1009,9 @@ def _update_crc(self, newdata):
self._running_crc = crc32(newdata, self._running_crc)
# Check the CRC if we're at the end of the file
if self._eof and self._running_crc != self._expected_crc:
raise BadZipFile("Bad CRC-32 for file %r" % self.name)
raise BadZipFile(
f"Bad CRC-32 for file {self.name!r} expected "
f"{hex(self._expected_crc)} != {hex(self._running_crc)}")

def read1(self, n):
"""Read up to n bytes with at most one read() system call."""
Expand Down Expand Up @@ -1184,15 +1187,24 @@ def tell(self):


class _ZipWriteFile(io.BufferedIOBase):
def __init__(self, zf, zinfo, zip64):
def __init__(self, zf, zinfo, zip64, *, precompressed=False):
self._zinfo = zinfo
self._zip64 = zip64
self._zipfile = zf
self._compressor = _get_compressor(zinfo.compress_type,
zinfo._compresslevel)
self._file_size = 0
if precompressed:
self._compute_crc = False # Precomputed in zinfo.
self._crc = zinfo.CRC
self._compute_file_size = False # Precomputed in zinfo.
self._file_size = zinfo.file_size
self._compressor = None
else:
self._compute_crc = True
self._crc = 0
self._compute_file_size = True
self._file_size = 0
self._compressor = _get_compressor(zinfo.compress_type,
zinfo._compresslevel)
self._compress_size = 0
self._crc = 0

@property
def _fileobj(self):
Expand All @@ -1211,12 +1223,14 @@ def write(self, data):
else:
data = memoryview(data)
nbytes = data.nbytes
self._file_size += nbytes
if self._compute_file_size:
self._file_size += nbytes

self._crc = crc32(data, self._crc)
if self._compute_crc:
self._crc = crc32(data, self._crc)
if self._compressor:
data = self._compressor.compress(data)
self._compress_size += len(data)
self._compress_size += len(data)
self._fileobj.write(data)
return nbytes

Expand All @@ -1230,11 +1244,11 @@ def close(self):
buf = self._compressor.flush()
self._compress_size += len(buf)
self._fileobj.write(buf)
self._zinfo.compress_size = self._compress_size
else:
self._zinfo.compress_size = self._file_size
self._zinfo.CRC = self._crc
self._zinfo.file_size = self._file_size
self._zinfo.compress_size = self._compress_size
if self._compute_crc:
self._zinfo.CRC = self._crc
if self._compute_file_size:
self._zinfo.file_size = self._file_size

if not self._zip64:
if self._file_size > ZIP64_LIMIT:
Expand Down Expand Up @@ -1564,7 +1578,8 @@ def read(self, name, pwd=None):
with self.open(name, "r", pwd) as fp:
return fp.read()

def open(self, name, mode="r", pwd=None, *, force_zip64=False):
def open(self, name, mode="r", pwd=None, *, force_zip64=False,
precompressed=False):
"""Return file-like object for 'name'.
name is a string for the file name within the ZIP file, or a ZipInfo
Expand All @@ -1579,6 +1594,10 @@ def open(self, name, mode="r", pwd=None, *, force_zip64=False):
2 GiB, pass force_zip64 to use the ZIP64 format, which can handle large
files. If the size is known in advance, it is best to pass a ZipInfo
instance for name, with zinfo.file_size set.
precompressed is for advanced users creating a zip structure out
of precompressed data, its use requires "name" to be a ZipInfo with
both .file_size and .CRC precomputed and populated.
"""
if mode not in {"r", "w"}:
raise ValueError('open() requires mode "r" or "w"')
Expand All @@ -1588,20 +1607,23 @@ def open(self, name, mode="r", pwd=None, *, force_zip64=False):
raise ValueError(
"Attempt to use ZIP archive that was already closed")

# Make sure we have an info object
if precompressed:
assert mode == "w", "precompressed is only for writing"
assert isinstance(name, ZipInfo), "precompressed needs ZipInfo"
assert name.CRC is not None, "precompressed requires ZipInfo.CRC"

if isinstance(name, ZipInfo):
# 'name' is already an info object
zinfo = name
elif mode == 'w':
zinfo = ZipInfo(name)
zinfo.compress_type = self.compression
zinfo._compresslevel = self.compresslevel
else:
# Get info object for name
else: # implies mode == "r"
zinfo = self.getinfo(name)

if mode == 'w':
return self._open_to_write(zinfo, force_zip64=force_zip64)
return self._open_to_write(zinfo, force_zip64=force_zip64,
precompressed=precompressed)

if self._writing:
raise ValueError("Can't read from the ZIP file while there "
Expand Down Expand Up @@ -1662,7 +1684,7 @@ def open(self, name, mode="r", pwd=None, *, force_zip64=False):
zef_file.close()
raise

def _open_to_write(self, zinfo, force_zip64=False):
def _open_to_write(self, zinfo, force_zip64=False, precompressed=False):
if force_zip64 and not self._allowZip64:
raise ValueError(
"force_zip64 is True, but allowZip64 was False when opening "
Expand All @@ -1673,9 +1695,10 @@ def _open_to_write(self, zinfo, force_zip64=False):
"another write handle open on it. "
"Close the first handle before opening another.")

# Size and CRC are overwritten with correct data after processing the file
# Size and CRC are set with correct data after processing the file.
zinfo.compress_size = 0
zinfo.CRC = 0
if not precompressed:
zinfo.CRC = 0

zinfo.flag_bits = 0x00
if zinfo.compress_type == ZIP_LZMA:
Expand All @@ -1702,7 +1725,7 @@ def _open_to_write(self, zinfo, force_zip64=False):
self.fp.write(zinfo.FileHeader(zip64))

self._writing = True
return _ZipWriteFile(self, zinfo, zip64)
return _ZipWriteFile(self, zinfo, zip64, precompressed=precompressed)

def extract(self, member, path=None, pwd=None):
"""Extract a member from the archive to the current working directory,
Expand Down Expand Up @@ -1818,9 +1841,24 @@ def _writecheck(self, zinfo):
" would require ZIP64 extensions")

def write(self, filename, arcname=None,
compress_type=None, compresslevel=None):
"""Put the bytes from filename into the archive under the name
arcname."""
compress_type=None, compresslevel=None,
*, zinfo=None, precompressed=None):
"""Put the bytes from a file into the archive as arcname.
filename may instead be a file like object open for reading in binary
mode. If so it will be read from and closed. It must have a .name
attribute (as file objects do) in order to construct a ZipInfo unless
the zinfo parameter is provided.
A ZipInfo instance zinfo may be supplied instead of arcname. If
neither is supplied, filename without a drive letter or leading path
separators will be used as the name within the archive.
If *precompressed* is True, file data is assumed to already have been
been compressed in the appropriate manner and *zinfo* MUST be supplied
with details about the original file including the .file_size and .CRC
attributes filled in. Advanced use only; no validation is performed!
"""
if not self.fp:
raise ValueError(
"Attempt to write to ZIP archive that was already closed")
Expand All @@ -1829,8 +1867,21 @@ def write(self, filename, arcname=None,
"Can't write to ZIP archive while an open writing handle exists"
)

zinfo = ZipInfo.from_file(filename, arcname,
strict_timestamps=self._strict_timestamps)
if arcname and zinfo:
raise ValueError("arcname and zinfo are mutually exclusive.")
if hasattr(filename, 'read'):
fileobj = filename
if not zinfo:
filename = fileobj.name # zinfo will be populated from this.
else:
fileobj = None

if precompressed:
assert zinfo is not None, "precompressed requires zinfo"
assert zinfo.CRC is not None
elif not zinfo:
zinfo = ZipInfo.from_file(filename, arcname,
strict_timestamps=self._strict_timestamps)

if zinfo.is_dir():
zinfo.compress_size = 0
Expand All @@ -1847,19 +1898,36 @@ def write(self, filename, arcname=None,
else:
zinfo._compresslevel = self.compresslevel

with open(filename, "rb") as src, self.open(zinfo, 'w') as dest:
shutil.copyfileobj(src, dest, 1024*8)
with (fileobj if fileobj else open(filename, "rb")) as src, \
self.open(zinfo, "w", precompressed=precompressed) as dest:
shutil.copyfileobj(src, dest)

def writestr(self, zinfo_or_arcname, data,
compress_type=None, compresslevel=None):
"""Write a file into the archive. The contents is 'data', which
may be either a 'str' or a 'bytes' instance; if it is a 'str',
it is encoded as UTF-8 first.
'zinfo_or_arcname' is either a ZipInfo instance or
the name of the file in the archive."""
compress_type=None, compresslevel=None, *,
precompressed=False):
"""Write a file into the archive from data in memory.
The contents is *data*, which may be either a 'str' or a 'bytes'
instance. If it is a 'str', it is encoded as UTF-8 first.
*zinfo_or_arcname* is either a ZipInfo instance or
the name of the file in the archive.
*compress_type* and *compresslevel* if supplied each override the
related setting from this ZipFile instance or within any supplied
ZipInfo.
If *precompressed* is True, the *data* is assumed to already have been
compressed in the appropriate manner and *zinfo_or_arcname* MUST be a
ZipInfo instance with details about the original file including the
.file_size and .CRC attributes filled in. Advanced use only; no
validation is performed!
"""
if isinstance(data, str):
data = data.encode("utf-8")
if not isinstance(zinfo_or_arcname, ZipInfo):
if precompressed:
raise ValueError("ZipInfo required when precompressed.")
zinfo = ZipInfo(filename=zinfo_or_arcname,
date_time=time.localtime(time.time())[:6])
zinfo.compress_type = self.compression
Expand All @@ -1886,10 +1954,13 @@ def writestr(self, zinfo_or_arcname, data,
if compresslevel is not None:
zinfo._compresslevel = compresslevel

zinfo.file_size = len(data) # Uncompressed size
if not precompressed:
zinfo.file_size = len(data) # Uncompressed size
else:
assert zinfo.CRC is not None
with self._lock:
with self.open(zinfo, mode='w') as dest:
dest.write(data)
with self.open(zinfo, mode="w", precompressed=precompressed) as dst:
dst.write(data)

def mkdir(self, zinfo_or_directory_name, mode=511):
"""Creates a directory inside the zip archive."""
Expand Down

0 comments on commit 9956ec0

Please sign in to comment.