Skip to content

Commit

Permalink
add conda_fmt module
Browse files Browse the repository at this point in the history
  • Loading branch information
dholth committed Jul 31, 2024
1 parent b6fec71 commit 43414cc
Show file tree
Hide file tree
Showing 7 changed files with 200 additions and 80 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

* Add `transmute_stream(...)` to create `.conda` from `(TarFile, TarInfo)`. (#90)
iterators, allowing more creative data sources than just `.tar.bz2` inputs.
* Add `conda_fmt` module with `TarFile` interface for creating `.conda`
archives, also used by `transmute`. (#90)

## 0.10.0 (2024-06)

Expand Down
163 changes: 163 additions & 0 deletions conda_package_streaming/conda_fmt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
"""
Tools for creating ``.conda``-format archives.
Uses ``tempfile.SpooledTemporaryFile`` to buffer ``pkg-*.tar`` and
``info-*.tar``, then compress directly into an open `ZipFile` at the end.
`SpooledTemporaryFile` buffers the first 10MB of the package and its metadata in
memory, but writes out to disk for larger packages.
"""

from __future__ import annotations

import json
import shutil
import tarfile
import tempfile
import zipfile
from contextlib import contextmanager
from pathlib import Path
from typing import Callable, Iterator

import zstandard

# increase to reduce speed and increase compression (levels above 19 use much
# more memory)
ZSTD_COMPRESS_LEVEL = 19
# increase to reduce compression and increase speed
ZSTD_COMPRESS_THREADS = 1

CONDA_PACKAGE_FORMAT_VERSION = 2

# Account for growth from "2 GB of /dev/urandom" to not exceed ZIP64_LIMIT after
# compression
CONDA_ZIP64_LIMIT = zipfile.ZIP64_LIMIT - (1 << 18) - 1


def anonymize(tarinfo: tarfile.TarInfo):
"""
Pass to ``tarfile.add(..., filter=anonymize)`` to anonymize uid/gid.
Does not anonymize mtime or any other field.
"""
tarinfo.uid = tarinfo.gid = 0
tarinfo.uname = tarinfo.gname = ""
return tarinfo


class CondaTarFile(tarfile.TarFile):
"""
Subclass of TarFile that adds members to a second ``info`` tar if they match
``is_info(name)``
Create this with ``conda_builder(...)`` which sets up the component archives,
then wraps them into a ``.conda`` on exit.
"""

info_tar: tarfile.TarFile
is_info: Callable

def __init__(
self,
*args,
info_tar: tarfile.TarFile,
is_info=lambda name: name.startswith("info/"),
**kwargs,
):
super().__init__(*args, **kwargs)
self.info_tar = info_tar
self.is_info = is_info

def addfile(self, tarinfo, fileobj=None):
"""
Add the TarInfo object ``tarinfo`` to the archive. If ``fileobj`` is
given, it should be a binary file, and tarinfo.size bytes are read from
it and added to the archive. You can create TarInfo objects directly, or
by using ``gettarinfo()``.
If ``self.is_info(tarinfo.name)`` returns ``True``, add ``tarinfo`` to
``self.info_tar`` instead.
"""
if self.is_info(tarinfo.name):
return self.info_tar.addfile(tarinfo, fileobj=fileobj)
else:
return super().addfile(tarinfo, fileobj)


@contextmanager
def conda_builder(
file_id,
path,
*,
compressor: Callable[
[], zstandard.ZstdCompressor
] = lambda: zstandard.ZstdCompressor(
level=ZSTD_COMPRESS_LEVEL, threads=ZSTD_COMPRESS_THREADS
),
is_info: Callable[[str], bool] = lambda filename: filename.startswith("info/"),
) -> Iterator[CondaTarFile]:
"""
Produce a ``TarFile`` subclass used to build a ``.conda`` package. The
subclass delegates ``addfile()`` to the ``info-`` component when ``is_info``
returns True.
When the context manager exits, ``{path}/{file_id}.conda`` is written with
the component tar archives.
Args:
file_id: output filename without extension
path: destination path for transmuted .conda package compressor: A
function that creates instances of ``zstandard.ZstdCompressor()``.
Yields:
``CondaTarFile``
"""
output_path = Path(path, f"{file_id}.conda")
with tempfile.SpooledTemporaryFile() as info_file, tempfile.SpooledTemporaryFile() as pkg_file:
with tarfile.TarFile(fileobj=info_file, mode="w") as info_tar, CondaTarFile(
fileobj=pkg_file, mode="w", info_tar=info_tar, is_info=is_info
) as pkg_tar:
# If we wanted to compress these at a low setting to save temporary
# space, we could insert a file object that counts bytes written in
# front of a zstd (level between 1..3) compressor.
yield pkg_tar

info_tar.close()
pkg_tar.close()

info_size = info_file.tell()
pkg_size = pkg_file.tell()

info_file.seek(0)
pkg_file.seek(0)

with zipfile.ZipFile(
output_path,
"x", # x to not append to existing
compresslevel=zipfile.ZIP_STORED,
) as conda_file:
# Use a maximum of one Zstd compressor, stream_writer at a time to save memory.
data_compress = compressor()

pkg_metadata = {"conda_pkg_format_version": CONDA_PACKAGE_FORMAT_VERSION}
conda_file.writestr("metadata.json", json.dumps(pkg_metadata))

with conda_file.open(
f"pkg-{file_id}.tar.zst",
"w",
force_zip64=(pkg_size > CONDA_ZIP64_LIMIT),
) as pkg_file_zip, data_compress.stream_writer(
pkg_file_zip, size=pkg_size, closefd=False
) as pkg_stream:
shutil.copyfileobj(pkg_file._file, pkg_stream)

with conda_file.open(
f"info-{file_id}.tar.zst",
"w",
force_zip64=(info_size > CONDA_ZIP64_LIMIT),
) as info_file_zip, data_compress.stream_writer(
info_file_zip,
size=info_size,
closefd=False,
) as info_stream:
shutil.copyfileobj(info_file._file, info_stream)
86 changes: 11 additions & 75 deletions conda_package_streaming/transmute.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,24 @@
"""
Convert .tar.bz2 to .conda
Uses `tempfile.SpooledTemporaryFile` to buffer `pkg-*` `.tar` and `info-*`
`.tar`, then compress directly into an open `ZipFile` at the end.
Uses ``tempfile.SpooledTemporaryFile`` to buffer ``pkg-*.tar`` and
``info-*.tar``, then compress directly into an open `ZipFile` at the end.
`SpooledTemporaryFile` buffers the first 10MB of the package and its metadata in
memory, but writes out to disk for larger packages.
Conda packages created this way have `info-*` as the last element in the
`ZipFile`, instead of the first for `.conda` packages created with pre-2.0
`conda-package-handling`.
"""

from __future__ import annotations

import json
import os
import shutil
import tarfile
import tempfile
import zipfile
from pathlib import Path
from typing import Callable, Iterator

import zstandard

# streams everything in .tar.bz2 mode
from .package_streaming import CondaComponent, stream_conda_component

# increase to reduce speed and increase compression (levels above 19 use much
# more memory)
ZSTD_COMPRESS_LEVEL = 19
# increase to reduce compression and increase speed
ZSTD_COMPRESS_THREADS = 1

CONDA_PACKAGE_FORMAT_VERSION = 2

# Account for growth from "2 GB of /dev/urandom" to not exceed ZIP64_LIMIT after
# compression
CONDA_ZIP64_LIMIT = zipfile.ZIP64_LIMIT - (1 << 18) - 1
from .conda_fmt import conda_builder, ZSTD_COMPRESS_LEVEL, ZSTD_COMPRESS_THREADS


def transmute(
Expand Down Expand Up @@ -125,59 +106,14 @@ def transmute_stream(
:return: Path to transmuted package.
"""
output_path = Path(path, f"{file_id}.conda")
with tempfile.SpooledTemporaryFile() as info_file, tempfile.SpooledTemporaryFile() as pkg_file:
with tarfile.TarFile(fileobj=info_file, mode="w") as info_tar, tarfile.TarFile(
fileobj=pkg_file, mode="w"
) as pkg_tar:
# If we wanted to compress these at a low setting to save temporary
# space, we could insert a file object that counts bytes written in
# front of a zstd (level between 1..3) compressor.
for tar, member in package_stream:
tar_get = info_tar if is_info(member.name) else pkg_tar
if member.isfile():
tar_get.addfile(member, tar.extractfile(member))
else:
tar_get.addfile(member)

info_tar.close()
pkg_tar.close()

info_size = info_file.tell()
pkg_size = pkg_file.tell()

info_file.seek(0)
pkg_file.seek(0)

with zipfile.ZipFile(
output_path,
"x", # x to not append to existing
compresslevel=zipfile.ZIP_STORED,
) as conda_file:
# Use a maximum of one Zstd compressor, stream_writer at a time to save memory.
data_compress = compressor()

pkg_metadata = {"conda_pkg_format_version": CONDA_PACKAGE_FORMAT_VERSION}
conda_file.writestr("metadata.json", json.dumps(pkg_metadata))

with conda_file.open(
f"pkg-{file_id}.tar.zst",
"w",
force_zip64=(pkg_size > CONDA_ZIP64_LIMIT),
) as pkg_file_zip, data_compress.stream_writer(
pkg_file_zip, size=pkg_size, closefd=False
) as pkg_stream:
shutil.copyfileobj(pkg_file._file, pkg_stream)

with conda_file.open(
f"info-{file_id}.tar.zst",
"w",
force_zip64=(info_size > CONDA_ZIP64_LIMIT),
) as info_file_zip, data_compress.stream_writer(
info_file_zip,
size=info_size,
closefd=False,
) as info_stream:
shutil.copyfileobj(info_file._file, info_stream)
with conda_builder(
file_id, path, compressor=compressor, is_info=is_info
) as conda_tar:
for tar, member in package_stream:
if member.isfile():
conda_tar.addfile(member, tar.extractfile(member))
else:
conda_tar.addfile(member)

return output_path

Expand Down
7 changes: 7 additions & 0 deletions docs/conda_fmt.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
transmute module
================

.. automodule:: conda_package_streaming.conda_fmt
:members:
:undoc-members:
:show-inheritance:
1 change: 1 addition & 0 deletions docs/modules.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,4 +78,5 @@ lazy_wheel
package_streaming
extract
transmute
conda_fmt
```
8 changes: 4 additions & 4 deletions tests/test_degraded.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,22 +27,22 @@ def test_degraded(tmpdir):

testtar = Path(tmpdir, "test.tar.bz2")
with tarfile.open(testtar, "w:bz2") as tar:
pass
tar.addfile(tarfile.TarInfo(name="jim"))

for (
tar,
member,
_,
) in conda_package_streaming.package_streaming.stream_conda_component(testtar):
pass

with pytest.raises(RuntimeError):
for (
tar,
member,
_,
) in conda_package_streaming.package_streaming.stream_conda_component(
testconda
):
pass
pass # pragma: no cover

with pytest.raises(RuntimeError):
conda_package_streaming.extract.extract(testconda, tmpdir)
Expand Down
13 changes: 12 additions & 1 deletion tests/test_transmute.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import zstandard
from conda_package_handling.validate import validate_converted_files_match_streaming

from conda_package_streaming.conda_fmt import anonymize
from conda_package_streaming.package_streaming import (
CondaComponent,
stream_conda_component,
Expand Down Expand Up @@ -148,7 +149,7 @@ def test_transmute_conditional_zip64(tmp_path, mocker):
LIMIT = 16384

for test_size, extra_expected in (LIMIT // 2, False), (LIMIT * 2, True):
mocker.patch("conda_package_streaming.transmute.CONDA_ZIP64_LIMIT", new=LIMIT)
mocker.patch("conda_package_streaming.conda_fmt.CONDA_ZIP64_LIMIT", new=LIMIT)
mocker.patch("zipfile.ZIP64_LIMIT", new=LIMIT)

tmp_tar = tmp_path / f"{test_size}.tar.bz2"
Expand Down Expand Up @@ -194,3 +195,13 @@ def test_transmute_stream(tmpdir, conda_paths):
stream_conda_component(package, component=CondaComponent.info),
),
)


def test_anonymize_helper():
ti = tarfile.TarInfo(name="info")
ti.uid = ti.gid = 500
ti.uname = ti.gname = "somebody"
anon = anonymize(ti)
assert anon.name == ti.name # they are also the same object
assert anon.uid == anon.gid == 0
assert anon.uname == anon.gname == ""

0 comments on commit 43414cc

Please sign in to comment.