From bdede34abb5fd07ded761eb20320f00ca4b2cd86 Mon Sep 17 00:00:00 2001 From: Matt Seddon Date: Wed, 31 Jul 2024 10:53:04 +1000 Subject: [PATCH] add examples smoke tests --- .github/workflows/benchmarks.yml | 2 +- .github/workflows/examples.yml | 37 ++++++++ examples/computer_vision/iptc_exif_xmp_lib.py | 3 +- examples/computer_vision/openimage-detect.py | 2 +- examples/get_started/json-csv-reader.py | 18 +--- examples/get_started/torch-loader.py | 2 +- examples/multimodal/wds.py | 31 ++++--- noxfile.py | 16 ++++ pyproject.toml | 7 +- tests/examples/test_examples.py | 87 +++++++++++++++++++ 10 files changed, 175 insertions(+), 30 deletions(-) create mode 100644 .github/workflows/examples.yml create mode 100644 tests/examples/test_examples.py diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index 564fb54fc..233dd85d6 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -11,7 +11,7 @@ env: FORCE_COLOR: "1" jobs: - build: + run: if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') }} runs-on: ubuntu-latest diff --git a/.github/workflows/examples.yml b/.github/workflows/examples.yml new file mode 100644 index 000000000..3d0394ad9 --- /dev/null +++ b/.github/workflows/examples.yml @@ -0,0 +1,37 @@ +name: Examples + +on: + workflow_dispatch: + schedule: + - cron: '0 3 * * *' + push: # to remove + +env: + FORCE_COLOR: "1" + +jobs: + run: + runs-on: ${{ matrix.os }} + timeout-minutes: 60 + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest-16-cores, macos-latest, windows-latest-8-cores] + pyv: ['3.9', '3.12'] + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.pyv }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.pyv }} + cache: 'pip' + + - name: Upgrade nox and uv + run: | + python -m pip install --upgrade 'nox[uv]' + nox --version + uv --version + + - name: Run examples + run: nox -s examples -p ${{ matrix.pyv }} diff --git a/examples/computer_vision/iptc_exif_xmp_lib.py b/examples/computer_vision/iptc_exif_xmp_lib.py index cff5f2556..079dc742e 100644 --- a/examples/computer_vision/iptc_exif_xmp_lib.py +++ b/examples/computer_vision/iptc_exif_xmp_lib.py @@ -1,3 +1,4 @@ +# pip install defusedxml import json from PIL import ( @@ -63,7 +64,7 @@ def image_description(file): DataChain.from_storage(source, type="image") .settings(parallel=-1) .filter(C("file.name").glob("*.jpg")) - .limit(10000) + .limit(5000) .map( image_description, params=["file"], diff --git a/examples/computer_vision/openimage-detect.py b/examples/computer_vision/openimage-detect.py index 11ad42191..2b13157d0 100644 --- a/examples/computer_vision/openimage-detect.py +++ b/examples/computer_vision/openimage-detect.py @@ -48,7 +48,7 @@ def openimage_detect(args): yield fstream, bbox -source = "gs://datachain-demo/openimages-v6-test-jsonpairs" +source = "gs://datachain-demo/openimages-v6-test-jsonpairs/" ( DataChain.from_storage(source) diff --git a/examples/get_started/json-csv-reader.py b/examples/get_started/json-csv-reader.py index 607611270..feedbdb28 100644 --- a/examples/get_started/json-csv-reader.py +++ b/examples/get_started/json-csv-reader.py @@ -36,7 +36,7 @@ def main(): print("========================================================================") uri = "gs://datachain-demo/jsonl/object.jsonl" jsonl_ds = DataChain.from_json(uri, meta_type="jsonl", print_schema=True) - print(jsonl_ds.to_pandas()) + jsonl_ds.show() print() print("========================================================================") @@ -49,8 +49,7 @@ def main(): json_pairs_ds = DataChain.from_json( uri, schema_from=schema_uri, jmespath="@", model_name="OpenImage" ) - print(json_pairs_ds.to_pandas()) - # print(list(json_pairs_ds.collect())[0]) + json_pairs_ds.show() uri = "gs://datachain-demo/coco2017/annotations_captions/" @@ -72,7 +71,7 @@ def main(): static_json_ds = DataChain.from_json( uri, jmespath="licenses", spec=LicenseFeature, nrows=3 ) - print(static_json_ds.to_pandas()) + static_json_ds.show() print() print("========================================================================") @@ -88,16 +87,7 @@ def main(): print("========================================================================") static_csv_ds = DataChain.from_csv(uri, output=ChatDialog, object_name="chat") static_csv_ds.print_schema() - print(static_csv_ds.to_pandas()) - - uri = "gs://datachain-demo/laion-aesthetics-csv" - print() - print("========================================================================") - print("dynamic CSV with header schema test parsing 3/3M objects") - print("========================================================================") - dynamic_csv_ds = DataChain.from_csv(uri, object_name="laion", nrows=3) - dynamic_csv_ds.print_schema() - print(dynamic_csv_ds.to_pandas()) + static_csv_ds.show() if __name__ == "__main__": diff --git a/examples/get_started/torch-loader.py b/examples/get_started/torch-loader.py index fdbf7541b..2091ee986 100644 --- a/examples/get_started/torch-loader.py +++ b/examples/get_started/torch-loader.py @@ -64,7 +64,7 @@ def forward(self, x): optimizer = optim.Adam(model.parameters(), lr=0.001) # Train the model - num_epochs = 10 + num_epochs = 1 for epoch in range(num_epochs): for i, data in enumerate(train_loader): inputs, labels = data diff --git a/examples/multimodal/wds.py b/examples/multimodal/wds.py index 6078747a8..03cc43c13 100644 --- a/examples/multimodal/wds.py +++ b/examples/multimodal/wds.py @@ -1,5 +1,3 @@ -import pandas as pd - from datachain import C, DataChain from datachain.lib.webdataset import process_webdataset from datachain.lib.webdataset_laion import WDSLaion, process_laion_meta @@ -9,25 +7,36 @@ .filter(C("file.name").glob("00000000.tar")) .settings(cache=True) .gen(laion=process_webdataset(spec=WDSLaion), params="file") + .save() # materialize chain to avoid downloading data multiple times +) + +meta_pq = ( + DataChain.from_parquet("gs://datachain-demo/datacomp-small/metadata/0020f*.parquet") + .filter( + C("uid").in_(values[0] for values in wds.select("laion.json.uid").collect()) + ) + .map(stem=lambda file: file.get_file_stem(), params=["source.file"], output=str) + .save() ) meta_emd = ( - DataChain.from_storage("gs://datachain-demo/datacomp-small/metadata") - .filter(C("file.name").glob("0020f*.npz")) + DataChain.from_storage("gs://datachain-demo/datacomp-small/metadata/0020f*.npz") .gen(emd=process_laion_meta) + .filter( + C("emd.index").in_( + values[0] for values in meta_pq.select("source.index").collect() + ) + ) .map(stem=lambda file: file.get_file_stem(), params=["emd.file"], output=str) ) -meta_pq = DataChain.from_parquet( - "gs://datachain-demo/datacomp-small/metadata/0020f*.parquet" -).map(stem=lambda file: file.get_file_stem(), params=["source.file"], output=str) meta = meta_emd.merge( - meta_pq, on=["stem", "emd.index"], right_on=["stem", "source.index"] + meta_pq, + on=["stem", "emd.index"], + right_on=["stem", "source.index"], ) res = wds.merge(meta, on="laion.json.uid", right_on="uid") -df = res.limit(10).to_pandas() -with pd.option_context("display.max_columns", None): - print(df) +res.show(3) diff --git a/noxfile.py b/noxfile.py index 5c3650475..44eb701f6 100644 --- a/noxfile.py +++ b/noxfile.py @@ -74,3 +74,19 @@ def dev(session: nox.Session) -> None: python = os.path.join(venv_dir, "bin/python") session.run(python, "-m", "pip", "install", "-e", ".[dev]", external=True) + + +@nox.session(python=["3.9", "3.10", "3.11", "3.12", "pypy3.9", "pypy3.10"]) +def examples(session: nox.Session) -> None: + session.install(".[examples]") + try: + session.install("unstructured[all-docs]") + except: # noqa: S110, E722 + pass + session.run( + "pytest", + "-m", + "examples", + "-vvv", + *session.posargs, + ) diff --git a/pyproject.toml b/pyproject.toml index 1dac0892b..7f63f4c84 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -93,6 +93,11 @@ dev = [ "types-PyYAML", "types-requests" ] +examples = [ + "datachain[tests]", + "defusedxml", + "accelerate", +] [project.urls] Documentation = "https://datachain.dvc.ai" @@ -110,7 +115,7 @@ namespaces = false [tool.setuptools_scm] [tool.pytest.ini_options] -addopts = "-rfEs -m 'not benchmark'" +addopts = "-rfEs -m 'not benchmark and not examples'" markers = [ "benchmark: benchmarks.", "e2e: End-to-end tests" diff --git a/tests/examples/test_examples.py b/tests/examples/test_examples.py new file mode 100644 index 000000000..12c29170a --- /dev/null +++ b/tests/examples/test_examples.py @@ -0,0 +1,87 @@ +import glob +import os +import subprocess +import sys + +import pytest + +NO_EXAMPLES = "no examples found" + + +def can_import_unstructured(): + try: + import unstructured # noqa: F401 + + return True + except ImportError: + return False + + +get_started_examples = [ + filename + for filename in glob.glob("examples/get_started/**/*.py", recursive=True) + if "torch" not in filename or os.environ.get("RUNNER_OS") != "Linux" +] + +llm_and_nlp_examples = [ + filename + for filename in glob.glob("examples/llm_and_nlp/**/*.py", recursive=True) + # no anthropic token + if "claude" not in filename + and ("unstructured" not in filename or can_import_unstructured()) +] or [NO_EXAMPLES] + +multimodal_examples = [ + filename + for filename in glob.glob("examples/multimodal/**/*.py", recursive=True) + # no OpenAI token + # and hf download painfully slow + if "openai" not in filename and "hf" not in filename +] + +computer_vision_examples = [ + filename + for filename in glob.glob("examples/multimodal/**/*.py", recursive=True) + # fashion product images tutorial out of scope + # and hf download painfully slow + if "image_desc" not in filename and "fashion_product_images" not in filename +] + + +def smoke_test(example: str): + if example == NO_EXAMPLES: + return + + completed_process = subprocess.run( # noqa: S603 + [sys.executable, example], + capture_output=True, + cwd=os.path.abspath(os.path.join(__file__, "..", "..", "..")), + check=True, + ) + + assert completed_process.stdout + assert completed_process.stderr + + +@pytest.mark.examples +@pytest.mark.parametrize("example", get_started_examples) +def test_get_started_examples(example): + smoke_test(example) + + +@pytest.mark.examples +@pytest.mark.parametrize("example", llm_and_nlp_examples) +def test_llm_and_nlp_examples(example): + smoke_test(example) + + +@pytest.mark.examples +@pytest.mark.parametrize("example", multimodal_examples) +def test_multimodal(example): + smoke_test(example) + + +@pytest.mark.examples +@pytest.mark.parametrize("example", computer_vision_examples) +def test_computer_vision_examples(example): + smoke_test(example)