From bdede34abb5fd07ded761eb20320f00ca4b2cd86 Mon Sep 17 00:00:00 2001
From: Matt Seddon <mattseddon@hotmail.com>
Date: Wed, 31 Jul 2024 10:53:04 +1000
Subject: [PATCH] add examples smoke tests

---
 .github/workflows/benchmarks.yml              |  2 +-
 .github/workflows/examples.yml                | 37 ++++++++
 examples/computer_vision/iptc_exif_xmp_lib.py |  3 +-
 examples/computer_vision/openimage-detect.py  |  2 +-
 examples/get_started/json-csv-reader.py       | 18 +---
 examples/get_started/torch-loader.py          |  2 +-
 examples/multimodal/wds.py                    | 31 ++++---
 noxfile.py                                    | 16 ++++
 pyproject.toml                                |  7 +-
 tests/examples/test_examples.py               | 87 +++++++++++++++++++
 10 files changed, 175 insertions(+), 30 deletions(-)
 create mode 100644 .github/workflows/examples.yml
 create mode 100644 tests/examples/test_examples.py

diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
index 564fb54fc..233dd85d6 100644
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
@@ -11,7 +11,7 @@ env:
   FORCE_COLOR: "1"
 
 jobs:
-  build:
+  run:
     if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') }}
     runs-on: ubuntu-latest
 
diff --git a/.github/workflows/examples.yml b/.github/workflows/examples.yml
new file mode 100644
index 000000000..3d0394ad9
--- /dev/null
+++ b/.github/workflows/examples.yml
@@ -0,0 +1,37 @@
+name: Examples
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: '0 3 * * *'
+  push: # to remove
+
+env:
+  FORCE_COLOR: "1"
+
+jobs:
+  run:
+    runs-on: ${{ matrix.os }}
+    timeout-minutes: 60
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest-16-cores, macos-latest, windows-latest-8-cores]
+        pyv: ['3.9', '3.12']
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python ${{ matrix.pyv }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.pyv }}
+          cache: 'pip'
+
+      - name: Upgrade nox and uv
+        run: |
+          python -m pip install --upgrade 'nox[uv]'
+          nox --version
+          uv --version
+
+      - name: Run examples
+        run: nox -s examples -p ${{ matrix.pyv }}
diff --git a/examples/computer_vision/iptc_exif_xmp_lib.py b/examples/computer_vision/iptc_exif_xmp_lib.py
index cff5f2556..079dc742e 100644
--- a/examples/computer_vision/iptc_exif_xmp_lib.py
+++ b/examples/computer_vision/iptc_exif_xmp_lib.py
@@ -1,3 +1,4 @@
+# pip install defusedxml
 import json
 
 from PIL import (
@@ -63,7 +64,7 @@ def image_description(file):
         DataChain.from_storage(source, type="image")
         .settings(parallel=-1)
         .filter(C("file.name").glob("*.jpg"))
-        .limit(10000)
+        .limit(5000)
         .map(
             image_description,
             params=["file"],
diff --git a/examples/computer_vision/openimage-detect.py b/examples/computer_vision/openimage-detect.py
index 11ad42191..2b13157d0 100644
--- a/examples/computer_vision/openimage-detect.py
+++ b/examples/computer_vision/openimage-detect.py
@@ -48,7 +48,7 @@ def openimage_detect(args):
         yield fstream, bbox
 
 
-source = "gs://datachain-demo/openimages-v6-test-jsonpairs"
+source = "gs://datachain-demo/openimages-v6-test-jsonpairs/"
 
 (
     DataChain.from_storage(source)
diff --git a/examples/get_started/json-csv-reader.py b/examples/get_started/json-csv-reader.py
index 607611270..feedbdb28 100644
--- a/examples/get_started/json-csv-reader.py
+++ b/examples/get_started/json-csv-reader.py
@@ -36,7 +36,7 @@ def main():
     print("========================================================================")
     uri = "gs://datachain-demo/jsonl/object.jsonl"
     jsonl_ds = DataChain.from_json(uri, meta_type="jsonl", print_schema=True)
-    print(jsonl_ds.to_pandas())
+    jsonl_ds.show()
 
     print()
     print("========================================================================")
@@ -49,8 +49,7 @@ def main():
     json_pairs_ds = DataChain.from_json(
         uri, schema_from=schema_uri, jmespath="@", model_name="OpenImage"
     )
-    print(json_pairs_ds.to_pandas())
-    # print(list(json_pairs_ds.collect())[0])
+    json_pairs_ds.show()
 
     uri = "gs://datachain-demo/coco2017/annotations_captions/"
 
@@ -72,7 +71,7 @@ def main():
     static_json_ds = DataChain.from_json(
         uri, jmespath="licenses", spec=LicenseFeature, nrows=3
     )
-    print(static_json_ds.to_pandas())
+    static_json_ds.show()
 
     print()
     print("========================================================================")
@@ -88,16 +87,7 @@ def main():
     print("========================================================================")
     static_csv_ds = DataChain.from_csv(uri, output=ChatDialog, object_name="chat")
     static_csv_ds.print_schema()
-    print(static_csv_ds.to_pandas())
-
-    uri = "gs://datachain-demo/laion-aesthetics-csv"
-    print()
-    print("========================================================================")
-    print("dynamic CSV with header schema test parsing 3/3M objects")
-    print("========================================================================")
-    dynamic_csv_ds = DataChain.from_csv(uri, object_name="laion", nrows=3)
-    dynamic_csv_ds.print_schema()
-    print(dynamic_csv_ds.to_pandas())
+    static_csv_ds.show()
 
 
 if __name__ == "__main__":
diff --git a/examples/get_started/torch-loader.py b/examples/get_started/torch-loader.py
index fdbf7541b..2091ee986 100644
--- a/examples/get_started/torch-loader.py
+++ b/examples/get_started/torch-loader.py
@@ -64,7 +64,7 @@ def forward(self, x):
     optimizer = optim.Adam(model.parameters(), lr=0.001)
 
     # Train the model
-    num_epochs = 10
+    num_epochs = 1
     for epoch in range(num_epochs):
         for i, data in enumerate(train_loader):
             inputs, labels = data
diff --git a/examples/multimodal/wds.py b/examples/multimodal/wds.py
index 6078747a8..03cc43c13 100644
--- a/examples/multimodal/wds.py
+++ b/examples/multimodal/wds.py
@@ -1,5 +1,3 @@
-import pandas as pd
-
 from datachain import C, DataChain
 from datachain.lib.webdataset import process_webdataset
 from datachain.lib.webdataset_laion import WDSLaion, process_laion_meta
@@ -9,25 +7,36 @@
     .filter(C("file.name").glob("00000000.tar"))
     .settings(cache=True)
     .gen(laion=process_webdataset(spec=WDSLaion), params="file")
+    .save()  # materialize chain to avoid downloading data multiple times
+)
+
+meta_pq = (
+    DataChain.from_parquet("gs://datachain-demo/datacomp-small/metadata/0020f*.parquet")
+    .filter(
+        C("uid").in_(values[0] for values in wds.select("laion.json.uid").collect())
+    )
+    .map(stem=lambda file: file.get_file_stem(), params=["source.file"], output=str)
+    .save()
 )
 
 meta_emd = (
-    DataChain.from_storage("gs://datachain-demo/datacomp-small/metadata")
-    .filter(C("file.name").glob("0020f*.npz"))
+    DataChain.from_storage("gs://datachain-demo/datacomp-small/metadata/0020f*.npz")
     .gen(emd=process_laion_meta)
+    .filter(
+        C("emd.index").in_(
+            values[0] for values in meta_pq.select("source.index").collect()
+        )
+    )
     .map(stem=lambda file: file.get_file_stem(), params=["emd.file"], output=str)
 )
 
-meta_pq = DataChain.from_parquet(
-    "gs://datachain-demo/datacomp-small/metadata/0020f*.parquet"
-).map(stem=lambda file: file.get_file_stem(), params=["source.file"], output=str)
 
 meta = meta_emd.merge(
-    meta_pq, on=["stem", "emd.index"], right_on=["stem", "source.index"]
+    meta_pq,
+    on=["stem", "emd.index"],
+    right_on=["stem", "source.index"],
 )
 
 res = wds.merge(meta, on="laion.json.uid", right_on="uid")
 
-df = res.limit(10).to_pandas()
-with pd.option_context("display.max_columns", None):
-    print(df)
+res.show(3)
diff --git a/noxfile.py b/noxfile.py
index 5c3650475..44eb701f6 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -74,3 +74,19 @@ def dev(session: nox.Session) -> None:
 
     python = os.path.join(venv_dir, "bin/python")
     session.run(python, "-m", "pip", "install", "-e", ".[dev]", external=True)
+
+
+@nox.session(python=["3.9", "3.10", "3.11", "3.12", "pypy3.9", "pypy3.10"])
+def examples(session: nox.Session) -> None:
+    session.install(".[examples]")
+    try:
+        session.install("unstructured[all-docs]")
+    except:  # noqa: S110, E722
+        pass
+    session.run(
+        "pytest",
+        "-m",
+        "examples",
+        "-vvv",
+        *session.posargs,
+    )
diff --git a/pyproject.toml b/pyproject.toml
index 1dac0892b..7f63f4c84 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -93,6 +93,11 @@ dev = [
   "types-PyYAML",
   "types-requests"
 ]
+examples = [
+  "datachain[tests]",
+  "defusedxml",
+  "accelerate",
+]
 
 [project.urls]
 Documentation = "https://datachain.dvc.ai"
@@ -110,7 +115,7 @@ namespaces = false
 [tool.setuptools_scm]
 
 [tool.pytest.ini_options]
-addopts = "-rfEs -m 'not benchmark'"
+addopts = "-rfEs -m 'not benchmark and not examples'"
 markers = [
   "benchmark: benchmarks.",
   "e2e: End-to-end tests"
diff --git a/tests/examples/test_examples.py b/tests/examples/test_examples.py
new file mode 100644
index 000000000..12c29170a
--- /dev/null
+++ b/tests/examples/test_examples.py
@@ -0,0 +1,87 @@
+import glob
+import os
+import subprocess
+import sys
+
+import pytest
+
+NO_EXAMPLES = "no examples found"
+
+
+def can_import_unstructured():
+    try:
+        import unstructured  # noqa: F401
+
+        return True
+    except ImportError:
+        return False
+
+
+get_started_examples = [
+    filename
+    for filename in glob.glob("examples/get_started/**/*.py", recursive=True)
+    if "torch" not in filename or os.environ.get("RUNNER_OS") != "Linux"
+]
+
+llm_and_nlp_examples = [
+    filename
+    for filename in glob.glob("examples/llm_and_nlp/**/*.py", recursive=True)
+    # no anthropic token
+    if "claude" not in filename
+    and ("unstructured" not in filename or can_import_unstructured())
+] or [NO_EXAMPLES]
+
+multimodal_examples = [
+    filename
+    for filename in glob.glob("examples/multimodal/**/*.py", recursive=True)
+    # no OpenAI token
+    # and hf download painfully slow
+    if "openai" not in filename and "hf" not in filename
+]
+
+computer_vision_examples = [
+    filename
+    for filename in glob.glob("examples/multimodal/**/*.py", recursive=True)
+    # fashion product images tutorial out of scope
+    # and hf download painfully slow
+    if "image_desc" not in filename and "fashion_product_images" not in filename
+]
+
+
+def smoke_test(example: str):
+    if example == NO_EXAMPLES:
+        return
+
+    completed_process = subprocess.run(  # noqa: S603
+        [sys.executable, example],
+        capture_output=True,
+        cwd=os.path.abspath(os.path.join(__file__, "..", "..", "..")),
+        check=True,
+    )
+
+    assert completed_process.stdout
+    assert completed_process.stderr
+
+
+@pytest.mark.examples
+@pytest.mark.parametrize("example", get_started_examples)
+def test_get_started_examples(example):
+    smoke_test(example)
+
+
+@pytest.mark.examples
+@pytest.mark.parametrize("example", llm_and_nlp_examples)
+def test_llm_and_nlp_examples(example):
+    smoke_test(example)
+
+
+@pytest.mark.examples
+@pytest.mark.parametrize("example", multimodal_examples)
+def test_multimodal(example):
+    smoke_test(example)
+
+
+@pytest.mark.examples
+@pytest.mark.parametrize("example", computer_vision_examples)
+def test_computer_vision_examples(example):
+    smoke_test(example)