Skip to content

Handle non-capturing groups in regex transforms (partial-train/*.parquet). #2376

Handle non-capturing groups in regex transforms (partial-train/*.parquet).

Handle non-capturing groups in regex transforms (partial-train/*.parquet). #2376

Workflow file for this run

name: Continuous Integration - mlcroissant (CI)
on:
push:
branches:
- main
pull_request:
branches:
- main
merge_group:
jobs:
python-test:
name: Format / Unit Tests / Python ${{ matrix.python-version }}
strategy:
fail-fast: false
matrix:
# Tests currently fail for 3.8 and 3.9.
python-version: ['3.10', '3.11']
runs-on: ubuntu-latest
defaults:
run:
working-directory: ./python/mlcroissant
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install library
run: sudo apt-get install -y libgraphviz-dev git-lfs && pip install .[dev]
- name: PyTest
run: make pytest
# Pylint is not compatible with Apache Beam in Python 3.11:
# https://github.com/pylint-dev/pylint/blob/02616372282fd84862636d58071e6f3c62b53559/pyproject.toml#L38
- name: Install Pylint separately.
run: pip install pylint
- name: PyLint
run: make pylint
# Pyflakes can see unused imports
- name: PyFlakes
run: make pyflakes
- name: Docstrings are defined
run: make flake8
notebook-test:
name: Notebook Tests / Python ${{ matrix.python-version }}
strategy:
fail-fast: false
matrix:
# Tests currently fail for 3.8 and 3.9.
python-version: ['3.10', '3.11']
runs-on: ubuntu-latest
defaults:
run:
working-directory: ./python/mlcroissant
env:
GITHUB_REPOSITORY: ${{ github.event.pull_request.head.repo.full_name }}
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install library
run: sudo apt-get install -y libgraphviz-dev
- run: pip install ipython ipykernel nbconvert
# Notebooks are in the recipes/ folder.
- name: Run notebook
run: |
GITHUB_REPOSITORY="${{ env.GITHUB_REPOSITORY }}"
ipython kernel install --user --name croissant-notebook
for notebook in recipes/*ipynb
do
if [ "$notebook" = "recipes/flores200_datapipes.ipynb" ]
then
echo "Skipping notebook=${notebook}"
else
jupyter nbconvert \
--ExecutePreprocessor.timeout=600 \
--ExecutePreprocessor.kernel_name=croissant-notebook \
--to notebook \
--execute "$notebook"
fi
done
python-format:
name: Python format
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: '3.11'
- uses: psf/black@stable
with:
options: --check --line-length 88 --preview
src: './python/mlcroissant'
version: "23.11.0"
- uses: isort/isort-action@master
with:
sort-paths: './python/mlcroissant'
pytype-test:
name: PyType
runs-on: ubuntu-latest
defaults:
run:
working-directory: ./python/mlcroissant
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install dependencies except mlcroissant itself
run: >
pip install absl-py \
apache-beam \
etils[epath] \
GitPython \
jsonpath_rw \
mypy \
networkx \
pandas \
pyarrow \
pytest \
pytype \
rdflib \
requests \
torchdata \
tqdm
- name: PyType
run: make pytype
mypy-test:
name: MyPy
runs-on: ubuntu-latest
defaults:
run:
working-directory: ./python/mlcroissant
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install dependencies except mlcroissant itself
run: >
pip install absl-py \
apache-beam \
etils[epath] \
GitPython \
jsonpath_rw \
mypy \
networkx \
pandas \
pyarrow \
pytest \
pytype \
rdflib \
requests \
torchdata \
tqdm
- name: MyPy
run: make mypy
validation-test:
name: Validation / JSON-LD Tests / Python 3.11
runs-on: ubuntu-latest
defaults:
run:
working-directory: ./python/mlcroissant
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install library
run: pip install .[git]
- name: Validate JSON-LD files
run: |
JSON_FILES=$(find ../../datasets/ -type f -name "*.json" ! -path '*/data/*')
for file in ${JSON_FILES}
do
echo "Validating ${file}..."
python mlcroissant/scripts/validate.py --jsonld ${file}
done
generation-test:
name: Generation / JSON-LD Tests / Python 3.11
runs-on: ubuntu-latest
defaults:
run:
working-directory: ./python/mlcroissant
strategy:
matrix:
version: ['0.8', '1.0']
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install library
run: pip install .
- name: Generate JSON-LD files - Titanic
run: mlcroissant load --jsonld ../../datasets/${{ matrix.version }}/titanic/metadata.json --record_set passengers
- name: Generate JSON-LD files - PASS
run: pip install .[image] && python mlcroissant/scripts/load.py --jsonld ../../datasets/${{ matrix.version }}/pass-mini/metadata.json --record_set images