diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..34ceb01 --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,50 @@ +--- +name: Publish content + +on: + pull_request: ~ + push: + branches: [ main ] + + # Allow job to be triggered manually. + workflow_dispatch: + +# Cancel in-progress jobs when pushing to the same branch. +concurrency: + cancel-in-progress: true + group: ${{ github.workflow }}-${{ github.ref }} + +jobs: + + publish-webserver: + + runs-on: "ubuntu-latest" + + name: Upload content to web server + steps: + + - name: Acquire sources + uses: actions/checkout@v4 + + - name: Install Rclone + run: sudo apt-get install --yes --no-install-recommends --no-install-suggests rclone + + - name: Configure Rclone + run: | + mkdir -p ~/.config/rclone + cat << EOF > ~/.config/rclone/rclone.conf + + [datasets-webdav] + type = webdav + url = https://cdn.crate.io/downloads/datasets/ + vendor = other + + EOF + + - name: Upload content using WebDAV + env: + RCLONE_WEBDAV_USER: webdav + RCLONE_WEBDAV_PASS: ${{ secrets.RCLONE_WEBDAV_PASS }} + run: | + rclone sync $(pwd) datasets-webdav:/cratedb-datasets \ + --copy-links --delete-excluded --exclude="/.git**" diff --git a/README.md b/README.md index f8d85b3..3533c7d 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,31 @@ The datasets are used for learning and experimenting, and to support blog posts and tech talks about CrateDB. +## Usage + +How to acquire and use datasets provided by this repository. + +### Acquisition + +The content of this repository is published to an HTTP folder +on the web server. Please consume all resources from there, +because it is discouraged to use GitHub as a CDN. + +https://cdn.crate.io/downloads/datasets/cratedb-datasets/ + +### Python API + +You can acquire datasets fluently in Python code by using +CrateDB Toolkit's [Dataset API]. +```python +from cratedb_toolkit.datasets import load_dataset +load_dataset("tutorial/weather-basic") +``` +Some of the datasets already include a default SQL DDL schema definition file, +so provisioning them as a CrateDB database table is easier than needing to +discover and type the correct `CREATE TABLE ...` statement manually. + + ## What's inside ### Embedded datasets @@ -55,4 +80,5 @@ type, make sure it is listed there. +[Dataset API]: https://cratedb-toolkit.readthedocs.io/datasets.html [Git LFS]: https://git-lfs.com/