Skip to content

Commit

Permalink
[MAINTENANCE] Nightly cleanup of stray bigquery schemas (#10815)
Browse files Browse the repository at this point in the history
  • Loading branch information
tyler-hoffman authored Jan 2, 2025
1 parent 3c34d40 commit c1eb4a2
Show file tree
Hide file tree
Showing 4 changed files with 104 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
- Responsible for keeping PR's up-to-date with `develop` (only works if "auto-merge" is turned on)
* [CodeSee Architecture Diagrams](codesee-arch-diagram.yml)
- Generates a visualization of proposed changes to the codebase through the use of https://www.codesee.io/
* [DataSource Cleanup](data_source_cleanup.yml)
- Responsible for cleaning up stray schemas left behind from tests
* [StaleBot](stale.yml)
- Responsible for marking PR's and issues as `stale`
* [PEP-273 Compatability](test-pep273-compatability.yml)
Expand Down
52 changes: 52 additions & 0 deletions .github/workflows/data_source_cleanup.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Script(s) to clean up any loose data left behind from test runs.
# These can primarily happen if CI is stopped while tests are running
# for big query, etc.

name: Nightly Data Source Cleanup

on:
workflow_dispatch:
schedule:
- cron: "0 6 * * *"

jobs:
cleanup-big-query:
runs-on: ubuntu-latest
env:
# google
GE_TEST_GCP_CREDENTIALS: ${{secrets.GE_TEST_GCP_CREDENTIALS}}
GE_TEST_GCP_PROJECT: ${{secrets.GE_TEST_GCP_PROJECT}}
GE_TEST_BIGQUERY_DATASET: ${{secrets.GE_TEST_BIGQUERY_DATASET}}
GOOGLE_APPLICATION_CREDENTIALS: "gcp-credentials.json"
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.12"
cache: "pip"
cache-dependency-path: |
requirements-types.txt
reqs/requirements-dev-contrib.txt
- name: Install dependencies
run: pip install -r requirements-types.txt -r reqs/requirements-dev-contrib.txt -e .
- name: Create JSON file GCP
run: |
echo "$GE_TEST_GCP_CREDENTIALS" > gcp-credentials.json
- name: Install and setup Google Cloud SDK
run: |
# this is recommended by the Google documentation for CI/CD https://cloud.google.com/sdk/docs/install#other_installation_options
curl -sS https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-370.0.0-linux-x86_64.tar.gz > ./google-cloud-sdk-370.0.0-linux-x86_64.tar.gz && tar -xf ./google-cloud-sdk-370.0.0-linux-x86_64.tar.gz
./google-cloud-sdk/install.sh --usage-reporting=False --path-update=True --quiet --install-python=False
# creating new named configuration
./google-cloud-sdk/bin/gcloud config configurations create ge-oss-ci-cd-configurations
# setting account config using project and service account info
./google-cloud-sdk/bin/gcloud config set account account-for-azure-tests --project=$GE_TEST_GCP_PROJECT --access-token-file=$GOOGLE_APPLICATION_CREDENTIALS
# Pass the configured Cloud SDK authentication to gsutil.
./google-cloud-sdk/bin/gcloud config set pass_credentials_to_gsutil True
# Authorize access to Google Cloud with a service account
./google-cloud-sdk/bin/gcloud auth activate-service-account --key-file=$GOOGLE_APPLICATION_CREDENTIALS
- name: Cleanup
run: |
python ./scripts/cleanup/cleanup_big_query.py
Empty file added scripts/cleanup/__init__.py
Empty file.
50 changes: 50 additions & 0 deletions scripts/cleanup/cleanup_big_query.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import logging

from great_expectations.compatibility.pydantic import BaseSettings
from great_expectations.compatibility.sqlalchemy import TextClause, create_engine

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


class BigQueryConnectionConfig(BaseSettings):
"""Environment variables for BigQuery connection.
These are injected in via CI, but when running locally, you may use your own credentials.
GOOGLE_APPLICATION_CREDENTIALS must be kept secret
"""

GE_TEST_GCP_PROJECT: str
GE_TEST_BIGQUERY_DATASET: str
GOOGLE_APPLICATION_CREDENTIALS: str

@property
def connection_string(self) -> str:
return f"bigquery://{self.GE_TEST_GCP_PROJECT}/{self.GE_TEST_BIGQUERY_DATASET}?credentials_path={self.GOOGLE_APPLICATION_CREDENTIALS}"


SCHEMA_FORMAT = "^test_[a-z]{10}$"


def cleanup_big_query(config: BigQueryConnectionConfig) -> None:
engine = create_engine(url=config.connection_string)
with engine.connect() as conn, conn.begin():
results = conn.execute(
TextClause(
"""
SELECT 'DROP SCHEMA ' || schema_name || ' CASCADE;'
FROM INFORMATION_SCHEMA.SCHEMATA
WHERE REGEXP_CONTAINS(schema_name, :schema_format)
AND creation_time < TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 2 HOUR);
"""
),
{"schema_format": SCHEMA_FORMAT},
).fetchall()
to_run = "\n".join([row[0] for row in results])
conn.execute(to_run)
logger.info(f"Cleaned up {len(results)} BigQuery schema(s)")
engine.dispose()


if __name__ == "__main__":
config = BigQueryConnectionConfig() # type: ignore[call-arg] # pydantic populates from env vars
cleanup_big_query(config)

0 comments on commit c1eb4a2

Please sign in to comment.