[MAINTENANCE] Nightly cleanup of stray bigquery schemas (#10815)

great-expectations · Jan 2, 2025 · c1eb4a2 · c1eb4a2
1 parent 3c34d40
commit c1eb4a2
Show file tree

Hide file tree

Showing 4 changed files with 104 additions and 0 deletions.
diff --git a/.github/workflows/README.md b/.github/workflows/README.md
@@ -6,6 +6,8 @@
   - Responsible for keeping PR's up-to-date with `develop` (only works if "auto-merge" is turned on)
 * [CodeSee Architecture Diagrams](codesee-arch-diagram.yml)
   - Generates a visualization of proposed changes to the codebase through the use of https://www.codesee.io/
+* [DataSource Cleanup](data_source_cleanup.yml)
+  - Responsible for cleaning up stray schemas left behind from tests
 * [StaleBot](stale.yml)
   - Responsible for marking PR's and issues as `stale`
 * [PEP-273 Compatability](test-pep273-compatability.yml)

diff --git a/.github/workflows/data_source_cleanup.yml b/.github/workflows/data_source_cleanup.yml
@@ -0,0 +1,52 @@
+# Script(s) to clean up any loose data left behind from test runs.
+# These can primarily happen if CI is stopped while tests are running
+# for big query, etc.
+
+name: Nightly Data Source Cleanup
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "0 6 * * *"
+
+jobs:
+  cleanup-big-query:
+    runs-on: ubuntu-latest
+    env:
+      # google
+      GE_TEST_GCP_CREDENTIALS: ${{secrets.GE_TEST_GCP_CREDENTIALS}}
+      GE_TEST_GCP_PROJECT: ${{secrets.GE_TEST_GCP_PROJECT}}
+      GE_TEST_BIGQUERY_DATASET: ${{secrets.GE_TEST_BIGQUERY_DATASET}}
+      GOOGLE_APPLICATION_CREDENTIALS: "gcp-credentials.json"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+          cache: "pip"
+          cache-dependency-path: |
+            requirements-types.txt
+            reqs/requirements-dev-contrib.txt
+      - name: Install dependencies
+        run: pip install -r requirements-types.txt -r reqs/requirements-dev-contrib.txt -e .
+      - name: Create JSON file GCP
+        run: |
+          echo "$GE_TEST_GCP_CREDENTIALS" > gcp-credentials.json
+      - name: Install and setup Google Cloud SDK
+        run: |
+          # this is recommended by the Google documentation for CI/CD https://cloud.google.com/sdk/docs/install#other_installation_options
+          curl -sS https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-370.0.0-linux-x86_64.tar.gz > ./google-cloud-sdk-370.0.0-linux-x86_64.tar.gz && tar -xf ./google-cloud-sdk-370.0.0-linux-x86_64.tar.gz
+          ./google-cloud-sdk/install.sh --usage-reporting=False --path-update=True --quiet --install-python=False
+          # creating new named configuration
+          ./google-cloud-sdk/bin/gcloud config configurations create ge-oss-ci-cd-configurations
+          # setting account config using project and service account info
+          ./google-cloud-sdk/bin/gcloud config set account account-for-azure-tests --project=$GE_TEST_GCP_PROJECT --access-token-file=$GOOGLE_APPLICATION_CREDENTIALS
+          # Pass the configured Cloud SDK authentication to gsutil.
+          ./google-cloud-sdk/bin/gcloud config set pass_credentials_to_gsutil True
+          # Authorize access to Google Cloud with a service account
+          ./google-cloud-sdk/bin/gcloud auth activate-service-account --key-file=$GOOGLE_APPLICATION_CREDENTIALS
+      - name: Cleanup
+        run: |
+          python ./scripts/cleanup/cleanup_big_query.py
diff --git a/scripts/cleanup/__init__.py b/scripts/cleanup/__init__.py
diff --git a/scripts/cleanup/cleanup_big_query.py b/scripts/cleanup/cleanup_big_query.py
@@ -0,0 +1,50 @@
+import logging
+
+from great_expectations.compatibility.pydantic import BaseSettings
+from great_expectations.compatibility.sqlalchemy import TextClause, create_engine
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+class BigQueryConnectionConfig(BaseSettings):
+    """Environment variables for BigQuery connection.
+    These are injected in via CI, but when running locally, you may use your own credentials.
+    GOOGLE_APPLICATION_CREDENTIALS must be kept secret
+    """
+
+    GE_TEST_GCP_PROJECT: str
+    GE_TEST_BIGQUERY_DATASET: str
+    GOOGLE_APPLICATION_CREDENTIALS: str
+
+    @property
+    def connection_string(self) -> str:
+        return f"bigquery://{self.GE_TEST_GCP_PROJECT}/{self.GE_TEST_BIGQUERY_DATASET}?credentials_path={self.GOOGLE_APPLICATION_CREDENTIALS}"
+
+
+SCHEMA_FORMAT = "^test_[a-z]{10}$"
+
+
+def cleanup_big_query(config: BigQueryConnectionConfig) -> None:
+    engine = create_engine(url=config.connection_string)
+    with engine.connect() as conn, conn.begin():
+        results = conn.execute(
+            TextClause(
+                """
+                SELECT 'DROP SCHEMA ' || schema_name || ' CASCADE;'
+                FROM INFORMATION_SCHEMA.SCHEMATA
+                WHERE REGEXP_CONTAINS(schema_name, :schema_format)
+                AND creation_time < TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 2 HOUR);
+                """
+            ),
+            {"schema_format": SCHEMA_FORMAT},
+        ).fetchall()
+        to_run = "\n".join([row[0] for row in results])
+        conn.execute(to_run)
+        logger.info(f"Cleaned up {len(results)} BigQuery schema(s)")
+    engine.dispose()
+
+
+if __name__ == "__main__":
+    config = BigQueryConnectionConfig()  # type: ignore[call-arg]  # pydantic populates from env vars
+    cleanup_big_query(config)