-
Notifications
You must be signed in to change notification settings - Fork 1.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[MAINTENANCE] Nightly cleanup of stray bigquery schemas (#10815)
- Loading branch information
1 parent
3c34d40
commit c1eb4a2
Showing
4 changed files
with
104 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
# Script(s) to clean up any loose data left behind from test runs. | ||
# These can primarily happen if CI is stopped while tests are running | ||
# for big query, etc. | ||
|
||
name: Nightly Data Source Cleanup | ||
|
||
on: | ||
workflow_dispatch: | ||
schedule: | ||
- cron: "0 6 * * *" | ||
|
||
jobs: | ||
cleanup-big-query: | ||
runs-on: ubuntu-latest | ||
env: | ||
GE_TEST_GCP_CREDENTIALS: ${{secrets.GE_TEST_GCP_CREDENTIALS}} | ||
GE_TEST_GCP_PROJECT: ${{secrets.GE_TEST_GCP_PROJECT}} | ||
GE_TEST_BIGQUERY_DATASET: ${{secrets.GE_TEST_BIGQUERY_DATASET}} | ||
GOOGLE_APPLICATION_CREDENTIALS: "gcp-credentials.json" | ||
steps: | ||
- name: Checkout | ||
uses: actions/checkout@v4 | ||
- name: Set up Python | ||
uses: actions/setup-python@v5 | ||
with: | ||
python-version: "3.12" | ||
cache: "pip" | ||
cache-dependency-path: | | ||
requirements-types.txt | ||
reqs/requirements-dev-contrib.txt | ||
- name: Install dependencies | ||
run: pip install -r requirements-types.txt -r reqs/requirements-dev-contrib.txt -e . | ||
- name: Create JSON file GCP | ||
run: | | ||
echo "$GE_TEST_GCP_CREDENTIALS" > gcp-credentials.json | ||
- name: Install and setup Google Cloud SDK | ||
run: | | ||
# this is recommended by the Google documentation for CI/CD https://cloud.google.com/sdk/docs/install#other_installation_options | ||
curl -sS https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-370.0.0-linux-x86_64.tar.gz > ./google-cloud-sdk-370.0.0-linux-x86_64.tar.gz && tar -xf ./google-cloud-sdk-370.0.0-linux-x86_64.tar.gz | ||
./google-cloud-sdk/install.sh --usage-reporting=False --path-update=True --quiet --install-python=False | ||
# creating new named configuration | ||
./google-cloud-sdk/bin/gcloud config configurations create ge-oss-ci-cd-configurations | ||
# setting account config using project and service account info | ||
./google-cloud-sdk/bin/gcloud config set account account-for-azure-tests --project=$GE_TEST_GCP_PROJECT --access-token-file=$GOOGLE_APPLICATION_CREDENTIALS | ||
# Pass the configured Cloud SDK authentication to gsutil. | ||
./google-cloud-sdk/bin/gcloud config set pass_credentials_to_gsutil True | ||
# Authorize access to Google Cloud with a service account | ||
./google-cloud-sdk/bin/gcloud auth activate-service-account --key-file=$GOOGLE_APPLICATION_CREDENTIALS | ||
- name: Cleanup | ||
run: | | ||
python ./scripts/cleanup/cleanup_big_query.py |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
import logging | ||
|
||
from great_expectations.compatibility.pydantic import BaseSettings | ||
from great_expectations.compatibility.sqlalchemy import TextClause, create_engine | ||
|
||
logger = logging.getLogger(__name__) | ||
logger.setLevel(logging.INFO) | ||
|
||
|
||
class BigQueryConnectionConfig(BaseSettings): | ||
"""Environment variables for BigQuery connection. | ||
These are injected in via CI, but when running locally, you may use your own credentials. | ||
GOOGLE_APPLICATION_CREDENTIALS must be kept secret | ||
""" | ||
|
||
GE_TEST_GCP_PROJECT: str | ||
GE_TEST_BIGQUERY_DATASET: str | ||
GOOGLE_APPLICATION_CREDENTIALS: str | ||
|
||
@property | ||
def connection_string(self) -> str: | ||
return f"bigquery://{self.GE_TEST_GCP_PROJECT}/{self.GE_TEST_BIGQUERY_DATASET}?credentials_path={self.GOOGLE_APPLICATION_CREDENTIALS}" | ||
|
||
|
||
SCHEMA_FORMAT = "^test_[a-z]{10}$" | ||
|
||
|
||
def cleanup_big_query(config: BigQueryConnectionConfig) -> None: | ||
engine = create_engine(url=config.connection_string) | ||
with engine.connect() as conn, conn.begin(): | ||
results = conn.execute( | ||
TextClause( | ||
""" | ||
SELECT 'DROP SCHEMA ' || schema_name || ' CASCADE;' | ||
FROM INFORMATION_SCHEMA.SCHEMATA | ||
WHERE REGEXP_CONTAINS(schema_name, :schema_format) | ||
AND creation_time < TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 2 HOUR); | ||
""" | ||
), | ||
{"schema_format": SCHEMA_FORMAT}, | ||
).fetchall() | ||
to_run = "\n".join([row[0] for row in results]) | ||
conn.execute(to_run) | ||
logger.info(f"Cleaned up {len(results)} BigQuery schema(s)") | ||
engine.dispose() | ||
|
||
|
||
if __name__ == "__main__": | ||
config = BigQueryConnectionConfig() # type: ignore[call-arg] # pydantic populates from env vars | ||
cleanup_big_query(config) |