diff --git a/.github/workflows/pr_tests_spark.yml b/.github/workflows/pr_tests_spark.yml index ab181cb3..f1a871c3 100644 --- a/.github/workflows/pr_tests_spark.yml +++ b/.github/workflows/pr_tests_spark.yml @@ -1,22 +1,22 @@ -name: Spark S3 Integration Tests +name: pr_tests_spark on: - # pull_request: - # branches: - # - main + pull_request: + branches: + - main push: branches: - feature/** - dev - staging - template-spark-tests - # - spark_prep + - spark_prep concurrency: dbt_integration_tests env: DBT_PROFILES_DIR: ./ci - SPARK_MASTER_HOST: spark-master + SPARK_MASTER_HOST: localhost SPARK_USER: spark SPARK_SCHEMA: default AWS_REGION: eu-west-1 @@ -68,13 +68,6 @@ jobs: aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} aws-region: eu-west-1 - - name: Verify AWS Credentials - run: | - echo "Verifying AWS Credentials..." - aws sts get-caller-identity - aws s3 ls s3://dbt-spark-iceberg/ - aws glue get-databases --region eu-west-1 - - name: Set up Docker Buildx uses: docker/setup-buildx-action@v1 @@ -83,138 +76,9 @@ jobs: sudo curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose sudo chmod +x /usr/local/bin/docker-compose - - name: Create Dockerfile - run: | - cat << EOF > Dockerfile - FROM openjdk:11-jdk-slim - - ENV SPARK_VERSION=3.5.1 - ENV HADOOP_VERSION=3.3.4 - ENV ICEBERG_VERSION=1.3.1 - ENV SPARK_HOME=/spark - - RUN apt-get update && apt-get install -y curl wget procps rsync ssh iputils-ping net-tools jq - - RUN wget --tries=5 --retry-connrefused --waitretry=1 --timeout=20 https://downloads.apache.org/spark/spark-\${SPARK_VERSION}/spark-\${SPARK_VERSION}-bin-hadoop3.tgz && \ - tar -xvzf spark-\${SPARK_VERSION}-bin-hadoop3.tgz && \ - mv spark-\${SPARK_VERSION}-bin-hadoop3 \${SPARK_HOME} && \ - rm spark-\${SPARK_VERSION}-bin-hadoop3.tgz - - ENV PATH=\$PATH:\${SPARK_HOME}/bin:\${SPARK_HOME}/sbin - - RUN mkdir -p /spark/spark-warehouse && \ - chown -R root:root /spark/spark-warehouse && \ - chmod -R 777 /spark/spark-warehouse - - # Install AWS Glue libraries and Iceberg - RUN curl -L -o /spark/jars/aws-java-sdk-bundle-1.11.1026.jar https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.1026/aws-java-sdk-bundle-1.11.1026.jar && \ - curl -L -o /spark/jars/hadoop-aws-3.3.4.jar https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar && \ - curl -L -o /spark/jars/iceberg-spark-runtime-3.5_2.12-\${ICEBERG_VERSION}.jar https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.5_2.12/\${ICEBERG_VERSION}/iceberg-spark-runtime-3.5_2.12-\${ICEBERG_VERSION}.jar && \ - curl -L -o /spark/jars/iceberg-aws-bundle-\${ICEBERG_VERSION}.jar https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-aws-bundle/\${ICEBERG_VERSION}/iceberg-aws-bundle-\${ICEBERG_VERSION}.jar - - WORKDIR \${SPARK_HOME} - - CMD ["bash"] - EOF - - - name: Create spark-defaults.conf - run: | - cat << EOF > spark-defaults.conf - spark.master spark://spark-master:7077 - spark.sql.warehouse.dir s3a://dbt-spark-iceberg/github-integration-testing - spark.sql.extensions org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions - spark.sql.catalog.spark_catalog org.apache.iceberg.spark.SparkSessionCatalog - spark.sql.catalog.spark_catalog.type hive - spark.sql.catalog.glue org.apache.iceberg.spark.SparkCatalog - spark.sql.catalog.glue.catalog-impl org.apache.iceberg.aws.glue.GlueCatalog - spark.sql.catalog.glue.warehouse s3a://dbt-spark-iceberg/github-integration-testing - spark.sql.catalog.glue.io-impl org.apache.iceberg.aws.s3.S3FileIO - spark.sql.defaultCatalog glue - spark.sql.catalog.glue.database dbt-spark-iceberg - spark.hadoop.fs.s3a.impl org.apache.hadoop.fs.s3a.S3AFileSystem - spark.hadoop.fs.s3a.access.key ${AWS_ACCESS_KEY_ID} - spark.hadoop.fs.s3a.secret.key ${AWS_SECRET_ACCESS_KEY} - spark.hadoop.fs.s3a.endpoint s3.eu-west-1.amazonaws.com - spark.hadoop.fs.s3a.path.style.access true - spark.hadoop.fs.s3a.region eu-west-1 - spark.hadoop.fs.s3a.aws.region eu-west-1 - spark.hadoop.com.amazonaws.services.s3.enableV4 true - spark.hadoop.fs.s3a.aws.credentials.provider org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider - spark.hadoop.hive.metastore.client.factory.class com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory - spark.driver.extraClassPath /spark/jars/* - spark.executor.extraClassPath /spark/jars/* - EOF - - - name: Create docker-compose.yml - run: | - cat << EOF > docker-compose.yml - version: '3' - networks: - spark-network: - driver: bridge - services: - spark-master: - build: . - command: ["/bin/bash", "-c", "/spark/sbin/start-master.sh -h spark-master --properties-file /spark/conf/spark-defaults.conf && tail -f /spark/logs/spark--org.apache.spark.deploy.master.Master-1-*.out"] - hostname: spark-master - ports: - - '8080:8080' - - '7077:7077' - environment: - - SPARK_LOCAL_IP=spark-master - - SPARK_MASTER_HOST=spark-master - - SPARK_MASTER_PORT=7077 - - SPARK_MASTER_OPTS="-Dspark.driver.memory=2g" - - AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} - - AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} - - AWS_REGION=eu-west-1 - - AWS_DEFAULT_REGION=eu-west-1 - volumes: - - ./spark-defaults.conf:/spark/conf/spark-defaults.conf - networks: - - spark-network - spark-worker: - build: . - command: ["/bin/bash", "-c", "sleep 10 && /spark/sbin/start-worker.sh spark://spark-master:7077 --properties-file /spark/conf/spark-defaults.conf && tail -f /spark/logs/spark--org.apache.spark.deploy.worker.Worker-*.out"] - depends_on: - - spark-master - environment: - - SPARK_WORKER_CORES=2 - - SPARK_WORKER_MEMORY=4G - - SPARK_EXECUTOR_MEMORY=3G - - SPARK_LOCAL_IP=spark-worker - - SPARK_MASTER=spark://spark-master:7077 - - AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} - - AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} - - AWS_REGION=eu-west-1 - - AWS_DEFAULT_REGION=eu-west-1 - volumes: - - ./spark-defaults.conf:/spark/conf/spark-defaults.conf - networks: - - spark-network - thrift-server: - build: . - command: ["/bin/bash", "-c", "sleep 30 && /spark/sbin/start-thriftserver.sh --master spark://spark-master:7077 --driver-memory 2g --executor-memory 3g --hiveconf hive.server2.thrift.port=10000 --hiveconf hive.server2.thrift.bind.host=0.0.0.0 --conf spark.sql.hive.thriftServer.async=true --conf spark.sql.hive.thriftServer.workerQueue.size=2000 --conf spark.sql.hive.thriftServer.maxWorkerThreads=100 --conf spark.sql.hive.thriftServer.minWorkerThreads=50 --jars /spark/jars/iceberg-spark-runtime-3.5_2.12-1.3.1.jar,/spark/jars/iceberg-aws-bundle-1.3.1.jar --driver-class-path /spark/jars/* --executor-class-path /spark/jars/* && tail -f /spark/logs/spark--org.apache.spark.sql.hive.thriftserver.HiveThriftServer2-*.out"] - ports: - - '10000:10000' - depends_on: - - spark-master - - spark-worker - environment: - - SPARK_LOCAL_IP=thrift-server - - AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} - - AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} - - AWS_REGION=eu-west-1 - - AWS_DEFAULT_REGION=eu-west-1 - volumes: - - ./spark-defaults.conf:/spark/conf/spark-defaults.conf - networks: - - spark-network - EOF - - name: Build and start Spark cluster run: | - docker-compose build --no-cache + docker-compose build docker-compose up -d echo "Waiting for Spark services to start..." sleep 90 @@ -225,7 +89,7 @@ jobs: - name: Check Docker network run: | docker network ls - docker network inspect integration_tests_spark-network + # docker network inspect spark-network - name: Print Docker logs run: | @@ -236,27 +100,11 @@ jobs: echo "Docker logs for thrift-server:" docker-compose logs --tail=1000 thrift-server - - name: Check Spark cluster status - run: | - docker-compose exec -T spark-master bash -c "jps && ps aux | grep spark && netstat -tuln" - docker-compose exec -T spark-worker bash -c "jps && ps aux | grep spark && netstat -tuln" - docker-compose exec -T thrift-server bash -c "jps && ps aux | grep spark && netstat -tuln" - - - name: Check Spark Master UI - run: | - echo "Checking Spark Master UI..." - docker-compose exec -T spark-master bash -c "curl -s http://spark-master:8080/json/ | jq '.workers'" - - name: Verify Spark configuration run: | echo "Verifying Spark configuration..." docker-compose exec -T spark-master bash -c "cat /spark/conf/spark-defaults.conf" - - name: List Spark jars - run: | - echo "Listing Spark jars..." - docker-compose exec -T spark-master bash -c "ls -l /spark/jars | grep iceberg" - - name: Wait for Thrift Server run: | echo "Waiting for Thrift Server to be fully operational..." @@ -270,29 +118,15 @@ jobs: - name: Test ThriftServer connection with Beeline run: | - docker-compose exec -T thrift-server bash -c ' - /spark/bin/beeline -u "jdbc:hive2://localhost:10000" -e "SHOW DATABASES;" --hiveconf hive.cli.errors.ignore=true - ' + docker-compose exec -T thrift-server bash -c '/spark/bin/beeline -u "jdbc:hive2://localhost:10000" -e "SHOW DATABASES;"' - - name: Verify AWS Credentials in Spark + - name: "Pre-test: Drop ci schemas" run: | - docker-compose exec -T spark-master bash -c ' - spark-shell --master spark://spark-master:7077 << EOF - import org.apache.spark.sql.SparkSession - val spark = SparkSession.builder().getOrCreate() - try { - val df = spark.read.format("s3a").load("s3a://dbt-spark-iceberg/") - println("Successfully read from S3") - df.show() - } catch { - case e: Exception => - println("Failed to read from S3") - e.printStackTrace() - } - spark.stop() - EOF - ' + dbt run-operation post_ci_cleanup --target spark + + - name: Run tests + run: ./.scripts/integration_tests.sh -d spark - - name: 'Pre-test: Drop ci schemas' + - name: "Post-test: Drop ci schemas" run: | - dbt run-operation post_ci_cleanup --target spark \ No newline at end of file + dbt run-operation post_ci_cleanup --target spark diff --git a/.github/workflows/pr_tests_spark_simple.yml b/.github/workflows/pr_tests_spark_simple.yml deleted file mode 100644 index cf565b56..00000000 --- a/.github/workflows/pr_tests_spark_simple.yml +++ /dev/null @@ -1,144 +0,0 @@ -name: Spark Deployment - -on: - pull_request: - branches: [main] - push: - branches: - - 'feature/**' - - 'dev' - - 'staging' - - 'template-spark-tests' - # - 'spark_prep' - -env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - AWS_REGION: eu-west-1 - AWS_DEFAULT_REGION: eu-west-1 - -jobs: - deploy-spark: - runs-on: ubuntu-latest - steps: - - name: Checkout code - uses: actions/checkout@v2 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v1 - - - name: Install Docker Compose - run: | - sudo curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose - sudo chmod +x /usr/local/bin/docker-compose - docker-compose --version - - - name: Create Dockerfile - run: | - cat << EOF > Dockerfile - FROM openjdk:11-jdk-slim - - ENV SPARK_VERSION=3.5.1 - ENV HADOOP_VERSION=3.3.4 - ENV SPARK_HOME=/spark - - RUN apt-get update && apt-get install -y curl wget procps rsync ssh iputils-ping net-tools - - RUN wget --tries=5 --retry-connrefused --waitretry=1 --timeout=20 https://downloads.apache.org/spark/spark-\${SPARK_VERSION}/spark-\${SPARK_VERSION}-bin-hadoop3.tgz && \ - tar -xvzf spark-\${SPARK_VERSION}-bin-hadoop3.tgz && \ - mv spark-\${SPARK_VERSION}-bin-hadoop3 \${SPARK_HOME} && \ - rm spark-\${SPARK_VERSION}-bin-hadoop3.tgz - - ENV PATH=\$PATH:\${SPARK_HOME}/bin:\${SPARK_HOME}/sbin - - WORKDIR \${SPARK_HOME} - - CMD ["bash"] - EOF - - - name: Create docker-compose.yml - run: | - cat << EOF > docker-compose.yml - version: '3' - - services: - spark-master: - build: . - command: bin/spark-class org.apache.spark.deploy.master.Master - ports: - - "8080:8080" - - "7077:7077" - environment: - - SPARK_MODE=master - - SPARK_MASTER_HOST=localhost - - SPARK_MASTER_PORT=7077 - - SPARK_MASTER_WEBUI_PORT=8080 - - spark-worker: - build: . - command: bin/spark-class org.apache.spark.deploy.worker.Worker spark://spark-master:7077 - depends_on: - - spark-master - environment: - - SPARK_MODE=worker - - SPARK_WORKER_CORES=2 - - SPARK_WORKER_MEMORY=2g - - SPARK_WORKER_PORT=8081 - - SPARK_WORKER_WEBUI_PORT=8081 - - SPARK_MASTER=spark://spark-master:7077 - EOF - - - name: Build and start Spark cluster - run: | - docker-compose build --no-cache - docker-compose up -d - - - name: Wait for services to start - run: | - echo "Waiting for Spark services to start..." - sleep 60 - - - name: Check Spark master status - run: | - docker-compose exec -T spark-master bash -c "jps" - docker-compose exec -T spark-master bash -c "ps aux | grep spark" - docker-compose exec -T spark-master bash -c "netstat -tuln" - docker-compose exec -T spark-master bash -c "ls -l \$SPARK_HOME" - docker-compose exec -T spark-master bash -c "cat \$SPARK_HOME/conf/spark-env.sh || echo 'spark-env.sh not found'" - - - name: Check Spark worker status - run: | - docker-compose exec -T spark-worker bash -c "jps" - docker-compose exec -T spark-worker bash -c "ps aux | grep spark" - docker-compose exec -T spark-worker bash -c "netstat -tuln" - docker-compose exec -T spark-worker bash -c "ls -l \$SPARK_HOME" - docker-compose exec -T spark-worker bash -c "cat \$SPARK_HOME/conf/spark-env.sh || echo 'spark-env.sh not found'" - - - name: Check network connectivity - run: | - docker-compose exec -T spark-worker ping -c 4 spark-master - - - name: Check Spark logs - run: | - docker-compose exec -T spark-master bash -c "ls -l \$SPARK_HOME/logs || echo 'No logs found'" - docker-compose exec -T spark-master bash -c "cat \$SPARK_HOME/logs/* || echo 'No logs to display'" - docker-compose exec -T spark-worker bash -c "ls -l \$SPARK_HOME/logs || echo 'No logs found'" - docker-compose exec -T spark-worker bash -c "cat \$SPARK_HOME/logs/* || echo 'No logs to display'" - - - name: Run test Spark job - run: | - echo "Running Spark Pi example job..." - docker-compose exec -T spark-master bin/spark-submit --master spark://spark-master:7077 --class org.apache.spark.examples.SparkPi examples/jars/spark-examples_2.12-3.5.1.jar 10 - - - name: 'Pre-test: Drop ci schemas' - run: dbt run-operation post_ci_cleanup --target spark - - - name: Run tests - run: ./.scripts/integration_tests.sh -d spark - - - name: 'Post-test: Drop ci schemas' - run: dbt run-operation post_ci_cleanup --target spark - - - name: Cleanup - if: always() - run: docker-compose down \ No newline at end of file diff --git a/.github/workflows/pr_tests_spark_with_files.yml b/.github/workflows/pr_tests_spark_with_files.yml deleted file mode 100644 index e6007d64..00000000 --- a/.github/workflows/pr_tests_spark_with_files.yml +++ /dev/null @@ -1,132 +0,0 @@ -name: Spark S3 Integration Tests dedicated files - -on: - pull_request: - branches: - - main - push: - branches: - - feature/** - - dev - - staging - - template-spark-tests - - spark_prep - -concurrency: dbt_integration_tests - -env: - DBT_PROFILES_DIR: ./ci - SPARK_MASTER_HOST: localhost - SPARK_USER: spark - SPARK_SCHEMA: default - AWS_REGION: eu-west-1 - AWS_DEFAULT_REGION: eu-west-1 - -jobs: - spark_s3_integration_tests: - name: Spark S3 Integration Tests - runs-on: ubuntu-latest - defaults: - run: - working-directory: ./integration_tests - strategy: - matrix: - dbt_version: - - 1.* - warehouse: - - spark - steps: - - name: Check out - uses: actions/checkout@v3 - - - name: Set SCHEMA_SUFFIX env - run: >- - echo "SCHEMA_SUFFIX=$(echo ${DBT_VERSION%.*} | tr . _)" >> $GITHUB_ENV - env: - DBT_VERSION: '${{ matrix.dbt_version }}' - - - name: Set DEFAULT_TARGET env - run: | - echo "DEFAULT_TARGET=${{ matrix.warehouse }}" >> $GITHUB_ENV - - - name: Python setup - uses: actions/setup-python@v4 - with: - python-version: 3.8.x - - - name: Install dependencies - run: | - pip install --upgrade pip wheel setuptools - pip install -Iv "dbt-spark[PyHive]==${{ matrix.dbt_version }}" --upgrade - pip install boto3 awscli - dbt deps - - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v1 - with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: eu-west-1 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v1 - - - name: Install Docker Compose - run: | - sudo curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose - sudo chmod +x /usr/local/bin/docker-compose - - - name: Build and start Spark cluster - run: | - docker-compose build - docker-compose up -d - echo "Waiting for Spark services to start..." - sleep 90 - - - name: Check running containers - run: docker ps - - - name: Check Docker network - run: | - docker network ls - # docker network inspect spark-network - - - name: Print Docker logs - run: | - echo "Docker logs for spark-master:" - docker-compose logs --tail=1000 spark-master - echo "Docker logs for spark-worker:" - docker-compose logs --tail=1000 spark-worker - echo "Docker logs for thrift-server:" - docker-compose logs --tail=1000 thrift-server - - - name: Verify Spark configuration - run: | - echo "Verifying Spark configuration..." - docker-compose exec -T spark-master bash -c "cat /spark/conf/spark-defaults.conf" - - - name: Wait for Thrift Server - run: | - echo "Waiting for Thrift Server to be fully operational..." - sleep 60 - - - name: Check ThriftServer Process - run: docker-compose exec -T thrift-server bash -c "ps aux | grep ThriftServer" - - - name: Check Latest ThriftServer Log - run: docker-compose exec -T thrift-server bash -c "tail -n 50 /spark/logs/\$(ls -t /spark/logs/ | grep thriftserver | head -n1)" - - - name: Test ThriftServer connection with Beeline - run: | - docker-compose exec -T thrift-server bash -c '/spark/bin/beeline -u "jdbc:hive2://localhost:10000" -e "SHOW DATABASES;"' - - - name: "Pre-test: Drop ci schemas" - run: | - dbt run-operation post_ci_cleanup --target spark - - - name: Run tests - run: ./.scripts/integration_tests.sh -d spark - - - name: "Post-test: Drop ci schemas" - run: | - dbt run-operation post_ci_cleanup --target spark