Update pr_tests_spark.yml #25
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Spark S3 Integration Tests | |
on: | |
pull_request: | |
branches: | |
- main | |
push: | |
branches: | |
- feature/** | |
- dev | |
- staging | |
- template-spark-tests | |
- spark_prep | |
concurrency: dbt_integration_tests | |
env: | |
DBT_PROFILES_DIR: ./ci | |
SPARK_MASTER_HOST: spark-master | |
SPARK_USER: spark | |
SPARK_SCHEMA: default | |
AWS_REGION: eu-west-1 | |
AWS_DEFAULT_REGION: eu-west-1 | |
jobs: | |
spark_s3_integration_tests: | |
name: Spark S3 Integration Tests | |
runs-on: ubuntu-latest | |
defaults: | |
run: | |
working-directory: ./integration_tests | |
strategy: | |
matrix: | |
dbt_version: | |
- 1.* | |
warehouse: | |
- spark | |
steps: | |
- name: Check out | |
uses: actions/checkout@v3 | |
- name: Set SCHEMA_SUFFIX env | |
run: >- | |
echo "SCHEMA_SUFFIX=$(echo ${DBT_VERSION%.*} | tr . _)" >> $GITHUB_ENV | |
env: | |
DBT_VERSION: '${{ matrix.dbt_version }}' | |
- name: Set DEFAULT_TARGET env | |
run: | | |
echo "DEFAULT_TARGET=${{ matrix.warehouse }}" >> $GITHUB_ENV | |
- name: Python setup | |
uses: actions/setup-python@v4 | |
with: | |
python-version: 3.8.x | |
- name: Install dependencies | |
run: | | |
pip install --upgrade pip wheel setuptools | |
pip install -Iv "dbt-spark[PyHive]==${{ matrix.dbt_version }}" --upgrade | |
pip install boto3 awscli | |
dbt deps | |
- name: Configure AWS credentials | |
uses: aws-actions/configure-aws-credentials@v1 | |
with: | |
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} | |
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | |
aws-region: eu-west-1 | |
- name: Verify AWS Credentials | |
run: | | |
echo "Verifying AWS Credentials..." | |
aws sts get-caller-identity | |
aws s3 ls s3://dbt-spark-iceberg/ | |
aws glue get-databases --region eu-west-1 | |
- name: Set up Docker Buildx | |
uses: docker/setup-buildx-action@v1 | |
- name: Install Docker Compose | |
run: | | |
sudo curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose | |
sudo chmod +x /usr/local/bin/docker-compose | |
- name: Create Dockerfile | |
run: | | |
cat << EOF > Dockerfile | |
FROM openjdk:11-jdk-slim | |
ENV SPARK_VERSION=3.5.1 | |
ENV HADOOP_VERSION=3.3.4 | |
ENV ICEBERG_VERSION=1.3.1 | |
ENV SPARK_HOME=/spark | |
RUN apt-get update && apt-get install -y curl wget procps rsync ssh iputils-ping net-tools jq | |
RUN wget --tries=5 --retry-connrefused --waitretry=1 --timeout=20 https://downloads.apache.org/spark/spark-\${SPARK_VERSION}/spark-\${SPARK_VERSION}-bin-hadoop3.tgz && \ | |
tar -xvzf spark-\${SPARK_VERSION}-bin-hadoop3.tgz && \ | |
mv spark-\${SPARK_VERSION}-bin-hadoop3 \${SPARK_HOME} && \ | |
rm spark-\${SPARK_VERSION}-bin-hadoop3.tgz | |
ENV PATH=\$PATH:\${SPARK_HOME}/bin:\${SPARK_HOME}/sbin | |
RUN mkdir -p /spark/spark-warehouse && \ | |
chown -R root:root /spark/spark-warehouse && \ | |
chmod -R 777 /spark/spark-warehouse | |
# Install AWS Glue libraries and Iceberg | |
RUN curl -L -o /spark/jars/aws-java-sdk-bundle-1.11.1026.jar https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.1026/aws-java-sdk-bundle-1.11.1026.jar && \ | |
curl -L -o /spark/jars/hadoop-aws-3.3.4.jar https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar && \ | |
curl -L -o /spark/jars/iceberg-spark-runtime-3.5_2.12-\${ICEBERG_VERSION}.jar https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.5_2.12/\${ICEBERG_VERSION}/iceberg-spark-runtime-3.5_2.12-\${ICEBERG_VERSION}.jar | |
WORKDIR \${SPARK_HOME} | |
CMD ["bash"] | |
EOF | |
- name: Create spark-defaults.conf | |
run: | | |
cat << EOF > spark-defaults.conf | |
spark.master spark://spark-master:7077 | |
spark.sql.warehouse.dir s3a://dbt-spark-iceberg/github-integration-testing | |
spark.sql.extensions org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions | |
spark.sql.catalog.glue org.apache.iceberg.spark.SparkCatalog | |
spark.sql.catalog.glue.catalog-impl org.apache.iceberg.aws.glue.GlueCatalog | |
spark.sql.catalog.glue.warehouse s3a://dbt-spark-iceberg/github-integration-testing | |
spark.sql.catalog.glue.io-impl org.apache.iceberg.aws.s3.S3FileIO | |
spark.sql.defaultCatalog glue | |
spark.sql.catalog.glue.database dbt-spark-iceberg | |
spark.hadoop.fs.s3a.impl org.apache.hadoop.fs.s3a.S3AFileSystem | |
spark.hadoop.fs.s3a.access.key ${AWS_ACCESS_KEY_ID} | |
spark.hadoop.fs.s3a.secret.key ${AWS_SECRET_ACCESS_KEY} | |
spark.hadoop.fs.s3a.endpoint s3.eu-west-1.amazonaws.com | |
spark.hadoop.fs.s3a.path.style.access true | |
spark.hadoop.fs.s3a.region eu-west-1 | |
spark.hadoop.fs.s3a.aws.region eu-west-1 | |
spark.hadoop.com.amazonaws.services.s3.enableV4 true | |
spark.hadoop.fs.s3a.aws.credentials.provider org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider | |
spark.hadoop.hive.metastore.client.factory.class com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory | |
EOF | |
- name: Create docker-compose.yml | |
run: | | |
cat << EOF > docker-compose.yml | |
version: '3' | |
networks: | |
spark-network: | |
driver: bridge | |
services: | |
spark-master: | |
build: . | |
command: ["/bin/bash", "-c", "/spark/sbin/start-master.sh -h spark-master --properties-file /spark/conf/spark-defaults.conf && tail -f /spark/logs/spark--org.apache.spark.deploy.master.Master-1-*.out"] | |
hostname: spark-master | |
ports: | |
- '8080:8080' | |
- '7077:7077' | |
environment: | |
- SPARK_LOCAL_IP=spark-master | |
- SPARK_MASTER_HOST=spark-master | |
- SPARK_MASTER_PORT=7077 | |
- SPARK_MASTER_OPTS="-Dspark.driver.memory=2g" | |
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} | |
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} | |
- AWS_REGION=eu-west-1 | |
- AWS_DEFAULT_REGION=eu-west-1 | |
volumes: | |
- ./spark-defaults.conf:/spark/conf/spark-defaults.conf | |
networks: | |
- spark-network | |
spark-worker: | |
build: . | |
command: ["/bin/bash", "-c", "sleep 10 && /spark/sbin/start-worker.sh spark://spark-master:7077 --properties-file /spark/conf/spark-defaults.conf && tail -f /spark/logs/spark--org.apache.spark.deploy.worker.Worker-*.out"] | |
depends_on: | |
- spark-master | |
environment: | |
- SPARK_WORKER_CORES=2 | |
- SPARK_WORKER_MEMORY=4G | |
- SPARK_EXECUTOR_MEMORY=3G | |
- SPARK_LOCAL_IP=spark-worker | |
- SPARK_MASTER=spark://spark-master:7077 | |
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} | |
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} | |
- AWS_REGION=eu-west-1 | |
- AWS_DEFAULT_REGION=eu-west-1 | |
volumes: | |
- ./spark-defaults.conf:/spark/conf/spark-defaults.conf | |
networks: | |
- spark-network | |
thrift-server: | |
build: . | |
command: ["/bin/bash", "-c", "sleep 30 && /spark/sbin/start-thriftserver.sh --master spark://spark-master:7077 --driver-memory 2g --executor-memory 3g --hiveconf hive.server2.thrift.port=10000 --hiveconf hive.server2.thrift.bind.host=0.0.0.0 --conf spark.sql.hive.thriftServer.async=true --conf spark.sql.hive.thriftServer.workerQueue.size=2000 --conf spark.sql.hive.thriftServer.maxWorkerThreads=100 --conf spark.sql.hive.thriftServer.minWorkerThreads=50 && tail -f /spark/logs/spark--org.apache.spark.sql.hive.thriftserver.HiveThriftServer2-*.out"] | |
ports: | |
- '10000:10000' | |
depends_on: | |
- spark-master | |
- spark-worker | |
environment: | |
- SPARK_LOCAL_IP=thrift-server | |
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} | |
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} | |
- AWS_REGION=eu-west-1 | |
- AWS_DEFAULT_REGION=eu-west-1 | |
volumes: | |
- ./spark-defaults.conf:/spark/conf/spark-defaults.conf | |
networks: | |
- spark-network | |
EOF | |
- name: Build and start Spark cluster | |
run: | | |
docker-compose build --no-cache | |
docker-compose up -d | |
echo "Waiting for Spark services to start..." | |
sleep 90 | |
- name: Check running containers | |
run: docker ps | |
- name: Check Docker network | |
run: | | |
docker network ls | |
docker network inspect integration_tests_spark-network | |
- name: Print Docker logs | |
run: | | |
echo "Docker logs for spark-master:" | |
docker-compose logs --tail=1000 spark-master | |
echo "Docker logs for spark-worker:" | |
docker-compose logs --tail=1000 spark-worker | |
echo "Docker logs for thrift-server:" | |
docker-compose logs --tail=1000 thrift-server | |
- name: Check Spark cluster status | |
run: | | |
docker-compose exec -T spark-master bash -c "jps && ps aux | grep spark && netstat -tuln" | |
docker-compose exec -T spark-worker bash -c "jps && ps aux | grep spark && netstat -tuln" | |
docker-compose exec -T thrift-server bash -c "jps && ps aux | grep spark && netstat -tuln" | |
- name: Check Spark Master UI | |
run: | | |
echo "Checking Spark Master UI..." | |
docker-compose exec -T spark-master bash -c "curl -s http://spark-master:8080/json/ | jq '.workers'" | |
- name: Verify Spark configuration | |
run: | | |
echo "Verifying Spark configuration..." | |
docker-compose exec -T spark-master bash -c "cat /spark/conf/spark-defaults.conf" | |
- name: List Spark jars | |
run: | | |
echo "Listing Spark jars..." | |
docker-compose exec -T spark-master bash -c "ls -l /spark/jars | grep iceberg" | |
- name: Wait for Thrift Server | |
run: | | |
echo "Waiting for Thrift Server to be fully operational..." | |
sleep 60 | |
- name: Check ThriftServer Process | |
run: docker-compose exec -T thrift-server bash -c "ps aux | grep ThriftServer" | |
- name: Check Latest ThriftServer Log | |
run: docker-compose exec -T thrift-server bash -c "tail -n 50 /spark/logs/\$(ls -t /spark/logs/ | grep thriftserver | head -n1)" | |
- name: Test ThriftServer connection with Beeline | |
run: | | |
docker-compose exec -T thrift-server bash -c ' | |
/spark/bin/beeline -u "jdbc:hive2://localhost:10000" -e "SHOW DATABASES;" | |
' | |
- name: Verify AWS Credentials in Spark | |
run: | | |
docker-compose exec -T spark-master bash -c ' | |
spark-shell --master spark://spark-master:7077 << EOF | |
import org.apache.spark.sql.SparkSession | |
val spark = SparkSession.builder().getOrCreate() | |
try { | |
val df = spark.read.format("s3a").load("s3a://dbt-spark-iceberg/") | |
println("Successfully read from S3") | |
df.show() | |
} catch { | |
case e: Exception => | |
println("Failed to read from S3") | |
e.printStackTrace() | |
} | |
spark.stop() | |
EOF | |
' | |
- name: 'Pre-test: Drop ci schemas' | |
run: | | |
dbt run-operation post_ci_cleanup --target spark | |
- name: Run tests | |
run: | | |
echo "Running DBT tests..." | |
./.scripts/integration_tests.sh -d spark | |
echo "DBT tests completed." | |
- name: 'Post-test: Drop ci schemas' | |
run: | | |
dbt run-operation post_ci_cleanup --target spark | |
- name: Cleanup Spark cluster | |
if: always() | |
run: | | |
docker-compose down |