snowplow · agnessnowplow · Dec 9, 2024 · Dec 10, 2024
@@ -0,0 +1,17 @@
+## PR TESTS
+
+PR tests are tests created for two main reasons: 
+1. to make sure we keep the integrity of the packages -> integration testing
+2. to be able to check if the code in development does what it is designed to do -> unit testing
+
+### Integration tests:
+All integration test related code is under the folder integration_tests within pr_tests. They are designed to be run for each PR.
+
+### Unit tests:
+Unit test related code is under the folder unit_tests within pr_tests/models. They are designed to be run only when a particular feature is developed and optionally also when the PR is opened against main. They are referenced by the macro to be tested and may contain multiple tests. Testing is usually achieved by comparing the expected_<macro_name>_macro sql against test_<macro_name>_macro sql outputs. Both models are executed and compared using dbt_utils.equality check. 
+
+To schedule the tests when ready you can add them in the corresponding script under .scripts folder then reference it in the pr_tests.yml as a github workflow.
+
+
+
+
@@ -3,7 +3,7 @@ name: pr_tests
 on:
   pull_request:
 
-concurrency: dbt_integration_tests
+concurrency: dbt_integration_and_unit_tests
 
 env:
   # Set profiles.yml directory
@@ -56,7 +56,7 @@ jobs:
     runs-on: ubuntu-latest
     defaults:
       run:
-        # Run tests from integration_tests sub dir
+        # Run tests from pr_tests sub dir which is a dbt repository that references the latest snowplow-utils package
         working-directory: ./integration_tests
     strategy:
       fail-fast: false
@@ -83,6 +83,15 @@ jobs:
     steps:
       - name: Check out
         uses: actions/checkout@v3
+      - name: Get branch name
+        id: vars
+        # This needs surfaced for below 'run unit tests' step which may run on specific branch only
+        run: |
+          if [ "${{ github.base_ref }}" = "main" ]; then
+            echo "BRANCH='release'" >> $GITHUB_ENV
+          else
+            echo "BRANCH='${{ github.head_ref }}'" >> $GITHUB_ENV
+          fi
       - name: Configure Docker credentials
         uses: docker/login-action@v2
         with:
@@ -97,10 +106,8 @@ jobs:
       - name: Set warehouse variables
         id: set_warehouse
         run: |
-          WAREHOUSE_PLATFORM=$(echo ${{ matrix.warehouse }} | cut -d'_' -f1)
-          WAREHOUSE_SPECIFIC=$(echo ${{ matrix.warehouse }} | cut -s -d'_' -f2)
-          echo "WAREHOUSE_PLATFORM=${WAREHOUSE_PLATFORM}" >> $GITHUB_ENV
-          echo "WAREHOUSE_SPECIFIC=${WAREHOUSE_SPECIFIC}" >> $GITHUB_ENV
+          echo "WAREHOUSE_PLATFORM=$(echo ${{ matrix.warehouse }} | cut -d'_' -f1)" >> $GITHUB_ENV
+          echo "WAREHOUSE_SPECIFIC=$(echo ${{ matrix.warehouse }} | cut -s -d'_' -f2)" >> $GITHUB_ENV
           echo "warehouse_platform=${WAREHOUSE_PLATFORM}" >> $GITHUB_OUTPUT
           echo "warehouse_specific=${WAREHOUSE_SPECIFIC}" >> $GITHUB_OUTPUT
       # Remove '*' and replace '.' with '_' in DBT_VERSION & set as SCHEMA_SUFFIX.
@@ -147,7 +154,6 @@ jobs:
           sudo curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
           sudo chmod +x /usr/local/bin/docker-compose
 
-
       - name: Build and start Spark cluster
         working-directory: .github/workflows/spark_deployment
         run: |
@@ -159,10 +165,16 @@ jobs:
       - name: "Pre-test: Drop ci schemas"
         run: |
           dbt run-operation post_ci_cleanup --target ${{matrix.warehouse}}
-
-      - name: Run tests
+
+      - name: Make unit_tests.sh executable
+        run: chmod +x ./.scripts/unit_tests.sh
+
+      - name: Run Unit tests
+        run: ./.scripts/unit_tests.sh -d ${{matrix.warehouse}} -b ${{ env.BRANCH }}
+
+      - name: Run integration tests
         run: ./.scripts/integration_tests.sh -d ${{matrix.warehouse}}
-
+        
       - name: "Post-test: Drop ci schemas"
         run: |
           dbt run-operation post_ci_cleanup --target ${{matrix.warehouse}}
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+# Expected input:
+# -d (database) target database for dbt. Set to 'all' to test all supported databases.
+# -b (branch) pr branch name, taken from the pr itself, when the pr is opened against main it will take the value release automatically
+while getopts "b:d:" opt; do
+    case $opt in
+        b) BRANCH="$OPTARG" ;;
+        d) DATABASE=$OPTARG ;;
+        *) echo "Invalid option: -$OPTARG" >&2 ;;
+    esac
+done
+
+
+declare -a SUPPORTED_DATABASES=("bigquery" "databricks" "postgres" "redshift" "snowflake", "spark_iceberg")
+
+# set to lower case
+DATABASE="$(echo $DATABASE | tr '[:upper:]' '[:lower:]')"
+BRANCH="$(echo $BRANCH | tr '[:upper:]' '[:lower:]')"
+
+if [[ $DATABASE == "all" ]]; then
+  DATABASES=( "${SUPPORTED_DATABASES[@]}" )
+else
+  DATABASES=$DATABASE
+fi
+
+for db in ${DATABASES[@]}; do
+
+    # In order to test this macro we need a model reference first and also a timestamp column which the macro takes the min and max of
+    # We need to make sure that the correct result is returned even if the table is empty and whether they want the output to be a low or a high set date in that case
+    # All in the models folder
+
+  if [[ $BRANCH == "release" || $BRANCH == "fix/return_limits" ]]; then
+    echo "Snowplow-utils unit tests: Run test_return_limits_from_model_macro"
+    eval "dbt run --select +test_return_limits_from_model_macro expected_return_limits_from_model_macro  --target $db --full-refresh" || exit 1;
+    eval "dbt test --select +test_return_limits_from_model_macro --store-failures --target $db" || exit 1;
+  fi
+
+done
@@ -19,7 +19,6 @@ clean-targets:
     - "dbt_modules"
     - "dbt_packages"
 
-
 # Completely or partially remove models from the manifest during run start.
 on-run-start:
   - "{{ snowplow_utils.snowplow_delete_from_manifest(var('models_to_remove',[]), ref('snowplow_incremental_manifest_actual')) }}"
@@ -98,12 +97,8 @@ models:
           +enabled: "{{ target.type in ['bigquery'] | as_bool() }}"
         data_get_string_agg_grp:
           +materialized: table
-
     incremental_hooks:
       +materialized: "{{ 'table' if target.type in ['spark'] else 'view' }}"
-
-
-
     base:
       +bind: false
       +materialized: table
@@ -118,7 +113,8 @@ models:
           +enabled: "{{ target.type == 'snowflake' | as_bool() }}"
         spark:
           +enabled: "{{ target.type == 'spark' | as_bool() }}"
-
+    unit_tests:
+      +materialized: "{{ 'table' if target.type in ['spark'] else 'view' }}"
 
 tests:
   snowplow_utils_integration_tests:
@@ -134,32 +130,27 @@ seeds:
   +quote_columns: false
   snowplow_utils_integration_tests:
     +schema: "snplw_utils_int_tests"
-
     incremental_hooks:
-
       data_get_incremental_manifest_status:
         +column_types:
           last_success: timestamp
       data_get_incremental_manifest_status_expected:
         +column_types:
           min_last_success: timestamp
           max_last_success: timestamp
-
       data_get_run_limits:
         +column_types:
           min_last_success: timestamp
           max_last_success: timestamp
           start_date: date
           lower_limit: timestamp
           upper_limit: timestamp
-
       data_update_incremental_manifest_table:
         +column_types:
           is_in_manifest: boolean
           last_success: timestamp
           collector_tstamp: timestamp
           expected_last_success: timestamp
-
     materializations:
       data_incremental:
         +column_types:
@@ -200,7 +191,6 @@ seeds:
             id: integer
             id2: integer
             start_tstamp: timestamp
-
     utils:
       data_return_limits_from_models:
         start_tstamp: timestamp
@@ -210,7 +200,6 @@ seeds:
         end_tstamp: timestamp
       data_get_sde_or_context:
         +enabled: "{{ target.type in ['postgres', 'redshift'] | as_bool() }}"
-
     base:
       source:
         contexts_com_snowplowanalytics_session_identifier_1_0_0:

@@ -0,0 +1,2 @@
+SELECT cast(NULL as {{ dbt.type_timestamp() }}) AS tstamp_col
+
@@ -0,0 +1,3 @@
+SELECT cast(NULL as {{ dbt.type_timestamp() }}) AS tstamp_col
+UNION ALL
+SELECT cast(NULL as {{ dbt.type_timestamp() }}) AS tstamp_col
@@ -0,0 +1,4 @@
+SELECT cast('2024-12-01 00:00:00' as {{ dbt.type_timestamp() }}) AS tstamp_col
+UNION ALL
+SELECT cast('2024-12-02 00:00:00' as {{ dbt.type_timestamp() }}) AS tstamp_col
+
@@ -0,0 +1,24 @@
+{#
+Copyright (c) 2021-present Snowplow Analytics Ltd. All rights reserved.
+This program is licensed to you under the Snowplow Personal and Academic License Version 1.0,
+and you may not use this file except in compliance with the Snowplow Personal and Academic License Version 1.0.
+You may obtain a copy of the Snowplow Personal and Academic License Version 1.0 at https://docs.snowplow.io/personal-and-academic-license-1.0/
+#}
+
+SELECT 'dummy_model_empty' AS test_case, cast('9999-01-01 00:00:00.000' as {{ dbt.type_timestamp() }}) as lower_limit, cast('9999-01-02 00:00:00.000' as {{ dbt.type_timestamp() }}) AS upper_limit
+UNION ALL
+SELECT 'dummy_model_only_nulls', cast('9999-01-01 00:00:00.000' as {{ dbt.type_timestamp() }}) , cast('9999-01-02 00:00:00.000' as {{ dbt.type_timestamp() }})
+UNION ALL
+SELECT 'dummy_model_standard', cast('2024-12-01 00:00:00.000' as {{ dbt.type_timestamp() }}) , cast('2024-12-02 00:00:00.000' as {{ dbt.type_timestamp() }})
+UNION ALL
+SELECT 'dummy_model_empty_with_lower_output_true', cast('1970-01-01 00:00:00.000' as {{ dbt.type_timestamp() }}), cast('1970-01-02 00:00:00.000'  as {{ dbt.type_timestamp() }})
+UNION ALL
+SELECT 'dummy_model_empty_with_lower_output_false', cast('9999-01-01 00:00:00.000' as {{ dbt.type_timestamp() }}), cast('9999-01-02 00:00:00.000' as {{ dbt.type_timestamp() }})
+UNION ALL
+SELECT 'dummy_model_only_nulls_with_lower_output_true', cast('1970-01-01 00:00:00.000' as {{ dbt.type_timestamp() }}), cast('1970-01-02 00:00:00.000'  as {{ dbt.type_timestamp() }})
+UNION ALL
+SELECT 'dummy_model_only_nulls_with_lower_output_false', cast('9999-01-01 00:00:00.000' as {{ dbt.type_timestamp() }}), cast('9999-01-02 00:00:00.000' as {{ dbt.type_timestamp() }})
+UNION ALL
+SELECT 'dummy_model_standard_with_lower_output_true', cast('2024-12-01 00:00:00.000' as {{ dbt.type_timestamp() }}), cast('2024-12-02 00:00:00.000' as {{ dbt.type_timestamp() }})
+UNION ALL
+SELECT 'dummy_model_standard_with_lower_output_false', cast('2024-12-01 00:00:00.000' as {{ dbt.type_timestamp() }}), cast('2024-12-02 00:00:00.000' as {{ dbt.type_timestamp() }})
@@ -0,0 +1,7 @@
+version: 2
+
+models:
+  - name: test_return_limits_from_model_macro
+    tests:
+      - dbt_utils.equality:
+          compare_model: ref('expected_return_limits_from_model_macro')
@@ -0,0 +1,70 @@
+{#
+Copyright (c) 2021-present Snowplow Analytics Ltd. All rights reserved.
+This program is licensed to you under the Snowplow Personal and Academic License Version 1.0,
+and you may not use this file except in compliance with the Snowplow Personal and Academic License Version 1.0.
+You may obtain a copy of the Snowplow Personal and Academic License Version 1.0 at https://docs.snowplow.io/personal-and-academic-license-1.0/
+#}
+
+{%- set lower_limit_1, upper_limit_1 = snowplow_utils.return_limits_from_model(ref('dummy_model_empty'), 'tstamp_col', 'tstamp_col') %}
+{%- set lower_limit_2, upper_limit_2 = snowplow_utils.return_limits_from_model(ref('dummy_model_only_nulls'), 'tstamp_col', 'tstamp_col') %}
+{%- set lower_limit_3, upper_limit_3 = snowplow_utils.return_limits_from_model(ref('dummy_model_standard'), 'tstamp_col', 'tstamp_col') %}
+{%- set lower_limit_4, upper_limit_4 = snowplow_utils.return_limits_from_model(ref('dummy_model_empty'), 'tstamp_col', 'tstamp_col', lower_output=True) %}
+{%- set lower_limit_5, upper_limit_5 = snowplow_utils.return_limits_from_model(ref('dummy_model_empty'), 'tstamp_col', 'tstamp_col', lower_output=False) %}
+{%- set lower_limit_6, upper_limit_6 = snowplow_utils.return_limits_from_model(ref('dummy_model_only_nulls'), 'tstamp_col', 'tstamp_col', lower_output=True) %}
+{%- set lower_limit_7, upper_limit_7 = snowplow_utils.return_limits_from_model(ref('dummy_model_only_nulls'), 'tstamp_col', 'tstamp_col', lower_output=False) %}
+{%- set lower_limit_8, upper_limit_8 = snowplow_utils.return_limits_from_model(ref('dummy_model_standard'), 'tstamp_col', 'tstamp_col', lower_output=True) %}
+{%- set lower_limit_9, upper_limit_9 = snowplow_utils.return_limits_from_model(ref('dummy_model_standard'), 'tstamp_col', 'tstamp_col', lower_output=False) %}
+
+WITH input_1 AS (
+  SELECT {{ lower_limit_1 }} as lower_limit, {{ upper_limit_1 }} AS upper_limit
+)
+
+, input_2 AS (
+  SELECT {{ lower_limit_2 }} as lower_limit, {{ upper_limit_2 }} AS upper_limit
+)
+
+, input_3 AS (
+  SELECT {{ lower_limit_3 }} as lower_limit, {{ upper_limit_3 }} AS upper_limit
+)
+
+, input_4 AS (
+  SELECT {{ lower_limit_4 }} as lower_limit, {{ upper_limit_4 }} AS upper_limit
+)
+
+, input_5 AS (
+  SELECT {{ lower_limit_5 }} as lower_limit, {{ upper_limit_5 }} AS upper_limit
+)
+
+, input_6 AS (
+  SELECT {{ lower_limit_6 }} as lower_limit, {{ upper_limit_6 }} AS upper_limit
+)
+
+, input_7 AS (
+  SELECT {{ lower_limit_7 }} as lower_limit, {{ upper_limit_7 }} AS upper_limit
+)
+
+, input_8 AS (
+  SELECT {{ lower_limit_8 }} as lower_limit, {{ upper_limit_8 }} AS upper_limit
+)
+
+, input_9 AS (
+  SELECT {{ lower_limit_9 }} as lower_limit, {{ upper_limit_9 }} AS upper_limit
+)
+
+SELECT 'dummy_model_empty' AS test_case, lower_limit, upper_limit FROM input_1
+UNION ALL
+SELECT 'dummy_model_only_nulls' AS test_case, lower_limit, upper_limit FROM input_2
+UNION ALL
+SELECT 'dummy_model_standard' AS test_case, lower_limit, upper_limit FROM input_3
+UNION ALL
+SELECT 'dummy_model_empty_with_lower_output_true' AS test_case, lower_limit, upper_limit FROM input_4
+UNION ALL
+SELECT 'dummy_model_empty_with_lower_output_false' AS test_case, lower_limit, upper_limit FROM input_5
+UNION ALL
+SELECT 'dummy_model_only_nulls_with_lower_output_true' AS test_case, lower_limit, upper_limit FROM input_6
+UNION ALL
+SELECT 'dummy_model_only_nulls_with_lower_output_false' AS test_case, lower_limit, upper_limit FROM input_7
+UNION ALL
+SELECT 'dummy_model_standard_with_lower_output_true' AS test_case, lower_limit, upper_limit FROM input_8
+UNION ALL
+SELECT 'dummy_model_standard_with_lower_output_false' AS test_case, lower_limit, upper_limit FROM input_9
@@ -21,4 +21,3 @@ models:
     tests:
       - dbt_utils.equality:
           compare_model: ref('expected_indexed_unnest')
-          exclude_columns: ['source_index']
@@ -4,6 +4,8 @@ This program is licensed to you under the Snowplow Personal and Academic License
 and you may not use this file except in compliance with the Snowplow Personal and Academic License Version 1.0.
 You may obtain a copy of the Snowplow Personal and Academic License Version 1.0 at https://docs.snowplow.io/personal-and-academic-license-1.0/
 #}
+
+with prep as (
   select
   'string_def_colon_false_false' as test_type,
   'a' as element,
@@ -106,3 +108,8 @@ You may obtain a copy of the Snowplow Personal and Academic License Version 1.0
   'int_def_colon_false_true' as test_type,
   '1' as element,
   4 as source_index
+
+)
+
+select test_type, element
+from prep
@@ -10,5 +10,10 @@ with data as (
   select * from {{ ref('data_indexed_unnest')}}
 )
 
+, expected as (
+
 {{ snowplow_utils.unnest('test_type', 'test_array', 'element', 'data', with_index=true) }}
 
+)
+
+select test_type, element from expected 
@@ -36,8 +36,8 @@ You may obtain a copy of the Snowplow Personal and Academic License Version 1.0
         {% do exceptions.warn("Snowplow Warning: No data in "~this~" for date range from variables, please modify your run variables to include data if this is not expected.") %}
         {{ snowplow_utils.log_message("Snowplow Warning: *************") }}
         {% if lower_output %}
-            {% set lower_limit = snowplow_utils.cast_to_tstamp('0000-01-01 00:00:00') %}
-            {% set upper_limit = snowplow_utils.cast_to_tstamp('0000-01-02 00:00:00') %}
+            {% set lower_limit = snowplow_utils.cast_to_tstamp('1970-01-01 00:00:00') %}
+            {% set upper_limit = snowplow_utils.cast_to_tstamp('1970-01-02 00:00:00') %}
         {%- else -%}
             {# Default behaviour for incrementalization. This allows for bigquery to still run the same way the other warehouses do, but also ensures no data is processed #}
             {% set lower_limit = snowplow_utils.cast_to_tstamp('9999-01-01 00:00:00') %}

@@ -0,0 +1,22 @@
+#!/bin/bash
+
+# Expected input:
+# -d (database) target database for dbt. Set to 'all' to test all supported databases.
+
+while getopts 'd:' opt
+do
+  case $opt in
+    d) DATABASE=$OPTARG
+  esac
+done
+
+declare -a SUPPORTED_DATABASES=("bigquery" "databricks" "postgres" "redshift" "snowflake", "spark_iceberg")
+
+# set to lower case
+DATABASE="$(echo $DATABASE | tr '[:upper:]' '[:lower:]')"
+
+if [[ $DATABASE == "all" ]]; then
+  DATABASES=( "${SUPPORTED_DATABASES[@]}" )
+else
+  DATABASES=$DATABASE
+fi
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		SELECT cast(NULL as {{ dbt.type_timestamp() }}) AS tstamp_col