diff --git a/terraform/gcp/pipeline/default/README.md b/terraform/gcp/pipeline/default/README.md index b466426..661ca7f 100644 --- a/terraform/gcp/pipeline/default/README.md +++ b/terraform/gcp/pipeline/default/README.md @@ -10,7 +10,7 @@ | Name | Version | |------|---------| -| [google](#provider\_google) | ~> 3.90.1 | +| [google](#provider\_google) | 3.90.1 | ## Modules @@ -21,12 +21,16 @@ | [bigquery\_loader](#module\_bigquery\_loader) | snowplow-devops/bigquery-loader-pubsub-ce/google | 0.1.0 | | [collector\_lb](#module\_collector\_lb) | snowplow-devops/lb/google | 0.1.0 | | [collector\_pubsub](#module\_collector\_pubsub) | snowplow-devops/collector-pubsub-ce/google | 0.2.2 | +| [databricks\_loader](#module\_databricks\_loader) | ../../../../../terraform-google-databricks-loader-pubsub-ce | n/a | | [enrich\_pubsub](#module\_enrich\_pubsub) | snowplow-devops/enrich-pubsub-ce/google | 0.1.2 | | [enriched\_topic](#module\_enriched\_topic) | snowplow-devops/pubsub-topic/google | 0.1.0 | | [postgres\_db](#module\_postgres\_db) | snowplow-devops/cloud-sql/google | 0.1.1 | | [postgres\_loader\_bad](#module\_postgres\_loader\_bad) | snowplow-devops/postgres-loader-pubsub-ce/google | 0.2.1 | | [postgres\_loader\_enriched](#module\_postgres\_loader\_enriched) | snowplow-devops/postgres-loader-pubsub-ce/google | 0.2.1 | | [raw\_topic](#module\_raw\_topic) | snowplow-devops/pubsub-topic/google | 0.1.0 | +| [snowflake\_loader](#module\_snowflake\_loader) | ../../../../../terraform-google-snowflake-loader-pubsub-ce | n/a | +| [transformed\_topic](#module\_transformed\_topic) | snowplow-devops/pubsub-topic/google | 0.1.0 | +| [transformer\_pubsub\_enriched](#module\_transformer\_pubsub\_enriched) | ../../../../../terraform-google-transformer-pubsub-ce | n/a | ## Resources @@ -34,32 +38,46 @@ |------|------| | [google_bigquery_dataset.bigquery_db](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/bigquery_dataset) | resource | | [google_storage_bucket.bq_loader_dead_letter_bucket](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket) | resource | +| [google_storage_bucket.transformer_bucket](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket) | resource | ## Inputs | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [iglu\_server\_dns\_name](#input\_iglu\_server\_dns\_name) | The DNS name of your Iglu Server | `string` | n/a | yes | -| [iglu\_super\_api\_key](#input\_iglu\_super\_api\_key) | A UUIDv4 string to use as the master API key for Iglu Server management | `string` | n/a | yes | -| [network](#input\_network) | The name of the network to deploy within | `string` | n/a | yes | -| [postgres\_db\_name](#input\_postgres\_db\_name) | The name of the database to connect to | `string` | n/a | yes | -| [postgres\_db\_password](#input\_postgres\_db\_password) | The password to use to connect to the database | `string` | n/a | yes | -| [postgres\_db\_username](#input\_postgres\_db\_username) | The username to use to connect to the database | `string` | n/a | yes | -| [prefix](#input\_prefix) | Will be prefixed to all resource names. Use to easily identify the resources created | `string` | n/a | yes | -| [project\_id](#input\_project\_id) | The project ID in which the stack is being deployed | `string` | n/a | yes | -| [region](#input\_region) | The name of the region to deploy within | `string` | n/a | yes | -| [ssh\_ip\_allowlist](#input\_ssh\_ip\_allowlist) | The list of CIDR ranges to allow SSH traffic from | `list(any)` | n/a | yes | -| [subnetwork](#input\_subnetwork) | The name of the sub-network to deploy within | `string` | n/a | yes | | [bigquery\_db\_enabled](#input\_bigquery\_db\_enabled) | Whether to enable loading into a BigQuery Dataset | `bool` | `false` | no | | [bigquery\_loader\_dead\_letter\_bucket\_deploy](#input\_bigquery\_loader\_dead\_letter\_bucket\_deploy) | Whether this module should create a new bucket with the specified name - if the bucket already exists set this to false | `bool` | `true` | no | | [bigquery\_loader\_dead\_letter\_bucket\_name](#input\_bigquery\_loader\_dead\_letter\_bucket\_name) | The name of the GCS bucket to use for dead-letter output of loader | `string` | `""` | no | +| [iglu\_server\_dns\_name](#input\_iglu\_server\_dns\_name) | The DNS name of your Iglu Server | `string` | n/a | yes | +| [iglu\_super\_api\_key](#input\_iglu\_super\_api\_key) | A UUIDv4 string to use as the master API key for Iglu Server management | `string` | n/a | yes | | [labels](#input\_labels) | The labels to append to the resources in this module | `map(string)` | `{}` | no | +| [network](#input\_network) | The name of the network to deploy within | `string` | n/a | yes | +| [pipeline\_db](#input\_pipeline\_db) | Database used by pipeline | `string` | n/a | yes | | [postgres\_db\_authorized\_networks](#input\_postgres\_db\_authorized\_networks) | The list of CIDR ranges to allow access to the Pipeline Database over |
list(object({| `[]` | no | | [postgres\_db\_enabled](#input\_postgres\_db\_enabled) | Whether to enable loading into a Postgres Database | `bool` | `false` | no | +| [postgres\_db\_name](#input\_postgres\_db\_name) | The name of the database to connect to | `string` | `""` | no | +| [postgres\_db\_password](#input\_postgres\_db\_password) | The password to use to connect to the database | `string` | `""` | no | | [postgres\_db\_tier](#input\_postgres\_db\_tier) | The instance type to assign to the deployed Cloud SQL instance | `string` | `"db-g1-small"` | no | +| [postgres\_db\_username](#input\_postgres\_db\_username) | The username to use to connect to the database | `string` | `""` | no | +| [prefix](#input\_prefix) | Will be prefixed to all resource names. Use to easily identify the resources created | `string` | n/a | yes | +| [project\_id](#input\_project\_id) | The project ID in which the stack is being deployed | `string` | n/a | yes | +| [region](#input\_region) | The name of the region to deploy within | `string` | n/a | yes | +| [snowflake\_account](#input\_snowflake\_account) | Snowflake account to use | `string` | `""` | no | +| [snowflake\_callback\_iam](#input\_snowflake\_callback\_iam) | Snowflake callback IAM from STORAGE INTEGRATION | `string` | `""` | no | +| [snowflake\_database](#input\_snowflake\_database) | Snowflake database name | `string` | `""` | no | +| [snowflake\_loader\_password](#input\_snowflake\_loader\_password) | The password to use for the loader user | `string` | `""` | no | +| [snowflake\_loader\_role](#input\_snowflake\_loader\_role) | Snowflake role for loading snowplow data | `string` | `""` | no | +| [snowflake\_loader\_user](#input\_snowflake\_loader\_user) | The Snowflake user used by Snowflake Loader | `string` | `""` | no | +| [snowflake\_region](#input\_snowflake\_region) | Region of Snowflake account | `string` | `""` | no | +| [snowflake\_schema](#input\_snowflake\_schema) | Snowflake schema name | `string` | `""` | no | +| [snowflake\_transformed\_stage\_name](#input\_snowflake\_transformed\_stage\_name) | Name of transformed stage | `string` | `""` | no | +| [snowflake\_warehouse](#input\_snowflake\_warehouse) | Snowflake warehouse name | `string` | `""` | no | +| [ssh\_ip\_allowlist](#input\_ssh\_ip\_allowlist) | The list of CIDR ranges to allow SSH traffic from | `list(any)` | n/a | yes | | [ssh\_key\_pairs](#input\_ssh\_key\_pairs) | The list of SSH key-pairs to add to the servers |
name = string
value = string
}))
list(object({| `[]` | no | | [ssl\_information](#input\_ssl\_information) | The ID of an Google Managed certificate to bind to the load balancer |
user_name = string
public_key = string
}))
object({|
enabled = bool
certificate_id = string
})
{| no | +| [subnetwork](#input\_subnetwork) | The name of the sub-network to deploy within | `string` | n/a | yes | | [telemetry\_enabled](#input\_telemetry\_enabled) | Whether or not to send telemetry information back to Snowplow Analytics Ltd | `bool` | `true` | no | +| [transformer\_bucket\_name](#input\_transformer\_bucket\_name) | Transformer bucket name, prefixed with the prefix value | `string` | `"qs-transformed"` | no | +| [transformer\_window\_period\_min](#input\_transformer\_window\_period\_min) | Frequency to emit transforming finished message - 5,10,15,20,30,60 etc minutes | `number` | `5` | no | | [user\_provided\_id](#input\_user\_provided\_id) | An optional unique identifier to identify the telemetry events emitted by this stack | `string` | `""` | no | ## Outputs @@ -71,4 +89,6 @@ | [bq\_loader\_dead\_letter\_bucket\_name](#output\_bq\_loader\_dead\_letter\_bucket\_name) | The name of the GCS bucket for dead letter events emitted from the BigQuery loader | | [collector\_ip\_address](#output\_collector\_ip\_address) | The IP address for the Pipeline Collector | | [postgres\_db\_ip\_address](#output\_postgres\_db\_ip\_address) | The IP address of the database where your data is being streamed | -| [postgres\_db\_port](#output\_postgres\_db\_port) | The port of the database where your data is being streamed | \ No newline at end of file +| [postgres\_db\_port](#output\_postgres\_db\_port) | The port of the database where your data is being streamed | +| [transformer\_bucket\_name](#output\_transformer\_bucket\_name) | The name of the GCS bucket for transformerd events emitted from the Transformer | +| [transformer\_topic\_name](#output\_transformer\_topic\_name) | The name of the topic for transformed rows emitted from the Transformer | diff --git a/terraform/gcp/pipeline/default/bigquery.terraform.tfvars b/terraform/gcp/pipeline/default/bigquery.terraform.tfvars index 979eec7..1d38391 100644 --- a/terraform/gcp/pipeline/default/bigquery.terraform.tfvars +++ b/terraform/gcp/pipeline/default/bigquery.terraform.tfvars @@ -35,7 +35,7 @@ iglu_server_dns_name = "http://CHANGE-TO-MY-IGLU-IP" iglu_super_api_key = "00000000-0000-0000-0000-000000000000" # --- Snowplow BigQuery Loader -bigquery_db_enabled = true +pipeline_db = "bigquery" # To use an existing bucket set this to false bigquery_loader_dead_letter_bucket_deploy = true # Must be globally unique so will need to be updated before applying diff --git a/terraform/gcp/pipeline/default/databricks.terraform.tfvars b/terraform/gcp/pipeline/default/databricks.terraform.tfvars new file mode 100644 index 0000000..309705a --- /dev/null +++ b/terraform/gcp/pipeline/default/databricks.terraform.tfvars @@ -0,0 +1,61 @@ +# Will be prefixed to all resource names +# Use this to easily identify the resources created and provide entropy for subsequent environments +prefix = "sp" + +# The project to deploy the infrastructure into +project_id = "PROJECT_ID_TO_DEPLOY_INTO" + +# Where to deploy the infrastructure +region = "REGION_TO_DEPLOY_INTO" + +# --- Default Network +# Update to the network you would like to deploy into +# +# Note: If you opt to use your own network then you will need to define a subnetwork to deploy into as well +network = "default" +subnetwork = "" + +# --- SSH +# Update this to your IP Address +ssh_ip_allowlist = ["999.999.999.999/32"] +# Generate a new SSH key locally with `ssh-keygen` +# ssh-keygen -t rsa -b 4096 +ssh_key_pairs = [ + { + user_name = "snowplow" + public_key = "ssh-rsa AAAAB3NzaC1yc2EAAAADAQA0jSi9//bRsHW4M6czodTs6smCXsxZ0gijzth0aBmycE= snowplow@Snowplows-MacBook-Pro.local" + } +] + +# --- Iglu Server Configuration +# Iglu Server DNS output from the Iglu Server stack +iglu_server_dns_name = "http://CHANGE-TO-MY-IGLU-IP" +# Used for API actions on the Iglu Server +# Change this to the same UUID from when you created the Iglu Server +iglu_super_api_key = "00000000-0000-0000-0000-000000000000" + +# --- Snowplow Databricks Loader +pipeline_db = "databricks" +deltalake_catalog = "DB_CATALOG" +deltalake_schema = "DB_SCHEMA" +deltalake_host = "DB_HOST" +deltalake_port = "DB_PORT" +deltalake_http_path = "DB_HTTP_PATH" +deltalake_auth_token = "DB_AUTH_TOKEN" +transformer_window_period_min = 10 +transformer_bucket_name = "transformer-bucket" +databricks_callback_iam = "DB_CALLBACK_IAM" + +# See for more information: https://registry.terraform.io/modules/snowplow-devops/collector-pubsub-ce/google/latest#telemetry +# Telemetry principles: https://docs.snowplowanalytics.com/docs/open-source-quick-start/what-is-the-quick-start-for-open-source/telemetry-principles/ +user_provided_id = "" +telemetry_enabled = true + +# --- SSL Configuration (optional) +ssl_information = { + certificate_id = "" + enabled = false +} + +# --- Extra Labels to append to created resources (optional) +labels = {} diff --git a/terraform/gcp/pipeline/default/main.tf b/terraform/gcp/pipeline/default/main.tf index 9035efc..621c782 100644 --- a/terraform/gcp/pipeline/default/main.tf +++ b/terraform/gcp/pipeline/default/main.tf @@ -8,6 +8,42 @@ locals { vendor_prefixes = [] } ] + + bigquery_enabled = ( + var.pipeline_db == "bigquery" + && var.bigquery_loader_dead_letter_bucket_deploy != "" + && var.bigquery_loader_dead_letter_bucket_name != "" + ) + + snowflake_enabled = ( + var.pipeline_db == "snowflake" + && var.snowflake_account != "" + && var.snowflake_region != "" + && var.snowflake_loader_user != "" + && var.snowflake_loader_password != "" + && var.snowflake_database != "" + && var.snowflake_schema != "" + && var.snowflake_loader_role != "" + && var.snowflake_warehouse != "" + && var.snowflake_transformed_stage_name != "" + ) + + databricks_enabled = ( + var.pipeline_db == "databricks" + && var.deltalake_catalog != "" + && var.deltalake_schema != "" + && var.deltalake_host != "" + && var.deltalake_port != "" + && var.deltalake_http_path != "" + && var.deltalake_auth_token != "" + ) + + postgres_enabled = ( + var.pipeline_db == "postgres" + && var.postgres_db_name != "" + && var.postgres_db_username != "" + && var.postgres_db_password != "" + ) } provider "google" { @@ -43,6 +79,15 @@ module "enriched_topic" { labels = var.labels } +module "transformed_topic" { + source = "snowplow-devops/pubsub-topic/google" + version = "0.1.0" + + name = "${var.prefix}-transformed-topic" + + labels = var.labels +} + # 2. Deploy Collector stack module "collector_pubsub" { source = "snowplow-devops/collector-pubsub-ce/google" @@ -113,7 +158,7 @@ module "postgres_db" { source = "snowplow-devops/cloud-sql/google" version = "0.1.1" - count = var.postgres_db_enabled ? 1 : 0 + count = local.postgres_enabled ? 1 : 0 name = "${var.prefix}-postgres-db" @@ -133,7 +178,7 @@ module "postgres_loader_enriched" { source = "snowplow-devops/postgres-loader-pubsub-ce/google" version = "0.2.1" - count = var.postgres_db_enabled ? 1 : 0 + count = local.postgres_enabled ? 1 : 0 name = "${var.prefix}-pg-loader-enriched-server" @@ -168,7 +213,7 @@ module "postgres_loader_bad" { source = "snowplow-devops/postgres-loader-pubsub-ce/google" version = "0.2.1" - count = var.postgres_db_enabled ? 1 : 0 + count = local.postgres_enabled ? 1 : 0 name = "${var.prefix}-pg-loader-bad-server" @@ -204,7 +249,7 @@ module "bad_rows_topic" { source = "snowplow-devops/pubsub-topic/google" version = "0.1.0" - count = var.bigquery_db_enabled ? 1 : 0 + count = local.bigquery_enabled ? 1 : 0 name = "${var.prefix}-bq-bad-rows-topic" @@ -212,7 +257,7 @@ module "bad_rows_topic" { } resource "google_bigquery_dataset" "bigquery_db" { - count = var.bigquery_db_enabled ? 1 : 0 + count = local.bigquery_enabled ? 1 : 0 dataset_id = replace("${var.prefix}_pipeline_db", "-", "_") location = var.region @@ -221,7 +266,7 @@ resource "google_bigquery_dataset" "bigquery_db" { } resource "google_storage_bucket" "bq_loader_dead_letter_bucket" { - count = var.bigquery_db_enabled && var.bigquery_loader_dead_letter_bucket_deploy ? 1 : 0 + count = local.bigquery_enabled && var.bigquery_loader_dead_letter_bucket_deploy ? 1 : 0 name = var.bigquery_loader_dead_letter_bucket_name location = var.region @@ -231,19 +276,19 @@ resource "google_storage_bucket" "bq_loader_dead_letter_bucket" { } locals { - bq_loader_dead_letter_bucket_name = coalesce( + bq_loader_dead_letter_bucket_name = local.bigquery_enabled ? coalesce( join("", google_storage_bucket.bq_loader_dead_letter_bucket.*.name), var.bigquery_loader_dead_letter_bucket_name, - ) + ) : "" } module "bigquery_loader" { source = "snowplow-devops/bigquery-loader-pubsub-ce/google" version = "0.1.0" - count = var.bigquery_db_enabled ? 1 : 0 + count = local.bigquery_enabled ? 1 : 0 - name = "${var.prefix}-bq-loader-server" + name = "${var.prefix}-bq-loader" network = var.network subnetwork = var.subnetwork @@ -266,3 +311,111 @@ module "bigquery_loader" { labels = var.labels } + +# 6. Deploy Transformer and Snowflake/Databricks loader +resource "google_storage_bucket" "transformer_bucket" { + count = (local.snowflake_enabled || local.databricks_enabled) ? 1 : 0 + + name = "${var.prefix}-${var.transformer_bucket_name}" + location = var.region + force_destroy = true + + labels = var.labels +} + +module "transformer_pubsub_enriched" { + # source = "snowplow-devops/transformer-pubsub-ce/google" + # version = "0.1.0" + source = "../../../../../terraform-google-transformer-pubsub-ce" + + count = (local.snowflake_enabled || local.databricks_enabled) ? 1 : 0 + + network = var.network + subnetwork = var.subnetwork + region = var.region + project_id = var.project_id + + input_topic_name = module.enriched_topic.name + message_queue_topic_name = module.transformed_topic.name + + name = "${var.prefix}-transformer" + ssh_key_pairs = var.ssh_key_pairs + ssh_ip_allowlist = var.ssh_ip_allowlist + transformation_type = "widerow" + widerow_file_format = local.snowflake_enabled ? "json" : "parquet" + custom_iglu_resolvers = local.custom_iglu_resolvers + telemetry_enabled = var.telemetry_enabled + user_provided_id = var.user_provided_id + transformer_output = google_storage_bucket.transformer_bucket[0].name + + labels = var.labels +} + +module "snowflake_loader" { + # source = "snowplow-devops/snowflake-loader-google-ce/gcp" + # version = "0.1.0" + source = "../../../../../terraform-google-snowflake-loader-pubsub-ce" + + count = local.snowflake_enabled ? 1 : 0 + + network = var.network + subnetwork = var.subnetwork + region = var.region + project_id = var.project_id + + name = "${var.prefix}-snowflake" + ssh_key_pairs = var.ssh_key_pairs + input_topic_name = module.transformed_topic.name + ssh_ip_allowlist = var.ssh_ip_allowlist + snowflake_region = var.snowflake_region + snowflake_account = var.snowflake_account + snowflake_loader_user = var.snowflake_loader_user + snowflake_password = var.snowflake_loader_password + snowflake_database = var.snowflake_database + snowflake_schema = var.snowflake_schema + snowflake_loader_role = var.snowflake_loader_role + snowflake_warehouse = var.snowflake_warehouse + snowflake_transformed_stage_name = var.snowflake_transformed_stage_name + snowflake_folder_monitoring_stage_url = "" + snowflake_callback_iam = var.snowflake_callback_iam + telemetry_enabled = var.telemetry_enabled + user_provided_id = var.user_provided_id + custom_iglu_resolvers = local.custom_iglu_resolvers + + transformer_output = google_storage_bucket.transformer_bucket[0].name + + labels = var.labels +} + +module "databricks_loader" { + # source = "snowplow-devops/snowflake-loader-google-ce/gcp" + # version = "0.1.0" + source = "../../../../../terraform-google-databricks-loader-pubsub-ce" + + count = local.databricks_enabled ? 1 : 0 + + network = var.network + subnetwork = var.subnetwork + region = var.region + project_id = var.project_id + + name = "${var.prefix}-databricks" + ssh_key_pairs = var.ssh_key_pairs + input_topic_name = module.transformed_topic.name + ssh_ip_allowlist = var.ssh_ip_allowlist + deltalake_catalog = var.deltalake_catalog + deltalake_schema = var.deltalake_schema + deltalake_host = var.deltalake_host + deltalake_port = var.deltalake_port + deltalake_http_path = var.deltalake_http_path + deltalake_auth_token = var.deltalake_auth_token + databricks_callback_iam = var.databricks_callback_iam + databricks_folder_monitoring_stage_url = "" + telemetry_enabled = var.telemetry_enabled + user_provided_id = var.user_provided_id + custom_iglu_resolvers = local.custom_iglu_resolvers + + transformer_output = google_storage_bucket.transformer_bucket[0].name + + labels = var.labels +} diff --git a/terraform/gcp/pipeline/default/outputs.tf b/terraform/gcp/pipeline/default/outputs.tf index 034f7a9..3fd253b 100644 --- a/terraform/gcp/pipeline/default/outputs.tf +++ b/terraform/gcp/pipeline/default/outputs.tf @@ -27,3 +27,13 @@ output "bq_loader_bad_rows_topic_name" { description = "The name of the topic for bad rows emitted from the BigQuery loader" value = join("", module.bad_rows_topic.*.name) } + +output "transformer_topic_name" { + description = "The name of the topic for transformed rows emitted from the Transformer" + value = join("", module.transformed_topic.*.name) +} + +output "transformer_bucket_name" { + description = "The name of the GCS bucket for transformerd events emitted from the Transformer" + value = join("", google_storage_bucket.transformer_bucket.*.name) +} diff --git a/terraform/gcp/pipeline/default/postgres.terraform.tfvars b/terraform/gcp/pipeline/default/postgres.terraform.tfvars index 7ca15d7..24c6ea7 100644 --- a/terraform/gcp/pipeline/default/postgres.terraform.tfvars +++ b/terraform/gcp/pipeline/default/postgres.terraform.tfvars @@ -35,7 +35,7 @@ iglu_server_dns_name = "http://CHANGE-TO-MY-IGLU-IP" iglu_super_api_key = "00000000-0000-0000-0000-000000000000" # --- Snowplow Postgres Loader -postgres_db_enabled = true +pipeline_db = "postgres" postgres_db_name = "snowplow" postgres_db_username = "snowplow" diff --git a/terraform/gcp/pipeline/default/snowflake.terraform.tfvars b/terraform/gcp/pipeline/default/snowflake.terraform.tfvars new file mode 100644 index 0000000..08ed1bf --- /dev/null +++ b/terraform/gcp/pipeline/default/snowflake.terraform.tfvars @@ -0,0 +1,65 @@ +# Will be prefixed to all resource names +# Use this to easily identify the resources created and provide entropy for subsequent environments +prefix = "sp" + +# The project to deploy the infrastructure into +project_id = "PROJECT_ID_TO_DEPLOY_INTO" + +# Where to deploy the infrastructure +region = "REGION_TO_DEPLOY_INTO" + +# --- Default Network +# Update to the network you would like to deploy into +# +# Note: If you opt to use your own network then you will need to define a subnetwork to deploy into as well +network = "default" +subnetwork = "" + +# --- SSH +# Update this to your IP Address +ssh_ip_allowlist = ["999.999.999.999/32"] +# Generate a new SSH key locally with `ssh-keygen` +# ssh-keygen -t rsa -b 4096 +ssh_key_pairs = [ + { + user_name = "snowplow" + public_key = "ssh-rsa AAAAB3NzaC1yc2EAAAADAQA0jSi9//bRsHW4M6czodTs6smCXsxZ0gijzth0aBmycE= snowplow@Snowplows-MacBook-Pro.local" + } +] + +# --- Iglu Server Configuration +# Iglu Server DNS output from the Iglu Server stack +iglu_server_dns_name = "http://CHANGE-TO-MY-IGLU-IP" +# Used for API actions on the Iglu Server +# Change this to the same UUID from when you created the Iglu Server +iglu_super_api_key = "00000000-0000-0000-0000-000000000000" + +# --- Snowplow Snowflake Loader +pipeline_db = "snowflake" +snowflake_account = "sf_account" +snowflake_region = "us-west-2" +# Change and keep this secret! +snowflake_loader_password = "Hell0W0rld!2" +snowflake_database = "SF_DB_NAME" +snowflake_loader_role = "SF_LOADER_ROLE" +snowflake_loader_user = "SF_LOADER_USER" +snowflake_schema = "ATOMIC" +snowflake_transformed_stage_name = "SF_TRANSFORMED_STAGE" +snowflake_warehouse = "SF_WAREHOUSE" +transformer_window_period_min = 10 +transformer_bucket_name = "transformer-bucket" +snowflake_callback_iam = "SF_CALLBACK_IAM" + +# See for more information: https://registry.terraform.io/modules/snowplow-devops/collector-pubsub-ce/google/latest#telemetry +# Telemetry principles: https://docs.snowplowanalytics.com/docs/open-source-quick-start/what-is-the-quick-start-for-open-source/telemetry-principles/ +user_provided_id = "" +telemetry_enabled = true + +# --- SSL Configuration (optional) +ssl_information = { + certificate_id = "" + enabled = false +} + +# --- Extra Labels to append to created resources (optional) +labels = {} diff --git a/terraform/gcp/pipeline/default/variables.tf b/terraform/gcp/pipeline/default/variables.tf index 57924f9..25e9294 100644 --- a/terraform/gcp/pipeline/default/variables.tf +++ b/terraform/gcp/pipeline/default/variables.tf @@ -48,6 +48,16 @@ variable "iglu_super_api_key" { sensitive = true } +variable "pipeline_db" { + type = string + description = "Database used by pipeline" + + validation { + condition = can(regex("^(postgres|bigquery|snowflake|databricks)$", var.pipeline_db)) + error_message = "Must be postgres or bigquery or snowflake or databricks." + } +} + variable "postgres_db_enabled" { description = "Whether to enable loading into a Postgres Database" default = false @@ -57,17 +67,20 @@ variable "postgres_db_enabled" { variable "postgres_db_name" { description = "The name of the database to connect to" type = string + default = "" } variable "postgres_db_username" { description = "The username to use to connect to the database" type = string + default = "" } variable "postgres_db_password" { description = "The password to use to connect to the database" type = string sensitive = true + default = "" } variable "postgres_db_authorized_networks" { @@ -132,3 +145,118 @@ variable "labels" { default = {} type = map(string) } + +variable "snowflake_account" { + description = "Snowflake account to use" + type = string + default = "" +} + +variable "snowflake_region" { + description = "Region of Snowflake account" + type = string + default = "" +} + +variable "snowflake_loader_password" { + description = "The password to use for the loader user" + type = string + sensitive = true + default = "" +} + +variable "snowflake_loader_user" { + description = "The Snowflake user used by Snowflake Loader" + type = string + default = "" +} + +variable "snowflake_database" { + description = "Snowflake database name" + type = string + default = "" +} + +variable "snowflake_schema" { + description = "Snowflake schema name" + type = string + default = "" +} + +variable "snowflake_loader_role" { + description = "Snowflake role for loading snowplow data" + type = string + default = "" +} + +variable "snowflake_warehouse" { + description = "Snowflake warehouse name" + type = string + default = "" +} + +variable "snowflake_transformed_stage_name" { + description = "Name of transformed stage" + type = string + default = "" +} + +variable "snowflake_callback_iam" { + description = "Snowflake callback IAM from STORAGE INTEGRATION" + type = string + default = "" +} + +variable "transformer_window_period_min" { + description = "Frequency to emit transforming finished message - 5,10,15,20,30,60 etc minutes" + type = number + default = 5 +} + +variable "transformer_bucket_name" { + description = "Transformer bucket name, prefixed with the prefix value" + type = string + default = "qs-transformed" +} + +variable "deltalake_catalog" { + description = "Databricks deltalake catalog" + type = string + default = "hive_metastore" +} + +variable "deltalake_schema" { + description = "Databricks deltalake schema" + type = string + default = "" +} + +variable "deltalake_host" { + description = "Databricks deltalake host" + type = string + default = "" +} + +variable "deltalake_port" { + description = "Databricks deltalake port" + type = string + default = "" +} + +variable "deltalake_http_path" { + description = "Databricks deltalake http path" + type = string + default = "" +} + +variable "deltalake_auth_token" { + description = "Databricks deltalake auth token" + type = string + default = "" + sensitive = true +} + +variable "databricks_callback_iam" { + description = "Databricks callback IAM to allow access to GCS bucket" + type = string +} diff --git a/terraform/gcp/pipeline/secure/README.md b/terraform/gcp/pipeline/secure/README.md index b466426..a805994 100644 --- a/terraform/gcp/pipeline/secure/README.md +++ b/terraform/gcp/pipeline/secure/README.md @@ -21,12 +21,16 @@ | [bigquery\_loader](#module\_bigquery\_loader) | snowplow-devops/bigquery-loader-pubsub-ce/google | 0.1.0 | | [collector\_lb](#module\_collector\_lb) | snowplow-devops/lb/google | 0.1.0 | | [collector\_pubsub](#module\_collector\_pubsub) | snowplow-devops/collector-pubsub-ce/google | 0.2.2 | +| [databricks\_loader](#module\_databricks\_loader) | ../../../../../terraform-google-databricks-loader-pubsub-ce | n/a | | [enrich\_pubsub](#module\_enrich\_pubsub) | snowplow-devops/enrich-pubsub-ce/google | 0.1.2 | | [enriched\_topic](#module\_enriched\_topic) | snowplow-devops/pubsub-topic/google | 0.1.0 | | [postgres\_db](#module\_postgres\_db) | snowplow-devops/cloud-sql/google | 0.1.1 | | [postgres\_loader\_bad](#module\_postgres\_loader\_bad) | snowplow-devops/postgres-loader-pubsub-ce/google | 0.2.1 | | [postgres\_loader\_enriched](#module\_postgres\_loader\_enriched) | snowplow-devops/postgres-loader-pubsub-ce/google | 0.2.1 | | [raw\_topic](#module\_raw\_topic) | snowplow-devops/pubsub-topic/google | 0.1.0 | +| [snowflake\_loader](#module\_snowflake\_loader) | ../../../../../terraform-google-snowflake-loader-pubsub-ce | n/a | +| [transformed\_topic](#module\_transformed\_topic) | snowplow-devops/pubsub-topic/google | 0.1.0 | +| [transformer\_pubsub\_enriched](#module\_transformer\_pubsub\_enriched) | ../../../../../terraform-google-transformer-pubsub-ce | n/a | ## Resources @@ -34,32 +38,52 @@ |------|------| | [google_bigquery_dataset.bigquery_db](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/bigquery_dataset) | resource | | [google_storage_bucket.bq_loader_dead_letter_bucket](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket) | resource | +| [google_storage_bucket.transformer_bucket](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket) | resource | ## Inputs | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| +| [bigquery\_db\_enabled](#input\_bigquery\_db\_enabled) | Whether to enable loading into a BigQuery Dataset | `bool` | `false` | no | +| [bigquery\_loader\_dead\_letter\_bucket\_deploy](#input\_bigquery\_loader\_dead\_letter\_bucket\_deploy) | Whether this module should create a new bucket with the specified name - if the bucket already exists set this to false | `bool` | `true` | no | +| [bigquery\_loader\_dead\_letter\_bucket\_name](#input\_bigquery\_loader\_dead\_letter\_bucket\_name) | The name of the GCS bucket to use for dead-letter output of loader | `string` | `""` | no | +| [databricks\_callback\_iam](#input\_databricks\_callback\_iam) | Databricks callback IAM to allow access to GCS bucket | `string` | n/a | yes | +| [deltalake\_auth\_token](#input\_deltalake\_auth\_token) | Databricks deltalake auth token | `string` | `""` | no | +| [deltalake\_catalog](#input\_deltalake\_catalog) | Databricks deltalake catalog | `string` | `"hive_metastore"` | no | +| [deltalake\_host](#input\_deltalake\_host) | Databricks deltalake host | `string` | `""` | no | +| [deltalake\_http\_path](#input\_deltalake\_http\_path) | Databricks deltalake http path | `string` | `""` | no | +| [deltalake\_port](#input\_deltalake\_port) | Databricks deltalake port | `string` | `""` | no | +| [deltalake\_schema](#input\_deltalake\_schema) | Databricks deltalake schema | `string` | `""` | no | | [iglu\_server\_dns\_name](#input\_iglu\_server\_dns\_name) | The DNS name of your Iglu Server | `string` | n/a | yes | | [iglu\_super\_api\_key](#input\_iglu\_super\_api\_key) | A UUIDv4 string to use as the master API key for Iglu Server management | `string` | n/a | yes | +| [labels](#input\_labels) | The labels to append to the resources in this module | `map(string)` | `{}` | no | | [network](#input\_network) | The name of the network to deploy within | `string` | n/a | yes | +| [postgres\_db\_authorized\_networks](#input\_postgres\_db\_authorized\_networks) | The list of CIDR ranges to allow access to the Pipeline Database over |
"certificate_id": "",
"enabled": false
}
list(object({| `[]` | no | +| [postgres\_db\_enabled](#input\_postgres\_db\_enabled) | Whether to enable loading into a Postgres Database | `bool` | `false` | no | | [postgres\_db\_name](#input\_postgres\_db\_name) | The name of the database to connect to | `string` | n/a | yes | | [postgres\_db\_password](#input\_postgres\_db\_password) | The password to use to connect to the database | `string` | n/a | yes | +| [postgres\_db\_tier](#input\_postgres\_db\_tier) | The instance type to assign to the deployed Cloud SQL instance | `string` | `"db-g1-small"` | no | | [postgres\_db\_username](#input\_postgres\_db\_username) | The username to use to connect to the database | `string` | n/a | yes | | [prefix](#input\_prefix) | Will be prefixed to all resource names. Use to easily identify the resources created | `string` | n/a | yes | | [project\_id](#input\_project\_id) | The project ID in which the stack is being deployed | `string` | n/a | yes | | [region](#input\_region) | The name of the region to deploy within | `string` | n/a | yes | +| [snowflake\_account](#input\_snowflake\_account) | Snowflake account to use | `string` | `""` | no | +| [snowflake\_callback\_iam](#input\_snowflake\_callback\_iam) | Snowflake callback IAM from STORAGE INTEGRATION | `string` | `""` | no | +| [snowflake\_database](#input\_snowflake\_database) | Snowflake database name | `string` | `""` | no | +| [snowflake\_loader\_password](#input\_snowflake\_loader\_password) | The password to use for the loader user | `string` | `""` | no | +| [snowflake\_loader\_role](#input\_snowflake\_loader\_role) | Snowflake role for loading snowplow data | `string` | `""` | no | +| [snowflake\_loader\_user](#input\_snowflake\_loader\_user) | The Snowflake user used by Snowflake Loader | `string` | `""` | no | +| [snowflake\_region](#input\_snowflake\_region) | Region of Snowflake account | `string` | `""` | no | +| [snowflake\_schema](#input\_snowflake\_schema) | Snowflake schema name | `string` | `""` | no | +| [snowflake\_transformed\_stage\_name](#input\_snowflake\_transformed\_stage\_name) | Name of transformed stage | `string` | `""` | no | +| [snowflake\_warehouse](#input\_snowflake\_warehouse) | Snowflake warehouse name | `string` | `""` | no | | [ssh\_ip\_allowlist](#input\_ssh\_ip\_allowlist) | The list of CIDR ranges to allow SSH traffic from | `list(any)` | n/a | yes | -| [subnetwork](#input\_subnetwork) | The name of the sub-network to deploy within | `string` | n/a | yes | -| [bigquery\_db\_enabled](#input\_bigquery\_db\_enabled) | Whether to enable loading into a BigQuery Dataset | `bool` | `false` | no | -| [bigquery\_loader\_dead\_letter\_bucket\_deploy](#input\_bigquery\_loader\_dead\_letter\_bucket\_deploy) | Whether this module should create a new bucket with the specified name - if the bucket already exists set this to false | `bool` | `true` | no | -| [bigquery\_loader\_dead\_letter\_bucket\_name](#input\_bigquery\_loader\_dead\_letter\_bucket\_name) | The name of the GCS bucket to use for dead-letter output of loader | `string` | `""` | no | -| [labels](#input\_labels) | The labels to append to the resources in this module | `map(string)` | `{}` | no | -| [postgres\_db\_authorized\_networks](#input\_postgres\_db\_authorized\_networks) | The list of CIDR ranges to allow access to the Pipeline Database over |
name = string
value = string
}))
list(object({| `[]` | no | -| [postgres\_db\_enabled](#input\_postgres\_db\_enabled) | Whether to enable loading into a Postgres Database | `bool` | `false` | no | -| [postgres\_db\_tier](#input\_postgres\_db\_tier) | The instance type to assign to the deployed Cloud SQL instance | `string` | `"db-g1-small"` | no | | [ssh\_key\_pairs](#input\_ssh\_key\_pairs) | The list of SSH key-pairs to add to the servers |
name = string
value = string
}))
list(object({| `[]` | no | | [ssl\_information](#input\_ssl\_information) | The ID of an Google Managed certificate to bind to the load balancer |
user_name = string
public_key = string
}))
object({|
enabled = bool
certificate_id = string
})
{| no | +| [subnetwork](#input\_subnetwork) | The name of the sub-network to deploy within | `string` | n/a | yes | | [telemetry\_enabled](#input\_telemetry\_enabled) | Whether or not to send telemetry information back to Snowplow Analytics Ltd | `bool` | `true` | no | +| [transformer\_bucket\_name](#input\_transformer\_bucket\_name) | Transformer bucket name, prefixed with the prefix value | `string` | `"qs-transformed"` | no | +| [transformer\_window\_period\_min](#input\_transformer\_window\_period\_min) | Frequency to emit transforming finished message - 5,10,15,20,30,60 etc minutes | `number` | `5` | no | | [user\_provided\_id](#input\_user\_provided\_id) | An optional unique identifier to identify the telemetry events emitted by this stack | `string` | `""` | no | ## Outputs @@ -71,4 +95,4 @@ | [bq\_loader\_dead\_letter\_bucket\_name](#output\_bq\_loader\_dead\_letter\_bucket\_name) | The name of the GCS bucket for dead letter events emitted from the BigQuery loader | | [collector\_ip\_address](#output\_collector\_ip\_address) | The IP address for the Pipeline Collector | | [postgres\_db\_ip\_address](#output\_postgres\_db\_ip\_address) | The IP address of the database where your data is being streamed | -| [postgres\_db\_port](#output\_postgres\_db\_port) | The port of the database where your data is being streamed | \ No newline at end of file +| [postgres\_db\_port](#output\_postgres\_db\_port) | The port of the database where your data is being streamed | diff --git a/terraform/gcp/pipeline/secure/bigquery.terraform.tfvars b/terraform/gcp/pipeline/secure/bigquery.terraform.tfvars index 7331a03..0545e26 100644 --- a/terraform/gcp/pipeline/secure/bigquery.terraform.tfvars +++ b/terraform/gcp/pipeline/secure/bigquery.terraform.tfvars @@ -34,7 +34,7 @@ iglu_server_dns_name = "http://CHANGE-TO-MY-IGLU-IP" iglu_super_api_key = "00000000-0000-0000-0000-000000000000" # --- Snowplow BigQuery Loader -bigquery_db_enabled = true +pipeline_db = "bigquery" # To use an existing bucket set this to false bigquery_loader_dead_letter_bucket_deploy = true # Must be globally unique so will need to be updated before applying diff --git a/terraform/gcp/pipeline/secure/databricks.terraform.tfvars b/terraform/gcp/pipeline/secure/databricks.terraform.tfvars new file mode 100644 index 0000000..9bac0a5 --- /dev/null +++ b/terraform/gcp/pipeline/secure/databricks.terraform.tfvars @@ -0,0 +1,60 @@ +# Will be prefixed to all resource names +# Use this to easily identify the resources created and provide entropy for subsequent environments +prefix = "sp" + +# The project to deploy the infrastructure into +project_id = "PROJECT_ID_TO_DEPLOY_INTO" + +# Where to deploy the infrastructure +region = "REGION_TO_DEPLOY_INTO" + +# --- Network +# NOTE: The network & sub-network configured must be configured with a Cloud NAT to allow the deployed Compute Engine instances to +# connect to the internet to download the required assets +network = "YOUR_NETWORK_HERE" +subnetwork = "YOUR_SUB_NETWORK_HERE" + +# --- SSH +# Update this to your IP Address +ssh_ip_allowlist = ["999.999.999.999/32"] +# Generate a new SSH key locally with `ssh-keygen` +# ssh-keygen -t rsa -b 4096 +ssh_key_pairs = [ + { + user_name = "snowplow" + public_key = "ssh-rsa AAAAB3NzaC1yc2EAAAADAQA0jSi9//bRsHW4M6czodTs6smCXsxZ0gijzth0aBmycE= snowplow@Snowplows-MacBook-Pro.local" + } +] + +# --- Iglu Server Configuration +# Iglu Server DNS output from the Iglu Server stack +iglu_server_dns_name = "http://CHANGE-TO-MY-IGLU-IP" +# Used for API actions on the Iglu Server +# Change this to the same UUID from when you created the Iglu Server +iglu_super_api_key = "00000000-0000-0000-0000-000000000000" + +# --- Snowplow Databricks Loader +pipeline_db = "databricks" +deltalake_catalog = "DB_CATALOG" +deltalake_schema = "DB_SCHEMA" +deltalake_host = "DB_HOST" +deltalake_port = "DB_PORT" +deltalake_http_path = "DB_HTTP_PATH" +deltalake_auth_token = "DB_AUTH_TOKEN" +transformer_window_period_min = 10 +transformer_bucket_name = "transformer-bucket" +databricks_callback_iam = "DB_CALLBACK_IAM" + +# See for more information: https://registry.terraform.io/modules/snowplow-devops/collector-pubsub-ce/google/latest#telemetry +# Telemetry principles: https://docs.snowplowanalytics.com/docs/open-source-quick-start/what-is-the-quick-start-for-open-source/telemetry-principles/ +user_provided_id = "" +telemetry_enabled = true + +# --- SSL Configuration (optional) +ssl_information = { + certificate_id = "" + enabled = false +} + +# --- Extra Labels to append to created resources (optional) +labels = {} diff --git a/terraform/gcp/pipeline/secure/main.tf b/terraform/gcp/pipeline/secure/main.tf index fc0084b..0e126a8 100644 --- a/terraform/gcp/pipeline/secure/main.tf +++ b/terraform/gcp/pipeline/secure/main.tf @@ -8,6 +8,42 @@ locals { vendor_prefixes = [] } ] + + bigquery_enabled = ( + var.pipeline_db == "bigquery" + && var.bigquery_loader_dead_letter_bucket_deploy != "" + && var.bigquery_loader_dead_letter_bucket_name != "" + ) + + snowflake_enabled = ( + var.pipeline_db == "snowflake" + && var.snowflake_account != "" + && var.snowflake_region != "" + && var.snowflake_loader_user != "" + && var.snowflake_loader_password != "" + && var.snowflake_database != "" + && var.snowflake_schema != "" + && var.snowflake_loader_role != "" + && var.snowflake_warehouse != "" + && var.snowflake_transformed_stage_name != "" + ) + + databricks_enabled = ( + var.pipeline_db == "databricks" + && var.deltalake_catalog != "" + && var.deltalake_schema != "" + && var.deltalake_host != "" + && var.deltalake_port != "" + && var.deltalake_http_path != "" + && var.deltalake_auth_token != "" + ) + + postgres_enabled = ( + var.pipeline_db == "postgres" + && var.postgres_db_name != "" + && var.postgres_db_username != "" + && var.postgres_db_password != "" + ) } provider "google" { @@ -43,6 +79,15 @@ module "enriched_topic" { labels = var.labels } +module "transformed_topic" { + source = "snowplow-devops/pubsub-topic/google" + version = "0.1.0" + + name = "${var.prefix}-transformed-topic" + + labels = var.labels +} + # 2. Deploy Collector stack module "collector_pubsub" { source = "snowplow-devops/collector-pubsub-ce/google" @@ -117,7 +162,7 @@ module "postgres_db" { source = "snowplow-devops/cloud-sql/google" version = "0.1.1" - count = var.postgres_db_enabled ? 1 : 0 + count = local.postgres_enabled ? 1 : 0 name = "${var.prefix}-postgres-db" @@ -137,7 +182,7 @@ module "postgres_loader_enriched" { source = "snowplow-devops/postgres-loader-pubsub-ce/google" version = "0.2.1" - count = var.postgres_db_enabled ? 1 : 0 + count = local.postgres_enabled ? 1 : 0 name = "${var.prefix}-pg-loader-enriched-server" @@ -172,7 +217,7 @@ module "postgres_loader_bad" { source = "snowplow-devops/postgres-loader-pubsub-ce/google" version = "0.2.1" - count = var.postgres_db_enabled ? 1 : 0 + count = local.postgres_enabled ? 1 : 0 name = "${var.prefix}-pg-loader-bad-server" @@ -208,7 +253,7 @@ module "bad_rows_topic" { source = "snowplow-devops/pubsub-topic/google" version = "0.1.0" - count = var.bigquery_db_enabled ? 1 : 0 + count = local.bigquery_enabled ? 1 : 0 name = "${var.prefix}-bq-bad-rows-topic" @@ -216,7 +261,7 @@ module "bad_rows_topic" { } resource "google_bigquery_dataset" "bigquery_db" { - count = var.bigquery_db_enabled ? 1 : 0 + count = local.bigquery_enabled ? 1 : 0 dataset_id = replace("${var.prefix}_snowplow_db", "-", "_") location = var.region @@ -225,7 +270,7 @@ resource "google_bigquery_dataset" "bigquery_db" { } resource "google_storage_bucket" "bq_loader_dead_letter_bucket" { - count = var.bigquery_db_enabled && var.bigquery_loader_dead_letter_bucket_deploy ? 1 : 0 + count = local.bigquery_enabled && var.bigquery_loader_dead_letter_bucket_deploy ? 1 : 0 name = var.bigquery_loader_dead_letter_bucket_name location = var.region @@ -245,7 +290,7 @@ module "bigquery_loader" { source = "snowplow-devops/bigquery-loader-pubsub-ce/google" version = "0.1.0" - count = var.bigquery_db_enabled ? 1 : 0 + count = local.bigquery_enabled ? 1 : 0 name = "${var.prefix}-bq-loader-server" @@ -270,3 +315,111 @@ module "bigquery_loader" { labels = var.labels } + +# 6. Deploy Transformer and Snowflake/Databricks loader +resource "google_storage_bucket" "transformer_bucket" { + count = (local.snowflake_enabled || local.databricks_enabled) ? 1 : 0 + + name = "${var.prefix}-${var.transformer_bucket_name}" + location = var.region + force_destroy = true + + labels = var.labels +} + +module "transformer_pubsub_enriched" { + # source = "snowplow-devops/transformer-pubsub-ce/google" + # version = "0.1.0" + source = "../../../../../terraform-google-transformer-pubsub-ce" + + count = (local.snowflake_enabled || local.databricks_enabled) ? 1 : 0 + + network = var.network + subnetwork = var.subnetwork + region = var.region + project_id = var.project_id + + input_topic_name = module.enriched_topic.name + message_queue_topic_name = module.transformed_topic.name + + name = "${var.prefix}-transformer" + ssh_key_pairs = var.ssh_key_pairs + ssh_ip_allowlist = var.ssh_ip_allowlist + transformation_type = "widerow" + widerow_file_format = local.snowflake_enabled ? "json" : "parquet" + custom_iglu_resolvers = local.custom_iglu_resolvers + telemetry_enabled = var.telemetry_enabled + user_provided_id = var.user_provided_id + transformer_output = google_storage_bucket.transformer_bucket[0].name + + labels = var.labels +} + +module "snowflake_loader" { + # source = "snowplow-devops/snowflake-loader-google-ce/gcp" + # version = "0.1.0" + source = "../../../../../terraform-google-snowflake-loader-pubsub-ce" + + count = local.snowflake_enabled ? 1 : 0 + + network = var.network + subnetwork = var.subnetwork + region = var.region + project_id = var.project_id + + name = "${var.prefix}-snowflake" + ssh_key_pairs = var.ssh_key_pairs + input_topic_name = module.transformed_topic.name + ssh_ip_allowlist = var.ssh_ip_allowlist + snowflake_region = var.snowflake_region + snowflake_account = var.snowflake_account + snowflake_loader_user = var.snowflake_loader_user + snowflake_password = var.snowflake_loader_password + snowflake_database = var.snowflake_database + snowflake_schema = var.snowflake_schema + snowflake_loader_role = var.snowflake_loader_role + snowflake_warehouse = var.snowflake_warehouse + snowflake_transformed_stage_name = var.snowflake_transformed_stage_name + snowflake_folder_monitoring_stage_url = "" + snowflake_callback_iam = var.snowflake_callback_iam + telemetry_enabled = var.telemetry_enabled + user_provided_id = var.user_provided_id + custom_iglu_resolvers = local.custom_iglu_resolvers + + transformer_output = google_storage_bucket.transformer_bucket[0].name + + labels = var.labels +} + +module "databricks_loader" { + # source = "snowplow-devops/snowflake-loader-google-ce/gcp" + # version = "0.1.0" + source = "../../../../../terraform-google-databricks-loader-pubsub-ce" + + count = local.databricks_enabled ? 1 : 0 + + network = var.network + subnetwork = var.subnetwork + region = var.region + project_id = var.project_id + + name = "${var.prefix}-databricks" + ssh_key_pairs = var.ssh_key_pairs + input_topic_name = module.transformed_topic.name + ssh_ip_allowlist = var.ssh_ip_allowlist + deltalake_catalog = var.deltalake_catalog + deltalake_schema = var.deltalake_schema + deltalake_host = var.deltalake_host + deltalake_port = var.deltalake_port + deltalake_http_path = var.deltalake_http_path + deltalake_auth_token = var.deltalake_auth_token + databricks_callback_iam = var.databricks_callback_iam + databricks_folder_monitoring_stage_url = "" + telemetry_enabled = var.telemetry_enabled + user_provided_id = var.user_provided_id + custom_iglu_resolvers = local.custom_iglu_resolvers + + transformer_output = google_storage_bucket.transformer_bucket[0].name + + labels = var.labels +} diff --git a/terraform/gcp/pipeline/secure/postgres.terraform.tfvars b/terraform/gcp/pipeline/secure/postgres.terraform.tfvars index fc9a4a8..a571df8 100644 --- a/terraform/gcp/pipeline/secure/postgres.terraform.tfvars +++ b/terraform/gcp/pipeline/secure/postgres.terraform.tfvars @@ -34,7 +34,7 @@ iglu_server_dns_name = "http://CHANGE-TO-MY-IGLU-IP" iglu_super_api_key = "00000000-0000-0000-0000-000000000000" # --- Snowplow Postgres Loader -postgres_db_enabled = true +pipeline_db = "postgres" postgres_db_name = "snowplow" postgres_db_username = "snowplow" diff --git a/terraform/gcp/pipeline/secure/snowflake.terraform.tfvars b/terraform/gcp/pipeline/secure/snowflake.terraform.tfvars new file mode 100644 index 0000000..60ffde9 --- /dev/null +++ b/terraform/gcp/pipeline/secure/snowflake.terraform.tfvars @@ -0,0 +1,62 @@ +# Will be prefixed to all resource names +# Use this to easily identify the resources created and provide entropy for subsequent environments +prefix = "sp" + +# The project to deploy the infrastructure into +project_id = "PROJECT_ID_TO_DEPLOY_INTO" + +# Where to deploy the infrastructure +region = "REGION_TO_DEPLOY_INTO" + +# --- Network +# NOTE: The network & sub-network configured must be configured with a Cloud NAT to allow the deployed Compute Engine instances to +# connect to the internet to download the required assets +network = "YOUR_NETWORK_HERE" +subnetwork = "YOUR_SUB_NETWORK_HERE" + +# --- SSH +# Update this to your IP Address +ssh_ip_allowlist = ["999.999.999.999/32"] +# Generate a new SSH key locally with `ssh-keygen` +# ssh-keygen -t rsa -b 4096 +ssh_key_pairs = [ + { + user_name = "snowplow" + public_key = "ssh-rsa AAAAB3NzaC1yc2EAAAADAQA0jSi9//bRsHW4M6czodTs6smCXsxZ0gijzth0aBmycE= snowplow@Snowplows-MacBook-Pro.local" + } +] + +# --- Iglu Server Configuration +# Iglu Server DNS output from the Iglu Server stack +iglu_server_dns_name = "http://CHANGE-TO-MY-IGLU-IP" +# Used for API actions on the Iglu Server +# Change this to the same UUID from when you created the Iglu Server +iglu_super_api_key = "00000000-0000-0000-0000-000000000000" + +# --- Snowplow Snowflake Loader +pipeline_db = "snowflake" +snowflake_account = "sf_account" +snowflake_region = "us-west-2" +# Change and keep this secret! +snowflake_loader_password = "Hell0W0rld!2" +snowflake_database = "SF_DB_NAME" +snowflake_loader_role = "SF_LOADER_ROLE" +snowflake_loader_user = "SF_LOADER_USER" +snowflake_schema = "ATOMIC" +snowflake_transformed_stage_name = "SF_TRANSFORMED_STAGE" +snowflake_warehouse = "SF_WAREHOUSE" +transformer_window_period_min = 10 + +# See for more information: https://registry.terraform.io/modules/snowplow-devops/collector-pubsub-ce/google/latest#telemetry +# Telemetry principles: https://docs.snowplowanalytics.com/docs/open-source-quick-start/what-is-the-quick-start-for-open-source/telemetry-principles/ +user_provided_id = "" +telemetry_enabled = true + +# --- SSL Configuration (optional) +ssl_information = { + certificate_id = "" + enabled = false +} + +# --- Extra Labels to append to created resources (optional) +labels = {} diff --git a/terraform/gcp/pipeline/secure/variables.tf b/terraform/gcp/pipeline/secure/variables.tf index 57924f9..474031d 100644 --- a/terraform/gcp/pipeline/secure/variables.tf +++ b/terraform/gcp/pipeline/secure/variables.tf @@ -132,3 +132,118 @@ variable "labels" { default = {} type = map(string) } + +variable "snowflake_account" { + description = "Snowflake account to use" + type = string + default = "" +} + +variable "snowflake_region" { + description = "Region of Snowflake account" + type = string + default = "" +} + +variable "snowflake_loader_password" { + description = "The password to use for the loader user" + type = string + sensitive = true + default = "" +} + +variable "snowflake_loader_user" { + description = "The Snowflake user used by Snowflake Loader" + type = string + default = "" +} + +variable "snowflake_database" { + description = "Snowflake database name" + type = string + default = "" +} + +variable "snowflake_schema" { + description = "Snowflake schema name" + type = string + default = "" +} + +variable "snowflake_loader_role" { + description = "Snowflake role for loading snowplow data" + type = string + default = "" +} + +variable "snowflake_warehouse" { + description = "Snowflake warehouse name" + type = string + default = "" +} + +variable "snowflake_transformed_stage_name" { + description = "Name of transformed stage" + type = string + default = "" +} + +variable "snowflake_callback_iam" { + description = "Snowflake callback IAM from STORAGE INTEGRATION" + type = string + default = "" +} + +variable "transformer_window_period_min" { + description = "Frequency to emit transforming finished message - 5,10,15,20,30,60 etc minutes" + type = number + default = 5 +} + +variable "transformer_bucket_name" { + description = "Transformer bucket name, prefixed with the prefix value" + type = string + default = "qs-transformed" +} + +variable "deltalake_catalog" { + description = "Databricks deltalake catalog" + type = string + default = "hive_metastore" +} + +variable "deltalake_schema" { + description = "Databricks deltalake schema" + type = string + default = "" +} + +variable "deltalake_host" { + description = "Databricks deltalake host" + type = string + default = "" +} + +variable "deltalake_port" { + description = "Databricks deltalake port" + type = string + default = "" +} + +variable "deltalake_http_path" { + description = "Databricks deltalake http path" + type = string + default = "" +} + +variable "deltalake_auth_token" { + description = "Databricks deltalake auth token" + type = string + default = "" + sensitive = true +} + +variable "databricks_callback_iam" { + description = "Databricks callback IAM to allow access to GCS bucket" + type = string +}
"certificate_id": "",
"enabled": false
}