From f3b0e3b770b515a7b7403118c6581b3b1924b13d Mon Sep 17 00:00:00 2001 From: jbeemster Date: Fri, 29 Jul 2022 17:14:26 +0200 Subject: [PATCH] [POC] RabbitMQ Snowplow Pipeline --- charts/snowplow-pipeline-rabbitmq/Chart.lock | 6 + charts/snowplow-pipeline-rabbitmq/Chart.yaml | 24 ++++ charts/snowplow-pipeline-rabbitmq/README.md | 109 ++++++++++++++++++ .../configs/collector.hocon | 20 ++++ .../configs/enrich.hocon | 36 ++++++ .../configs/enrichments.json | 8 ++ .../configs/iglu.json | 28 +++++ .../templates/NOTES.txt | 19 +++ charts/snowplow-pipeline-rabbitmq/values.yaml | 56 +++++++++ 9 files changed, 306 insertions(+) create mode 100644 charts/snowplow-pipeline-rabbitmq/Chart.lock create mode 100644 charts/snowplow-pipeline-rabbitmq/Chart.yaml create mode 100644 charts/snowplow-pipeline-rabbitmq/README.md create mode 100644 charts/snowplow-pipeline-rabbitmq/configs/collector.hocon create mode 100644 charts/snowplow-pipeline-rabbitmq/configs/enrich.hocon create mode 100644 charts/snowplow-pipeline-rabbitmq/configs/enrichments.json create mode 100644 charts/snowplow-pipeline-rabbitmq/configs/iglu.json create mode 100644 charts/snowplow-pipeline-rabbitmq/templates/NOTES.txt create mode 100644 charts/snowplow-pipeline-rabbitmq/values.yaml diff --git a/charts/snowplow-pipeline-rabbitmq/Chart.lock b/charts/snowplow-pipeline-rabbitmq/Chart.lock new file mode 100644 index 0000000..81d361d --- /dev/null +++ b/charts/snowplow-pipeline-rabbitmq/Chart.lock @@ -0,0 +1,6 @@ +dependencies: +- name: service-deployment + repository: https://snowplow-devops.github.io/helm-charts + version: 0.1.3 +digest: sha256:35026d3c11ba320ee596d29b3a24e4764c3dc43e975b4305b7fd643bafd9bb41 +generated: "2022-07-29T10:22:47.587589+02:00" diff --git a/charts/snowplow-pipeline-rabbitmq/Chart.yaml b/charts/snowplow-pipeline-rabbitmq/Chart.yaml new file mode 100644 index 0000000..832c902 --- /dev/null +++ b/charts/snowplow-pipeline-rabbitmq/Chart.yaml @@ -0,0 +1,24 @@ +apiVersion: v2 +name: snowplow-pipeline-rabbitmq +description: A Helm Chart to deploy a RabbitMQ powered Snowplow Pipeline +version: 0.0.1 +icon: https://raw.githubusercontent.com/snowplow-devops/helm-charts/master/docs/logo/snowplow.png +home: https://github.com/snowplow-devops/helm-charts +sources: + - https://github.com/snowplow-devops/helm-charts + - https://github.com/snowplow/snowplow +maintainers: + - name: jbeemster + url: https://github.com/jbeemster + email: jbeemster@users.noreply.github.com +keywords: + - snowplow +dependencies: + - name: service-deployment + version: 0.1.3 + repository: "https://snowplow-devops.github.io/helm-charts" + alias: collector + - name: service-deployment + version: 0.1.3 + repository: "https://snowplow-devops.github.io/helm-charts" + alias: enrich diff --git a/charts/snowplow-pipeline-rabbitmq/README.md b/charts/snowplow-pipeline-rabbitmq/README.md new file mode 100644 index 0000000..c89071b --- /dev/null +++ b/charts/snowplow-pipeline-rabbitmq/README.md @@ -0,0 +1,109 @@ +# snowplow-pipeline-rabbitmq + +## DISCLAIMER + +This Chart and guide is completely experimental and not officially supported for Production workloads. It is an attempt at building a lightweight Kubernetes first pipeline as opposed to our traditional cloud first approach. + +## Introduction + +This chart helps to deploy a Snowplow Pipeline entirely contained within a Kubernetes cluster. This is an _example_ approach and should not be used in production without due diligence on all of the settings - but for PoC purposes it should be perfect! + +To get a basic working pipeline you will need to deploy: + +1. RabbitMQ cluster: Will be used as our queue layer between applications +2. Snowplow Collector: Provides the entrypoint for events to be collected +3. Snowplow Enrich: Application which consumes raw Collector events and turns them into good/bad events + +After Enrich you will have a set of good + bad data in different RabbitMQ queues that can then be consumed by your tool of choice. + +## 1. Setting up the prerequisites + +To begin with you must have access to a Kubernetes cluster - if you are using macOS / Windows the easiest way to do this is with [Docker Desktop](https://docs.docker.com/desktop/kubernetes/) which now has a built-in Kubernetes cluster. + +You will also need the [Helm tool available](https://helm.sh/docs/intro/install/) on your command-line. As of time of writing this guide the version being used: + +```bash +> helm version +version.BuildInfo{Version:"v3.9.0", GitCommit:"7ceeda6c585217a19a1131663d8cd1f7d641b2a7", GitTreeState:"clean", GoVersion:"go1.18.2"} +``` + +Now that you have everything prepared you can start installing a pipeline! + +## 2. Setting up RabbitMQ + +For this we are going to leverage the excellent Bitnami Chart which is readily available and use that as our test bed - there are hundreds of options to tune here for a production grade cluster but for our purposes the defaults will work just fine! + +```bash +helm repo add bitnami https://charts.bitnami.com/bitnami +helm repo update + +# Install RabbitMQ +helm install rmq1 \ + --set auth.username=admin,auth.password=secretpassword,auth.erlangCookie=secretcookie \ + bitnami/rabbitmq +``` + +_Note_: You will need the username and password for later steps so do take note of them! + +This step should dump out a good bit of information which you should save for later - specifically we are going to want to know how to portForward the management portal so we can inspect whats happening and the endpoint within the cluster so we can point our applications at the cluster. + +```bash +# Endpoint + Port for sending and pulling data +RabbitMQ can be accessed within the cluster on port 5672 at rmq1-rabbitmq.default.svc.cluster.local +# Command to execute to expose management portal +kubectl port-forward --namespace default svc/rmq1-rabbitmq 15672:15672 +``` + +## 3. Lets deploy a pipeline + +if you are following the guide exactly then you should see two preconfigured config files inside `/configs` - one for the Collector and one for Enrich. We have already taken the liberty of base64 encoding these and adding them to the `values.yaml` - if you want to change any settings you should update your values accordingly. + +These configs are pointing at `rmq1-rabbitmq.default.svc.cluster.local:5672` and will use the following queue names: + +- "raw-queue" +- "bad-1-queue" +- "enriched-queue" + +```bash +helm repo add snowplow-devops https://snowplow-devops.github.io/helm-charts +helm repo update + +# Deploy the pipeline +helm install pipeline snowplow-devops/snowplow-pipeline-rabbitmq +``` + + + + + + + + +### How to: expose the Collector to the internet? + +As default the Collector service is not exposed externally to the Kubernetes cluster - we default to only using NodePort - binding a Load Balancer / external routing is left to the implementor to deal with. For local testing port forwarding the service is recommended as a way to interact with the Collector. + +However if are running an EKS (AWS) or GKE (GCP) cluster we do expose bindings that can map onto an existing AWS Target Group or GCP Network Endpoint Group (NEG) to allow you to route traffic from the internet into the exposed service. You will need to set `global.cloud` to `aws` or `gcp` and follow the instructions below for your specific cloud. + +_Note_: If you have feedback on this please open an issue on this repository for how this could be simplified! + +#### GKE (GCP) NetworkEndpointGroup + +To manage the Collector load balancer externally from the GKE cluster you can bind the deployment onto dynamically assigned Network Endpoint Groups (NEGs). + +1. Set the NEG name: `collector.service.gcp.networkEndpointGroupName: ` + - Will default to the Chart release name +2. This will create Zonal NEGs in your account automatically (do not proceed until the NEGs appear - check your deployment events if this doesn't happen!) +3. Create a Load Balancer as usual and map the NEGs created into your backend service (follow the `Create Load Balancer` flow in the GCP Console) + +*Note*: The HealthCheck you create should map to the same port you used for the Collector deployment. + +#### EKS (AWS) TargetGroup + +To manage the Collector load balancer externally to the Kubernetes cluster you can bind the deployment to an existing TargetGroup ARN. Its important that the TargetGroup exist ahead of time and that you use the same port as you have used in your `values.yaml`. + +*Note*: Before this will work you will need to install the `aws-load-balancer-controller-crds` and `aws-load-balancer-controller` charts into your EKS cluster. + +You will need to fill these targeted fields: + +- `service.aws.targetGroupARN: ""` diff --git a/charts/snowplow-pipeline-rabbitmq/configs/collector.hocon b/charts/snowplow-pipeline-rabbitmq/configs/collector.hocon new file mode 100644 index 0000000..1b61b48 --- /dev/null +++ b/charts/snowplow-pipeline-rabbitmq/configs/collector.hocon @@ -0,0 +1,20 @@ +collector { + interface: "0.0.0.0" + port: 8080 + + streams { + good: "not used" + bad: "not used" + + sink { + hostName: "rmq1-rabbitmq.default.svc.cluster.local" + portNumber: 5672 + userName: "admin" + password: "secretpassword" + virtualHost: "/" + exchangeName: "raw-queue" + routingKey: "raw-queue" + queueName: "raw-queue" + } + } +} diff --git a/charts/snowplow-pipeline-rabbitmq/configs/enrich.hocon b/charts/snowplow-pipeline-rabbitmq/configs/enrich.hocon new file mode 100644 index 0000000..bbbee78 --- /dev/null +++ b/charts/snowplow-pipeline-rabbitmq/configs/enrich.hocon @@ -0,0 +1,36 @@ +{ + "input": { + "hostName": "rmq1-rabbitmq.default.svc.cluster.local" + "portNumber": 5672 + "userName": "admin" + "password": "secretpassword" + "virtualHost": "/" + "exchangeName": "raw-queue" + "routingKey": "raw-queue" + "queueName": "raw-queue" + } + + "output": { + "good": { + "hostName": "rmq1-rabbitmq.default.svc.cluster.local" + "portNumber": 5672 + "userName": "admin" + "password": "secretpassword" + "virtualHost": "/" + "exchangeName": "enriched-queue" + "routingKey": "enriched-queue" + "queueName": "enriched-queue" + } + + "bad": { + "hostName": "rmq1-rabbitmq.default.svc.cluster.local" + "portNumber": 5672 + "userName": "admin" + "password": "secretpassword" + "virtualHost": "/" + "exchangeName": "bad-1-queue" + "routingKey": "bad-1-queue" + "queueName": "bad-1-queue" + } + } +} diff --git a/charts/snowplow-pipeline-rabbitmq/configs/enrichments.json b/charts/snowplow-pipeline-rabbitmq/configs/enrichments.json new file mode 100644 index 0000000..67a8ff4 --- /dev/null +++ b/charts/snowplow-pipeline-rabbitmq/configs/enrichments.json @@ -0,0 +1,8 @@ +{ + "schema": "iglu:com.snowplowanalytics.snowplow.enrichments/yauaa_enrichment_config/jsonschema/1-0-0", + "data": { + "enabled": true, + "vendor": "com.snowplowanalytics.snowplow.enrichments", + "name": "yauaa_enrichment_config" + } +} diff --git a/charts/snowplow-pipeline-rabbitmq/configs/iglu.json b/charts/snowplow-pipeline-rabbitmq/configs/iglu.json new file mode 100644 index 0000000..b9bf079 --- /dev/null +++ b/charts/snowplow-pipeline-rabbitmq/configs/iglu.json @@ -0,0 +1,28 @@ +{ + "schema": "iglu:com.snowplowanalytics.iglu/resolver-config/jsonschema/1-0-1", + "data": { + "cacheSize": 500, + "repositories": [ + { + "name": "Iglu Central", + "priority": 0, + "vendorPrefixes": [ "com.snowplowanalytics" ], + "connection": { + "http": { + "uri": "http://iglucentral.com" + } + } + }, + { + "name": "Iglu Central - GCP Mirror", + "priority": 1, + "vendorPrefixes": [ "com.snowplowanalytics" ], + "connection": { + "http": { + "uri": "http://mirror01.iglucentral.com" + } + } + } + ] + } +} diff --git a/charts/snowplow-pipeline-rabbitmq/templates/NOTES.txt b/charts/snowplow-pipeline-rabbitmq/templates/NOTES.txt new file mode 100644 index 0000000..db23a92 --- /dev/null +++ b/charts/snowplow-pipeline-rabbitmq/templates/NOTES.txt @@ -0,0 +1,19 @@ +-------------------------------------------------------------------------------- +The pipeline is now deploying - please be patient as images download and launch! +-------------------------------------------------------------------------------- + +The Collector can be accessed via port {{ .Values.collector.service.port }} on the following DNS names from within your cluster: + + {{ .Values.collector.fullnameOverride }}.{{ .Release.Namespace }}.svc.cluster.local + +To connect to the Collector from outside the cluster execute the following commands: + + kubectl port-forward --namespace {{ .Release.Namespace }} svc/{{ .Values.collector.fullnameOverride }} {{ .Values.collector.service.port }}:{{ .Values.collector.service.port }} + +To send a simple test event to the Collector: + + http://localhost:{{ .Values.collector.service.port }}/i?e=pv + +Or to check that it is healthy: + + http://localhost:{{ .Values.collector.service.port }}/health diff --git a/charts/snowplow-pipeline-rabbitmq/values.yaml b/charts/snowplow-pipeline-rabbitmq/values.yaml new file mode 100644 index 0000000..424d79f --- /dev/null +++ b/charts/snowplow-pipeline-rabbitmq/values.yaml @@ -0,0 +1,56 @@ +collector: + fullnameOverride: "collector" + image: + repository: "snowplow/scala-stream-collector-rabbitmq" + tag: "2.7.1-rc4-distroless" + config: + args: + - "--config" + - "/snowplow/config/collector.hocon" + env: + JDK_JAVA_OPTIONS: "-Dorg.slf4j.simpleLogger.defaultLogLevel=info" + configMaps: + - name: "collector" + key: "collector.hocon" + contentsB64: "Y29sbGVjdG9yIHsKICBpbnRlcmZhY2U6ICIwLjAuMC4wIgogIHBvcnQ6IDgwODAKCiAgc3RyZWFtcyB7CiAgICBnb29kOiAibm90IHVzZWQiCiAgICBiYWQ6ICJub3QgdXNlZCIKCiAgICBzaW5rIHsKICAgICAgaG9zdE5hbWU6ICJybXExLXJhYmJpdG1xLmRlZmF1bHQuc3ZjLmNsdXN0ZXIubG9jYWwiCiAgICAgIHBvcnROdW1iZXI6IDU2NzIKICAgICAgdXNlck5hbWU6ICJhZG1pbiIKICAgICAgcGFzc3dvcmQ6ICJzZWNyZXRwYXNzd29yZCIKICAgICAgdmlydHVhbEhvc3Q6ICIvIgogICAgICBleGNoYW5nZU5hbWU6ICJyYXctcXVldWUiCiAgICAgIHJvdXRpbmdLZXk6ICJyYXctcXVldWUiCiAgICAgIHF1ZXVlTmFtZTogInJhdy1xdWV1ZSIKICAgIH0KICB9Cn0K" + mountPath: "/snowplow/config" + readinessProbe: + httpGet: + path: "/health" + service: + deploy: true + port: 8080 + +enrich: + fullnameOverride: "enrich" + image: + repository: "snowplow/snowplow-enrich-rabbitmq" + tag: "latest" + config: + args: + - "--config" + - "/snowplow/config/enrich.hocon" + - "--iglu-config" + - "/snowplow/config_iglu/iglu.json" + - "--enrichments" + - "/snowplow/enrichments/" + env: + JDK_JAVA_OPTIONS: "-Dorg.slf4j.simpleLogger.defaultLogLevel=info" + configMaps: + - name: "enrich" + key: "enrich.hocon" + contentsB64: "ewogICJpbnB1dCI6IHsKICAgICJob3N0TmFtZSI6ICJybXExLXJhYmJpdG1xLmRlZmF1bHQuc3ZjLmNsdXN0ZXIubG9jYWwiCiAgICAicG9ydE51bWJlciI6IDU2NzIKICAgICJ1c2VyTmFtZSI6ICJhZG1pbiIKICAgICJwYXNzd29yZCI6ICJzZWNyZXRwYXNzd29yZCIKICAgICJ2aXJ0dWFsSG9zdCI6ICIvIgogICAgImV4Y2hhbmdlTmFtZSI6ICJyYXctcXVldWUiCiAgICAicm91dGluZ0tleSI6ICJyYXctcXVldWUiCiAgICAicXVldWVOYW1lIjogInJhdy1xdWV1ZSIKICB9CgogICJvdXRwdXQiOiB7CiAgICAiZ29vZCI6IHsKICAgICAgImhvc3ROYW1lIjogInJtcTEtcmFiYml0bXEuZGVmYXVsdC5zdmMuY2x1c3Rlci5sb2NhbCIKICAgICAgInBvcnROdW1iZXIiOiA1NjcyCiAgICAgICJ1c2VyTmFtZSI6ICJhZG1pbiIKICAgICAgInBhc3N3b3JkIjogInNlY3JldHBhc3N3b3JkIgogICAgICAidmlydHVhbEhvc3QiOiAiLyIKICAgICAgImV4Y2hhbmdlTmFtZSI6ICJlbnJpY2hlZC1xdWV1ZSIKICAgICAgInJvdXRpbmdLZXkiOiAiZW5yaWNoZWQtcXVldWUiCiAgICAgICJxdWV1ZU5hbWUiOiAiZW5yaWNoZWQtcXVldWUiCiAgICB9CgogICAgImJhZCI6IHsKICAgICAgImhvc3ROYW1lIjogInJtcTEtcmFiYml0bXEuZGVmYXVsdC5zdmMuY2x1c3Rlci5sb2NhbCIKICAgICAgInBvcnROdW1iZXIiOiA1NjcyCiAgICAgICJ1c2VyTmFtZSI6ICJhZG1pbiIKICAgICAgInBhc3N3b3JkIjogInNlY3JldHBhc3N3b3JkIgogICAgICAidmlydHVhbEhvc3QiOiAiLyIKICAgICAgImV4Y2hhbmdlTmFtZSI6ICJiYWQtMS1xdWV1ZSIKICAgICAgInJvdXRpbmdLZXkiOiAiYmFkLTEtcXVldWUiCiAgICAgICJxdWV1ZU5hbWUiOiAiYmFkLTEtcXVldWUiCiAgICB9CiAgfQp9Cg==" + mountPath: "/snowplow/config" + - name: "iglu" + key: "iglu.json" + contentsB64: "ewogICJzY2hlbWEiOiAiaWdsdTpjb20uc25vd3Bsb3dhbmFseXRpY3MuaWdsdS9yZXNvbHZlci1jb25maWcvanNvbnNjaGVtYS8xLTAtMSIsCiAgImRhdGEiOiB7CiAgICAiY2FjaGVTaXplIjogNTAwLAogICAgInJlcG9zaXRvcmllcyI6IFsKICAgICAgewogICAgICAgICJuYW1lIjogIklnbHUgQ2VudHJhbCIsCiAgICAgICAgInByaW9yaXR5IjogMCwKICAgICAgICAidmVuZG9yUHJlZml4ZXMiOiBbICJjb20uc25vd3Bsb3dhbmFseXRpY3MiIF0sCiAgICAgICAgImNvbm5lY3Rpb24iOiB7CiAgICAgICAgICAiaHR0cCI6IHsKICAgICAgICAgICAgInVyaSI6ICJodHRwOi8vaWdsdWNlbnRyYWwuY29tIgogICAgICAgICAgfQogICAgICAgIH0KICAgICAgfSwKICAgICAgewogICAgICAgICJuYW1lIjogIklnbHUgQ2VudHJhbCAtIEdDUCBNaXJyb3IiLAogICAgICAgICJwcmlvcml0eSI6IDEsCiAgICAgICAgInZlbmRvclByZWZpeGVzIjogWyAiY29tLnNub3dwbG93YW5hbHl0aWNzIiBdLAogICAgICAgICJjb25uZWN0aW9uIjogewogICAgICAgICAgImh0dHAiOiB7CiAgICAgICAgICAgICJ1cmkiOiAiaHR0cDovL21pcnJvcjAxLmlnbHVjZW50cmFsLmNvbSIKICAgICAgICAgIH0KICAgICAgICB9CiAgICAgIH0KICAgIF0KICB9Cn0K" + mountPath: "/snowplow/config_iglu" + - name: "yauaa" + key: "yauaa.json" + contentsB64: "ewogICAgInNjaGVtYSI6ICJpZ2x1OmNvbS5zbm93cGxvd2FuYWx5dGljcy5zbm93cGxvdy5lbnJpY2htZW50cy95YXVhYV9lbnJpY2htZW50X2NvbmZpZy9qc29uc2NoZW1hLzEtMC0wIiwKICAgICJkYXRhIjogewogICAgICAgICJlbmFibGVkIjogdHJ1ZSwKICAgICAgICAidmVuZG9yIjogImNvbS5zbm93cGxvd2FuYWx5dGljcy5zbm93cGxvdy5lbnJpY2htZW50cyIsCiAgICAgICAgIm5hbWUiOiAieWF1YWFfZW5yaWNobWVudF9jb25maWciCiAgICB9Cn0K" + mountPath: "/snowplow/enrichments" + readinessProbe: + httpGet: + path: "" + service: + deploy: false