Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update go-discover to support ECS discovery #13782

Merged
merged 18 commits into from
Jan 12, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ require (
github.com/hashicorp/go-checkpoint v0.5.0
github.com/hashicorp/go-cleanhttp v0.5.1
github.com/hashicorp/go-connlimit v0.3.0
github.com/hashicorp/go-discover v0.0.0-20220411141802-20db45f7f0f9
github.com/hashicorp/go-discover v0.0.0-20220714221025-1c234a67149a
github.com/hashicorp/go-hclog v0.14.1
github.com/hashicorp/go-memdb v1.3.2
fdr2 marked this conversation as resolved.
Show resolved Hide resolved
github.com/hashicorp/go-multierror v1.1.1
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,8 @@ github.com/hashicorp/go-connlimit v0.3.0 h1:oAojHGjFxUTTTA8c5XXnDqWJ2HLuWbDiBPTp
github.com/hashicorp/go-connlimit v0.3.0/go.mod h1:OUj9FGL1tPIhl/2RCfzYHrIiWj+VVPGNyVPnUX8AqS0=
github.com/hashicorp/go-discover v0.0.0-20220411141802-20db45f7f0f9 h1:2GsEkBZf1q4LKZjtd4cO+V0xd85xGCMolX3ebC2+xd4=
github.com/hashicorp/go-discover v0.0.0-20220411141802-20db45f7f0f9/go.mod h1:1xfdKvc3pe5WKxfUUHHOGaKMk7NLGhHY1jkyhKo6098=
github.com/hashicorp/go-discover v0.0.0-20220714221025-1c234a67149a h1:xeDSq/xo0CfnSZnPUkNH/00Qy8Q8ySJW0Ij2u/pH680=
github.com/hashicorp/go-discover v0.0.0-20220714221025-1c234a67149a/go.mod h1:1xfdKvc3pe5WKxfUUHHOGaKMk7NLGhHY1jkyhKo6098=
github.com/hashicorp/go-hclog v0.0.0-20180709165350-ff2cf002a8dd/go.mod h1:9bjs9uLqI8l75knNv3lV1kA55veR+WUPSiKIWcQHudI=
github.com/hashicorp/go-hclog v0.9.1/go.mod h1:5CU+agLiy3J7N7QjHK5d05KxGsuXiQLrjA0H7acj2lQ=
github.com/hashicorp/go-hclog v0.9.2/go.mod h1:5CU+agLiy3J7N7QjHK5d05KxGsuXiQLrjA0H7acj2lQ=
Expand Down
8 changes: 8 additions & 0 deletions test/load-ecs/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
.idea
*.iml
tls/*
!tls/.gitkeep
.terraform*
*.tfstate*
*.auto.tfvars
*.auto.pkrvars.hcl
88 changes: 88 additions & 0 deletions test/load-ecs/GNUmakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
SHELL = bash

TF_VAR_aws_region ?= "us-east-1"
AWS_ACCOUNT_ID=$(shell aws sts get-caller-identity | jq -r .Account)
CLUSTER_NAME ?= consul-example

# Refresh certificates, build and deploy images
.PHONY: all
all: certs images

# Build CA & Certs for Consul
.PHONY: certs
certs: clean-certs
cd tls && consul tls ca create && \
for I in "-server" "-client" "-cli"; do \
consul tls cert create $$I -dc dc1; \
done; \
cd ..

# Clean out local CA & Cert
.PHONY: clean-certs
clean-certs:
rm -rf tls && mkdir tls && touch tls/.gitkeep

# Make server container
.PHONY: server
server:
docker build --platform linux/amd64 --tag consul-server:local --target consul-server ./containers/consul

# Make agent container
.PHONY: agent
agent:
docker build --platform linux/amd64 --tag consul-agent:local --target consul-agent ./containers/consul

# Make datadog container
.PHONY: datadog
datadog:
docker build --platform linux/amd64 --tag datadog-agent:local ./containers/datadog

# Make k6 container
.PHONY: k6
k6:
docker build --platform linux/amd64 --tag k6:local ./containers/k6

# Build all the test images
.PHONY: build
build: server agent datadog k6

# Push the test images to ECR
.PHONY: push
push:
@test "$(TF_VAR_aws_region)" || $(shell echo "Failed to find TF_VAR_aws_region from environment" && exit 1)
@test "$(AWS_ACCOUNT_ID)" || $(shell echo "Failed to find AWS_ACCOUNT_ID from caller identity" && exit 1)
@test "$(CLUSTER_NAME)" || $(shell echo "Failed to find CLUSTER_NAME from caller identity" && exit 1)
@aws ecr get-login-password --region $(TF_VAR_aws_region) | docker login --username AWS --password-stdin "$(AWS_ACCOUNT_ID).dkr.ecr.$(TF_VAR_aws_region).amazonaws.com"
@docker tag "consul-server:local" "$(AWS_ACCOUNT_ID).dkr.ecr.$(TF_VAR_aws_region).amazonaws.com/$(CLUSTER_NAME)/consul:server"
@docker tag "consul-agent:local" "$(AWS_ACCOUNT_ID).dkr.ecr.$(TF_VAR_aws_region).amazonaws.com/$(CLUSTER_NAME)/consul:agent"
@docker tag "datadog-agent:local" "$(AWS_ACCOUNT_ID).dkr.ecr.$(TF_VAR_aws_region).amazonaws.com/$(CLUSTER_NAME)/consul:datadog"
@docker tag "k6:local" "$(AWS_ACCOUNT_ID).dkr.ecr.$(TF_VAR_aws_region).amazonaws.com/$(CLUSTER_NAME)/consul:k6"
docker push "$(AWS_ACCOUNT_ID).dkr.ecr.$(TF_VAR_aws_region).amazonaws.com/$(CLUSTER_NAME)/consul:server"
docker push "$(AWS_ACCOUNT_ID).dkr.ecr.$(TF_VAR_aws_region).amazonaws.com/$(CLUSTER_NAME)/consul:agent"
docker push "$(AWS_ACCOUNT_ID).dkr.ecr.$(TF_VAR_aws_region).amazonaws.com/$(CLUSTER_NAME)/consul:datadog"
docker push "$(AWS_ACCOUNT_ID).dkr.ecr.$(TF_VAR_aws_region).amazonaws.com/$(CLUSTER_NAME)/consul:k6"


# Pre-build some infra, so that ECR can exist before docker build tag push
.PHONY: repos
repos:
terraform apply -auto-approve

# Builds and Push Images to ECR repos
.PHONY: images
images: build push

# Deploy containers and lambdas that depend on ECR images
.PHONY: infra
infra:
terraform apply -var 'deploy_consul_ecs=true' -var 'deploy_efs_cluster=true' -auto-approve

# Execute the K6 Load test ideally after the infra stage has completed and has had time to start the containers
.PHONY: test-ecs
test-ecs:
terraform apply -var 'deploy_consul_ecs=true' -var 'deploy_efs_cluster=true' -var 'run_k6=true' -auto-approve

# Clean the environment
.PHONY: clean
clean:
terraform destroy -auto-approve
139 changes: 139 additions & 0 deletions test/load-ecs/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
# Load Test Consul on ECS

Refer to `getting started` in the adjacent [load test for EC2 README.md](https://github.com/hashicorp/consul/blob/main/test/load/README.md)
to set up your environment for access to AWS. You can use keys, but in the example below, profiles are used.

The main difference to keep in mind, is this repository will source consul from this, your local, repository. Before you
begin using the makefile in this folder, ensure you have run `make dist` _or a similar command_ that will result in the
`consul:local` docker image being created and present in your docker image repository.

> Ensure you have the consul:local container image built
> ```bash
> docker image ls | awk '{print $1":"$2}' | grep consul:local
> # consul:local
> ```

Then use `make` to:
- Create Consul TLS Certificates.
- Deploy base aws-infrastructure (network, iam, ecr, ecs-cluster, etc).
- Build consul/datadog/k6 images (based off your local `consul:local` image) and push to ECR.
- Deploy ECS Services with Task Definitions corresponding to the ECR images.
- Initiate a K6 Lambda Load Test.
- Collect information from CloudWatch Logs & Dashboards as well as Datadog's Consul Dashboard.
- Clean up the environment.

## This repo has the following folder structure

| Path | Description |
|--------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| containers | Container assets for the development load-test stack |
| modules | Terraform modules used to simplify parts of the deployment. |
| tls | An intentionally empty directory, used for generating a set of local Consul CA certificates. This will also be the directory, AWS SecretsManager source strings for the certs are sourced. |
| templates | Template used for terraform deployment |

> EFS can be disabled in the TF options. This is not recommended for use, but will allow comparisons to be made with and without the underlying EFS if desired.

## Commence the Tests

Your AWS environment will need to be authorized with the organization you wish to test within.

> __Warning__: This will incur AWS charges to the organization you are authorized!

You will also need to set environment variables to deliver a Consul Encryption Key.
The AWS Region, Datadog API Key and a K6 API Key are optional.

- TF_VAR_consul_encryption_token=[generate with `consul keygen`]
- TF_VAR_aws_region=[the region you intend to test in, default is us-east-1]
- TF_VAR_datadog_apikey=[generate in the datadog ui, optional]
- TF_VAR_k6_apikey=[generate in the k6 ui, optional]

Using environment variables here helps us ensure we don't save keys that can be compromised when we run this manually
and allows for workflows to set up an environment when running each step of the makefile. Consider the following command
to `make images` and `make infra`.
```text
AWS_PROFILE=test-3 TF_VAR_aws_region=us-east-2 make images

AWS_PROFILE=test-3 \
TF_VAR_consul_encryption_token=consul-encryption-token \
TF_VAR_datadog_apikey=DD_API_KEY \
TF_VAR_k6_apikey=k6_API_KEY \
make infra
```

Additionally, you will need an *.auto.tfvars file created similarly to the example provided. Note: 3 subnets are
__required__ in order to provide three sets of consul-server-agent combos.

```hcl
# Filename: ./variables.auto.tfvars
aws_region = "us-east-2"
vpc_name = "consul-test-vpc"
vpc_az = ["us-east-2a", "us-east-2b", "us-east-2c"]
vpc_cidr = "10.0.0.0/16"
public_subnet_cidrs = ["10.0.1.0/24", "10.0.3.0/24", "10.0.5.0/24"]
private_subnet_cidrs = ["10.0.2.0/24", "10.0.4.0/24", "10.0.6.0/24"]

# deploy_efs_cluster = false

admin_cidrs = ["123.456.789.12/32"]
```

```bash
cd test/load-ecs/
cp variables.auto.tfvars.example variables.auto.tfvars
terraform init
make certs
AWS_PROFILE=test-3 make repos
AWS_PROFILE=test-3 TF_VAR_aws_region=us-east-2 make images
AWS_PROFILE=test-3 TF_VAR_aws_region=us-east-2 TF_VAR_consul_encryption_token=12345= TF_VAR_datadog_apikey=DDABC123 TF_VAR_k6_apikey=k6987ZYX make infra
AWS_PROFILE=test-3 TF_VAR_aws_region=us-east-2 TF_VAR_consul_encryption_token=12345= TF_VAR_datadog_apikey=DDABC123 TF_VAR_k6_apikey=k6987ZYX make test-ecs
```

When the test is invoked in the `make test-ecs` step above, you will be able to view the test output in the Lambda
Cloudwatch Logs associated with the k6 load test lambda function, running time ~10 minutes. There is also a CloudWatch
dashboard created for witnessing the ECS and EFS metrics.

The final output in CloudWatch will look something like the following, unfortunately this is not available as terraform
output:
```text
scenarios: (100.00%) 1 scenario, 25 max VUs, 10m30s max duration (incl. graceful stop):
...
default [ 15% ] 25 VUs 01m28.9s/10m0s
running (01m29.9s), 25/25 VUs, 2859 complete and 0 interrupted iterations
...
default [ 61% ] 25 VUs 06m07.9s/10m0s
time="2022-07-21T13:38:46Z" level=warning msg="Request Failed" error="Put \"https://internal-consul-examplelb-1234567890.us-east-2.elb.amazonaws.com:8500/v1/agent/service/register\": request timeout"
...
running (10m01.4s), 00/25 VUs, 11041 complete and 0 interrupted iterations
default ✓ [ 100% ] 25 VUs 10m0s
↳ 99% — ✓ 11017 / ✗ 24
checks.......................: 99.89% ✓ 22058 ✗ 24
✓ http_req_duration..........: avg=679.57ms min=29.36ms med=112.67ms max=1m0s p(90)=1.44s p(95)=1.92s
{ expected_response:true }...: avg=615.03ms min=29.36ms med=112.46ms max=55.78s p(90)=1.43s p(95)=1.9s

```

The test takes about 10 minutes to run. Don't forget to clean-up the environment :)

```bash
AWS_PROFILE=test-3 make clean
```

## Considerations

- This is not a production-ready stack. While it implements gossip encryption and agent communication tls, there are
many aspects not considered for the purpose of creating a testable proof-of-concept in ECS.

- The ALB is internal by default. If you set it to external and define an ingress admin ip range, you can connect to the
consul servers on port 18500 and the clients on 8500. You will not be able to run the lambda load test, but this can
be useful for debugging. Traffic over the public interface can get expensive.

- Each consul service and agent combo share an availability zone (ie: 3 zones, one-server + one-client per zone). Since
there is only one k6-lambda, that can potentially be in any of the three subnets, there is additional network load
placed on one particular availability-zone as opposed to the others.

- Fargate does not allow setting the DNS Server of a container task and resolv.conf is bind mounted ro. Therefore, no dns
queries are made to consul in the scope of this test. The EFS mounts are mounted via Route53 dns names, so vpc dns is
enabled.

- This load test is not automated, and it has not been tested in a workflow, but could be following the same method as
the [load-test.yml workflow](https://github.com/hashicorp/consul/blob/main/.github/workflows/load-test.yml)
11 changes: 11 additions & 0 deletions test/load-ecs/containers/consul/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
ARG VERSION=local
ARG IMAGE=consul
FROM $IMAGE:$VERSION as consul-agent
COPY config/entrypoint.sh /usr/local/bin/
COPY config/auto-encrypt-client.hcl /consul/config/auto-encrypt.hcl
ENTRYPOINT ["entrypoint.sh"]

FROM consul-agent as consul-server
COPY config/tls.hcl /consul/config/tls.hcl
COPY config/auto-encrypt-ca.hcl /consul/config/auto-encrypt.hcl
COPY config/telemetry.hcl /consul/config/telemetry.hcl
4 changes: 4 additions & 0 deletions test/load-ecs/containers/consul/config/auto-encrypt-ca.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
auto_encrypt {
allow_tls = true
dns_san = ["localhost", "client.dc1.consul"]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
auto_encrypt {
tls = true
}
18 changes: 18 additions & 0 deletions test/load-ecs/containers/consul/config/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/usr/bin/env sh
CONSUL_CONFIG_DIR=${CONSUL_CONFIG_DIR:-/consul/config}

if [ -d "$CONSUL_CONFIG_DIR" ]; then
if [ -n "$CONSUL_CA" ]; then
echo "${CONSUL_CA}" > "$CONSUL_CONFIG_DIR/consul-agent-ca.pem"
fi
if [ -n "$CONSUL_CERT" ]; then
echo "${CONSUL_CERT}" > "$CONSUL_CONFIG_DIR/consul-agent-0.pem"
fi
if [ -n "$CONSUL_KEY" ]; then
echo "${CONSUL_KEY}" > "$CONSUL_CONFIG_DIR/consul-agent-0-key.pem"
fi
chown -R consul:consul "$CONSUL_CONFIG_DIR"
chmod -R go-rwx "$CONSUL_CONFIG_DIR"
fi

docker-entrypoint.sh "$@"
3 changes: 3 additions & 0 deletions test/load-ecs/containers/consul/config/telemetry.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
telemetry {
dogstatsd_addr = "127.0.0.1:8125"
}
12 changes: 12 additions & 0 deletions test/load-ecs/containers/consul/config/tls.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
tls {
defaults {
ca_file = "/consul/config/consul-agent-ca.pem"
cert_file = "/consul/config/consul-agent-0.pem"
key_file = "/consul/config/consul-agent-0-key.pem"
verify_incoming = true
verify_outgoing = true
}
internal_rpc {
verify_server_hostname = true
}
}
6 changes: 6 additions & 0 deletions test/load-ecs/containers/datadog/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
ARG VERSION=7
ARG IMAGE=public.ecr.aws/datadog/agent
FROM $IMAGE:$VERSION

COPY datadog.yaml /etc/datadog-agent/datadog.yaml
COPY conf.yaml /etc/datadog-agent/conf.d/consul.d/conf.yaml
8 changes: 8 additions & 0 deletions test/load-ecs/containers/datadog/conf.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
init_config:

instances:
- url: http://localhost:8500
self_leader_check: true
network_latency_checks: true
catalog_checks: true

37 changes: 37 additions & 0 deletions test/load-ecs/containers/datadog/datadog.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#########################
## Basic Configuration ##
#########################

## @param api_key - string - required
## The Datadog API key to associate your Agent's data with your organization.
## Create a new API key here: https://app.datadoghq.com/account/settings
#
tags:
api_key:
logs_enabled: true
dogstatsd_mapper_profiles:
- name: consul
prefix: "consul."
mappings:
- match: 'consul\.http\.([a-zA-Z]+)\.(.*)'
match_type: "regex"
name: "consul.http.request"
tags:
http_method: "$1"
path: "$2"
- match: 'consul\.raft\.replication\.appendEntries\.logs\.([0-9a-f-]+)'
match_type: "regex"
name: "consul.raft.replication.appendEntries.logs"
tags:
consul_node_id: "$1"
- match: 'consul\.raft\.replication\.appendEntries\.rpc\.([0-9a-f-]+)'
match_type: "regex"
name: "consul.raft.replication.appendEntries.rpc"
tags:
consul_node_id: "$1"
- match: 'consul\.raft\.replication\.heartbeat\.([0-9a-f-]+)'
match_type: "regex"
name: "consul.raft.replication.heartbeat"
tags:
consul_node_id: "$1"

18 changes: 18 additions & 0 deletions test/load-ecs/containers/k6/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
FROM amazonlinux:2 as k6bin
ARG K6_VERSION=v0.39.0

RUN yum update -y \
&& yum install -y curl tar gzip zip

RUN curl -O -L "https://github.com/grafana/k6/releases/download/${K6_VERSION}/k6-${K6_VERSION}-linux-amd64.tar.gz" \
&& tar -xvf k6-${K6_VERSION}-linux-amd64.tar.gz \
&& mv /k6-${K6_VERSION}-linux-amd64/k6 /usr/local/bin/


FROM public.ecr.aws/lambda/nodejs:14

COPY --from=k6bin /usr/local/bin/k6 /usr/local/bin/
COPY index.js loadtest.js service.json ${LAMBDA_TASK_ROOT}

WORKDIR /var/task
CMD ["index.handler"]
5 changes: 5 additions & 0 deletions test/load-ecs/containers/k6/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
const { execSync } = require('child_process')

exports.handler = async (event) => {
execSync('k6 run loadtest.js', { encoding: 'utf8', stdio: 'inherit' })
}
Loading