Skip to content

Commit

Permalink
A0-3499: Archive logs in case of e2e failure (#1492)
Browse files Browse the repository at this point in the history
# Description

* Enable RUST_LOG=debug in e2e.
* dump all logs from docker-compose and archive them in a tarball, but
only for failed e2e
* Upload this tarball to GH artifacts

Considerations:

Log growth is around 200 MB per node per 1 hour so given that some
nightly tests run 1 hour and there are max 10 nodes, we have 2GB per
test, which is affordable as GH runners have a quota of 14 GB, and our
self-hosted runners have at least that much. Also, compressed logs
should be around 100 MBs max, so for the case of GH artifacts quota, we
should be good. For public repos, we don’t pay anything for this quota
anyway.

There's some trickery when calling e2e client docker, as there's no way
to enforce timeout on step level in GH composite actions, so I needed to
do a workaround to apply custom timeout mechanism when calling e2e
client in docker.

## Type of change

Please delete options that are not relevant.

- New feature (non-breaking change which adds functionality)

## Testing

* success case - no logs uploaded
https://github.com/Cardinal-Cryptography/aleph-node/actions/runs/6889159800/job/18740144103
* a genuine timeouts in one e2e tests
https://github.com/Cardinal-Cryptography/aleph-node/actions/runs/6904198011
.See that at the bottom there's tgz attached from failed e2e case
  • Loading branch information
Marcin-Radecki authored Nov 20, 2023
1 parent fdcc4f8 commit 94b811e
Show file tree
Hide file tree
Showing 6 changed files with 89 additions and 51 deletions.
30 changes: 26 additions & 4 deletions .github/actions/run-e2e-test/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,6 @@ runs:
shell: bash
run: sleep 60

- name: Display bootnode logs
shell: bash
run: docker logs Node0 --follow &

- name: Download artifact with the test suite image
if: inputs.test-case != ''
uses: actions/download-artifact@v3
Expand All @@ -84,6 +80,7 @@ runs:
run: docker load -i aleph-e2e-client.tar

- name: Run single e2e test
id: run-single-e2e-test
if: inputs.test-case != ''
shell: bash
run: |
Expand Down Expand Up @@ -127,6 +124,31 @@ runs:
contracts/scripts/clean.sh
fi
- name: Get log tarball file name
if: ${{ failure() }}
id: get-log-tarball-file-name
shell: bash
run: |
test_case_escaped=$(echo ${{ inputs.test-case }} | sed 's/::/-/g')
echo "name=${test_case_escaped}" >> $GITHUB_OUTPUT
- name: Archive logs from failed e2e test
if: ${{ failure() }}
shell: bash
run: |
./.github/scripts/run_consensus.sh -n '${{ inputs.node-count }}' \
--archive-logs \
"aleph-node-${{ steps.get-log-tarball-file-name.outputs.name }}-e2e-failure.tgz"
- name: Upload logs from failed e2e test
if: ${{ failure() }}
uses: actions/upload-artifact@v3
with:
path: aleph-node-${{ steps.get-log-tarball-file-name.outputs.name }}-e2e-failure.tgz
name: aleph-node-${{ steps.get-log-tarball-file-name.outputs.name }}-e2e-failure.tgz
if-no-files-found: error
retention-days: 7

- name: Run finalization e2e test
if: inputs.follow-up-finalization-check == 'true'
shell: bash
Expand Down
49 changes: 44 additions & 5 deletions .github/scripts/run_consensus.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ MIN_VALIDATOR_COUNT=${MIN_VALIDATOR_COUNT:-4}
DOCKER_COMPOSE=${DOCKER_COMPOSE:-docker/docker-compose.yml}
OVERRIDE_DOCKER_COMPOSE=${OVERRIDE_DOCKER_COMPOSE:-""}
NODE_IMAGE=${NODE_IMAGE:-"aleph-node:latest"}
LOGS_OUTPUT_FILE=${LOGS_OUTPUT_FILE:=""}

# ------------------------ argument parsing and usage -----------------------

Expand All @@ -29,6 +30,8 @@ Usage
$0
[-n|--node-count NODE_COUNT]
number of nodes to run
[-a|--archive-logs LOGS_OUTPUT_FILE]
archive logs from all nodes in a tarball file; when this is given, no nodes are run
EOF
exit 0
}
Expand All @@ -42,6 +45,10 @@ while [[ $# -gt 0 ]]; do
NODE_COUNT="$2"
shift 2
;;
-a|--archive-logs)
LOGS_OUTPUT_FILE="$2"
shift 2
;;
*)
echo "Unrecognized argument $1!"
usage
Expand Down Expand Up @@ -92,6 +99,18 @@ function generate_bootnode_peer_id() {
-c "aleph-node key inspect-node-key --file /data/${bootnode_account}/p2p_secret")
}

function get_compose_file_list() {
set +u
local docker_compose_file="${1}"
local override_file="${2}"
return_list=("-f" $(realpath "${docker_compose_file}"))
if [[ -n "${override_file}" ]]; then
return_list+=("-f" $(realpath "${override_file}"))
fi
echo ${return_list[@]}
set -u
}

function run_containers() {
local authorities_count="$1"
local docker_compose_file="$2"
Expand All @@ -103,11 +122,26 @@ function run_containers() {
for index in $(seq 0 "${authorities_count}"); do
containers+=("Node${index}")
done
if [[ -z ${override_file} ]]; then
docker-compose -f "${docker_compose_file}" up -d "${containers[@]}"
else
docker-compose -f "${docker_compose_file}" -f "${override_file}" up -d "${containers[@]}"
fi
docker-compose $(get_compose_file_list "${docker_compose_file}" "${override_file}") up -d "${containers[@]}"
}

function archive_logs() {
local tarball_output=$(realpath "${1}")
local node_count="${2}"
local docker_compose_file="${3}"
local override_file="${4}"

local compose_file_list=$(get_compose_file_list "${docker_compose_file}" "${override_file}")

echo "Archiving all logs from ${node_count} nodes to a file ${tarball_output}..."
pushd $(mktemp -d) > /dev/null
for index in $(seq 0 "${node_count}"); do
echo "Archiving "Node${index}" logs..."
docker-compose ${compose_file_list} logs --no-color --no-log-prefix "Node${index}" > "Node${index}.log"
done
tar -czf "${tarball_output}" Node*
popd > /dev/null
echo "Done"
}

# --------------------------------- main script --------------------------------------------
Expand All @@ -119,6 +153,11 @@ script_dir=$(dirname "${script_path}")
aleph_node_root_dir=$(realpath "${script_dir}/../..")
pushd "${aleph_node_root_dir}" > /dev/null

if [[ -n "${LOGS_OUTPUT_FILE}" ]]; then
archive_logs "${LOGS_OUTPUT_FILE}" "${NODE_COUNT}" "${DOCKER_COMPOSE}" "${OVERRIDE_DOCKER_COMPOSE}"
exit 0
fi

if docker inspect ${NODE_IMAGE} > /dev/null; then
echo "aleph-node image tag ${NODE_IMAGE} found locally"
else
Expand Down
22 changes: 18 additions & 4 deletions .github/scripts/run_e2e_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,21 @@ fi
if [[ -n "${OUT_LATENCY:-}" ]]; then
ARGS+=(-e OUT_LATENCY)
fi

docker run -v "$(pwd)/contracts:/contracts" -v "$(pwd)/docker/data:/data" "${ARGS[@]}" aleph-e2e-client:latest

exit $?
timeout_duration="15m"
echo "Running test, logs will be shown when tests finishes or after ${timeout_duration} timeout."
# a hack to set global timeout on a e2e testcase run
# we can't do that on GH yaml level due to https://github.com/actions/runner/issues/1979
docker_service=$(docker run -v "$(pwd)/contracts:/contracts" -v "$(pwd)/docker/data:/data" -d "${ARGS[@]}" \
aleph-e2e-client:latest)
set +e
timeout_output=$(timeout "${timeout_duration}" docker wait "${docker_service}")
docker_exit_code=$?
# timeout returns 124 exit code if command times out
# otherwise, docker wait finishes and it prints docker service exit code on stdout
if [[ "${docker_exit_code}" != 124 ]]; then
docker_exit_code="${timeout_output}"
fi
echo "Test exited with exit code ${docker_exit_code}"
echo "Logs from test:"
docker logs "${docker_service}"
exit "${docker_exit_code}"
Loading

0 comments on commit 94b811e

Please sign in to comment.