From 4d1453f9aead1dbe5300032323a4c34aaee2cca6 Mon Sep 17 00:00:00 2001 From: Jacob Callahan Date: Mon, 13 Jan 2025 15:17:49 -0500 Subject: [PATCH] Make AnsibleTower checkout/execute actions more resilient We've been seeing some issues with service interruptions in AAP under high load. While the jobs do complete successfully, awxkit bails when encountering the connection issue. With this change, we simple enter a retry loop when monitoring job status. --- broker/providers/ansible_tower.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/broker/providers/ansible_tower.py b/broker/providers/ansible_tower.py index 1251ff3..8b72315 100644 --- a/broker/providers/ansible_tower.py +++ b/broker/providers/ansible_tower.py @@ -8,6 +8,7 @@ import click from dynaconf import Validator from logzero import logger +from requests.exceptions import ConnectionError from broker import exceptions from broker.helpers import eval_filter, find_origin, yaml @@ -41,6 +42,19 @@ def convert_pseudonamespaces(attr_dict): return out_dict +def resilient_job_wait(job, timeout=None): + """Wait for a job to complete. Retry on errors.""" + timeout = timeout or settings.ANSIBLETOWER.workflow_timeout + completed = False + while not completed: + try: + job.wait_until_completed(timeout=timeout) + completed = True + except ConnectionError as err: + logger.error(f"Error occurred while waiting for job: {err}") + logger.info("Retrying job wait...") + + class JobExecutionError(exceptions.ProviderError): """Raised when a job execution fails.""" @@ -605,7 +619,7 @@ def execute(self, **kwargs): # noqa: PLR0912,PLR0915 - Possible TODO refactor job_ui_url = url_parser.urljoin(self.url, f"/#/{subject}s/{job_number}") helpers.emit(api_url=job_api_url, ui_url=job_ui_url) logger.info(f"Waiting for job: \nAPI: {job_api_url}\nUI: {job_ui_url}") - job.wait_until_completed(timeout=settings.ANSIBLETOWER.workflow_timeout) + resilient_job_wait(job) if job.status != "successful": message_data = { f"{subject.capitalize()} Status": job.status,