Skip to content

Commit

Permalink
[CLOUDOPS-548] Allow retry on openstack HttpException
Browse files Browse the repository at this point in the history
  • Loading branch information
ricolin committed Nov 1, 2024
1 parent 6a68711 commit a8b9fa0
Show file tree
Hide file tree
Showing 7 changed files with 353 additions and 18 deletions.
3 changes: 1 addition & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,4 @@ parse
tooz # Apache-2.0
sherlock>=0.4.1 # MIT
kubernetes # Apache-2.0
# email
# smtplib
tenacity
78 changes: 78 additions & 0 deletions staffeln/common/openstack.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,33 @@
from openstack import exceptions, proxy
from oslo_log import log
from staffeln.common import auth
from staffeln import conf
from staffeln.i18n import _

import tenacity

CONF = conf.CONF
LOG = log.getLogger(__name__)

class RetryHTTPError(tenacity.retry_if_exception):
"""Retry strategy that retries if the exception is an ``HTTPError`` with
a abnormal status code.
"""

def __init__(self):
def is_http_error(exception):
# Make sure we don't retry on 404, as not found could be an
# expected status.
result = (isinstance(exception, exceptions.HttpException) and
exception.status_code != 404)
if result:
LOG.debug(f"Getting HttpException {exception} (status "
f"code: {exception.status_code}), "
"retry till timeout...")
return result

super().__init__(predicate=is_http_error)


class OpenstackSDK:
def __init__(self):
Expand All @@ -23,6 +46,11 @@ def set_project(self, project):
self.conn = self.conn_list[project_id]

# user
@tenacity.retry(
retry=RetryHTTPError(),
wait=tenacity.wait_exponential(max=30),
reraise=True,
stop=tenacity.stop_after_delay(CONF.conductor.retry_timeout))
def get_user_id(self):
user_name = self.conn.config.auth["username"]
if "user_domain_id" in self.conn.config.auth:
Expand All @@ -35,15 +63,30 @@ def get_user_id(self):
user = self.conn.get_user(name_or_id=user_name)
return user.id

@tenacity.retry(
retry=RetryHTTPError(),
wait=tenacity.wait_exponential(max=30),
reraise=True,
stop=tenacity.stop_after_delay(CONF.conductor.retry_timeout))
def get_role_assignments(self, project_id, user_id=None):
filters = {"project": project_id}
if user_id:
filters["user"] = user_id
return self.conn.list_role_assignments(filters=filters)

@tenacity.retry(
retry=RetryHTTPError(),
wait=tenacity.wait_exponential(max=30),
reraise=True,
stop=tenacity.stop_after_delay(CONF.conductor.retry_timeout))
def get_user(self, user_id):
return self.conn.get_user(name_or_id=user_id)

@tenacity.retry(
retry=RetryHTTPError(),
wait=tenacity.wait_exponential(max=30),
reraise=True,
stop=tenacity.stop_after_delay(CONF.conductor.retry_timeout))
def get_project_member_emails(self, project_id):
members = self.get_role_assignments(project_id)
emails = []
Expand All @@ -60,9 +103,19 @@ def get_project_member_emails(self, project_id):
emails.append(user.email)
return emails

@tenacity.retry(
retry=RetryHTTPError(),
wait=tenacity.wait_exponential(max=30),
reraise=True,
stop=tenacity.stop_after_delay(CONF.conductor.retry_timeout))
def get_projects(self):
return self.conn.list_projects()

@tenacity.retry(
retry=RetryHTTPError(),
wait=tenacity.wait_exponential(max=30),
reraise=True,
stop=tenacity.stop_after_delay(CONF.conductor.retry_timeout))
def get_servers(self, project_id=None, all_projects=True, details=True):
if project_id is not None:
return self.conn.compute.servers(
Expand All @@ -71,9 +124,19 @@ def get_servers(self, project_id=None, all_projects=True, details=True):
else:
return self.conn.compute.servers(details=details, all_projects=all_projects)

@tenacity.retry(
retry=RetryHTTPError(),
wait=tenacity.wait_exponential(max=30),
reraise=True,
stop=tenacity.stop_after_delay(CONF.conductor.retry_timeout))
def get_volume(self, uuid, project_id):
return self.conn.get_volume_by_id(uuid)

@tenacity.retry(
retry=RetryHTTPError(),
wait=tenacity.wait_exponential(max=30),
reraise=True,
stop=tenacity.stop_after_delay(CONF.conductor.retry_timeout))
def get_backup(self, uuid, project_id=None):
# return conn.block_storage.get_backup(
# project_id=project_id, backup_id=uuid,
Expand Down Expand Up @@ -104,6 +167,11 @@ def create_backup(
incremental=incremental,
)

@tenacity.retry(
retry=RetryHTTPError(),
wait=tenacity.wait_exponential(max=30),
reraise=True,
stop=tenacity.stop_after_delay(CONF.conductor.retry_timeout))
def delete_backup(self, uuid, project_id=None, force=False):
# Note(Alex): v3 is not supporting force delete?
# conn.block_storage.delete_backup(
Expand All @@ -116,11 +184,21 @@ def delete_backup(self, uuid, project_id=None, force=False):
except exceptions.ResourceNotFound:
return None

@tenacity.retry(
retry=RetryHTTPError(),
wait=tenacity.wait_exponential(max=30),
reraise=True,
stop=tenacity.stop_after_delay(CONF.conductor.retry_timeout))
def get_backup_quota(self, project_id):
# quota = conn.get_volume_quotas(project_id)
quota = self._get_volume_quotas(project_id)
return quota.backups

@tenacity.retry(
retry=RetryHTTPError(),
wait=tenacity.wait_exponential(max=30),
reraise=True,
stop=tenacity.stop_after_delay(CONF.conductor.retry_timeout))
def get_backup_gigabytes_quota(self, project_id):
# quota = conn.get_volume_quotas(project_id)
quota = self._get_volume_quotas(project_id)
Expand Down
22 changes: 9 additions & 13 deletions staffeln/conductor/backup.py
Original file line number Diff line number Diff line change
Expand Up @@ -432,12 +432,8 @@ def check_instance_volumes(self):
try:
servers = self.openstacksdk.get_servers(project_id=project.id)
except OpenstackHttpException as ex:
LOG.warn(
_(
"Failed to list servers in project %s. %s"
% (project.id, str(ex))
)
)
LOG.warn(f"Failed to list servers in project {project.id}. "
f"{str(ex)} (status code: {ex.status_code}).")
continue
for server in servers:
if not self.filter_by_server_metadata(server.metadata):
Expand Down Expand Up @@ -501,8 +497,10 @@ def collect_instance_retention_map(self):

try:
servers = self.openstacksdk.get_servers(all_projects=True)
except OpenstackHttpException:
LOG.warn(_("Failed to list servers for all projects."))
except OpenstackHttpException as ex:
servers = []
LOG.warn(f"Failed to list servers for all projects. "
f"{str(ex)} (status code: {ex.status_code}).")

for server in servers:
if CONF.conductor.retention_metadata_key in server.metadata:
Expand Down Expand Up @@ -647,18 +645,16 @@ def process_pre_failed_backup(self, task):

def process_failed_backup(self, task):
# 1. notify via email
reason = _("The status of backup for the volume %s is error." % task.volume_id)
reason = f"The status of backup for the volume {task.volume_id} is error."
LOG.warn(reason)
# 2. delete backup generator
try:
self.openstacksdk.delete_backup(uuid=task.backup_id)
self.create_failed_backup_obj(task)
except OpenstackHttpException as ex:
LOG.warn(
_(
"Failed to delete volume backup %s. %s. Need to delete manually."
% (task.backup_id, str(ex))
)
f"Failed to delete volume backup {task.backup_id}. {ex} "
f"(status code: {ex.status_code}). Need to delete manually."
)
task.reason = reason
task.backup_status = constants.BACKUP_FAILED
Expand Down
6 changes: 6 additions & 0 deletions staffeln/conf/conductor.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,12 @@
min=0,
help=_("Number of incremental backups between full backups."),
),
cfg.IntOpt(
"retry_timeout",
default=300,
min=1,
help=_("The timeout for retry, the unit is one second."),
),
]

rotation_opts = [
Expand Down
Empty file.
Loading

0 comments on commit a8b9fa0

Please sign in to comment.