diff --git a/lisa/sut_orchestrator/baremetal/cluster/cluster.py b/lisa/sut_orchestrator/baremetal/cluster/cluster.py index c8fd607e7c..ecfe2cbe9d 100644 --- a/lisa/sut_orchestrator/baremetal/cluster/cluster.py +++ b/lisa/sut_orchestrator/baremetal/cluster/cluster.py @@ -8,7 +8,7 @@ from lisa.util import InitializableMixin, subclasses from lisa.util.logger import get_logger -from ..schema import ClusterSchema +from ..schema import ClientCapabilities, ClientSchema, ClusterSchema class Cluster(subclasses.BaseClassWithRunbookMixin, InitializableMixin): @@ -35,3 +35,9 @@ def get_serial_console(self) -> Type[features.SerialConsole]: def get_start_stop(self) -> Type[features.StartStop]: raise NotImplementedError() + + def get_client_capabilities(self, client: ClientSchema) -> ClientCapabilities: + raise NotImplementedError() + + def cleanup(self) -> None: + raise NotImplementedError() diff --git a/lisa/sut_orchestrator/baremetal/cluster/idrac.py b/lisa/sut_orchestrator/baremetal/cluster/idrac.py index 10c794ee34..c3a64f79fc 100644 --- a/lisa/sut_orchestrator/baremetal/cluster/idrac.py +++ b/lisa/sut_orchestrator/baremetal/cluster/idrac.py @@ -1,21 +1,23 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. +import base64 import time import xml.etree.ElementTree as ETree -from typing import Any, Type +from pathlib import Path +from typing import Any, Optional, Type import redfish # type: ignore from assertpy import assert_that from lisa import features, schema from lisa.environment import Environment -from lisa.util import LisaException +from lisa.util import LisaException, check_till_timeout from lisa.util.logger import get_logger from lisa.util.perf_timer import create_timer from ..platform_ import BareMetalPlatform -from ..schema import ClusterSchema, IdracSchema +from ..schema import ClientCapabilities, ClientSchema, ClusterSchema, IdracSchema from .cluster import Cluster @@ -38,27 +40,52 @@ def _stop( "baremetal orchestrator does not support hibernate stop" ) self._login() - if self.cluster.get_power_state() == "Off": - self._log.debug("System is already off.") - return self.cluster.reset("GracefulShutdown") self._logout() def _start(self, wait: bool = True) -> None: self._login() - if self.cluster.get_power_state() == "On": - self._log.debug("System is already powered on.") - return self.cluster.reset("On") self._logout() def _restart(self, wait: bool = True) -> None: self._login() - self.cluster.reset("ForceRestart") + self.cluster.reset("ForceRestart", force_run=True) self._logout() +class IdracSerialConsole(features.SerialConsole): + def _login(self) -> None: + platform: BareMetalPlatform = self._platform # type: ignore + self.cluster: Idrac = platform.cluster # type: ignore + self.cluster.login() + + def _logout(self) -> None: + platform: BareMetalPlatform = self._platform # type: ignore + self.cluster = platform.cluster # type: ignore + self.cluster.logout() + + def _get_console_log(self, saved_path: Optional[Path]) -> bytes: + self._login() + if saved_path: + screenshot_file_name: str = "serial_console" + decoded_data = base64.b64decode(self.cluster.get_server_screen_shot()) + screenshot_raw_name = saved_path / f"{screenshot_file_name}.png" + with open(screenshot_raw_name, "wb") as img_file: + img_file.write(decoded_data) + console_log = self.cluster.get_serial_console_log().encode("utf-8") + self._logout() + return console_log + + class Idrac(Cluster): + state_dict = { + "GracefulShutdown": "Off", + "ForceRestart": "On", + "On": "On", + "ForceOff": "Off", + } + def __init__(self, runbook: ClusterSchema) -> None: super().__init__(runbook) self.idrac_runbook: IdracSchema = self.runbook @@ -68,6 +95,7 @@ def __init__(self, runbook: ClusterSchema) -> None: ).is_equal_to(1) self.client = self.idrac_runbook.client[0] + self._enable_serial_console() @classmethod def type_name(cls) -> str: @@ -80,26 +108,80 @@ def type_schema(cls) -> Type[schema.TypedSchema]: def get_start_stop(self) -> Type[features.StartStop]: return IdracStartStop + def get_serial_console(self) -> Type[features.SerialConsole]: + return IdracSerialConsole + def deploy(self, environment: Environment) -> Any: self.login() self._eject_virtual_media() - self._change_boot_order_once("VCD-DVD") assert self.client.iso_http_url, "iso_http_url is required for idrac client" - if self.get_power_state() == "Off": - self._log.debug("System is already off.") - else: - self.reset("GracefulShutdown") + self._change_boot_order_once("VCD-DVD") + self.reset("ForceOff") self._insert_virtual_media(self.client.iso_http_url) - self.reset("On") + self.reset("On", force_run=True) + self.logout() + + def cleanup(self) -> None: + self.login() + self._clear_serial_console_log() self.logout() - def reset(self, operation: str) -> None: + def get_client_capabilities(self, client: ClientSchema) -> ClientCapabilities: + if client.capabilities: + return client.capabilities + self.login() + response = self.redfish_instance.get( + "/redfish/v1/Systems/System.Embedded.1/", + ) + cluster_capabilities = ClientCapabilities() + cluster_capabilities.core_count = int( + response.dict["ProcessorSummary"]["LogicalProcessorCount"] + ) + cluster_capabilities.free_memory_mb = ( + int(response.dict["MemorySummary"]["TotalSystemMemoryGiB"]) * 1024 + ) + self.logout() + return cluster_capabilities + + def get_serial_console_log(self) -> str: + response = self.redfish_instance.post( + "/redfish/v1/Managers/iDRAC.Embedded.1/SerialInterfaces" + "/Serial.1/Actions/Oem/DellSerialInterface.SerialDataExport", + body={}, + ) + check_till_timeout( + lambda: int(response.status) == 200, + timeout_message="wait for response status 200", + ) + return str(response.text) + + def get_server_screen_shot(self, file_type: str = "ServerScreenShot") -> str: + response = self.redfish_instance.post( + "/redfish/v1/Dell/Managers/iDRAC.Embedded.1/DellLCService/Actions/" + "DellLCService.ExportServerScreenShot", + body={"FileType": file_type}, + ) + self._wait_for_completion(response) + return str(response.dict["ServerScreenShotFile"]) + + def reset(self, operation: str, force_run: bool = False) -> None: + if operation in self.state_dict.keys(): + expected_state = self.state_dict[operation] + if not force_run and self.get_power_state() == expected_state: + self._log.debug(f"System is already in {expected_state} state.") + return + body = {"ResetType": operation} response = self.redfish_instance.post( "/redfish/v1/Systems/System.Embedded.1/Actions/ComputerSystem.Reset", body=body, ) self._wait_for_completion(response) + if operation in self.state_dict.keys(): + check_till_timeout( + lambda: self.get_power_state() == expected_state, + timeout_message=(f"wait for client into '{expected_state}' state"), + ) self._log.debug(f"{operation} initiated successfully.") def get_power_state(self) -> str: @@ -184,3 +266,35 @@ def _change_boot_order_once(self, boot_from: str) -> None: self._log.debug("Waiting for boot order override task to complete...") self._wait_for_completion(response) self._log.debug(f"Updating boot source to {boot_from} completed") + + def _enable_serial_console(self) -> None: + self.login() + response = self.redfish_instance.get( + "/redfish/v1/Managers/iDRAC.Embedded.1/Attributes" + ) + if response.dict["Attributes"]["SerialCapture.1.Enable"] == "Disabled": + response = self.redfish_instance.patch( + "/redfish/v1/Managers/iDRAC.Embedded.1/Attributes", + body={"Attributes": {"SerialCapture.1.Enable": "Enabled"}}, + ) + response = self.redfish_instance.get( + "/redfish/v1/Managers/iDRAC.Embedded.1/Attributes" + ) + if response.dict["Attributes"]["SerialCapture.1.Enable"] == "Enabled": + self._log.debug("Serial console enabled successfully.") + else: + raise LisaException("Failed to enable serial console.") + self.logout() + + def _clear_serial_console_log(self) -> None: + response = self.redfish_instance.get( + "/redfish/v1/Managers/iDRAC.Embedded.1/Attributes" + ) + if response.dict["Attributes"]["SerialCapture.1.Enable"] == "Disabled": + self._log.debug("Serial console is already disabled. No need to clear log.") + response = self.redfish_instance.post( + "/redfish/v1/Managers/iDRAC.Embedded.1/SerialInterfaces" + "/Serial.1/Actions/Oem/DellSerialInterface.SerialDataClear", + body={}, + ) + self._wait_for_completion(response) diff --git a/lisa/sut_orchestrator/baremetal/cluster/rackmanager.py b/lisa/sut_orchestrator/baremetal/cluster/rackmanager.py index 7915d4bc61..0889e19777 100644 --- a/lisa/sut_orchestrator/baremetal/cluster/rackmanager.py +++ b/lisa/sut_orchestrator/baremetal/cluster/rackmanager.py @@ -1,6 +1,5 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. - from typing import Any, Type from lisa import features, schema @@ -9,7 +8,7 @@ from lisa.util.logger import get_logger from ..platform_ import BareMetalPlatform -from ..schema import RackManagerSchema +from ..schema import ClientCapabilities, ClientSchema, RackManagerSchema from .cluster import Cluster @@ -54,19 +53,34 @@ def type_schema(cls) -> Type[schema.TypedSchema]: def get_start_stop(self) -> Type[features.StartStop]: return RackManagerStartStop + def connect_to_rack_manager(self) -> None: + assert self.rm_runbook.connection, "connection is required for rackmanager" + self.rm_runbook.connection.name = "rackmanager" + self.rm_node = quick_connect( + self.rm_runbook.connection, logger_name="rackmanager" + ) + def deploy(self, environment: Environment) -> Any: self.reset("off") self.reset("on") def reset(self, operation: str) -> None: - assert self.rm_runbook.connection, "connection is required for rackmanager" - self.rm_runbook.connection.name = "rackmanager" - rm_node = quick_connect(self.rm_runbook.connection, logger_name="rackmanager") + self.connect_to_rack_manager() assert self.rm_runbook.client, "client is required for rackmanager" for client in self.rm_runbook.client: assert ( client.management_port ), "management_port is required for rackmanager client" - rm_node.execute(f"set system {operation} -i {client.management_port}") - + self.rm_node.execute(f"set system {operation} -i {client.management_port}") self._log.debug(f"client has been {operation} successfully") + + def get_client_capabilities(self, client: ClientSchema) -> ClientCapabilities: + if client.capabilities: + return client.capabilities + cluster_capabilities = ClientCapabilities() + cluster_capabilities.core_count = 0 + cluster_capabilities.free_memory_mb = 0 + return cluster_capabilities + + def cleanup(self) -> None: + pass diff --git a/lisa/sut_orchestrator/baremetal/features.py b/lisa/sut_orchestrator/baremetal/features.py index f676b4c4ba..f157c1f66a 100644 --- a/lisa/sut_orchestrator/baremetal/features.py +++ b/lisa/sut_orchestrator/baremetal/features.py @@ -25,6 +25,7 @@ def _initialize(self, *args: Any, **kwargs: Any) -> None: *args, **kwargs, ) + self._inner.initialize() class StartStop(ClusterFeature): diff --git a/lisa/sut_orchestrator/baremetal/platform_.py b/lisa/sut_orchestrator/baremetal/platform_.py index 6a50b53845..325a99cba2 100644 --- a/lisa/sut_orchestrator/baremetal/platform_.py +++ b/lisa/sut_orchestrator/baremetal/platform_.py @@ -20,7 +20,7 @@ from .ip_getter import IpGetterChecker from .key_loader import KeyLoader from .readychecker import ReadyChecker -from .schema import BareMetalPlatformSchema, BuildSchema +from .schema import BareMetalPlatformSchema, BuildSchema, ClientCapabilities from .source import Source @@ -52,18 +52,20 @@ def _initialize(self, *args: Any, **kwargs: Any) -> None: self.key_loader_factory = Factory[KeyLoader](KeyLoader) self.source_factory = Factory[Source](Source) self.build_factory = Factory[Build](Build) - - def _prepare_environment(self, environment: Environment, log: Logger) -> bool: - return self._configure_node_capabilities(environment, log) - - def _deploy_environment(self, environment: Environment, log: Logger) -> None: # currently only support one cluster assert self._baremetal_runbook.cluster, "no cluster is specified in the runbook" - cluster_instance = self._baremetal_runbook.cluster[0] + self._cluster_runbook = self._baremetal_runbook.cluster[0] + self.cluster = self.cluster_factory.create_by_runbook(self._cluster_runbook) - self.cluster = self.cluster_factory.create_by_runbook(cluster_instance) + def _prepare_environment(self, environment: Environment, log: Logger) -> bool: assert self.cluster.runbook.client, "no client is specified in the runbook" + client_capabilities = self.cluster.get_client_capabilities( + self.cluster.runbook.client[0] + ) + return self._configure_node_capabilities(environment, log, client_capabilities) + + def _deploy_environment(self, environment: Environment, log: Logger) -> None: # copy build (shared, check if it's copied) if self._baremetal_runbook.source: if not self.local_artifacts_path: @@ -83,9 +85,9 @@ def _deploy_environment(self, environment: Environment, log: Logger) -> None: ready_checker: Optional[ReadyChecker] = None # ready checker cleanup - if cluster_instance.ready_checker: + if self._cluster_runbook.ready_checker: ready_checker = self.ready_checker_factory.create_by_runbook( - cluster_instance.ready_checker + self._cluster_runbook.ready_checker ) ready_checker.clean_up() @@ -143,9 +145,9 @@ def _deploy_environment(self, environment: Environment, log: Logger) -> None: # deploy cluster self.cluster.deploy(environment) - if cluster_instance.ready_checker: + if self._cluster_runbook.ready_checker: ready_checker = self.ready_checker_factory.create_by_runbook( - cluster_instance.ready_checker + self._cluster_runbook.ready_checker ) for index, node in enumerate(environment.nodes.list()): @@ -156,9 +158,9 @@ def _deploy_environment(self, environment: Environment, log: Logger) -> None: ready_checker.is_ready(node) # get ip address - if cluster_instance.ip_getter: + if self._cluster_runbook.ip_getter: ip_getter = self.ip_getter_factory.create_by_runbook( - cluster_instance.ip_getter + self._cluster_runbook.ip_getter ) node_context.connection.address = ip_getter.get_ip() @@ -185,12 +187,15 @@ def copy(self, build_schema: BuildSchema, sources_path: List[Path]) -> None: self._log.debug("no copied source path specified, skip copy") def _configure_node_capabilities( - self, environment: Environment, log: Logger + self, + environment: Environment, + log: Logger, + cluster_capabilities: ClientCapabilities, ) -> bool: if not environment.runbook.nodes_requirement: return True - nodes_capabilities = self._create_node_capabilities() + nodes_capabilities = self._create_node_capabilities(cluster_capabilities) nodes_requirement = [] for node_space in environment.runbook.nodes_requirement: @@ -203,11 +208,16 @@ def _configure_node_capabilities( environment.runbook.nodes_requirement = nodes_requirement return True - def _create_node_capabilities(self) -> schema.NodeSpace: + def _create_node_capabilities( + self, cluster_capabilities: ClientCapabilities + ) -> schema.NodeSpace: node_capabilities = schema.NodeSpace() node_capabilities.name = "baremetal" node_capabilities.node_count = 1 - node_capabilities.core_count = search_space.IntRange(min=1, max=1) + node_capabilities.core_count = search_space.IntRange( + min=1, max=cluster_capabilities.core_count + ) + node_capabilities.memory_mb = cluster_capabilities.free_memory_mb node_capabilities.disk = schema.DiskOptionSettings( data_disk_count=search_space.IntRange(min=0), data_disk_size=search_space.IntRange(min=1), @@ -231,3 +241,6 @@ def _create_node_capabilities(self) -> schema.NodeSpace: ) return node_capabilities + + def _cleanup(self) -> None: + self.cluster.cleanup() diff --git a/lisa/sut_orchestrator/baremetal/schema.py b/lisa/sut_orchestrator/baremetal/schema.py index 5840fcc85c..401090f6f2 100644 --- a/lisa/sut_orchestrator/baremetal/schema.py +++ b/lisa/sut_orchestrator/baremetal/schema.py @@ -10,12 +10,20 @@ from lisa.util import field_metadata +@dataclass_json() +@dataclass +class ClientCapabilities: + core_count: int = field(default=-1) + free_memory_mb: int = field(default=-1) + + @dataclass_json() @dataclass class ClientSchema: connection: Optional[schema.RemoteNode] = field( default=None, metadata=field_metadata(required=True) ) + capabilities: Optional[ClientCapabilities] = None @dataclass_json()