diff --git a/reconcile/cli.py b/reconcile/cli.py index f3e88b5100..7c4be90362 100644 --- a/reconcile/cli.py +++ b/reconcile/cli.py @@ -2394,11 +2394,14 @@ def ocm_addons_upgrade_tests_trigger(ctx): @integration.command(short_help="Manage Machine Pools in OCM.") +@gitlab_project_id @click.pass_context -def ocm_machine_pools(ctx): +def ocm_machine_pools(ctx, gitlab_project_id): import reconcile.ocm_machine_pools - run_integration(reconcile.ocm_machine_pools, ctx.obj) + run_integration( + reconcile.ocm_machine_pools, ctx.obj, gitlab_project_id=gitlab_project_id + ) @integration.command(short_help="Manage Upgrade Policy schedules in OCM organizations.") diff --git a/reconcile/ocm_machine_pools.py b/reconcile/ocm_machine_pools.py index dfdb05a1a9..6b499e3705 100644 --- a/reconcile/ocm_machine_pools.py +++ b/reconcile/ocm_machine_pools.py @@ -16,7 +16,7 @@ root_validator, ) -from reconcile import queries +from reconcile import mr_client_gateway, queries from reconcile.gql_definitions.common.clusters import ( ClusterMachinePoolV1, ClusterSpecAutoScaleV1, @@ -25,6 +25,7 @@ from reconcile.typed_queries.clusters import get_clusters from reconcile.utils.differ import diff_mappings from reconcile.utils.disabled_integrations import integration_is_enabled +from reconcile.utils.mr.cluster_machine_pool_updates import ClustersMachinePoolUpdates from reconcile.utils.ocm import ( DEFAULT_OCM_MACHINE_POOL_ID, OCM, @@ -134,18 +135,32 @@ def has_diff(self, pool: ClusterMachinePoolV1) -> bool: def invalid_diff(self, pool: ClusterMachinePoolV1) -> Optional[str]: pass + def get_pool_spec_to_update( + self, pool: ClusterMachinePoolV1 + ) -> Optional[ClusterMachinePoolV1]: + """ + For situations where OCM holds the source of truth for parts of the spec, + this function detects a drift in the app-interface spec and returns a + new pool spec to be updated in app-interface. + + If no drift is detected, None is returned. + """ + return None + @abstractmethod def deletable(self) -> bool: pass - def _has_diff_autoscale(self, pool): + def _has_diff_autoscale(self, pool: ClusterMachinePoolV1) -> bool: match (self.autoscaling, pool.autoscale): case (None, None): return False case (None, _) | (_, None): return True - case _: + case _ if self.autoscaling and pool.autoscale: return self.autoscaling.has_diff(pool.autoscale) + case _: + return False class MachinePool(AbstractPool): @@ -232,7 +247,7 @@ class NodePool(AbstractPool): subnet: Optional[str] def delete(self, ocm: OCM) -> None: - ocm.delete_node_pool(self.cluster, self.dict(by_alias=True)) + ocm.delete_node_pool(self.cluster, self.id) def create(self, ocm: OCM) -> None: spec = self.dict(by_alias=True) @@ -262,7 +277,11 @@ def has_diff(self, pool: ClusterMachinePoolV1) -> bool: or self.taints != pool.taints or self.labels != pool.labels or self.aws_node_pool.instance_type != pool.instance_type - or self.subnet != pool.subnet + # if the nodepool in app-interface does not define a subnet explicitely + # we don't consider it a diff. we don't manage it, but `get_pool_spec_to_update` + # will highlight it as a spec drift that is going to be fixed with an MR + # towards app-interface + or (pool.subnet is not None and self.subnet != pool.subnet) or self._has_diff_autoscale(pool) ) @@ -273,6 +292,27 @@ def invalid_diff(self, pool: ClusterMachinePoolV1) -> Optional[str]: return "subnet" return None + def get_pool_spec_to_update( + self, pool: ClusterMachinePoolV1 + ) -> Optional[ClusterMachinePoolV1]: + """ + if app-interface does not define a subnet explicitely or if the + subnet is different from OCM, we consider the spec in app-interface oudated. + + In such a case this method returns a clone of the current pool definition + in app-interface with the subnet updated to the one in OCM. if no update + is required, None is returned + + How can this happen? + * a cluster was created without specified subnets and ROSA CLI picks the subnet + assignment for the nodepools + * nodepools have been recreated on OCM side without app-interface involvement + and the subnet changed + """ + if pool.subnet is None or self.subnet != pool.subnet: + return pool.copy(update={"subnet": self.subnet}) + return None + def deletable(self) -> bool: return True @@ -327,6 +367,7 @@ def act(self, dry_run: bool, ocm: OCM) -> None: class DesiredMachinePool(BaseModel): + cluster_path: str cluster_name: str cluster_type: ClusterType pools: list[ClusterMachinePoolV1] @@ -354,6 +395,7 @@ def fetch_current_state( return { c.name: fetch_current_state_for_cluster(c, ocm_map.get(c.name)) for c in clusters + if c.spec and c.spec.q_id } @@ -373,7 +415,7 @@ def _classify_cluster_type(cluster: ClusterV1) -> ClusterType: raise ValueError(f"unknown cluster type for cluster {cluster.name}") -def fetch_current_state_for_cluster(cluster, ocm): +def fetch_current_state_for_cluster(cluster: ClusterV1, ocm: OCM) -> list[AbstractPool]: cluster_type = _classify_cluster_type(cluster) if cluster_type == ClusterType.ROSA_HCP: return [ @@ -423,12 +465,13 @@ def create_desired_state_from_gql( ) -> dict[str, DesiredMachinePool]: return { cluster.name: DesiredMachinePool( + cluster_path=cluster.path, cluster_name=cluster.name, cluster_type=_classify_cluster_type(cluster), pools=cluster.machine_pools, ) for cluster in clusters - if cluster.machine_pools is not None + if cluster.machine_pools is not None and cluster.spec and cluster.spec.q_id } @@ -504,7 +547,34 @@ def calculate_diff( return diffs, errors -def act(dry_run: bool, diffs: Iterable[PoolHandler], ocm_map: OCMMap) -> None: +def _drift_updates( + current_pools: list[AbstractPool], + desired: DesiredMachinePool, +) -> list[ClusterMachinePoolV1]: + current_machine_pools = {pool.id: pool for pool in current_pools} + return [ + spec + for pool in desired.pools + if (current_pool := current_machine_pools.get(pool.q_id)) + and (spec := current_pool.get_pool_spec_to_update(pool)) is not None + ] + + +def calculate_spec_drift( + current_state: Mapping[str, list[AbstractPool]], + desired_state: Mapping[str, DesiredMachinePool], +) -> dict[str, list[ClusterMachinePoolV1]]: + """ + Finds spec drifts between OCM and app-interface and returns a list of them. + """ + return { + desired_state[k].cluster_path: updates + for k in current_state.keys() & desired_state.keys() + if (updates := _drift_updates(current_state[k], desired_state[k])) + } + + +def update_ocm(dry_run: bool, diffs: Iterable[PoolHandler], ocm_map: OCMMap) -> None: for diff in diffs: logging.info([diff.action, diff.pool.cluster, diff.pool.id]) if not dry_run: @@ -512,11 +582,35 @@ def act(dry_run: bool, diffs: Iterable[PoolHandler], ocm_map: OCMMap) -> None: diff.act(dry_run, ocm) +def update_app_interface( + dry_run: bool, + gitlab_project_id: Optional[str], + diffs: dict[str, list[ClusterMachinePoolV1]], +) -> None: + if not diffs: + return + + mr = ClustersMachinePoolUpdates( + machine_pool_updates={ + cluster_path: [pool.dict(by_alias=True) for pool in pool_updates] + for cluster_path, pool_updates in diffs.items() + } + ) + if not dry_run: + with mr_client_gateway.init(gitlab_project_id=gitlab_project_id) as mr_cli: + mr.submit(cli=mr_cli) + + def _cluster_is_compatible(cluster: ClusterV1) -> bool: - return cluster.ocm is not None and cluster.machine_pools is not None + return ( + cluster.ocm is not None + and cluster.machine_pools is not None + and cluster.spec is not None + and cluster.spec.q_id is not None + ) -def run(dry_run: bool): +def run(dry_run: bool, gitlab_project_id: Optional[str] = None): clusters = get_clusters() filtered_clusters = [ @@ -540,9 +634,14 @@ def run(dry_run: bool): current_state = fetch_current_state(ocm_map, filtered_clusters) desired_state = create_desired_state_from_gql(filtered_clusters) + + # handle diff towards OCM diffs, errors = calculate_diff(current_state, desired_state) + update_ocm(dry_run, diffs, ocm_map) - act(dry_run, diffs, ocm_map) + # handle diffs towards app-interface + spec_drift = calculate_spec_drift(current_state, desired_state) + update_app_interface(dry_run, gitlab_project_id, spec_drift) if errors: for err in errors: diff --git a/reconcile/test/test_ocm_machine_pools.py b/reconcile/test/test_ocm_machine_pools.py index a447003147..b5bcef97ae 100644 --- a/reconcile/test/test_ocm_machine_pools.py +++ b/reconcile/test/test_ocm_machine_pools.py @@ -168,6 +168,7 @@ def test_calculate_diff_create(): } desired = { "cluster1": DesiredMachinePool( + cluster_path="path", cluster_name="cluster1", cluster_type=ClusterType.OSD, pools=[ @@ -193,6 +194,7 @@ def test_calculate_diff_create(): def test_calculate_diff_noop(current_with_pool): desired = { "cluster1": DesiredMachinePool( + cluster_path="path", cluster_name="cluster1", cluster_type=ClusterType.OSD, pools=[ @@ -216,6 +218,7 @@ def test_calculate_diff_noop(current_with_pool): def test_calculate_diff_update(current_with_pool): desired = { "cluster1": DesiredMachinePool( + cluster_path="path", cluster_name="cluster1", cluster_type=ClusterType.OSD, pools=[ @@ -267,6 +270,7 @@ def current_with_2_pools() -> Mapping[str, list[AbstractPool]]: def test_calculate_diff_delete(current_with_2_pools): desired = { "cluster1": DesiredMachinePool( + cluster_path="path", cluster_name="cluster1", cluster_type=ClusterType.OSD, pools=[ @@ -292,6 +296,7 @@ def test_calculate_diff_delete(current_with_2_pools): def test_calculate_diff_delete_all_fail_validation(current_with_pool): desired = { "cluster1": DesiredMachinePool( + cluster_path="path", cluster_name="cluster1", cluster_type=ClusterType.OSD, pools=[], @@ -340,6 +345,13 @@ def test_pool_node_pool_invalid_diff_subnet(node_pool, cluster_machine_pool): assert node_pool.invalid_diff(cluster_machine_pool) +def test_pool_node_pool_valid_diff_subnet(node_pool, cluster_machine_pool): + cluster_machine_pool.subnet = None + node_pool.subnet = "foo" + cluster_machine_pool.replicas = 2 + assert not node_pool.has_diff(cluster_machine_pool) + + def test_pool_node_pool_invalid_diff_instance_type(node_pool, cluster_machine_pool): cluster_machine_pool.instance_type = "foo" assert node_pool.invalid_diff(cluster_machine_pool) @@ -463,9 +475,11 @@ def builder(machine_pools: list[dict]) -> ClusterV1: return gql_class_factory( ClusterV1, { + "path": "path", "name": "ocm-cluster", "auth": [], "spec": { + "id": "id", "product": "osd", }, "ocm": { @@ -489,9 +503,11 @@ def builder(machine_pools: list[dict]) -> ClusterV1: return gql_class_factory( ClusterV1, { + "path": "path", "name": "ocm-cluster", "auth": [], "spec": { + "id": "id", "product": "rosa", }, "ocm": { @@ -887,6 +903,7 @@ def builder(machine_pools: Iterable[dict]) -> ClusterV1: return gql_class_factory( ClusterV1, { + "path": "path", "name": "hypershift-cluster", "auth": [], "ocm": { @@ -896,6 +913,7 @@ def builder(machine_pools: Iterable[dict]) -> ClusterV1: }, }, "spec": { + "id": "id", "product": "rosa", "hypershift": True, }, @@ -912,6 +930,7 @@ def default_hypershift_worker_machine_pool() -> dict: "id": "workers", "instance_type": "m5.xlarge", "replicas": 2, + "subnet": "subnet-1", } @@ -932,7 +951,7 @@ def expected_node_pool_create_payload() -> dict: "id": "workers", "labels": None, "replicas": 2, - "subnet": None, + "subnet": "subnet-1", "taints": [], } @@ -959,6 +978,7 @@ def existing_updated_hypershift_node_pools() -> list[dict]: "id": "workers", "aws_node_pool": {"instance_type": "m5.xlarge"}, "replicas": 3, + "subnet": "subnet-1", } ] @@ -1009,34 +1029,21 @@ def existing_multiple_hypershift_node_pools() -> list[dict]: "id": "workers", "aws_node_pool": {"instance_type": "m5.xlarge"}, "replicas": 3, + "subnet": "subnet-1", }, { "id": "new-workers", "aws_node_pool": {"instance_type": "m5.xlarge"}, "replicas": 3, + "subnet": "subnet-2", }, ] -@pytest.fixture -def expected_hypershift_node_pool_delete_payload() -> dict: - return { - "autoscaling": None, - "cluster": "hypershift-cluster", - "id": "new-workers", - "aws_node_pool": {"instance_type": "m5.xlarge"}, - "labels": None, - "replicas": 3, - "subnet": None, - "taints": None, - } - - def test_run_delete_node_pool( mocker: MockerFixture, hypershift_cluster: ClusterV1, existing_multiple_hypershift_node_pools: list[dict], - expected_hypershift_node_pool_delete_payload: dict, ) -> None: mocks = setup_mocks( mocker, @@ -1048,7 +1055,7 @@ def test_run_delete_node_pool( mocks["OCM"].delete_node_pool.assert_called_once_with( hypershift_cluster.name, - expected_hypershift_node_pool_delete_payload, + "new-workers", ) @@ -1058,6 +1065,7 @@ def non_default_hypershift_worker_machine_pool() -> dict: "id": "new-workers", "instance_type": "m5.xlarge", "replicas": 3, + "subnet": "subnet-3", } @@ -1076,16 +1084,19 @@ def existing_multiple_hypershift_node_pools_with_defaults() -> list[dict]: "id": "workers", "aws_node_pool": {"instance_type": "m5.xlarge"}, "replicas": 3, + "subnet": "subnet-1", }, { "id": "workers-1", "aws_node_pool": {"instance_type": "m5.xlarge"}, "replicas": 3, + "subnet": "subnet-2", }, { "id": "new-workers", "aws_node_pool": {"instance_type": "m5.xlarge"}, "replicas": 3, + "subnet": "subnet-3", }, ] @@ -1104,3 +1115,45 @@ def test_run_delete_default_node_pool( run(False) mocks["OCM"].delete_node_pool.assert_called() + + +def test_update_app_interface_with_subnet( + mocker: MockerFixture, + hypershift_cluster: ClusterV1, +) -> None: + hypershift_cluster.machine_pools[0].subnet = None # type: ignore + setup_mocks( + mocker, + clusters=[hypershift_cluster], + node_pools=[ + { + "id": "workers", + "aws_node_pool": {"instance_type": "m5.xlarge"}, + "replicas": 2, + "subnet": "subnet-1", + } + ], + ) + update_app_interface_mock = mocker.patch( + "reconcile.ocm_machine_pools.update_app_interface", + autospec=True, + ) + + run(False) + update_app_interface_mock.assert_called_once_with( + False, + None, + { + hypershift_cluster.path: [ + ClusterMachinePoolV1( + id="workers", + instance_type="m5.xlarge", + replicas=2, + autoscale=None, + labels=None, + taints=None, + subnet="subnet-1", + ), + ] + }, + ) diff --git a/reconcile/test/utils/test_mr_machine_pool_updates.py b/reconcile/test/utils/test_mr_machine_pool_updates.py new file mode 100644 index 0000000000..46ce7b7f0b --- /dev/null +++ b/reconcile/test/utils/test_mr_machine_pool_updates.py @@ -0,0 +1,114 @@ +from io import StringIO +from typing import Any +from unittest.mock import MagicMock, Mock, patch + +import pytest +from ruamel.yaml import YAML + +from reconcile.test.fixtures import Fixtures +from reconcile.utils.mr.cluster_machine_pool_updates import ClustersMachinePoolUpdates +from reconcile.utils.ruamel import create_ruamel_instance + + +@pytest.fixture +def fxt() -> Fixtures: + return Fixtures("clusters") + + +@pytest.fixture +def cluster_spec(fxt: Fixtures) -> dict[str, Any]: + return fxt.get_anymarkup("rosa_hcp_spec_ai.yml") + + +@pytest.fixture +def cluster_spec_raw(fxt: Fixtures) -> str: + return fxt.get("rosa_hcp_spec_ai.yml") + + +@pytest.fixture +def yaml() -> YAML: + return create_ruamel_instance(explicit_start=True, width=4096) + + +@pytest.fixture +def gitlab_cli(cluster_spec_raw: str) -> MagicMock: + cli = MagicMock() + cli.project.files.get.return_value = cluster_spec_raw.encode() + return cli + + +def test_normalized_machine_pool_updates() -> None: + mr = ClustersMachinePoolUpdates({ + "/path": [ + { + "id": "worker", + "instance_type": "m5.xlarge", + "autoscaling": None, + "subnet": "subnet-1", + } + ] + }) + assert mr.normalized_machine_pool_updates() == { + "/path": [ + { + "id": "worker", + "instance_type": "m5.xlarge", + "subnet": "subnet-1", + } + ] + } + + +@patch.object(ClustersMachinePoolUpdates, "cancel", autospec=True, return_value=None) +def test_mr_machine_pool_update_changes_to_spec( + cancel_mock: Mock, cluster_spec_raw: str, gitlab_cli: MagicMock, yaml: YAML +) -> None: + mr = ClustersMachinePoolUpdates({ + "/path": [ + { + "id": "worker", + "instance_type": "m5.xlarge", + "autoscaling": None, + "subnet": "subnet-1", + } + ] + }) + mr.branch = "abranch" + mr.process(gitlab_cli) + + with StringIO() as stream: + cluster_spec = yaml.load(cluster_spec_raw) + cluster_spec["machinePools"][0]["subnet"] = "subnet-1" + yaml.dump(cluster_spec, stream) + new_content = stream.getvalue() + + gitlab_cli.update_file.assert_called_once_with( + branch_name="abranch", + file_path="data/path", + commit_message="update cluster data/path machine-pool fields", + content=new_content, + ) + cancel_mock.assert_not_called() + + +@pytest.mark.parametrize( + "no_op_changes", + [ + {"/path": []}, + {}, + ], +) +@patch.object(ClustersMachinePoolUpdates, "cancel", autospec=True, return_value=None) +def test_mr_machine_pool_update_no_changes( + cancel_mock: Mock, + cluster_spec_raw: str, + gitlab_cli: MagicMock, + yaml: YAML, + no_op_changes: dict, +) -> None: + mr = ClustersMachinePoolUpdates(no_op_changes) + mr.branch = "abranch" + mr.process(gitlab_cli) + + gitlab_cli.update_file.assert_not_called() + cancel_mock.assert_called() diff --git a/reconcile/utils/mr/cluster_machine_pool_updates.py b/reconcile/utils/mr/cluster_machine_pool_updates.py new file mode 100644 index 0000000000..bc26238001 --- /dev/null +++ b/reconcile/utils/mr/cluster_machine_pool_updates.py @@ -0,0 +1,79 @@ +from io import StringIO +from typing import Any + +from reconcile.change_owners.decision import DecisionCommand +from reconcile.utils.gitlab_api import GitLabApi +from reconcile.utils.mr.base import MergeRequestBase +from reconcile.utils.ruamel import create_ruamel_instance + + +class ClustersMachinePoolUpdates(MergeRequestBase): + name = "create_cluster_machine_pool_updates_mr" + + def __init__(self, machine_pool_updates: dict[str, list[dict[str, Any]]]) -> None: + self.machine_pool_updates: dict[str, list[dict[str, Any]]] = ( + machine_pool_updates + ) + + super().__init__() + + self.labels = [] + + @property + def title(self) -> str: + return f"[{self.name}] machine pool updates" + + @property + def description(self) -> str: + return DecisionCommand.APPROVED.value + + def normalized_machine_pool_updates(self) -> dict[str, list[dict[str, Any]]]: + return { + cluster_path: [ + clean_none_values_from_dict(update) for update in pool_updates + ] + for cluster_path, pool_updates in self.machine_pool_updates.items() + } + + def process(self, gitlab_cli: GitLabApi) -> None: + yaml = create_ruamel_instance(explicit_start=True, width=4096) + changes = False + for ( + cluster_path, + machine_pool_updates, + ) in self.normalized_machine_pool_updates().items(): + cluster_fs_path = f"data{cluster_path}" + if not machine_pool_updates: + continue + + raw_file = gitlab_cli.project.files.get( + file_path=cluster_fs_path, ref=gitlab_cli.main_branch + ) + content = yaml.load(raw_file.decode()) + if "machinePools" not in content: + self.cancel(f"{cluster_fs_path} misses machinePools. Nothing to do.") + + for machine_pool in machine_pool_updates: + for mp in content["machinePools"]: + if mp["id"] == machine_pool["id"]: + mp.update(machine_pool) + changes = True + + with StringIO() as stream: + yaml.dump(content, stream) + new_content = stream.getvalue() + + msg = f"update cluster {cluster_fs_path} machine-pool fields" + gitlab_cli.update_file( + branch_name=self.branch, + file_path=cluster_fs_path, + commit_message=msg, + content=new_content, + ) + + if not changes: + self.cancel("Clusters are up to date. Nothing to do.") + + +def clean_none_values_from_dict(d: dict[str, Any]) -> dict[str, Any]: + return {k: v for k, v in d.items() if v is not None} diff --git a/reconcile/utils/ocm/ocm.py b/reconcile/utils/ocm/ocm.py index f96c72df24..77b38a2584 100644 --- a/reconcile/utils/ocm/ocm.py +++ b/reconcile/utils/ocm/ocm.py @@ -575,7 +575,7 @@ def get_node_pools(self, cluster): return get_node_pools(self._ocm_client, cluster_id) - def delete_node_pool(self, cluster, spec): + def delete_node_pool(self, cluster, node_pool_id): """Deletes an existing Node Pool :param cluster: cluster name @@ -585,7 +585,6 @@ def delete_node_pool(self, cluster, spec): :type spec: dictionary """ cluster_id = self.cluster_ids[cluster] - node_pool_id = spec["id"] api = f"{CS_API_BASE}/v1/clusters/{cluster_id}/node_pools/" + f"{node_pool_id}" self._delete(api)