VectorInstitute · nerdai · Jan 24, 2025 · Jan 23, 2025 · Jan 23, 2025 · Jan 23, 2025
diff --git a/.github/workflows/smoke_tests.yaml b/.github/workflows/smoke_tests.yaml
@@ -7,6 +7,7 @@ on:
   pull_request:
     branches:
       - main
+  workflow_dispatch:
 
 jobs:
   test:
@@ -46,4 +47,4 @@ jobs:
       - name: Run Script
         run: |
           source .venv/bin/activate
-          pytest -m "smoketest"
+          pytest -m "smoketest" -v
diff --git a/pyproject.toml b/pyproject.toml
@@ -108,4 +108,3 @@ markers = [
     "smoketest: marks tests as smoke tests (deselect with '-m \"not smoketest\"')",
 ]
 asyncio_default_fixture_loop_scope = "session"
-asyncio_mode = "auto"
diff --git a/tests/smoke_tests/run_smoke_test.py b/tests/smoke_tests/run_smoke_test.py
@@ -1,5 +1,4 @@
 import asyncio
-import datetime
 import json
 import logging
 import re
@@ -22,6 +21,20 @@
 logger = logging.getLogger()
 
 DEFAULT_TOLERANCE = 0.0005
+DEFAULT_READ_LOGS_TIMEOUT = 300
+
+
+# Custom Errors
+class SmokeTestAssertError(Exception):
+    pass
+
+
+class SmokeTestExecutionError(Exception):
+    pass
+
+
+class SmokeTestTimeoutError(Exception):
+    pass
 
 
 def postprocess_logs(logs: str) -> str:
@@ -50,7 +63,8 @@ async def run_smoke_test(
     client_metrics: dict[str, Any] | None = None,
     # assertion params
     tolerance: float = DEFAULT_TOLERANCE,
-) -> None:
+    read_logs_timeout: int = DEFAULT_READ_LOGS_TIMEOUT,
+) -> tuple[list[str], list[str]]:
     """Runs a smoke test for a given server, client, and dataset configuration.
 
     Uses asyncio to kick off one server instance defined by the `server_python_path` module and N client instances
@@ -141,6 +155,10 @@ async def run_smoke_test(
         client_metrics (dict[str, Any] | None): A dictionary of metrics to be checked against the metrics file
             saved by the clients. Should be in the same format as fl4health.reporting.metrics.MetricsReporter.
             Default is None.
+
+    Returns:
+        (server_errors, client_errors): (list[str], list[str]): list of errors from server and client processes,
+            respectively.
     """
     clear_metrics_folder()
 
@@ -191,7 +209,8 @@ async def run_smoke_test(
     output_found = False
     while not output_found:
         try:
-            assert server_process.stdout is not None, "Server's process stdout is None"
+            if not (server_process.stdout is not None):
+                raise SmokeTestExecutionError("Server's process stdout is None")
             server_output_in_bytes = await asyncio.wait_for(server_process.stdout.readline(), 20)
             server_output = server_output_in_bytes.decode()
             logger.debug(f"Server output: {server_output}")
@@ -201,16 +220,16 @@ async def run_smoke_test(
             break
 
         return_code = server_process.returncode
-        assert return_code is None or (return_code is not None and return_code == 0), (
-            f"Full output:\n{full_server_output}\n" f"[ASSERT ERROR] Server exited with code {return_code}."
-        )
+        if not (return_code is None or (return_code is not None and return_code == 0)):
+            msg = f"Full output:\n{full_server_output}\n" f"[ASSERT ERROR] Server exited with code {return_code}."
+            raise SmokeTestAssertError(msg)
 
         if any(startup_message in server_output for startup_message in startup_messages):
             output_found = True
 
-    assert output_found, (
-        f"Full output:\n{full_server_output}\n" f"[ASSERT_ERROR] Startup log message not found in server output."
-    )
+    if not output_found:
+        msg = f"Full output:\n{full_server_output}\n" f"[ASSERT_ERROR] Startup log message not found in server output."
+        raise SmokeTestAssertError(msg)
 
     logger.info("Server started")
 
@@ -240,74 +259,96 @@ async def run_smoke_test(
     # Collecting the clients output when their processes finish
     client_result_tasks = []
     for i, client_process in enumerate(client_processes):
-        client_result_tasks.append(_wait_for_process_to_finish_and_retrieve_logs(client_process, f"Client {i}"))
+        client_result_tasks.append(
+            _wait_for_process_to_finish_and_retrieve_logs(client_process, f"Client {i}", read_logs_timeout),
+        )
 
     full_client_outputs = await asyncio.gather(*client_result_tasks)
     logger.info("All clients finished execution")
 
     # Collecting the server output when its process finish
-    full_server_output = await _wait_for_process_to_finish_and_retrieve_logs(server_process, "Server")
+    full_server_output = await _wait_for_process_to_finish_and_retrieve_logs(
+        server_process, "Server", read_logs_timeout
+    )
     full_server_output = postprocess_logs(full_server_output)
 
     logger.info("Server has finished execution")
 
     # server assertions
-    assert "error" not in full_server_output.lower(), (
-        f"Full output:\n{full_server_output}\n" "[ASSERT ERROR] Error message found for server."
-    )
+    if not ("error" not in full_server_output.lower()):
+        msg = f"Full output:\n{full_server_output}\n" "[ASSERT ERROR] Error message found for server."
+        raise SmokeTestAssertError(msg)
+
     if assert_evaluation_logs:
-        assert f"Federated Evaluation received {config['n_clients']} results and 0 failures" in full_server_output, (
-            f"Full output:\n{full_server_output}\n" "[ASSERT ERROR] Last FL round message not found for server."
-        )
-        assert "Federated Evaluation Finished" in full_server_output, (
-            f"Full output:\n{full_server_output}\n"
-            "[ASSERT ERROR] Federated Evaluation Finished message not found for server."
-        )
+        if not (f"Federated Evaluation received {config['n_clients']} results and 0 failures" in full_server_output):
+            msg = f"Full output:\n{full_server_output}\n" "[ASSERT ERROR] Last FL round message not found for server."
+            raise SmokeTestAssertError(msg)
+
+        if not ("Federated Evaluation Finished" in full_server_output):
+            msg = (
+                f"Full output:\n{full_server_output}\n"
+                "[ASSERT ERROR] Federated Evaluation Finished message not found for server."
+            )
+            raise SmokeTestAssertError(msg)
+
     else:
-        assert "[SUMMARY]" in full_server_output, (
-            f"Full output:\n{full_server_output}\n" "[ASSERT ERROR] [SUMMARY] message not found for server."
-        )
+        if not ("[SUMMARY]" in full_server_output):
+            msg = f"Full output:\n{full_server_output}\n" "[ASSERT ERROR] [SUMMARY] message not found for server."
+            raise SmokeTestAssertError(msg)
     if not assert_evaluation_logs:
-        assert all(
-            message in full_server_output
-            for message in [
-                "History (loss, distributed):",
-                "History (metrics, distributed, fit):",
-            ]
-        ), f"Full output:\n{full_server_output}\n[ASSERT ERROR] Metrics message not found for server."
+        if not (
+            all(
+                message in full_server_output
+                for message in [
+                    "History (loss, distributed):",
+                    "History (metrics, distributed, fit):",
+                ]
+            )
+        ):
+            msg = f"Full output:\n{full_server_output}\n[ASSERT ERROR] Metrics message not found for server."
+            raise SmokeTestAssertError(msg)
+
     else:
-        assert all(
-            message in full_server_output for message in ["History (metrics, distributed, evaluate):"]
-        ), f"Full output:\n{full_server_output}\n[ASSERT ERROR] Metrics message not found for server."
+        if not (all(message in full_server_output for message in ["History (metrics, distributed, evaluate):"])):
+            msg = f"Full output:\n{full_server_output}\n[ASSERT ERROR] Metrics message not found for server."
+            raise SmokeTestAssertError(msg)
 
     server_errors = _assert_metrics(MetricType.SERVER, server_metrics, tolerance)
-    assert len(server_errors) == 0, f"Server metrics check failed. Errors: {server_errors}"
 
     # client assertions
     client_errors = []
     for i, full_client_output in enumerate(full_client_outputs):
         full_client_output = postprocess_logs(full_client_output)
-        assert "error" not in full_client_output.lower(), (
-            f"Full client output:\n{full_client_output}\n" f"[ASSERT ERROR] Error message found for client {i}."
-        )
-        assert "Disconnect and shut down" in full_client_output, (
-            f"Full client output:\n{full_client_output}\n" f"[ASSERT ERROR] Shutdown message not found for client {i}."
-        )
-        if assert_evaluation_logs:
-            assert "Client Evaluation Local Model Metrics" in full_client_output, (
+        if not ("error" not in full_client_output.lower()):
+            msg = f"Full client output:\n{full_client_output}\n" f"[ASSERT ERROR] Error message found for client {i}."
+            raise SmokeTestAssertError(msg)
+
+        if not ("Disconnect and shut down" in full_client_output):
+            msg = (
                 f"Full client output:\n{full_client_output}\n"
-                f"[ASSERT ERROR] 'Client Evaluation Local Model Metrics' message not found for client {i}."
+                f"[ASSERT ERROR] Shutdown message not found for client {i}."
             )
+            raise SmokeTestAssertError(msg)
+
+        if assert_evaluation_logs:
+            if not ("Client Evaluation Local Model Metrics" in full_client_output):
+                msg = (
+                    f"Full client output:\n{full_client_output}\n"
+                    f"[ASSERT ERROR] 'Client Evaluation Local Model Metrics' message not found for client {i}."
+                )
+                raise SmokeTestAssertError(msg)
+
         elif not skip_assert_client_fl_rounds:
-            assert f"Current FL Round: {config['n_server_rounds']}" in full_client_output, (
-                f"Full client output:\n{full_client_output}\n"
-                f"[ASSERT ERROR] Last FL round message not found for client {i}."
-            )
+            if not (f"Current FL Round: {config['n_server_rounds']}" in full_client_output):
+                msg = (
+                    f"Full client output:\n{full_client_output}\n"
+                    f"[ASSERT ERROR] Last FL round message not found for client {i}."
+                )
+                raise SmokeTestAssertError(msg)
 
         client_errors.extend(_assert_metrics(MetricType.CLIENT, client_metrics, tolerance))
-        assert len(client_errors) == 0, f"Client metrics check failed. Errors: {client_errors}"
 
-    logger.info("All checks passed. Test finished.")
+    return server_errors, client_errors
 
 
 async def run_fault_tolerance_smoke_test(
@@ -322,7 +363,8 @@ async def run_fault_tolerance_smoke_test(
     intermediate_checkpoint_dir: str = "./",
     server_name: str = "server",
     tolerance: float = DEFAULT_TOLERANCE,
-) -> None:
+    read_logs_timeout: int = DEFAULT_READ_LOGS_TIMEOUT,
+) -> tuple[list[str], list[str]]:
     """Runs a smoke test for a given server, client, and dataset configuration.
 
     Args:
@@ -349,6 +391,10 @@ async def run_fault_tolerance_smoke_test(
             saved by the server. Should be in the same format as fl4health.reporting.metrics.MetricsReporter.
         client_metrics (dict[str, Any]): A dictionary of metrics to be checked against the metrics file
             saved by the clients. Should be in the same format as fl4health.reporting.metrics.MetricsReporter.
+
+    Returns:
+        (server_errors, client_errors): (list[str], list[str]): list of errors from server and client processes,
+            respectively.
     """
     clear_metrics_folder()
 
@@ -426,13 +472,15 @@ async def run_fault_tolerance_smoke_test(
 
     client_output_tasks = []
     for i in range(len(client_processes)):
-        client_output_tasks.append(_wait_for_process_to_finish_and_retrieve_logs(client_processes[i], f"Client {i}"))
+        client_output_tasks.append(
+            _wait_for_process_to_finish_and_retrieve_logs(client_processes[i], f"Client {i}", read_logs_timeout),
+        )
 
     _ = await asyncio.gather(*client_output_tasks)
 
     logger.info("All clients finished execution")
 
-    await _wait_for_process_to_finish_and_retrieve_logs(server_process, "Server")
+    await _wait_for_process_to_finish_and_retrieve_logs(server_process, "Server", read_logs_timeout)
 
     logger.info("Server has finished execution")
 
@@ -461,24 +509,22 @@ async def run_fault_tolerance_smoke_test(
         client_processes.append(client_process)
 
     for i in range(len(client_processes)):
-        await _wait_for_process_to_finish_and_retrieve_logs(client_processes[i], f"Client {i}")
+        await _wait_for_process_to_finish_and_retrieve_logs(client_processes[i], f"Client {i}", read_logs_timeout)
 
     logger.info("All clients finished execution")
 
-    await _wait_for_process_to_finish_and_retrieve_logs(server_process, "Server")
+    await _wait_for_process_to_finish_and_retrieve_logs(server_process, "Server", read_logs_timeout)
 
     logger.info("Server has finished execution")
 
     server_errors = _assert_metrics(MetricType.SERVER, server_metrics, tolerance)
-    assert len(server_errors) == 0, f"Server metrics check failed. Errors: {server_errors}"
 
     # client assertions
     client_errors = []
     for i in range(len(client_processes)):
         client_errors.extend(_assert_metrics(MetricType.CLIENT, client_metrics, tolerance))
-        assert len(client_errors) == 0, f"Client metrics check failed. Errors: {client_errors}"
 
-    logger.info("All checks passed. Test finished.")
+    return server_errors, client_errors
 
 
 def _preload_dataset(dataset_path: str, config: Config, seed: int | None = None) -> None:
@@ -521,14 +567,12 @@ async def _wait_for_process_to_finish_and_retrieve_logs(
     process_name: str,
     timeout: int = 300,  # timeout for the whole process to complete
 ) -> str:
-    logger.info(f"Collecting output for {process_name}...")
-    full_output = ""
-    try:
-        assert process.stdout
-        start_time = datetime.datetime.now()
+
+    async def get_output_from_stdout(stream_reader: asyncio.streams.StreamReader) -> tuple[str, int | None]:
+        full_output = ""
         while True:
-            # giving a smaller timeout here just in case it hangs for a long time waiting for a single log line
-            output_in_bytes = await asyncio.wait_for(process.stdout.readline(), timeout=timeout)
+            output_in_bytes = await stream_reader.readline()
+            await asyncio.sleep(0)  # give control back to loop manager
             output = output_in_bytes.decode().replace("\\n", "\n")
             logger.debug(f"{process_name} output: {output}")
             full_output += output
@@ -537,21 +581,26 @@ async def _wait_for_process_to_finish_and_retrieve_logs(
             if output == "" and return_code is not None:
                 break
 
-            elapsed_time = datetime.datetime.now() - start_time
-            if elapsed_time.seconds > timeout:
-                raise Exception(f"Timeout limit of {timeout}s exceeded waiting for {process_name} to finish execution")
+        return full_output, return_code
 
+    logger.info(f"Collecting output for {process_name}...")
+
+    try:
+        if not (process.stdout is not None):
+            raise SmokeTestExecutionError("Process stdout is None")
+        full_output, return_code = await asyncio.wait_for(get_output_from_stdout(process.stdout), timeout=timeout)
+    except asyncio.exceptions.TimeoutError as e:
+        raise SmokeTestTimeoutError("Timeout for reading logs reached.") from e
     except Exception as ex:
-        logger.error(f"{process_name} output:\n{full_output}")
         logger.exception(f"Error collecting {process_name} log messages:")
         raise ex
 
     logger.info(f"Output collected for {process_name}")
 
     # checking for clients with failure exit codes
-    assert return_code is None or (return_code is not None and return_code == 0), (
-        f"Full output:\n{full_output}\n" f"[ASSERT ERROR] {process_name} exited with code {return_code}."
-    )
+    if not (return_code is None or (return_code is not None and return_code == 0)):
+        msg = f"Full output:\n{full_output}\n" f"[ASSERT ERROR] {process_name} exited with code {return_code}."
+        raise SmokeTestAssertError(msg)
 
     return full_output