pegasystems · yusufuyanik1 · Jan 6, 2025 · Dec 30, 2024 · Dec 31, 2024 · Dec 31, 2024
diff --git a/python/pdstools/adm/Plots.py b/python/pdstools/adm/Plots.py
@@ -137,7 +137,7 @@ def get_nonperforming_models(df: pl.LazyFrame):
                 else:
                     print(fig.data, i)
         return fig
-    num_models = df.select(pl.first().count()).collect().item()
+    num_models = df.select(pl.first().len()).collect().item()
     bottomleft = get_nonperforming_models(df)
     newtext = f"{num_models} models: {bottomleft} ({round(bottomleft/num_models*100, 2)}%) at (50,0)"
     fig.layout.title.text += f"<br><sup>{newtext}</sup>"
@@ -486,7 +486,7 @@ def score_distribution(
             )
         ).sort("BinIndex")
 
-        if df.select(pl.first().count()).collect().item() == 0:
+        if df.select(pl.first().len()).collect().item() == 0:
             raise ValueError(f"There is no data for the provided modelid {model_id}")
 
         if return_df:
@@ -578,7 +578,7 @@ def predictor_binning(
             )
         ).sort("BinIndex")
 
-        if df.select(pl.first().count()).collect().item() == 0:
+        if df.select(pl.first().len()).collect().item() == 0:
             raise ValueError(
                 f"There is no data for the provided modelid {model_id} and predictor {predictor_name}"
             )
@@ -1194,7 +1194,7 @@ def binning_lift(
             return plot_df
 
         fig = px.bar(
-            plot_df.collect(), #.to_pandas(use_pyarrow_extension_array=False),
+            plot_df.collect(),  # .to_pandas(use_pyarrow_extension_array=False),
             x="Lift",
             y="BinSymbolAbbreviated",
             color="Direction",
@@ -1260,8 +1260,7 @@ def partitioned_plot(
                 fig.show()
         return figs
 
-
-    # TODO I took the propensity distrib plot out of the HC as 
+    # TODO I took the propensity distrib plot out of the HC as
     # it wasn't very clear, also didn't look great visually.
 
     @requires(

diff --git a/python/pdstools/adm/Reports.py b/python/pdstools/adm/Reports.py
@@ -1,20 +1,18 @@
 __all__ = ["Reports"]
 import logging
 import os
-import re
 import shutil
 import subprocess
-import sys
 from os import PathLike
 from pathlib import Path
-from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union
 
 import polars as pl
 
 from ..utils import cdh_utils
 from ..utils.namespaces import LazyNamespace
 from ..utils.types import QUERY
-from ..prediction import Prediction
+from ..utils.report_utils import _serialize_query, get_quarto_with_version
 
 if TYPE_CHECKING:
     from .ADMDatamart import ADMDatamart
@@ -257,7 +255,7 @@ def health_check(
                 and (self.datamart.predictor_data is not None)
             ):
                 model_file_path, predictor_file_path = self.datamart.save_data(temp_dir)
-
+            serialized_query = _serialize_query(query)
             self.run_quarto(
                 qmd_file=qmd_file,
                 output_filename=output_filename,
@@ -267,7 +265,7 @@ def health_check(
                     "model_file_path": str(model_file_path),
                     "predictor_file_path": str(predictor_file_path),
                     "prediction_file_path": str(prediction_file_path),
-                    "query": query,
+                    "query": serialized_query,
                     "title": title,
                     "subtitle": subtitle,
                 },
@@ -331,17 +329,6 @@ def _copy_quarto_file(qmd_file: str, temp_dir: Path) -> None:
 
         shutil.copy(__reports__ / qmd_file, temp_dir)
 
-    # Never used?
-    # def _verify_cached_files(self, temp_dir: Path) -> None:
-    #     """Verify that cached data files exist."""
-    #     modeldata_files = list(temp_dir.glob("cached_modelData*"))
-    #     predictordata_files = list(temp_dir.glob("cached_predictorData*"))
-
-    #     if not modeldata_files:
-    #         raise FileNotFoundError("No cached model data found.")
-    #     if not predictordata_files:
-    #         logger.warning("No cached predictor data found.")
-
     @staticmethod
     def _write_params_files(
         temp_dir: Path,
@@ -353,15 +340,13 @@ def _write_params_files(
         import yaml
 
         # Parameters to python code
-
         with open(temp_dir / "params.yml", "w") as f:
             yaml.dump(
                 params,
                 f,
             )
 
         # Project/rendering options to quarto
-
         with open(temp_dir / "_quarto.yml", "w") as f:
             yaml.dump(
                 {
@@ -371,103 +356,6 @@ def _write_params_files(
                 f,
             )
 
-    @staticmethod
-    def _find_executable(exec_name: str) -> Path:
-        """Find the executable on the system."""
-
-        # First find in path
-        exec_in_path = shutil.which(exec_name)  # pragma: no cover
-        if exec_in_path:  # pragma: no cover
-            return Path(exec_in_path)
-
-        # If not in path try find explicitly. TODO not sure this is wise
-        # maybe we should not try be smart and assume quarto/pandoc are
-        # properly installed.
-
-        if sys.platform == "win32":  # pragma: no cover
-            possible_paths = [
-                Path(
-                    os.environ.get("USERPROFILE", ""),
-                    "AppData",
-                    "Local",
-                    "Programs",
-                    f"{exec_name}",  # assume windows is still case insensitive (NTFS changes this...)
-                    "bin",
-                    f"{exec_name}.cmd",
-                ),
-                Path(
-                    os.environ.get("PROGRAMFILES", ""),
-                    f"{exec_name}",
-                    "bin",
-                    f"{exec_name}.cmd",
-                ),
-            ]
-        else:  # pragma: no cover
-            possible_paths = [
-                Path(f"/usr/local/bin/{exec_name}"),
-                Path(f"/opt/{exec_name}/bin/{exec_name}"),
-                Path(os.environ.get("HOME", ""), ".local", "bin", exec_name),
-            ]
-
-        for path in possible_paths:
-            if path.exists():
-                return path
-
-        raise FileNotFoundError(
-            "Quarto executable not found. Please ensure Quarto is installed and in the system PATH."
-        )  # pragma: no cover
-
-    # TODO not conviced about below. This isn't necessarily the same path resolution
-    # as the os does. What's wrong with just assuming quarto is in the path so we can
-    # just test for version w code like
-    # def get_cmd_output(args):
-    # result = (
-    #     subprocess.run(args, stdout=subprocess.PIPE).stdout.decode("utf-8").split("\n")
-    # )
-    # return result
-    # get_version_only(get_cmd_output(["quarto", "--version"])[0])
-
-    @staticmethod
-    def _get_executable_with_version(
-        exec_name: str, verbose: bool = False
-    ) -> Tuple[Path, str]:
-        def get_version_only(versionstr):
-            return re.sub("[^.0-9]", "", versionstr)
-
-        try:
-            executable = Reports._find_executable(exec_name=exec_name)
-        except FileNotFoundError as e:  # pragma: no cover
-            logger.error(e)
-            raise
-
-        # Check version
-        try:
-            version_result = subprocess.run(
-                [str(executable), "--version"],
-                capture_output=True,
-                text=True,
-                check=True,
-            )
-            version_string = get_version_only(
-                version_result.stdout.split("\n")[0].strip()
-            )
-            message = f"{exec_name} version: {version_string}"
-            logger.info(message)
-            if verbose:
-                print(message)
-        except subprocess.CalledProcessError as e:  # pragma: no cover
-            logger.warning(f"Failed to check {exec_name} version: {e}")
-
-        return (executable, version_string)
-
-    @staticmethod
-    def get_quarto_with_version(verbose: bool = True) -> Tuple[Path, str]:
-        return Reports._get_executable_with_version("quarto", verbose=verbose)
-
-    @staticmethod
-    def get_pandoc_with_version(verbose: bool = True) -> Tuple[Path, str]:
-        return Reports._get_executable_with_version("pandoc", verbose=verbose)
-
     @staticmethod
     def run_quarto(
         qmd_file: str,
@@ -488,7 +376,7 @@ def run_quarto(
             analysis=analysis,
         )
 
-        quarto_exec, _ = Reports.get_quarto_with_version(verbose)
+        quarto_exec, _ = get_quarto_with_version(verbose)
 
         command = [
             str(quarto_exec),
@@ -565,9 +453,7 @@ def excel_report(
         }
 
         if self.datamart.predictor_data is not None:
-            tabs["predictors_overview"] = (
-                self.datamart.aggregates.predictors_overview()
-            )
+            tabs["predictors_overview"] = self.datamart.aggregates.predictors_overview()
 
         if predictor_binning and self.datamart.predictor_data is not None:
             tabs["predictor_binning"] = self.datamart.aggregates.last(

diff --git a/python/pdstools/app/health_check/pages/2_Data_Filters.py b/python/pdstools/app/health_check/pages/2_Data_Filters.py
@@ -18,9 +18,14 @@
         "Upload Filters You Downloaded Earlier", type=["json"]
     )
     if uploaded_file:
+        import io
+
         imported_filters = json.load(uploaded_file)
         for key, val in imported_filters.items():
-            expr_list.append(pl.Expr.from_json(json.dumps(val)))
+            # Convert the JSON string to a StringIO object and specify the format as 'json'
+            json_str = json.dumps(val)
+            str_io = io.StringIO(json_str)
+            expr_list.append(pl.Expr.deserialize(str_io, format="json"))
 
     st.session_state["filters"] = filter_dataframe(
         st.session_state["dm"].model_data, queries=expr_list
@@ -33,10 +38,11 @@
         filtered_modelid_count, filtered_row_count = model_and_row_counts(
             _apply_query(st.session_state["dm"].model_data, st.session_state["filters"])
         )
-        deserialize_exprs = {}
+        serialized_exprs = {}
         for i, expr in enumerate(st.session_state["filters"]):
-            deserialize_exprs[i] = json.loads(expr.meta.write_json())
-        data = json.dumps(deserialize_exprs)
+            serialized = expr.meta.serialize(format="json")
+            serialized_exprs[i] = json.loads(serialized)
+        data = json.dumps(serialized_exprs)
         st.download_button(
             label="Download Filters",
             data=data,

diff --git a/python/pdstools/reports/HealthCheck.qmd b/python/pdstools/reports/HealthCheck.qmd
@@ -48,6 +48,8 @@ import polars as pl
 
 import numpy as np
 import math
+import json
+import io
 
 cdh_guidelines = CDHGuidelines()
 
@@ -73,6 +75,22 @@ def fig_set_xaxis_modelperformance(fig, label="Model Performance"):
         .update_xaxes(title=label, showticklabels=True, visible=True)
     )
     return fig
+def _deserialize_query(serialized_query):
+    if serialized_query is None:
+        return None
+
+    if serialized_query["type"] == "expr_list":
+        expr_list = []
+        for _, val in serialized_query["expressions"].items():
+            json_str = json.dumps(val)
+            str_io = io.StringIO(json_str)
+            expr_list.append(pl.Expr.deserialize(str_io, format="json"))
+        return expr_list
+
+    elif serialized_query["type"] == "dict":
+        return serialized_query["data"]
+
+    raise ValueError(f"Unknown query type: {serialized_query['type']}")
 ```
 
 ```{python}
@@ -84,7 +102,7 @@ def fig_set_xaxis_modelperformance(fig, label="Model Performance"):
 title = "ADM Model Overview"
 subtitle = "Sample data"
 
-# Insert the paths to your data files here to run the notebook from your IDE. 
+# Insert the paths to your data files here to run the notebook from your IDE.
 # Edit the _quarto.yml to enable/disable specific sections of the quarto output.
 # Parameters will be overriden by quarto when a parameters yaml is provided
 
@@ -116,6 +134,7 @@ if query and query == "None":
     query = None
 
 
+query = _deserialize_query(query)
 responsecount_analysis_query = (
     pl.col("ResponseCount") > responsecount_analysis_threshold
 )
@@ -1390,7 +1409,7 @@ if datamart.predictor_data is not None:
         # The default of observed=False is deprecated...
 
         fig = px.treemap(
-            missing, 
+            missing,
             path=path,
             color="Percentage without responses",
             template="pega",
@@ -1840,4 +1859,4 @@ except Exception as e:
 report_utils.show_credits("pega-datascientist-tools/python/pdstools/reports/HealthCheck.qmd")
 
 
-```
+```
diff --git a/python/pdstools/utils/polars_ext.py b/python/pdstools/utils/polars_ext.py
@@ -29,7 +29,7 @@ def sample_it(s: pl.Series, n) -> pl.Series:
         )
 
     def height(self):
-        return self._ldf.select(pl.first().count()).collect().item()
+        return self._ldf.select(pl.first().len()).collect().item()
 
     def shape(self):
         return (self.height(), len(self._ldf.columns))