diff --git a/python/pdstools/adm/Plots.py b/python/pdstools/adm/Plots.py
index aa511c3f..a5215c43 100644
--- a/python/pdstools/adm/Plots.py
+++ b/python/pdstools/adm/Plots.py
@@ -137,7 +137,7 @@ def get_nonperforming_models(df: pl.LazyFrame):
else:
print(fig.data, i)
return fig
- num_models = df.select(pl.first().count()).collect().item()
+ num_models = df.select(pl.first().len()).collect().item()
bottomleft = get_nonperforming_models(df)
newtext = f"{num_models} models: {bottomleft} ({round(bottomleft/num_models*100, 2)}%) at (50,0)"
fig.layout.title.text += f"
{newtext}"
@@ -486,7 +486,7 @@ def score_distribution(
)
).sort("BinIndex")
- if df.select(pl.first().count()).collect().item() == 0:
+ if df.select(pl.first().len()).collect().item() == 0:
raise ValueError(f"There is no data for the provided modelid {model_id}")
if return_df:
@@ -578,7 +578,7 @@ def predictor_binning(
)
).sort("BinIndex")
- if df.select(pl.first().count()).collect().item() == 0:
+ if df.select(pl.first().len()).collect().item() == 0:
raise ValueError(
f"There is no data for the provided modelid {model_id} and predictor {predictor_name}"
)
@@ -1194,7 +1194,7 @@ def binning_lift(
return plot_df
fig = px.bar(
- plot_df.collect(), #.to_pandas(use_pyarrow_extension_array=False),
+ plot_df.collect(), # .to_pandas(use_pyarrow_extension_array=False),
x="Lift",
y="BinSymbolAbbreviated",
color="Direction",
@@ -1260,8 +1260,7 @@ def partitioned_plot(
fig.show()
return figs
-
- # TODO I took the propensity distrib plot out of the HC as
+ # TODO I took the propensity distrib plot out of the HC as
# it wasn't very clear, also didn't look great visually.
@requires(
diff --git a/python/pdstools/adm/Reports.py b/python/pdstools/adm/Reports.py
index b3626cef..ecf01d69 100644
--- a/python/pdstools/adm/Reports.py
+++ b/python/pdstools/adm/Reports.py
@@ -1,20 +1,18 @@
__all__ = ["Reports"]
import logging
import os
-import re
import shutil
import subprocess
-import sys
from os import PathLike
from pathlib import Path
-from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union
import polars as pl
from ..utils import cdh_utils
from ..utils.namespaces import LazyNamespace
from ..utils.types import QUERY
-from ..prediction import Prediction
+from ..utils.report_utils import _serialize_query, get_quarto_with_version
if TYPE_CHECKING:
from .ADMDatamart import ADMDatamart
@@ -257,7 +255,7 @@ def health_check(
and (self.datamart.predictor_data is not None)
):
model_file_path, predictor_file_path = self.datamart.save_data(temp_dir)
-
+ serialized_query = _serialize_query(query)
self.run_quarto(
qmd_file=qmd_file,
output_filename=output_filename,
@@ -267,7 +265,7 @@ def health_check(
"model_file_path": str(model_file_path),
"predictor_file_path": str(predictor_file_path),
"prediction_file_path": str(prediction_file_path),
- "query": query,
+ "query": serialized_query,
"title": title,
"subtitle": subtitle,
},
@@ -331,17 +329,6 @@ def _copy_quarto_file(qmd_file: str, temp_dir: Path) -> None:
shutil.copy(__reports__ / qmd_file, temp_dir)
- # Never used?
- # def _verify_cached_files(self, temp_dir: Path) -> None:
- # """Verify that cached data files exist."""
- # modeldata_files = list(temp_dir.glob("cached_modelData*"))
- # predictordata_files = list(temp_dir.glob("cached_predictorData*"))
-
- # if not modeldata_files:
- # raise FileNotFoundError("No cached model data found.")
- # if not predictordata_files:
- # logger.warning("No cached predictor data found.")
-
@staticmethod
def _write_params_files(
temp_dir: Path,
@@ -353,7 +340,6 @@ def _write_params_files(
import yaml
# Parameters to python code
-
with open(temp_dir / "params.yml", "w") as f:
yaml.dump(
params,
@@ -361,7 +347,6 @@ def _write_params_files(
)
# Project/rendering options to quarto
-
with open(temp_dir / "_quarto.yml", "w") as f:
yaml.dump(
{
@@ -371,103 +356,6 @@ def _write_params_files(
f,
)
- @staticmethod
- def _find_executable(exec_name: str) -> Path:
- """Find the executable on the system."""
-
- # First find in path
- exec_in_path = shutil.which(exec_name) # pragma: no cover
- if exec_in_path: # pragma: no cover
- return Path(exec_in_path)
-
- # If not in path try find explicitly. TODO not sure this is wise
- # maybe we should not try be smart and assume quarto/pandoc are
- # properly installed.
-
- if sys.platform == "win32": # pragma: no cover
- possible_paths = [
- Path(
- os.environ.get("USERPROFILE", ""),
- "AppData",
- "Local",
- "Programs",
- f"{exec_name}", # assume windows is still case insensitive (NTFS changes this...)
- "bin",
- f"{exec_name}.cmd",
- ),
- Path(
- os.environ.get("PROGRAMFILES", ""),
- f"{exec_name}",
- "bin",
- f"{exec_name}.cmd",
- ),
- ]
- else: # pragma: no cover
- possible_paths = [
- Path(f"/usr/local/bin/{exec_name}"),
- Path(f"/opt/{exec_name}/bin/{exec_name}"),
- Path(os.environ.get("HOME", ""), ".local", "bin", exec_name),
- ]
-
- for path in possible_paths:
- if path.exists():
- return path
-
- raise FileNotFoundError(
- "Quarto executable not found. Please ensure Quarto is installed and in the system PATH."
- ) # pragma: no cover
-
- # TODO not conviced about below. This isn't necessarily the same path resolution
- # as the os does. What's wrong with just assuming quarto is in the path so we can
- # just test for version w code like
- # def get_cmd_output(args):
- # result = (
- # subprocess.run(args, stdout=subprocess.PIPE).stdout.decode("utf-8").split("\n")
- # )
- # return result
- # get_version_only(get_cmd_output(["quarto", "--version"])[0])
-
- @staticmethod
- def _get_executable_with_version(
- exec_name: str, verbose: bool = False
- ) -> Tuple[Path, str]:
- def get_version_only(versionstr):
- return re.sub("[^.0-9]", "", versionstr)
-
- try:
- executable = Reports._find_executable(exec_name=exec_name)
- except FileNotFoundError as e: # pragma: no cover
- logger.error(e)
- raise
-
- # Check version
- try:
- version_result = subprocess.run(
- [str(executable), "--version"],
- capture_output=True,
- text=True,
- check=True,
- )
- version_string = get_version_only(
- version_result.stdout.split("\n")[0].strip()
- )
- message = f"{exec_name} version: {version_string}"
- logger.info(message)
- if verbose:
- print(message)
- except subprocess.CalledProcessError as e: # pragma: no cover
- logger.warning(f"Failed to check {exec_name} version: {e}")
-
- return (executable, version_string)
-
- @staticmethod
- def get_quarto_with_version(verbose: bool = True) -> Tuple[Path, str]:
- return Reports._get_executable_with_version("quarto", verbose=verbose)
-
- @staticmethod
- def get_pandoc_with_version(verbose: bool = True) -> Tuple[Path, str]:
- return Reports._get_executable_with_version("pandoc", verbose=verbose)
-
@staticmethod
def run_quarto(
qmd_file: str,
@@ -488,7 +376,7 @@ def run_quarto(
analysis=analysis,
)
- quarto_exec, _ = Reports.get_quarto_with_version(verbose)
+ quarto_exec, _ = get_quarto_with_version(verbose)
command = [
str(quarto_exec),
@@ -565,9 +453,7 @@ def excel_report(
}
if self.datamart.predictor_data is not None:
- tabs["predictors_overview"] = (
- self.datamart.aggregates.predictors_overview()
- )
+ tabs["predictors_overview"] = self.datamart.aggregates.predictors_overview()
if predictor_binning and self.datamart.predictor_data is not None:
tabs["predictor_binning"] = self.datamart.aggregates.last(
diff --git a/python/pdstools/app/health_check/pages/2_Data_Filters.py b/python/pdstools/app/health_check/pages/2_Data_Filters.py
index 1eb8543f..6382b50e 100644
--- a/python/pdstools/app/health_check/pages/2_Data_Filters.py
+++ b/python/pdstools/app/health_check/pages/2_Data_Filters.py
@@ -18,9 +18,14 @@
"Upload Filters You Downloaded Earlier", type=["json"]
)
if uploaded_file:
+ import io
+
imported_filters = json.load(uploaded_file)
for key, val in imported_filters.items():
- expr_list.append(pl.Expr.from_json(json.dumps(val)))
+ # Convert the JSON string to a StringIO object and specify the format as 'json'
+ json_str = json.dumps(val)
+ str_io = io.StringIO(json_str)
+ expr_list.append(pl.Expr.deserialize(str_io, format="json"))
st.session_state["filters"] = filter_dataframe(
st.session_state["dm"].model_data, queries=expr_list
@@ -33,10 +38,11 @@
filtered_modelid_count, filtered_row_count = model_and_row_counts(
_apply_query(st.session_state["dm"].model_data, st.session_state["filters"])
)
- deserialize_exprs = {}
+ serialized_exprs = {}
for i, expr in enumerate(st.session_state["filters"]):
- deserialize_exprs[i] = json.loads(expr.meta.write_json())
- data = json.dumps(deserialize_exprs)
+ serialized = expr.meta.serialize(format="json")
+ serialized_exprs[i] = json.loads(serialized)
+ data = json.dumps(serialized_exprs)
st.download_button(
label="Download Filters",
data=data,
diff --git a/python/pdstools/reports/HealthCheck.qmd b/python/pdstools/reports/HealthCheck.qmd
index d817674a..ef64e473 100644
--- a/python/pdstools/reports/HealthCheck.qmd
+++ b/python/pdstools/reports/HealthCheck.qmd
@@ -48,6 +48,8 @@ import polars as pl
import numpy as np
import math
+import json
+import io
cdh_guidelines = CDHGuidelines()
@@ -73,6 +75,22 @@ def fig_set_xaxis_modelperformance(fig, label="Model Performance"):
.update_xaxes(title=label, showticklabels=True, visible=True)
)
return fig
+def _deserialize_query(serialized_query):
+ if serialized_query is None:
+ return None
+
+ if serialized_query["type"] == "expr_list":
+ expr_list = []
+ for _, val in serialized_query["expressions"].items():
+ json_str = json.dumps(val)
+ str_io = io.StringIO(json_str)
+ expr_list.append(pl.Expr.deserialize(str_io, format="json"))
+ return expr_list
+
+ elif serialized_query["type"] == "dict":
+ return serialized_query["data"]
+
+ raise ValueError(f"Unknown query type: {serialized_query['type']}")
```
```{python}
@@ -84,7 +102,7 @@ def fig_set_xaxis_modelperformance(fig, label="Model Performance"):
title = "ADM Model Overview"
subtitle = "Sample data"
-# Insert the paths to your data files here to run the notebook from your IDE.
+# Insert the paths to your data files here to run the notebook from your IDE.
# Edit the _quarto.yml to enable/disable specific sections of the quarto output.
# Parameters will be overriden by quarto when a parameters yaml is provided
@@ -116,6 +134,7 @@ if query and query == "None":
query = None
+query = _deserialize_query(query)
responsecount_analysis_query = (
pl.col("ResponseCount") > responsecount_analysis_threshold
)
@@ -1390,7 +1409,7 @@ if datamart.predictor_data is not None:
# The default of observed=False is deprecated...
fig = px.treemap(
- missing,
+ missing,
path=path,
color="Percentage without responses",
template="pega",
@@ -1840,4 +1859,4 @@ except Exception as e:
report_utils.show_credits("pega-datascientist-tools/python/pdstools/reports/HealthCheck.qmd")
-```
\ No newline at end of file
+```
diff --git a/python/pdstools/utils/polars_ext.py b/python/pdstools/utils/polars_ext.py
index e2e54428..1856281d 100644
--- a/python/pdstools/utils/polars_ext.py
+++ b/python/pdstools/utils/polars_ext.py
@@ -29,7 +29,7 @@ def sample_it(s: pl.Series, n) -> pl.Series:
)
def height(self):
- return self._ldf.select(pl.first().count()).collect().item()
+ return self._ldf.select(pl.first().len()).collect().item()
def shape(self):
return (self.height(), len(self._ldf.columns))
diff --git a/python/pdstools/utils/report_utils.py b/python/pdstools/utils/report_utils.py
index 5275c344..d7b7793c 100644
--- a/python/pdstools/utils/report_utils.py
+++ b/python/pdstools/utils/report_utils.py
@@ -1,13 +1,79 @@
import re
import traceback
-from typing import Dict, List, Literal, Optional, Union
+import shutil
+import subprocess
+import logging
+from pathlib import Path
+from typing import Dict, List, Optional, Union, Tuple
from IPython.display import display, Markdown
from great_tables import GT, style, loc
from ..adm.CDH_Guidelines import CDHGuidelines
from ..utils.show_versions import show_versions
-from ..adm.Reports import Reports
+from ..utils.types import QUERY
import polars as pl
import datetime
+import json
+
+logger = logging.getLogger(__name__)
+
+
+def _get_cmd_output(args: List[str]) -> List[str]:
+ """Get command output in an OS-agnostic way."""
+ try:
+ result = subprocess.run(
+ args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True
+ )
+ return result.stdout.split("\n")
+ except subprocess.CalledProcessError as e:
+ logger.error(f"Failed to run command {' '.join(args)}: {e}")
+ raise FileNotFoundError(
+ f"Command failed. Make sure {args[0]} is installed and in the system PATH."
+ )
+
+
+def _get_version_only(versionstr: str) -> str:
+ """Extract version number from version string."""
+ return re.sub("[^.0-9]", "", versionstr)
+
+
+def get_quarto_with_version(verbose: bool = True) -> Tuple[Path, str]:
+ """Get Quarto executable path and version."""
+ try:
+ executable = Path(shutil.which("quarto"))
+ if not executable:
+ raise FileNotFoundError(
+ "Quarto executable not found. Please ensure Quarto is installed and in the system PATH."
+ )
+
+ version_string = _get_version_only(_get_cmd_output(["quarto", "--version"])[0])
+ message = f"quarto version: {version_string}"
+ logger.info(message)
+ if verbose:
+ print(message)
+ return executable, version_string
+ except Exception as e:
+ logger.error(f"Error getting quarto version: {e}")
+ raise
+
+
+def get_pandoc_with_version(verbose: bool = True) -> Tuple[Path, str]:
+ """Get Pandoc executable path and version."""
+ try:
+ executable = Path(shutil.which("pandoc"))
+ if not executable:
+ raise FileNotFoundError(
+ "Pandoc executable not found. Please ensure Pandoc is installed and in the system PATH."
+ )
+
+ version_string = _get_version_only(_get_cmd_output(["pandoc", "--version"])[0])
+ message = f"pandoc version: {version_string}"
+ logger.info(message)
+ if verbose:
+ print(message)
+ return executable, version_string
+ except Exception as e:
+ logger.error(f"Error getting pandoc version: {e}")
+ raise
def quarto_print(text):
@@ -65,52 +131,46 @@ def polars_subset_to_existing_cols(all_columns, cols):
return [col for col in cols if col in all_columns]
-def rag_background_styler(
- rag: Optional[Literal["Red", "Amber", "Yellow", "Green"]] = None
-):
- match rag[0].upper() if len(rag) > 0 else None:
- case "R":
+def rag_background_styler(rag: Optional[str] = None):
+ if rag is not None and len(rag) > 0:
+ rag_upper = rag[0].upper()
+ if rag_upper == "R":
return style.fill(color="orangered")
- case "A":
+ elif rag_upper == "A":
return style.fill(color="orange")
- case "Y":
+ elif rag_upper == "Y":
return style.fill(color="yellow")
- case "G":
+ elif rag_upper == "G":
return None # no green background to keep it light
- case _:
- raise ValueError(f"Not a supported RAG value: {rag}")
+ raise ValueError(f"Not a supported RAG value: {rag}")
-def rag_background_styler_dense(
- rag: Optional[Literal["Red", "Amber", "Yellow", "Green"]] = None
-):
- match rag[0].upper() if len(rag) > 0 else None:
- case "R":
+def rag_background_styler_dense(rag: Optional[str] = None):
+ if rag is not None and len(rag) > 0:
+ rag_upper = rag[0].upper()
+ if rag_upper == "R":
return style.fill(color="orangered")
- case "A":
+ elif rag_upper == "A":
return style.fill(color="orange")
- case "Y":
+ elif rag_upper == "Y":
return style.fill(color="yellow")
- case "G":
+ elif rag_upper == "G":
return style.fill(color="green")
- case _:
- raise ValueError(f"Not a supported RAG value: {rag}")
+ raise ValueError(f"Not a supported RAG value: {rag}")
-def rag_textcolor_styler(
- rag: Optional[Literal["Red", "Amber", "Yellow", "Green"]] = None
-):
- match rag[0].upper() if len(rag) > 0 else None:
- case "R":
+def rag_textcolor_styler(rag: Optional[str] = None):
+ if rag is not None and len(rag) > 0:
+ rag_upper = rag[0].upper()
+ if rag_upper == "R":
return style.text(color="orangered")
- case "A":
+ elif rag_upper == "A":
return style.text(color="orange")
- case "Y":
+ elif rag_upper == "Y":
return style.text(color="yellow")
- case "G":
+ elif rag_upper == "G":
return style.text(color="green")
- case _:
- raise ValueError(f"Not a supported RAG value: {rag}")
+ raise ValueError(f"Not a supported RAG value: {rag}")
def table_standard_formatting(
@@ -177,7 +237,6 @@ def apply_rag_styling(gt, col_name, metric):
and (best_practice_min is None or v >= best_practice_min)
and (best_practice_max is None or v <= best_practice_max)
]
- # TODO consider that bad / warning rows are exclusive
gt = apply_style(gt, "green", good_rows)
gt = apply_style(gt, "amber", warning_rows)
@@ -202,37 +261,36 @@ def apply_rag_styling(gt, col_name, metric):
gt = apply_rag_styling(gt, col_name=col_name, metric=metric)
# Value formatting
- match metric:
- case "Model Performance":
- gt = gt.fmt_number(
- decimals=2,
- columns=cols,
- )
- case "Engagement Lift":
- gt = gt.fmt_percent(
- decimals=0,
- columns=cols,
- )
- case "OmniChannel":
- gt = gt.fmt_percent(
- decimals=0,
- columns=cols,
- )
- case "CTR":
- gt = gt.fmt_percent(
- decimals=3,
- columns=cols,
- )
- case _:
- gt = gt.fmt_number(
- decimals=0,
- compact=True,
- columns=cols,
- )
+ if metric == "Model Performance":
+ gt = gt.fmt_number(
+ decimals=2,
+ columns=cols,
+ )
+ elif metric == "Engagement Lift":
+ gt = gt.fmt_percent(
+ decimals=0,
+ columns=cols,
+ )
+ elif metric == "OmniChannel":
+ gt = gt.fmt_percent(
+ decimals=0,
+ columns=cols,
+ )
+ elif metric == "CTR":
+ gt = gt.fmt_percent(
+ decimals=3,
+ columns=cols,
+ )
+ else:
+ gt = gt.fmt_number(
+ decimals=0,
+ compact=True,
+ columns=cols,
+ )
# Highlight columns with non-standard values
def simplify_name(x: str) -> str:
- if x is None:
+ if x is None:
return x
return re.sub("\\W", "", x, flags=re.IGNORECASE).upper()
@@ -346,8 +404,8 @@ def sample_values(dm, all_dm_cols, fld, n=6):
def show_credits(quarto_source: str):
- _, quarto_version = Reports.get_quarto_with_version(verbose=False)
- _, pandoc_version = Reports.get_pandoc_with_version(verbose=False)
+ _, quarto_version = get_quarto_with_version(verbose=False)
+ _, pandoc_version = get_pandoc_with_version(verbose=False)
timestamp_str = datetime.datetime.now().strftime("%d %b %Y %H:%M:%S")
@@ -357,10 +415,10 @@ def show_credits(quarto_source: str):
Document created at: {timestamp_str}
This notebook: {quarto_source}
-
+
Quarto runtime: {quarto_version}
Pandoc: {pandoc_version}
-
+
Additional details from 'pdstools.show_versions()':
"""
@@ -371,3 +429,24 @@ def show_credits(quarto_source: str):
quarto_print(
"For more information please see the [Pega Data Scientist Tools](https://github.com/pegasystems/pega-datascientist-tools)."
)
+
+
+def _serialize_query(query: Optional[QUERY]) -> Optional[Dict]:
+ if query is None:
+ return None
+
+ if isinstance(query, pl.Expr):
+ query = [query]
+
+ if isinstance(query, (list, tuple)):
+ serialized_exprs = {}
+ for i, expr in enumerate(query):
+ if not isinstance(expr, pl.Expr):
+ raise ValueError("All items in query list must be Expressions")
+ serialized_exprs[str(i)] = json.loads(expr.meta.serialize(format="json"))
+ return {"type": "expr_list", "expressions": serialized_exprs}
+
+ elif isinstance(query, dict):
+ return {"type": "dict", "data": query}
+
+ raise ValueError(f"Unsupported query type: {type(query)}")
diff --git a/python/pdstools/utils/streamlit_utils.py b/python/pdstools/utils/streamlit_utils.py
index 0e062fdb..07e3d915 100644
--- a/python/pdstools/utils/streamlit_utils.py
+++ b/python/pdstools/utils/streamlit_utils.py
@@ -209,14 +209,14 @@ def filter_dataframe(
"""
to_filter_columns = st.multiselect(
- "Filter dataframe on", df.columns, key="multiselect"
+ "Filter dataframe on", df.collect_schema().names(), key="multiselect"
)
for column in to_filter_columns:
left, right = st.columns((1, 20))
left.write("## ↳")
-
+ col_dtype = df.collect_schema()[column]
# Treat columns with < 20 unique values as categorical
- if (df.schema[column] == pl.Categorical) or (df.schema[column] == pl.Utf8):
+ if (col_dtype == pl.Categorical) or (col_dtype == pl.Utf8):
if f"categories_{column}" not in st.session_state.keys():
st.session_state[f"categories_{column}"] = (
df.select(pl.col(column).unique()).collect().to_series().to_list()
@@ -245,7 +245,7 @@ def filter_dataframe(
if user_text_input:
queries.append(pl.col(column).str.contains(user_text_input))
- elif df.schema[column] in pl.NUMERIC_DTYPES:
+ elif col_dtype in pl.NUMERIC_DTYPES:
min_col, max_col = right.columns((1, 1))
_min = float(df.select(pl.min(column)).collect().item())
_max = float(df.select(pl.max(column)).collect().item())
@@ -272,7 +272,7 @@ def filter_dataframe(
user_num_input = [user_min, user_max]
if user_num_input[0] != _min or user_num_input[1] != _max:
queries.append(pl.col(column).is_between(*user_num_input))
- elif df.schema[column] in pl.TEMPORAL_DTYPES:
+ elif col_dtype in pl.TEMPORAL_DTYPES:
user_date_input = right.date_input(
f"Values for {column}",
value=(
diff --git a/python/tests/test_ValueFinder.py b/python/tests/test_ValueFinder.py
index 5964c349..9e25c2c9 100644
--- a/python/tests/test_ValueFinder.py
+++ b/python/tests/test_ValueFinder.py
@@ -70,8 +70,8 @@ def test_query(vf: ValueFinder):
query=pl.col("Stage") != "Arbitration",
)
assert (
- _vf.df.select(pl.first().count()).collect().item()
- != vf.df.select(pl.first().count()).collect().item()
+ _vf.df.select(pl.first().len()).collect().item()
+ != vf.df.select(pl.first().len()).collect().item()
)