diff --git a/python/pdstools/adm/Plots.py b/python/pdstools/adm/Plots.py index aa511c3f..a5215c43 100644 --- a/python/pdstools/adm/Plots.py +++ b/python/pdstools/adm/Plots.py @@ -137,7 +137,7 @@ def get_nonperforming_models(df: pl.LazyFrame): else: print(fig.data, i) return fig - num_models = df.select(pl.first().count()).collect().item() + num_models = df.select(pl.first().len()).collect().item() bottomleft = get_nonperforming_models(df) newtext = f"{num_models} models: {bottomleft} ({round(bottomleft/num_models*100, 2)}%) at (50,0)" fig.layout.title.text += f"
{newtext}" @@ -486,7 +486,7 @@ def score_distribution( ) ).sort("BinIndex") - if df.select(pl.first().count()).collect().item() == 0: + if df.select(pl.first().len()).collect().item() == 0: raise ValueError(f"There is no data for the provided modelid {model_id}") if return_df: @@ -578,7 +578,7 @@ def predictor_binning( ) ).sort("BinIndex") - if df.select(pl.first().count()).collect().item() == 0: + if df.select(pl.first().len()).collect().item() == 0: raise ValueError( f"There is no data for the provided modelid {model_id} and predictor {predictor_name}" ) @@ -1194,7 +1194,7 @@ def binning_lift( return plot_df fig = px.bar( - plot_df.collect(), #.to_pandas(use_pyarrow_extension_array=False), + plot_df.collect(), # .to_pandas(use_pyarrow_extension_array=False), x="Lift", y="BinSymbolAbbreviated", color="Direction", @@ -1260,8 +1260,7 @@ def partitioned_plot( fig.show() return figs - - # TODO I took the propensity distrib plot out of the HC as + # TODO I took the propensity distrib plot out of the HC as # it wasn't very clear, also didn't look great visually. @requires( diff --git a/python/pdstools/adm/Reports.py b/python/pdstools/adm/Reports.py index b3626cef..ecf01d69 100644 --- a/python/pdstools/adm/Reports.py +++ b/python/pdstools/adm/Reports.py @@ -1,20 +1,18 @@ __all__ = ["Reports"] import logging import os -import re import shutil import subprocess -import sys from os import PathLike from pathlib import Path -from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union import polars as pl from ..utils import cdh_utils from ..utils.namespaces import LazyNamespace from ..utils.types import QUERY -from ..prediction import Prediction +from ..utils.report_utils import _serialize_query, get_quarto_with_version if TYPE_CHECKING: from .ADMDatamart import ADMDatamart @@ -257,7 +255,7 @@ def health_check( and (self.datamart.predictor_data is not None) ): model_file_path, predictor_file_path = self.datamart.save_data(temp_dir) - + serialized_query = _serialize_query(query) self.run_quarto( qmd_file=qmd_file, output_filename=output_filename, @@ -267,7 +265,7 @@ def health_check( "model_file_path": str(model_file_path), "predictor_file_path": str(predictor_file_path), "prediction_file_path": str(prediction_file_path), - "query": query, + "query": serialized_query, "title": title, "subtitle": subtitle, }, @@ -331,17 +329,6 @@ def _copy_quarto_file(qmd_file: str, temp_dir: Path) -> None: shutil.copy(__reports__ / qmd_file, temp_dir) - # Never used? - # def _verify_cached_files(self, temp_dir: Path) -> None: - # """Verify that cached data files exist.""" - # modeldata_files = list(temp_dir.glob("cached_modelData*")) - # predictordata_files = list(temp_dir.glob("cached_predictorData*")) - - # if not modeldata_files: - # raise FileNotFoundError("No cached model data found.") - # if not predictordata_files: - # logger.warning("No cached predictor data found.") - @staticmethod def _write_params_files( temp_dir: Path, @@ -353,7 +340,6 @@ def _write_params_files( import yaml # Parameters to python code - with open(temp_dir / "params.yml", "w") as f: yaml.dump( params, @@ -361,7 +347,6 @@ def _write_params_files( ) # Project/rendering options to quarto - with open(temp_dir / "_quarto.yml", "w") as f: yaml.dump( { @@ -371,103 +356,6 @@ def _write_params_files( f, ) - @staticmethod - def _find_executable(exec_name: str) -> Path: - """Find the executable on the system.""" - - # First find in path - exec_in_path = shutil.which(exec_name) # pragma: no cover - if exec_in_path: # pragma: no cover - return Path(exec_in_path) - - # If not in path try find explicitly. TODO not sure this is wise - # maybe we should not try be smart and assume quarto/pandoc are - # properly installed. - - if sys.platform == "win32": # pragma: no cover - possible_paths = [ - Path( - os.environ.get("USERPROFILE", ""), - "AppData", - "Local", - "Programs", - f"{exec_name}", # assume windows is still case insensitive (NTFS changes this...) - "bin", - f"{exec_name}.cmd", - ), - Path( - os.environ.get("PROGRAMFILES", ""), - f"{exec_name}", - "bin", - f"{exec_name}.cmd", - ), - ] - else: # pragma: no cover - possible_paths = [ - Path(f"/usr/local/bin/{exec_name}"), - Path(f"/opt/{exec_name}/bin/{exec_name}"), - Path(os.environ.get("HOME", ""), ".local", "bin", exec_name), - ] - - for path in possible_paths: - if path.exists(): - return path - - raise FileNotFoundError( - "Quarto executable not found. Please ensure Quarto is installed and in the system PATH." - ) # pragma: no cover - - # TODO not conviced about below. This isn't necessarily the same path resolution - # as the os does. What's wrong with just assuming quarto is in the path so we can - # just test for version w code like - # def get_cmd_output(args): - # result = ( - # subprocess.run(args, stdout=subprocess.PIPE).stdout.decode("utf-8").split("\n") - # ) - # return result - # get_version_only(get_cmd_output(["quarto", "--version"])[0]) - - @staticmethod - def _get_executable_with_version( - exec_name: str, verbose: bool = False - ) -> Tuple[Path, str]: - def get_version_only(versionstr): - return re.sub("[^.0-9]", "", versionstr) - - try: - executable = Reports._find_executable(exec_name=exec_name) - except FileNotFoundError as e: # pragma: no cover - logger.error(e) - raise - - # Check version - try: - version_result = subprocess.run( - [str(executable), "--version"], - capture_output=True, - text=True, - check=True, - ) - version_string = get_version_only( - version_result.stdout.split("\n")[0].strip() - ) - message = f"{exec_name} version: {version_string}" - logger.info(message) - if verbose: - print(message) - except subprocess.CalledProcessError as e: # pragma: no cover - logger.warning(f"Failed to check {exec_name} version: {e}") - - return (executable, version_string) - - @staticmethod - def get_quarto_with_version(verbose: bool = True) -> Tuple[Path, str]: - return Reports._get_executable_with_version("quarto", verbose=verbose) - - @staticmethod - def get_pandoc_with_version(verbose: bool = True) -> Tuple[Path, str]: - return Reports._get_executable_with_version("pandoc", verbose=verbose) - @staticmethod def run_quarto( qmd_file: str, @@ -488,7 +376,7 @@ def run_quarto( analysis=analysis, ) - quarto_exec, _ = Reports.get_quarto_with_version(verbose) + quarto_exec, _ = get_quarto_with_version(verbose) command = [ str(quarto_exec), @@ -565,9 +453,7 @@ def excel_report( } if self.datamart.predictor_data is not None: - tabs["predictors_overview"] = ( - self.datamart.aggregates.predictors_overview() - ) + tabs["predictors_overview"] = self.datamart.aggregates.predictors_overview() if predictor_binning and self.datamart.predictor_data is not None: tabs["predictor_binning"] = self.datamart.aggregates.last( diff --git a/python/pdstools/app/health_check/pages/2_Data_Filters.py b/python/pdstools/app/health_check/pages/2_Data_Filters.py index 1eb8543f..6382b50e 100644 --- a/python/pdstools/app/health_check/pages/2_Data_Filters.py +++ b/python/pdstools/app/health_check/pages/2_Data_Filters.py @@ -18,9 +18,14 @@ "Upload Filters You Downloaded Earlier", type=["json"] ) if uploaded_file: + import io + imported_filters = json.load(uploaded_file) for key, val in imported_filters.items(): - expr_list.append(pl.Expr.from_json(json.dumps(val))) + # Convert the JSON string to a StringIO object and specify the format as 'json' + json_str = json.dumps(val) + str_io = io.StringIO(json_str) + expr_list.append(pl.Expr.deserialize(str_io, format="json")) st.session_state["filters"] = filter_dataframe( st.session_state["dm"].model_data, queries=expr_list @@ -33,10 +38,11 @@ filtered_modelid_count, filtered_row_count = model_and_row_counts( _apply_query(st.session_state["dm"].model_data, st.session_state["filters"]) ) - deserialize_exprs = {} + serialized_exprs = {} for i, expr in enumerate(st.session_state["filters"]): - deserialize_exprs[i] = json.loads(expr.meta.write_json()) - data = json.dumps(deserialize_exprs) + serialized = expr.meta.serialize(format="json") + serialized_exprs[i] = json.loads(serialized) + data = json.dumps(serialized_exprs) st.download_button( label="Download Filters", data=data, diff --git a/python/pdstools/reports/HealthCheck.qmd b/python/pdstools/reports/HealthCheck.qmd index d817674a..ef64e473 100644 --- a/python/pdstools/reports/HealthCheck.qmd +++ b/python/pdstools/reports/HealthCheck.qmd @@ -48,6 +48,8 @@ import polars as pl import numpy as np import math +import json +import io cdh_guidelines = CDHGuidelines() @@ -73,6 +75,22 @@ def fig_set_xaxis_modelperformance(fig, label="Model Performance"): .update_xaxes(title=label, showticklabels=True, visible=True) ) return fig +def _deserialize_query(serialized_query): + if serialized_query is None: + return None + + if serialized_query["type"] == "expr_list": + expr_list = [] + for _, val in serialized_query["expressions"].items(): + json_str = json.dumps(val) + str_io = io.StringIO(json_str) + expr_list.append(pl.Expr.deserialize(str_io, format="json")) + return expr_list + + elif serialized_query["type"] == "dict": + return serialized_query["data"] + + raise ValueError(f"Unknown query type: {serialized_query['type']}") ``` ```{python} @@ -84,7 +102,7 @@ def fig_set_xaxis_modelperformance(fig, label="Model Performance"): title = "ADM Model Overview" subtitle = "Sample data" -# Insert the paths to your data files here to run the notebook from your IDE. +# Insert the paths to your data files here to run the notebook from your IDE. # Edit the _quarto.yml to enable/disable specific sections of the quarto output. # Parameters will be overriden by quarto when a parameters yaml is provided @@ -116,6 +134,7 @@ if query and query == "None": query = None +query = _deserialize_query(query) responsecount_analysis_query = ( pl.col("ResponseCount") > responsecount_analysis_threshold ) @@ -1390,7 +1409,7 @@ if datamart.predictor_data is not None: # The default of observed=False is deprecated... fig = px.treemap( - missing, + missing, path=path, color="Percentage without responses", template="pega", @@ -1840,4 +1859,4 @@ except Exception as e: report_utils.show_credits("pega-datascientist-tools/python/pdstools/reports/HealthCheck.qmd") -``` \ No newline at end of file +``` diff --git a/python/pdstools/utils/polars_ext.py b/python/pdstools/utils/polars_ext.py index e2e54428..1856281d 100644 --- a/python/pdstools/utils/polars_ext.py +++ b/python/pdstools/utils/polars_ext.py @@ -29,7 +29,7 @@ def sample_it(s: pl.Series, n) -> pl.Series: ) def height(self): - return self._ldf.select(pl.first().count()).collect().item() + return self._ldf.select(pl.first().len()).collect().item() def shape(self): return (self.height(), len(self._ldf.columns)) diff --git a/python/pdstools/utils/report_utils.py b/python/pdstools/utils/report_utils.py index 5275c344..d7b7793c 100644 --- a/python/pdstools/utils/report_utils.py +++ b/python/pdstools/utils/report_utils.py @@ -1,13 +1,79 @@ import re import traceback -from typing import Dict, List, Literal, Optional, Union +import shutil +import subprocess +import logging +from pathlib import Path +from typing import Dict, List, Optional, Union, Tuple from IPython.display import display, Markdown from great_tables import GT, style, loc from ..adm.CDH_Guidelines import CDHGuidelines from ..utils.show_versions import show_versions -from ..adm.Reports import Reports +from ..utils.types import QUERY import polars as pl import datetime +import json + +logger = logging.getLogger(__name__) + + +def _get_cmd_output(args: List[str]) -> List[str]: + """Get command output in an OS-agnostic way.""" + try: + result = subprocess.run( + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True + ) + return result.stdout.split("\n") + except subprocess.CalledProcessError as e: + logger.error(f"Failed to run command {' '.join(args)}: {e}") + raise FileNotFoundError( + f"Command failed. Make sure {args[0]} is installed and in the system PATH." + ) + + +def _get_version_only(versionstr: str) -> str: + """Extract version number from version string.""" + return re.sub("[^.0-9]", "", versionstr) + + +def get_quarto_with_version(verbose: bool = True) -> Tuple[Path, str]: + """Get Quarto executable path and version.""" + try: + executable = Path(shutil.which("quarto")) + if not executable: + raise FileNotFoundError( + "Quarto executable not found. Please ensure Quarto is installed and in the system PATH." + ) + + version_string = _get_version_only(_get_cmd_output(["quarto", "--version"])[0]) + message = f"quarto version: {version_string}" + logger.info(message) + if verbose: + print(message) + return executable, version_string + except Exception as e: + logger.error(f"Error getting quarto version: {e}") + raise + + +def get_pandoc_with_version(verbose: bool = True) -> Tuple[Path, str]: + """Get Pandoc executable path and version.""" + try: + executable = Path(shutil.which("pandoc")) + if not executable: + raise FileNotFoundError( + "Pandoc executable not found. Please ensure Pandoc is installed and in the system PATH." + ) + + version_string = _get_version_only(_get_cmd_output(["pandoc", "--version"])[0]) + message = f"pandoc version: {version_string}" + logger.info(message) + if verbose: + print(message) + return executable, version_string + except Exception as e: + logger.error(f"Error getting pandoc version: {e}") + raise def quarto_print(text): @@ -65,52 +131,46 @@ def polars_subset_to_existing_cols(all_columns, cols): return [col for col in cols if col in all_columns] -def rag_background_styler( - rag: Optional[Literal["Red", "Amber", "Yellow", "Green"]] = None -): - match rag[0].upper() if len(rag) > 0 else None: - case "R": +def rag_background_styler(rag: Optional[str] = None): + if rag is not None and len(rag) > 0: + rag_upper = rag[0].upper() + if rag_upper == "R": return style.fill(color="orangered") - case "A": + elif rag_upper == "A": return style.fill(color="orange") - case "Y": + elif rag_upper == "Y": return style.fill(color="yellow") - case "G": + elif rag_upper == "G": return None # no green background to keep it light - case _: - raise ValueError(f"Not a supported RAG value: {rag}") + raise ValueError(f"Not a supported RAG value: {rag}") -def rag_background_styler_dense( - rag: Optional[Literal["Red", "Amber", "Yellow", "Green"]] = None -): - match rag[0].upper() if len(rag) > 0 else None: - case "R": +def rag_background_styler_dense(rag: Optional[str] = None): + if rag is not None and len(rag) > 0: + rag_upper = rag[0].upper() + if rag_upper == "R": return style.fill(color="orangered") - case "A": + elif rag_upper == "A": return style.fill(color="orange") - case "Y": + elif rag_upper == "Y": return style.fill(color="yellow") - case "G": + elif rag_upper == "G": return style.fill(color="green") - case _: - raise ValueError(f"Not a supported RAG value: {rag}") + raise ValueError(f"Not a supported RAG value: {rag}") -def rag_textcolor_styler( - rag: Optional[Literal["Red", "Amber", "Yellow", "Green"]] = None -): - match rag[0].upper() if len(rag) > 0 else None: - case "R": +def rag_textcolor_styler(rag: Optional[str] = None): + if rag is not None and len(rag) > 0: + rag_upper = rag[0].upper() + if rag_upper == "R": return style.text(color="orangered") - case "A": + elif rag_upper == "A": return style.text(color="orange") - case "Y": + elif rag_upper == "Y": return style.text(color="yellow") - case "G": + elif rag_upper == "G": return style.text(color="green") - case _: - raise ValueError(f"Not a supported RAG value: {rag}") + raise ValueError(f"Not a supported RAG value: {rag}") def table_standard_formatting( @@ -177,7 +237,6 @@ def apply_rag_styling(gt, col_name, metric): and (best_practice_min is None or v >= best_practice_min) and (best_practice_max is None or v <= best_practice_max) ] - # TODO consider that bad / warning rows are exclusive gt = apply_style(gt, "green", good_rows) gt = apply_style(gt, "amber", warning_rows) @@ -202,37 +261,36 @@ def apply_rag_styling(gt, col_name, metric): gt = apply_rag_styling(gt, col_name=col_name, metric=metric) # Value formatting - match metric: - case "Model Performance": - gt = gt.fmt_number( - decimals=2, - columns=cols, - ) - case "Engagement Lift": - gt = gt.fmt_percent( - decimals=0, - columns=cols, - ) - case "OmniChannel": - gt = gt.fmt_percent( - decimals=0, - columns=cols, - ) - case "CTR": - gt = gt.fmt_percent( - decimals=3, - columns=cols, - ) - case _: - gt = gt.fmt_number( - decimals=0, - compact=True, - columns=cols, - ) + if metric == "Model Performance": + gt = gt.fmt_number( + decimals=2, + columns=cols, + ) + elif metric == "Engagement Lift": + gt = gt.fmt_percent( + decimals=0, + columns=cols, + ) + elif metric == "OmniChannel": + gt = gt.fmt_percent( + decimals=0, + columns=cols, + ) + elif metric == "CTR": + gt = gt.fmt_percent( + decimals=3, + columns=cols, + ) + else: + gt = gt.fmt_number( + decimals=0, + compact=True, + columns=cols, + ) # Highlight columns with non-standard values def simplify_name(x: str) -> str: - if x is None: + if x is None: return x return re.sub("\\W", "", x, flags=re.IGNORECASE).upper() @@ -346,8 +404,8 @@ def sample_values(dm, all_dm_cols, fld, n=6): def show_credits(quarto_source: str): - _, quarto_version = Reports.get_quarto_with_version(verbose=False) - _, pandoc_version = Reports.get_pandoc_with_version(verbose=False) + _, quarto_version = get_quarto_with_version(verbose=False) + _, pandoc_version = get_pandoc_with_version(verbose=False) timestamp_str = datetime.datetime.now().strftime("%d %b %Y %H:%M:%S") @@ -357,10 +415,10 @@ def show_credits(quarto_source: str): Document created at: {timestamp_str} This notebook: {quarto_source} - + Quarto runtime: {quarto_version} Pandoc: {pandoc_version} - + Additional details from 'pdstools.show_versions()': """ @@ -371,3 +429,24 @@ def show_credits(quarto_source: str): quarto_print( "For more information please see the [Pega Data Scientist Tools](https://github.com/pegasystems/pega-datascientist-tools)." ) + + +def _serialize_query(query: Optional[QUERY]) -> Optional[Dict]: + if query is None: + return None + + if isinstance(query, pl.Expr): + query = [query] + + if isinstance(query, (list, tuple)): + serialized_exprs = {} + for i, expr in enumerate(query): + if not isinstance(expr, pl.Expr): + raise ValueError("All items in query list must be Expressions") + serialized_exprs[str(i)] = json.loads(expr.meta.serialize(format="json")) + return {"type": "expr_list", "expressions": serialized_exprs} + + elif isinstance(query, dict): + return {"type": "dict", "data": query} + + raise ValueError(f"Unsupported query type: {type(query)}") diff --git a/python/pdstools/utils/streamlit_utils.py b/python/pdstools/utils/streamlit_utils.py index 0e062fdb..07e3d915 100644 --- a/python/pdstools/utils/streamlit_utils.py +++ b/python/pdstools/utils/streamlit_utils.py @@ -209,14 +209,14 @@ def filter_dataframe( """ to_filter_columns = st.multiselect( - "Filter dataframe on", df.columns, key="multiselect" + "Filter dataframe on", df.collect_schema().names(), key="multiselect" ) for column in to_filter_columns: left, right = st.columns((1, 20)) left.write("## ↳") - + col_dtype = df.collect_schema()[column] # Treat columns with < 20 unique values as categorical - if (df.schema[column] == pl.Categorical) or (df.schema[column] == pl.Utf8): + if (col_dtype == pl.Categorical) or (col_dtype == pl.Utf8): if f"categories_{column}" not in st.session_state.keys(): st.session_state[f"categories_{column}"] = ( df.select(pl.col(column).unique()).collect().to_series().to_list() @@ -245,7 +245,7 @@ def filter_dataframe( if user_text_input: queries.append(pl.col(column).str.contains(user_text_input)) - elif df.schema[column] in pl.NUMERIC_DTYPES: + elif col_dtype in pl.NUMERIC_DTYPES: min_col, max_col = right.columns((1, 1)) _min = float(df.select(pl.min(column)).collect().item()) _max = float(df.select(pl.max(column)).collect().item()) @@ -272,7 +272,7 @@ def filter_dataframe( user_num_input = [user_min, user_max] if user_num_input[0] != _min or user_num_input[1] != _max: queries.append(pl.col(column).is_between(*user_num_input)) - elif df.schema[column] in pl.TEMPORAL_DTYPES: + elif col_dtype in pl.TEMPORAL_DTYPES: user_date_input = right.date_input( f"Values for {column}", value=( diff --git a/python/tests/test_ValueFinder.py b/python/tests/test_ValueFinder.py index 5964c349..9e25c2c9 100644 --- a/python/tests/test_ValueFinder.py +++ b/python/tests/test_ValueFinder.py @@ -70,8 +70,8 @@ def test_query(vf: ValueFinder): query=pl.col("Stage") != "Arbitration", ) assert ( - _vf.df.select(pl.first().count()).collect().item() - != vf.df.select(pl.first().count()).collect().item() + _vf.df.select(pl.first().len()).collect().item() + != vf.df.select(pl.first().len()).collect().item() )