Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

polars expression serialization #308

Merged
merged 4 commits into from
Jan 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions python/pdstools/adm/Plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def get_nonperforming_models(df: pl.LazyFrame):
else:
print(fig.data, i)
return fig
num_models = df.select(pl.first().count()).collect().item()
num_models = df.select(pl.first().len()).collect().item()
bottomleft = get_nonperforming_models(df)
newtext = f"{num_models} models: {bottomleft} ({round(bottomleft/num_models*100, 2)}%) at (50,0)"
fig.layout.title.text += f"<br><sup>{newtext}</sup>"
Expand Down Expand Up @@ -486,7 +486,7 @@ def score_distribution(
)
).sort("BinIndex")

if df.select(pl.first().count()).collect().item() == 0:
if df.select(pl.first().len()).collect().item() == 0:
raise ValueError(f"There is no data for the provided modelid {model_id}")

if return_df:
Expand Down Expand Up @@ -578,7 +578,7 @@ def predictor_binning(
)
).sort("BinIndex")

if df.select(pl.first().count()).collect().item() == 0:
if df.select(pl.first().len()).collect().item() == 0:
raise ValueError(
f"There is no data for the provided modelid {model_id} and predictor {predictor_name}"
)
Expand Down Expand Up @@ -1194,7 +1194,7 @@ def binning_lift(
return plot_df

fig = px.bar(
plot_df.collect(), #.to_pandas(use_pyarrow_extension_array=False),
plot_df.collect(), # .to_pandas(use_pyarrow_extension_array=False),
x="Lift",
y="BinSymbolAbbreviated",
color="Direction",
Expand Down Expand Up @@ -1260,8 +1260,7 @@ def partitioned_plot(
fig.show()
return figs


# TODO I took the propensity distrib plot out of the HC as
# TODO I took the propensity distrib plot out of the HC as
# it wasn't very clear, also didn't look great visually.

@requires(
Expand Down
126 changes: 6 additions & 120 deletions python/pdstools/adm/Reports.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,18 @@
__all__ = ["Reports"]
import logging
import os
import re
import shutil
import subprocess
import sys
from os import PathLike
from pathlib import Path
from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union
from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union

import polars as pl

from ..utils import cdh_utils
from ..utils.namespaces import LazyNamespace
from ..utils.types import QUERY
from ..prediction import Prediction
from ..utils.report_utils import _serialize_query, get_quarto_with_version

if TYPE_CHECKING:
from .ADMDatamart import ADMDatamart
Expand Down Expand Up @@ -257,7 +255,7 @@ def health_check(
and (self.datamart.predictor_data is not None)
):
model_file_path, predictor_file_path = self.datamart.save_data(temp_dir)

serialized_query = _serialize_query(query)
self.run_quarto(
qmd_file=qmd_file,
output_filename=output_filename,
Expand All @@ -267,7 +265,7 @@ def health_check(
"model_file_path": str(model_file_path),
"predictor_file_path": str(predictor_file_path),
"prediction_file_path": str(prediction_file_path),
"query": query,
"query": serialized_query,
"title": title,
"subtitle": subtitle,
},
Expand Down Expand Up @@ -331,17 +329,6 @@ def _copy_quarto_file(qmd_file: str, temp_dir: Path) -> None:

shutil.copy(__reports__ / qmd_file, temp_dir)

# Never used?
# def _verify_cached_files(self, temp_dir: Path) -> None:
# """Verify that cached data files exist."""
# modeldata_files = list(temp_dir.glob("cached_modelData*"))
# predictordata_files = list(temp_dir.glob("cached_predictorData*"))

# if not modeldata_files:
# raise FileNotFoundError("No cached model data found.")
# if not predictordata_files:
# logger.warning("No cached predictor data found.")

@staticmethod
def _write_params_files(
temp_dir: Path,
Expand All @@ -353,15 +340,13 @@ def _write_params_files(
import yaml

# Parameters to python code

with open(temp_dir / "params.yml", "w") as f:
yaml.dump(
params,
f,
)

# Project/rendering options to quarto

with open(temp_dir / "_quarto.yml", "w") as f:
yaml.dump(
{
Expand All @@ -371,103 +356,6 @@ def _write_params_files(
f,
)

@staticmethod
def _find_executable(exec_name: str) -> Path:
"""Find the executable on the system."""

# First find in path
exec_in_path = shutil.which(exec_name) # pragma: no cover
if exec_in_path: # pragma: no cover
return Path(exec_in_path)

# If not in path try find explicitly. TODO not sure this is wise
# maybe we should not try be smart and assume quarto/pandoc are
# properly installed.

if sys.platform == "win32": # pragma: no cover
possible_paths = [
Path(
os.environ.get("USERPROFILE", ""),
"AppData",
"Local",
"Programs",
f"{exec_name}", # assume windows is still case insensitive (NTFS changes this...)
"bin",
f"{exec_name}.cmd",
),
Path(
os.environ.get("PROGRAMFILES", ""),
f"{exec_name}",
"bin",
f"{exec_name}.cmd",
),
]
else: # pragma: no cover
possible_paths = [
Path(f"/usr/local/bin/{exec_name}"),
Path(f"/opt/{exec_name}/bin/{exec_name}"),
Path(os.environ.get("HOME", ""), ".local", "bin", exec_name),
]

for path in possible_paths:
if path.exists():
return path

raise FileNotFoundError(
"Quarto executable not found. Please ensure Quarto is installed and in the system PATH."
) # pragma: no cover

# TODO not conviced about below. This isn't necessarily the same path resolution
# as the os does. What's wrong with just assuming quarto is in the path so we can
# just test for version w code like
# def get_cmd_output(args):
# result = (
# subprocess.run(args, stdout=subprocess.PIPE).stdout.decode("utf-8").split("\n")
# )
# return result
# get_version_only(get_cmd_output(["quarto", "--version"])[0])

@staticmethod
def _get_executable_with_version(
exec_name: str, verbose: bool = False
) -> Tuple[Path, str]:
def get_version_only(versionstr):
return re.sub("[^.0-9]", "", versionstr)

try:
executable = Reports._find_executable(exec_name=exec_name)
except FileNotFoundError as e: # pragma: no cover
logger.error(e)
raise

# Check version
try:
version_result = subprocess.run(
[str(executable), "--version"],
capture_output=True,
text=True,
check=True,
)
version_string = get_version_only(
version_result.stdout.split("\n")[0].strip()
)
message = f"{exec_name} version: {version_string}"
logger.info(message)
if verbose:
print(message)
except subprocess.CalledProcessError as e: # pragma: no cover
logger.warning(f"Failed to check {exec_name} version: {e}")

return (executable, version_string)

@staticmethod
def get_quarto_with_version(verbose: bool = True) -> Tuple[Path, str]:
return Reports._get_executable_with_version("quarto", verbose=verbose)

@staticmethod
def get_pandoc_with_version(verbose: bool = True) -> Tuple[Path, str]:
return Reports._get_executable_with_version("pandoc", verbose=verbose)

@staticmethod
def run_quarto(
qmd_file: str,
Expand All @@ -488,7 +376,7 @@ def run_quarto(
analysis=analysis,
)

quarto_exec, _ = Reports.get_quarto_with_version(verbose)
quarto_exec, _ = get_quarto_with_version(verbose)

command = [
str(quarto_exec),
Expand Down Expand Up @@ -565,9 +453,7 @@ def excel_report(
}

if self.datamart.predictor_data is not None:
tabs["predictors_overview"] = (
self.datamart.aggregates.predictors_overview()
)
tabs["predictors_overview"] = self.datamart.aggregates.predictors_overview()

if predictor_binning and self.datamart.predictor_data is not None:
tabs["predictor_binning"] = self.datamart.aggregates.last(
Expand Down
14 changes: 10 additions & 4 deletions python/pdstools/app/health_check/pages/2_Data_Filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,14 @@
"Upload Filters You Downloaded Earlier", type=["json"]
)
if uploaded_file:
import io

imported_filters = json.load(uploaded_file)
for key, val in imported_filters.items():
expr_list.append(pl.Expr.from_json(json.dumps(val)))
# Convert the JSON string to a StringIO object and specify the format as 'json'
json_str = json.dumps(val)
str_io = io.StringIO(json_str)
expr_list.append(pl.Expr.deserialize(str_io, format="json"))

st.session_state["filters"] = filter_dataframe(
st.session_state["dm"].model_data, queries=expr_list
Expand All @@ -33,10 +38,11 @@
filtered_modelid_count, filtered_row_count = model_and_row_counts(
_apply_query(st.session_state["dm"].model_data, st.session_state["filters"])
)
deserialize_exprs = {}
serialized_exprs = {}
for i, expr in enumerate(st.session_state["filters"]):
deserialize_exprs[i] = json.loads(expr.meta.write_json())
data = json.dumps(deserialize_exprs)
serialized = expr.meta.serialize(format="json")
serialized_exprs[i] = json.loads(serialized)
data = json.dumps(serialized_exprs)
st.download_button(
label="Download Filters",
data=data,
Expand Down
25 changes: 22 additions & 3 deletions python/pdstools/reports/HealthCheck.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ import polars as pl

import numpy as np
import math
import json
import io

cdh_guidelines = CDHGuidelines()

Expand All @@ -73,6 +75,22 @@ def fig_set_xaxis_modelperformance(fig, label="Model Performance"):
.update_xaxes(title=label, showticklabels=True, visible=True)
)
return fig
def _deserialize_query(serialized_query):
if serialized_query is None:
return None

if serialized_query["type"] == "expr_list":
expr_list = []
for _, val in serialized_query["expressions"].items():
json_str = json.dumps(val)
str_io = io.StringIO(json_str)
expr_list.append(pl.Expr.deserialize(str_io, format="json"))
return expr_list

elif serialized_query["type"] == "dict":
return serialized_query["data"]

raise ValueError(f"Unknown query type: {serialized_query['type']}")
```

```{python}
Expand All @@ -84,7 +102,7 @@ def fig_set_xaxis_modelperformance(fig, label="Model Performance"):
title = "ADM Model Overview"
subtitle = "Sample data"

# Insert the paths to your data files here to run the notebook from your IDE.
# Insert the paths to your data files here to run the notebook from your IDE.
# Edit the _quarto.yml to enable/disable specific sections of the quarto output.
# Parameters will be overriden by quarto when a parameters yaml is provided

Expand Down Expand Up @@ -116,6 +134,7 @@ if query and query == "None":
query = None


query = _deserialize_query(query)
responsecount_analysis_query = (
pl.col("ResponseCount") > responsecount_analysis_threshold
)
Expand Down Expand Up @@ -1390,7 +1409,7 @@ if datamart.predictor_data is not None:
# The default of observed=False is deprecated...

fig = px.treemap(
missing,
missing,
path=path,
color="Percentage without responses",
template="pega",
Expand Down Expand Up @@ -1840,4 +1859,4 @@ except Exception as e:
report_utils.show_credits("pega-datascientist-tools/python/pdstools/reports/HealthCheck.qmd")


```
```
2 changes: 1 addition & 1 deletion python/pdstools/utils/polars_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def sample_it(s: pl.Series, n) -> pl.Series:
)

def height(self):
return self._ldf.select(pl.first().count()).collect().item()
return self._ldf.select(pl.first().len()).collect().item()

def shape(self):
return (self.height(), len(self._ldf.columns))
Expand Down
Loading
Loading