diff --git a/examples/ih/Conversion_Reporting.ipynb b/examples/ih/Conversion_Reporting.ipynb index 077263fa..ddfd1813 100644 --- a/examples/ih/Conversion_Reporting.ipynb +++ b/examples/ih/Conversion_Reporting.ipynb @@ -33,17 +33,12 @@ "from pathlib import Path\n", "import polars as pl\n", "\n", - "ih_export_file = Path(\n", - " \"./Data-pxStrategyResult_InteractionFiles_20241213T091932_GMT.zip\"\n", - ")\n", "\n", - "if not ih_export_file.exists():\n", - " ih = IH.from_mock_data()\n", - "else:\n", - " ih = IH.from_ds_export(\n", - " ih_export_file,\n", - " query=pl.col.ExperimentGroup.is_not_null() & (pl.col.ExperimentGroup != \"\"),\n", - " )\n", + "ih = IH.from_mock_data()\n", + "# ih = IH.from_ds_export(\n", + "# \"./Data-pxStrategyResult_InteractionFiles_20241213T091932_GMT.zip\",\n", + "# query=pl.col.ExperimentGroup.is_not_null() & (pl.col.ExperimentGroup != \"\"),\n", + "# )\n", "\n", "ih.aggregates.summary_success_rates(by=[\"ExperimentGroup\", \"Channel\"]).drop(\n", " \"Outcomes\"\n", @@ -79,7 +74,7 @@ "metadata": {}, "outputs": [], "source": [ - "ih.plot.success_rates_tree_map(metric=\"Conversion\")\n" + "ih.plot.success_rate_tree_map(metric=\"Conversion\")\n" ] }, { @@ -97,20 +92,7 @@ "metadata": {}, "outputs": [], "source": [ - "ih.plot.success_rates_trend_bar(\n", - " metric=\"Conversion\",\n", - " condition=\"ExperimentGroup\",\n", - " every=\"1w\",\n", - ")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ih.plot.success_rates_trend(metric=\"Conversion\", every=\"1d\")" + "ih.plot.success_rate(metric=\"Conversion\", every=\"1d\")" ] }, { @@ -139,8 +121,8 @@ "metadata": {}, "outputs": [], "source": [ - "ih.plot.success_rates_trend(\n", - " by=\"Channel\"\n", + "ih.plot.success_rate(\n", + " facet=\"Channel\"\n", ")" ] } @@ -160,7 +142,8 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3" + "pygments_lexer": "ipython3", + "version": "3.12.3" } }, "nbformat": 4, diff --git a/examples/ih/Example_IH_Analysis.ipynb b/examples/ih/Example_IH_Analysis.ipynb index 30188e6e..86b479d0 100644 --- a/examples/ih/Example_IH_Analysis.ipynb +++ b/examples/ih/Example_IH_Analysis.ipynb @@ -32,13 +32,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "ih = IH.from_ds_export(\n", - " \"../../data/Data-pxStrategyResult_pxInteractionHistory_20210101T010000_GMT.zip\"\n", - ")" + "# ih = IH.from_ds_export(\n", + "# \"../../data/Data-pxStrategyResult_pxInteractionHistory_20210101T010000_GMT.zip\"\n", + "# )\n", + "ih = IH.from_mock_data()" ] }, { @@ -57,6 +58,11 @@ "ih.data.head().collect()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, { "cell_type": "markdown", "metadata": {}, @@ -115,7 +121,7 @@ "metadata": {}, "outputs": [], "source": [ - "ih.plot.response_counts(every=\"1d\")" + "ih.plot.response_count(every=\"1d\")" ] }, { @@ -131,8 +137,8 @@ "metadata": {}, "outputs": [], "source": [ - "ih.plot.response_counts(\n", - " by=\"Channel\",\n", + "ih.plot.response_count(\n", + " facet=\"Channel\",\n", " query=pl.col.Channel != \"\",\n", ")" ] @@ -159,8 +165,8 @@ "metadata": {}, "outputs": [], "source": [ - "ih.plot.success_rates_trend(\n", - " by=\"Channel\", query=pl.col.Channel.is_not_null() & (pl.col.Channel != \"\")\n", + "ih.plot.success_rate(\n", + " facet=\"Channel\", query=pl.col.Channel.is_not_null() & (pl.col.Channel != \"\")\n", ")" ] }, @@ -218,10 +224,9 @@ " .to_list()\n", " for c in channels\n", "]\n", - "\n", "fig = ff.create_distplot(plot_data, group_labels=channels, show_hist=False)\n", - "fig.update_layout(title=\"Propensity Distribution\")\n", - "fig" + "fig.update_layout(title=\"Propensity Distribution\", yaxis=dict(showticklabels=False))\n", + "fig\n" ] }, { @@ -250,23 +255,44 @@ " .to_list()\n", " if c is not None and c != \"\"\n", "]\n", - "plot_data=ih.data.filter(pl.col.OutcomeTime.is_not_null()).group_by(\"InteractionID\").agg(\n", - " [pl.col.OutcomeTime.min().alias(\"Decision_Time\")]+\n", - " [pl.col.OutcomeTime.filter(pl.col.Outcome == o).max().alias(o) for o in outcomes],\n", - ").collect().unpivot(\n", - " index=[\"InteractionID\", \"Decision_Time\"],\n", - " variable_name=\"Outcome\",\n", - " value_name=\"Time\",\n", - ").with_columns(\n", - " Duration = (pl.col.Time - pl.col.Decision_Time).dt.total_seconds()\n", - ").filter(pl.col.Duration > 0)\n", + "plot_data = (\n", + " ih.data.filter(pl.col.OutcomeTime.is_not_null())\n", + " .group_by(\"InteractionID\")\n", + " .agg(\n", + " [pl.col.OutcomeTime.min().alias(\"Decision_Time\")]\n", + " + [\n", + " pl.col.OutcomeTime.filter(pl.col.Outcome == o).max().alias(o)\n", + " for o in outcomes\n", + " ],\n", + " )\n", + " .collect()\n", + " .unpivot(\n", + " index=[\"InteractionID\", \"Decision_Time\"],\n", + " variable_name=\"Outcome\",\n", + " value_name=\"Time\",\n", + " )\n", + " .with_columns(Duration=(pl.col.Time - pl.col.Decision_Time).dt.total_seconds())\n", + " .filter(pl.col.Duration > 0)\n", + ")\n", + "\n", + "ordered_outcomes = (\n", + " plot_data.group_by(\"Outcome\")\n", + " .agg(Duration=pl.col(\"Duration\").median())\n", + " .sort(\"Duration\")[\"Outcome\"]\n", + " .to_list()\n", + ")\n", + "\n", "fig = px.box(\n", " plot_data,\n", " x=\"Duration\",\n", " y=\"Outcome\",\n", " color=\"Outcome\",\n", - " template=\"pega\"\n", + " template=\"pega\",\n", + " category_orders={\"Outcome\": ordered_outcomes},\n", + " points=False,\n", + " title=\"Duration of Responses\"\n", ")\n", + "fig.update_layout(xaxis_title=\"Duration (seconds)\")\n", "fig" ] } @@ -286,7 +312,8 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3" + "pygments_lexer": "ipython3", + "version": "3.12.3" } }, "nbformat": 4, diff --git a/python/pdstools/ih/IH.py b/python/pdstools/ih/IH.py index 0ded1472..7b23ef0d 100644 --- a/python/pdstools/ih/IH.py +++ b/python/pdstools/ih/IH.py @@ -24,6 +24,7 @@ def __init__(self, data: pl.LazyFrame): self.positive_outcome_labels = { "Engagement": ["Accepted", "Accept", "Clicked", "Click"], "Conversion": ["Conversion"], + "OpenRate": ["Opened", "Open"], } self.negative_outcome_labels = { "Engagement": [ @@ -33,6 +34,7 @@ def __init__(self, data: pl.LazyFrame): "NoResponse", ], "Conversion": ["Impression", "Pending"], + "OpenRate": ["Impression", "Pending"], } @classmethod @@ -41,8 +43,21 @@ def from_ds_export( ih_filename: Union[os.PathLike, str], query: Optional[QUERY] = None, ): - """Import from a Pega Dataset Export""" + """Create an IH instance from a file with Pega Dataset Export + Parameters + ---------- + ih_filename : Union[os.PathLike, str] + The full path to the dataset files + query : Optional[QUERY], optional + An optional argument to filter out selected data, by default None + + Returns + ------- + IH + The properly initialized IH object + + """ data = read_ds_export(ih_filename).with_columns( # TODO this should come from some polars func in utils pl.col("pxOutcomeTime").str.strptime(pl.Datetime, "%Y%m%dT%H%M%S%.3f %Z") @@ -52,13 +67,31 @@ def from_ds_export( return IH(data) + @classmethod + def from_s3(cls): + """Not implemented yet. Please let us know if you would like this functionality!""" + ... + @classmethod def from_mock_data(cls, days=90, n=100000): - """Generate sample data""" - accept_rate = 0.2 - accept_avg_duration_minutes = 10 - convert_over_accept_rate_test = 0.5 - convert_over_accept_rate_control = 0.3 + """Initialize an IH instance with sample data + + Parameters + ---------- + days : number of days, defaults to 90 days + n : number of interaction data records, defaults to 100k + + Returns + ------- + IH + The properly initialized IH object + """ + click_rate = 0.2 + accept_rate = 0.15 + click_avg_duration_minutes = 2 + accept_avg_duration_minutes = 30 + convert_over_accept_click_rate_test = 0.5 + convert_over_accept_click_rate_control = 0.3 convert_avg_duration_days = 2 now = datetime.datetime.now() @@ -70,15 +103,39 @@ def from_mock_data(cls, days=90, n=100000): ih_fake_impressions = pl.DataFrame( { "pxInteractionID": [str(int(1e9 + i)) for i in range(n)], + "pyDirection": "", # will be set later from channel "pyChannel": random.choices(["Web", "Email"], k=n), - "pyIssue": "Acquisition", - "pyGroup": "Phones", - "pyName": "AppleIPhone1564GB", + "pyIssue": random.choices( + ["Acquisition", "Retention", "Risk", "Service"], k=n + ), + "pyGroup": random.choices( + [ + "Pension", + "Lending", + "Mortgages", + "Investments", + "Insurance", + "Savings", + ], + k=n, + ), + "pyName": [ + random.randint(1, 10) for _ in range(n) + ], # nr 1-10 will be appended to group + "pyTreatment": [ + random.randint(1, 2) for _ in range(n) + ], # nr will be appended to group/channel + # https://stackoverflow.com/questions/40351791/how-to-hash-strings-into-a-float-in-01 + "__PropensityInbound__": [random.betavariate(1, 10) for _ in range(n)], + "__PropensityOutbound__": [random.betavariate(1, 20) for _ in range(n)], "ExperimentGroup": ["Conversion-Test", "Conversion-Control"] * int(n / 2), "pxOutcomeTime": [ (now - datetime.timedelta(days=i * days / n)) for i in range(n) ], + "__ClickDurationMinutes__": [ + random.uniform(0, 2 * click_avg_duration_minutes) for i in range(n) + ], "__AcceptDurationMinutes__": [ random.uniform(0, 2 * accept_avg_duration_minutes) for i in range(n) ], @@ -87,29 +144,76 @@ def from_mock_data(cls, days=90, n=100000): ], } ).with_columns( + pyDirection=pl.when(pl.col("pyChannel") == "Web") + .then(pl.lit("Inbound")) + .otherwise(pl.lit("Outbound")), + pyName=pl.format("{}_{}", pl.col("pyGroup"), pl.col("pyName")), + pyTreatment=pl.format( + "{}_{}_{}Treatment{}", + pl.col("pyGroup"), + pl.col("pyName"), + pl.col("pyChannel"), + pl.col("pyTreatment"), + ), pyOutcome=pl.when(pl.col.pyChannel == "Web") .then(pl.lit("Impression")) - .otherwise(pl.lit("Pending")) + .otherwise(pl.lit("Pending")), + pyPropensity=pl.when(pl.col("pyChannel") == "Web") + .then(pl.col("__PropensityInbound__") + * pl.col("pyName").hash() + / pl.datatypes.UInt64.max()) + .otherwise(pl.col("__PropensityOutbound__") + * pl.col("pyName").hash() + / pl.datatypes.UInt64.max()), ) - ih_fake_accepts = ih_fake_impressions.sample(fraction=accept_rate).with_columns( - pl.col.pxOutcomeTime - + pl.duration(minutes=pl.col("__AcceptDurationMinutes__")), - pyOutcome=pl.when(pl.col.pyChannel == "Web") - .then(pl.lit("Clicked")) - .otherwise(pl.lit("Accepted")), + ih_fake_clicks = ( + ih_fake_impressions.filter(pl.col.pyDirection == "Inbound") + .sample(fraction=click_rate) + .with_columns( + pl.col.pxOutcomeTime + + pl.duration(minutes=pl.col("__ClickDurationMinutes__")), + pyOutcome=pl.lit("Clicked"), + ) ) - ih_fake_converts_test = ( + ih_fake_accepts = ( + ih_fake_impressions.filter(pl.col.pyDirection == "Outbound") + .sample(fraction=accept_rate) + .with_columns( + pl.col.pxOutcomeTime + + pl.duration(minutes=pl.col("__AcceptDurationMinutes__")), + pyOutcome=pl.lit("Accepted"), + ) + ) + ih_fake_converts_over_clicks_test = ( + ih_fake_clicks.filter(pl.col.ExperimentGroup == "Conversion-Test") + .sample(fraction=convert_over_accept_click_rate_test) + .with_columns( + pl.col.pxOutcomeTime + + pl.duration(days=pl.col("__ConvertDurationDays__")), + pyOutcome=pl.lit("Conversion"), + ) + ) + ih_fake_converts_over_accepts_test = ( ih_fake_accepts.filter(pl.col.ExperimentGroup == "Conversion-Test") - .sample(fraction=convert_over_accept_rate_test) + .sample(fraction=convert_over_accept_click_rate_test) + .with_columns( + pl.col.pxOutcomeTime + + pl.duration(days=pl.col("__ConvertDurationDays__")), + pyOutcome=pl.lit("Conversion"), + ) + ) + ih_fake_converts_over_clicks_control = ( + ih_fake_clicks.filter(pl.col.ExperimentGroup == "Conversion-Control") + .sample(fraction=convert_over_accept_click_rate_control) .with_columns( pl.col.pxOutcomeTime + pl.duration(days=pl.col("__ConvertDurationDays__")), pyOutcome=pl.lit("Conversion"), ) ) - ih_fake_converts_control = ( + ih_fake_converts_over_accepts_control = ( ih_fake_accepts.filter(pl.col.ExperimentGroup == "Conversion-Control") - .sample(fraction=convert_over_accept_rate_control) + .sample(fraction=convert_over_accept_click_rate_control) .with_columns( pl.col.pxOutcomeTime + pl.duration(days=pl.col("__ConvertDurationDays__")), @@ -121,14 +225,20 @@ def from_mock_data(cls, days=90, n=100000): pl.concat( [ ih_fake_impressions, + ih_fake_clicks, ih_fake_accepts, - ih_fake_converts_test, - ih_fake_converts_control, + ih_fake_converts_over_clicks_test, + ih_fake_converts_over_clicks_control, + ih_fake_converts_over_accepts_test, + ih_fake_converts_over_accepts_control, ] ) - .filter(pl.col("pxOutcomeTime") < pl.lit(now)) + .filter(pl.col("pxOutcomeTime") <= pl.lit(now)) .drop( [ + "__PropensityInbound__", + "__PropensityOutbound__", + "__ClickDurationMinutes__", "__AcceptDurationMinutes__", "__ConvertDurationDays__", ] diff --git a/python/pdstools/ih/Plots.py b/python/pdstools/ih/Plots.py index 7db952ec..2acbbc10 100644 --- a/python/pdstools/ih/Plots.py +++ b/python/pdstools/ih/Plots.py @@ -22,11 +22,13 @@ def __init__(self, ih: "IH_Class"): def overall_gauges( self, condition: Union[str, pl.Expr], + *, metric: Optional[str] = "Engagement", by: Optional[str] = "Channel", reference_values: Optional[Dict[str, float]] = None, title: Optional[str] = None, query: Optional[QUERY] = None, + # facet: Optional[str] = None, return_df: Optional[bool] = False, ): plot_data = self.ih.aggregates.summary_success_rates( @@ -44,7 +46,7 @@ def overall_gauges( cols = plot_data[by].unique().shape[0] # TODO can be None rows = ( plot_data[condition].unique().shape[0] - ) # TODO generalize to support pl expression + ) # TODO generalize to support pl expression, see ADM plots, eg facet in bubble chart fig = make_subplots( rows=rows, @@ -104,9 +106,11 @@ def overall_gauges( def response_count_tree_map( self, + *, by: Optional[List[str]] = None, title: Optional[str] = None, query: Optional[QUERY] = None, + facet: Optional[str] = None, return_df: Optional[bool] = False, ): @@ -143,12 +147,14 @@ def response_count_tree_map( return fig - def success_rates_tree_map( + def success_rate_tree_map( self, + *, metric: Optional[str] = "Engagement", by: Optional[List[str]] = None, title: Optional[str] = None, query: Optional[QUERY] = None, + facet: Optional[str] = None, return_df: Optional[bool] = False, ): if by is None: @@ -193,11 +199,13 @@ def success_rates_tree_map( def action_distribution( self, + *, # TODO change - one is the by, when multiple join together # other is the facet dimension/condition by: Optional[str] = "Name", title: Optional[str] = "Action Distribution", query: Optional[QUERY] = None, + facet: Optional[str] = None, return_df: Optional[bool] = False, ): plot_data = self.ih.aggregates.summary_outcomes(by=by, query=query) @@ -214,55 +222,58 @@ def action_distribution( ) return fig - def success_rates_trend_bar( + # def success_rates_trend_bar( + # self, + # condition: Union[str, pl.Expr], + # *, + # metric: Optional[str] = "Engagement", + # every: Union[str, timedelta] = "1d", + # by: Optional[str] = None, + # title: Optional[str] = None, + # query: Optional[QUERY] = None, + # facet: Optional[str] = None, + # return_df: Optional[bool] = False, + # ): + + # plot_data = self.ih.aggregates.summary_success_rates( + # every=every, + # by=[condition] + [by], # TODO generalize to support pl expression + # query=query, + # ) + + # if return_df: + # return plot_data + + # if title is None: + # title = f"{metric} Rates over Time" + + # fig = px.bar( + # plot_data.collect(), + # x="OutcomeTime", + # y=f"SuccessRate_{metric}", + # color=condition, + # error_y=f"StdErr_{metric}", + # facet_row=by, + # barmode="group", + # custom_data=[condition], + # template="pega", + # title=title, + # ) + # fig.update_yaxes(tickformat=",.3%").update_layout(xaxis_title=None) + # return fig + + def success_rate( self, - condition: Union[str, pl.Expr], + *, metric: Optional[str] = "Engagement", every: Union[str, timedelta] = "1d", - by: Optional[str] = None, title: Optional[str] = None, query: Optional[QUERY] = None, + facet: Optional[str] = None, return_df: Optional[bool] = False, ): - plot_data = self.ih.aggregates.summary_success_rates( - every=every, - by=[condition] + [by], # TODO generalize to support pl expression - query=query, - ) - - if return_df: - return plot_data - - if title is None: - title = f"{metric} Rates over Time" - - fig = px.bar( - plot_data.collect(), - x="OutcomeTime", - y=f"SuccessRate_{metric}", - color=condition, - error_y=f"StdErr_{metric}", - facet_row=by, - barmode="group", - custom_data=[condition], - template="pega", - title=title, - ) - fig.update_yaxes(tickformat=",.3%").update_layout(xaxis_title=None) - return fig - - def success_rates_trend( - self, - metric: Optional[str] = "Engagement", - every: Union[str, timedelta] = "1d", - by: Optional[str] = None, - title: Optional[str] = None, - query: Optional[QUERY] = None, - return_df: Optional[bool] = False, - ): - plot_data = self.ih.aggregates.summary_success_rates( - every=every, by=by, query=query + every=every, by=facet, query=query ) if return_df: @@ -272,8 +283,8 @@ def success_rates_trend( plot_data.collect(), x="OutcomeTime", y=f"SuccessRate_{metric}", - color=by, - facet_row=by, + color=facet, + facet_row=facet, # custom_data=[experiment_field] if experiment_field is not None else None, template="pega", title=title, @@ -282,16 +293,17 @@ def success_rates_trend( fig.update_yaxes(tickformat=",.3%").update_layout(xaxis_title=None) return fig - def response_counts( + def response_count( self, + *, every: Union[str, timedelta] = "1d", - by: Optional[str] = None, title: Optional[str] = "Responses", query: Optional[QUERY] = None, + facet: Optional[str] = None, return_df: Optional[bool] = False, ): plot_data = self.ih.aggregates.ih.aggregates.summary_outcomes( - every=every, by=by, query=query + every=every, by=facet, query=query ).collect() if return_df: @@ -304,7 +316,7 @@ def response_counts( color="Outcome", template="pega", title=title, - facet_row=by, + facet_row=facet, ) fig.update_layout(xaxis_title=None) @@ -312,11 +324,13 @@ def response_counts( def model_performance_trend( self, + *, metric: Optional[str] = "Engagement", every: Union[str, timedelta] = "1d", by: Optional[str] = None, title: Optional[str] = "Model Performance over Time", query: Optional[QUERY] = None, + facet: Optional[str] = None, return_df: Optional[bool] = False, ): @@ -336,7 +350,7 @@ def model_performance_trend( ).alias("Performance") ) .sort(["OutcomeTime"]) - ) + ).with_columns(pl.col("Performance") * 100) if return_df: return plot_data @@ -349,4 +363,7 @@ def model_performance_trend( template="pega", title=title, ) + + fig.update_layout(yaxis=dict(range=[50, 100]), xaxis_title=None) + return fig diff --git a/python/tests/test_IH.py b/python/tests/test_IH.py new file mode 100644 index 00000000..dbcd843c --- /dev/null +++ b/python/tests/test_IH.py @@ -0,0 +1,30 @@ +""" +Testing the functionality of the IH class +""" + +import os +import pathlib + +import polars as pl +import pytest +from pdstools import IH +import plotly.express as px +from plotly.graph_objs import Figure + +def test_mockdata(): + ih = IH.from_mock_data() + assert ih.data.collect().shape[0] > 100000 # interactions + assert ih.data.collect().shape[1] == 11 # nr of IH properties in the sample data + + summary = ih.aggregates._summary_interactions().collect() + assert summary.shape[0] == 100000 + +def test_plots(): + ih = IH.from_mock_data() + assert isinstance(ih.plot.overall_gauges(condition="ExperimentGroup"), Figure) + assert isinstance(ih.plot.response_count_tree_map(), Figure) + assert isinstance(ih.plot.success_rate_tree_map(), Figure) + assert isinstance(ih.plot.action_distribution(), Figure) + assert isinstance(ih.plot.success_rate(), Figure) + assert isinstance(ih.plot.response_count(), Figure) + assert isinstance(ih.plot.model_performance_trend(), Figure)