pegasystems · operdeck · Jan 10, 2025 · Jan 10, 2025
diff --git a/examples/ih/Example_IH_Analysis.ipynb b/examples/ih/Example_IH_Analysis.ipynb
@@ -14,7 +14,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {
     "nbsphinx": "hidden"
    },
@@ -31,7 +31,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -64,14 +64,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
     "# ih = IH.from_ds_export(\n",
     "#     \"../../data/Data-pxStrategyResult_pxInteractionHistory_20210101T010000_GMT.zip\"\n",
     "# )\n",
-    "ih = IH.from_mock_data()"
+    "ih = IH.from_mock_data(n=1000000)"
    ]
   },
   {
@@ -131,10 +131,11 @@
    "outputs": [],
    "source": [
     "fig = ih.plot.action_distribution(\n",
-    "    query=pl.col.Outcome.is_in([\"Clicked\", \"Accepted\"]), facet=\"Channel\",\n",
-    "    title=\"Distribution of Clicked or Accepted actions\"\n",
+    "    query=pl.col.Outcome.is_in([\"Clicked\", \"Accepted\"]), \n",
+    "    title=\"Distribution of Actions\",\n",
+    "    color=\"Outcome\",\n",
     ")\n",
-    "fig.update_layout(yaxis=dict(tickmode=\"linear\"))\n",
+    "# fig.update_layout(yaxis=dict(tickmode=\"linear\")) # to show all names\n",
     "fig"
    ]
   },
@@ -370,7 +371,8 @@
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3"
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
   }
  },
  "nbformat": 4,

diff --git a/python/pdstools/ih/IH.py b/python/pdstools/ih/IH.py
@@ -87,15 +87,17 @@ def from_mock_data(cls, days=90, n=100000):
             The properly initialized IH object
         """
         n_actions = 10
-        click_rate = 0.2
-        accept_rate = 0.15
         click_avg_duration_minutes = 2
         accept_avg_duration_minutes = 30
         convert_over_accept_click_rate_test = 0.5
         convert_over_accept_click_rate_control = 0.3
         convert_avg_duration_days = 2
         inbound_base_propensity = 0.02
         outbound_base_propensity = 0.01
+        inbound_modelnoise_NaiveBayes = 0.2  # relative amount of extra noise added to models
+        inbound_modelnoise_GradientBoost = 0.0
+        outbound_modelnoise_NaiveBayes = 0.3
+        outbound_modelnoise_GradientBoost = 0.1
 
         now = datetime.datetime.now()
 
@@ -119,7 +121,9 @@ def thompson_sampler(propensity, responses=10000):
         ih_fake_impressions = pl.DataFrame(
             {
                 "pxInteractionID": [str(int(1e9 + i)) for i in range(n)],
-                "pyChannel": random.choices(["Web", "Email"], k=n),
+                "pyChannel": random.choices(
+                    ["Web", "Email"], k=n
+                ),  # Direction will be derived from this later
                 "pyIssue": random.choices(
                     ["Acquisition", "Retention", "Risk", "Service"], k=n
                 ),
@@ -170,6 +174,7 @@ def thompson_sampler(propensity, responses=10000):
                 "Temp.ConvertDurationDays": [
                     random.uniform(0, 2 * convert_avg_duration_days) for i in range(n)
                 ],
+                "Temp.RandomUniform": [random.uniform(0, 1) for i in range(n)],
             }
         ).with_columns(
             pyDirection=pl.when(pl.col("pyChannel") == "Web")
@@ -206,20 +211,16 @@ def thompson_sampler(propensity, responses=10000):
                 pl.col("Temp.Zipf")
                 .mean()
                 .over(["pyChannel", "pyDirection"])
-                .alias("Temp.ZipfMean")
+                .alias("Temp.ZipfMean"),
+                pl.when(pl.col("pyDirection") == "Inbound")
+                .then(pl.lit(inbound_base_propensity))
+                .otherwise(pl.lit(outbound_base_propensity))
+                .alias("Temp.ChannelBasePropensity"),
             )
             .with_columns(
-                BasePropensity=pl.when(pl.col("pyDirection") == "Inbound")
-                .then(
-                    pl.col("Temp.Zipf")
-                    * inbound_base_propensity
-                    / pl.col("Temp.ZipfMean")
-                )
-                .otherwise(
-                    pl.col("Temp.Zipf")
-                    * outbound_base_propensity
-                    / pl.col("Temp.ZipfMean")
-                )
+                BasePropensity=pl.col("Temp.Zipf")
+                * pl.col("Temp.ChannelBasePropensity")
+                / pl.col("Temp.ZipfMean")
             )
             .with_columns(
                 pyPropensity=pl.col("BasePropensity").map_elements(
@@ -228,9 +229,23 @@ def thompson_sampler(propensity, responses=10000):
             )
         )
 
+        # Add artificial noise to the models to manipulate some scenarios
+        ih_fake_impressions = ih_fake_impressions.with_columns(
+            pl.when((pl.col.pyModelTechnique == "NaiveBayes") & (pl.col.pyDirection == "Inbound"))
+            .then(pl.col("Temp.ChannelBasePropensity") * inbound_modelnoise_NaiveBayes)
+            .when((pl.col.pyModelTechnique == "GradientBoost") & (pl.col.pyDirection == "Inbound"))
+            .then(pl.col("Temp.ChannelBasePropensity") * inbound_modelnoise_GradientBoost)
+            .when((pl.col.pyModelTechnique == "NaiveBayes") & (pl.col.pyDirection == "Outbound"))
+            .then(pl.col("Temp.ChannelBasePropensity") * outbound_modelnoise_NaiveBayes)
+            .when((pl.col.pyModelTechnique == "GradientBoost") & (pl.col.pyDirection == "Outbound"))
+            .then(pl.col("Temp.ChannelBasePropensity") * outbound_modelnoise_GradientBoost)
+            .otherwise(pl.lit(0.0))
+            .alias("Temp.ExtraModelNoise")
+        )
+
         ih_fake_clicks = (
             ih_fake_impressions.filter(pl.col.pyDirection == "Inbound")
-            .sample(fraction=click_rate)
+            .filter(pl.col("Temp.RandomUniform") < (pl.col("pyPropensity") + pl.col("Temp.ExtraModelNoise")))
             .with_columns(
                 pxOutcomeTime=pl.col.pxOutcomeTime
                 + pl.duration(minutes=pl.col("Temp.ClickDurationMinutes")),
@@ -239,7 +254,7 @@ def thompson_sampler(propensity, responses=10000):
         )
         ih_fake_accepts = (
             ih_fake_impressions.filter(pl.col.pyDirection == "Outbound")
-            .sample(fraction=accept_rate)
+            .filter(pl.col("Temp.RandomUniform") < (pl.col("pyPropensity") + pl.col("Temp.ExtraModelNoise")))
             .with_columns(
                 pxOutcomeTime=pl.col.pxOutcomeTime
                 + pl.duration(minutes=pl.col("Temp.AcceptDurationMinutes")),

diff --git a/python/pdstools/ih/Plots.py b/python/pdstools/ih/Plots.py
@@ -208,16 +208,16 @@ def success_rate_tree_map(
     def action_distribution(
         self,
         *,
-        # TODO change - one is the by, when multiple join together
-        # other is the facet dimension/condition
         by: Optional[str] = "Name",
         title: Optional[str] = "Action Distribution",
         query: Optional[QUERY] = None,
+        color: Optional[str] = None,
         facet: Optional[str] = None,
         return_df: Optional[bool] = False,
     ):
-        group_by_clause = cdh_utils.safe_flatten_list([by, facet])
-        plot_data = self.ih.aggregates.summary_outcomes(by=group_by_clause, query=query)
+        plot_data = self.ih.aggregates.summary_outcomes(
+            by=[by, color, facet], query=query
+        )
 
         if return_df:
             return plot_data
@@ -226,11 +226,15 @@ def action_distribution(
             plot_data.collect(),
             x="Count",
             y=by,
+            color=color,
             facet_col=facet,
             template="pega",
             title=title,
         )
 
+        fig.update_layout(barmode="stack")
+        fig.update_yaxes(categoryorder="total ascending")
+        fig.update_layout(yaxis=dict(title=""))
         fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
 
         return fig

diff --git a/python/pdstools/utils/cdh_utils.py b/python/pdstools/utils/cdh_utils.py
@@ -1168,4 +1168,10 @@ def safe_flatten_list(alist: List) -> List:
         for item in sublist
     ]
     alist = list(filter(partial(is_not, None), alist))
-    return alist if len(alist) > 0 else None
+    seen = set()
+    unique_alist = []
+    for item in alist:
+        if item not in seen:
+            unique_alist.append(item)
+            seen.add(item)
+    return unique_alist if len(unique_alist) > 0 else None