From 88b7bacb34de8f016cafdad3830473bd2d894b6c Mon Sep 17 00:00:00 2001
From: haxscramper <haxscramper@gmail.com>
Date: Thu, 14 Mar 2024 21:16:16 +0400
Subject: [PATCH] py: count words per period

---
 .../scratch_scripts/activity_analysis.py      | 122 +++++++++++++-----
 .../py_exporters/export_sqlite.py             |  85 ++++++++----
 2 files changed, 149 insertions(+), 58 deletions(-)

diff --git a/scripts/py_cli/py_cli/scratch_scripts/activity_analysis.py b/scripts/py_cli/py_cli/scratch_scripts/activity_analysis.py
index fd1a2d62c..ce48e794c 100755
--- a/scripts/py_cli/py_cli/scratch_scripts/activity_analysis.py
+++ b/scripts/py_cli/py_cli/scratch_scripts/activity_analysis.py
@@ -47,31 +47,74 @@ def plot_timestamped_events_with_pandas(
 ) -> Optional[Tuple[matplotlib_figure.Figure, Any]]:
     session = sessionmaker(bind=engine)()
 
-    # log(CAT).info(
-    #     render_rich(
-    #         format_rich_query(engine, select(sql.Block.timestamp, literal("Block")))))
-
     union_query = union_all(
         select(
             sql.PriorityModified.timestamp.label("timestamp"),
             literal("PriorityModified").label("event"),
+            literal(1).label("count"),
+        ),
+        select(
+            sql.StateModified.timestamp,
+            literal("StateModified"),
+            literal(1).label("count"),
+        ),
+        select(
+            sql.TagModified.timestamp,
+            literal("TagModified"),
+            literal(1).label("count"),
+        ),
+        select(
+            sql.ClockModified.from_.label("timestamp"),
+            literal("ClockModified"),
+            literal(1).label("count"),
+        ),
+        select(
+            sql.NoteModified.timestamp,
+            literal("NoteModified"),
+            literal(1).label("count"),
+        ),
+        select(
+            sql.Block.timestamp,
+            literal("Block"),
+            literal(1).label("count"),
+        ),
+        select(
+            sql.Subtree.created.label("timestamp"),
+            literal("SubtreeCreated"),
+            literal(1).label("count"),
+        ),
+        select(
+            sql.Subtree.scheduled,
+            literal("SubtreeScheduled"),
+            literal(1).label("count"),
+        ),
+        select(
+            sql.Subtree.deadline,
+            literal("SubtreeDeadline"),
+            literal(1).label("count"),
         ),
-        select(sql.StateModified.timestamp, literal("StateModified")),
-        select(sql.TagModified.timestamp, literal("TagModified")),
-        select(sql.ClockModified.from_.label("timestamp"), literal("ClockModified")),
-        select(sql.NoteModified.timestamp, literal("NoteModified")),
-        select(sql.Block.timestamp, literal("Block")),
-        select(sql.Subtree.created.label("timestamp"), literal("SubtreeCreated")),
-        select(sql.Subtree.scheduled, literal("SubtreeScheduled")),
-        select(sql.Subtree.deadline, literal("SubtreeDeadline")),
         select(
             sql.Subtree.closed,
             literal("SubtreeClosed"),
+            literal(1).label("count"),
+        ),
+        select(
+            sql.Subtree.created.label("timestamp"),
+            literal("SubtreeWordcount"),
+            sql.Subtree.wordcount.label("count"),
+        ),
+        select(
+            sql.Block.timestamp,
+            literal("BlockWordcount"),
+            sql.Block.wordcount.label("count"),
         ),
     ).alias("union_query")
 
-    query = select(union_query.c.timestamp,
-                   union_query.c.event).where(union_query.c.timestamp.is_not(None))
+    query = select(
+        union_query.c.timestamp,
+        union_query.c.event,
+        union_query.c.count,
+    ).where(union_query.c.timestamp.is_not(None))
 
     df = pd.read_sql(query, engine)
 
@@ -94,31 +137,41 @@ def plot_timestamped_events_with_pandas(
 
     df["bin"] = pd.cut(df["timestamp"], bins=bins, right=False)
 
-    grouped = df.groupby(["bin", "event"]).size().unstack(fill_value=0)
+    grouped = df.groupby(["bin", "event"])["count"].sum().unstack(fill_value=0)
+    grouped.index = pd.to_datetime(grouped.index.map(lambda it: it.left))
 
-    fig, ax = plt.subplots(figsize=figsize)
-    grouped.plot.area(stacked=True, ax=ax)
+    if grouped.empty:
+        return None
 
-    ax.set_xlabel("Time")
-    ax.set_ylabel("Number of Events")
-    ax.xaxis_date()
-    ax.legend(loc="upper left", bbox_to_anchor=(1, 1))
+    # log(CAT).info("\n" + render_rich(dataframe_to_rich_table(grouped.head(20))))
+    n_plots = len(grouped.columns)
+    fig, axes = plt.subplots(n_plots, 1, sharex=True, figsize=(10, 2 * n_plots))
 
-    grouped.index = pd.to_datetime(grouped.index.map(lambda it: it.left))
-    ticklabels = [""] * len(grouped.index)
-    ticklabels[::time_tick_skip] = [
-        item.strftime("%Y-%m-%d") for item in grouped.index[::time_tick_skip]
-    ]
+    if n_plots == 1:
+        axes = [axes]
+
+
+    for ax, column in zip(axes, grouped.columns):
+        ax.fill_between(grouped.index, grouped[column], alpha=0.5)
+        ax.plot(grouped.index, grouped[column], label=column)
+        ax.margins(y=0.1)  # Add some margin to the y-axis for aesthetics
+        ax.set_ylabel(column)
+        ax.legend(loc="upper left")
+
+    ticklabels = [item.strftime("%Y-%m-%d") for item in grouped.index]
+    axes[0].set_xticklabels(ticklabels[::time_tick_skip])
+    axes[0].set_xticks(grouped.index[::time_tick_skip])
 
-    ax.xaxis.set_major_formatter(mticker.FixedFormatter(ticklabels))
-    ax.grid(True)
-    ax.grid(color='gray', linestyle='--', linewidth=0.5)
+    for ax in axes:
+        ax.xaxis.grid(True)
+        ax.yaxis.grid(False)
+        ax.grid(color='gray', linestyle='--', linewidth=0.5)
 
-    fig.autofmt_xdate()
+    fig.autofmt_xdate(rotation=45)
     fig.tight_layout()
 
     session.close()
-    return (fig, ax)
+    return (fig, axes)
 
 
 @click.command()
@@ -145,6 +198,7 @@ def cli(ctx: click.Context, config: str, **kwargs) -> None:
         if sql_db.exists():
             sql_db.unlink()
 
+        log(CAT).info("Registering DB")
         engine: Engine = create_engine("sqlite:///" + str(sql_db))
         sql.Base.metadata.create_all(engine)
         for node, file in nodes:
@@ -157,10 +211,10 @@ def cli(ctx: click.Context, config: str, **kwargs) -> None:
     log(CAT).info("Plotting data")
     plot = plot_timestamped_events_with_pandas(
         engine,
-        bin_size=25,
-        min_time=datetime(year=2017, month=1, day=1),
+        bin_size=30,
+        # min_time=datetime(year=2017, month=1, day=1),
         figsize=(20, 12),
-        time_tick_skip=1,
+        time_tick_skip=5,
     )
 
     if plot:
diff --git a/scripts/py_exporters/py_exporters/export_sqlite.py b/scripts/py_exporters/py_exporters/export_sqlite.py
index 44f2eb31c..690142f72 100644
--- a/scripts/py_exporters/py_exporters/export_sqlite.py
+++ b/scripts/py_exporters/py_exporters/export_sqlite.py
@@ -43,6 +43,7 @@ class Subtree(Base):
     deadline = DateTimeColumn(nullable=True)
     closed = DateTimeColumn(nullable=True)
     location = ForeignId(name="Location.id", nullable=True)
+    wordcount = IntColumn(nullable=True)
 
 
 class BlockKind(enum.Enum):
@@ -124,9 +125,9 @@ class RefileModified(Base):
 
 CAT = "haxorg.export.sqlite"
 
-
 subtree_count = 0
 
+
 @beartype
 def registerDocument(node: org.Org, engine: Engine, file: str):
     Base.metadata.bind = engine
@@ -143,7 +144,7 @@ def registerDocument(node: org.Org, engine: Engine, file: str):
     def get_location(node: org.Org) -> Optional[int]:
         if not node.loc:
             return None
-        
+
         nonlocal counter
         result = file_record.id * 1E6 + counter
         counter += 1
@@ -246,37 +247,81 @@ def aux_subtree_log(node: org.SubtreeLog, subtree_id: int):
                 session.add(
                     NoteModified(
                         subtree=subtree_id,
-                        plaintext=ExporterUltraplain.getStr(note.desc) if note.desc else "",
+                        plaintext=ExporterUltraplain.getStr(note.desc)
+                        if note.desc else "",
                     ))
 
+    @beartype
+    def getSubtreeTime(node: org.Subtree,
+                       kind: org.SubtreePeriodKind) -> Optional[datetime]:
+        result: Optional[datetime] = None
+        time: org.SubtreePeriod
+        for time in node.getTimePeriods(org.IntSetOfSubtreePeriodKindIntVec([kind])):
+
+            if time.from_.getTimeKind() == org.TimeTimeKind.Static:
+                result = evalDateTime(time.from_.getStatic().time)
+
+        return result
+
+    @beartype
+    def getCreationTime(node: org.Org) -> Optional[datetime]:
+        match node:
+            case org.Subtree():
+                return getSubtreeTime(node,
+                                      org.SubtreePeriodKind.Created) or getSubtreeTime(
+                                          node, org.SubtreePeriodKind.Titled)
+
+            case org.AnnotatedParagraph():
+                if node.getAnnotationKind(
+                ) == org.AnnotatedParagraphAnnotationKind.Timestamp:
+                    return evalDateTime(node.getTimestamp().time.getStatic().time)
+
     @beartype
     def aux(node: org.Org, parent: Optional[int] = None):
         global subtree_count
         match node:
             case org.Subtree():
-                def getTime(kind: org.SubtreePeriodKind) -> Optional[datetime]:
-                    result: Optional[datetime] = None
-                    time: org.SubtreePeriod
-                    for time in node.getTimePeriods(
-                            org.IntSetOfSubtreePeriodKindIntVec([kind])):
 
-                        if time.from_.getTimeKind() == org.TimeTimeKind.Static:
-                            result = evalDateTime(time.from_.getStatic().time)
+                def getNestedWordcount(node: org.Org) -> int:
+                    if not node or getCreationTime(node) is not None:
+                        return 0
 
+                    else:
+                        result = 0
+                        match node:
+                            case org.Word() | org.BigIdent() | org.RawText(
+                            ) | org.HashTag() | org.AtMention():
+                                result += 1
+
+                            case _:
+                                for sub in node:
+                                    result += getNestedWordcount(sub)
+
+                        return result
+
+                count = 0
+                for sub in node:
+                    count += getNestedWordcount(sub)
+
+                # log(CAT).info("{} {} {}:{}".format(
+                #     ExporterUltraplain.getStr(node.title),
+                #     count,
+                #     node.loc.line if node.loc else -1,
+                #     node.loc.column if node.loc else -1,
+                # ))
 
-                    return result
-                
                 session.add(
                     Subtree(
                         id=subtree_count,
                         parent=parent,
-                        created=getTime(org.SubtreePeriodKind.Created),
-                        scheduled=getTime(org.SubtreePeriodKind.Scheduled),
+                        created=getCreationTime(node),
+                        scheduled=getSubtreeTime(node, org.SubtreePeriodKind.Scheduled),
                         level=node.level,
                         plaintext_title=ExporterUltraplain.getStr(node.title),
                         location=get_location(node),
+                        wordcount=count,
                     ))
-                
+
                 subtree_count += 1
 
                 for item in node.logbook:
@@ -302,18 +347,11 @@ def getTime(kind: org.SubtreePeriodKind) -> Optional[datetime]:
                         if sub.getKind() == osk.Word:
                             wordcount += 1
 
-                    timestamp: Optional[datetime] = None
-                    if isinstance(node, org.AnnotatedParagraph):
-                        if node.getAnnotationKind(
-                        ) == org.AnnotatedParagraphAnnotationKind.Timestamp:
-                            timestamp = evalDateTime(
-                                node.getTimestamp().time.getStatic().time)
-
                     session.add(
                         Block(
                             kind=BlockKind.Paragraph,
                             wordcount=wordcount,
-                            timestamp=timestamp,
+                            timestamp=getCreationTime(node),
                             plaintext=ExporterUltraplain.getStr(node),
                             location=get_location(node),
                         ))
@@ -336,6 +374,5 @@ def getTime(kind: org.SubtreePeriodKind) -> Optional[datetime]:
             case osk.Table:
                 pass
 
-
     aux(node)
     session.commit()