From 88b7bacb34de8f016cafdad3830473bd2d894b6c Mon Sep 17 00:00:00 2001 From: haxscramper Date: Thu, 14 Mar 2024 21:16:16 +0400 Subject: [PATCH] py: count words per period --- .../scratch_scripts/activity_analysis.py | 122 +++++++++++++----- .../py_exporters/export_sqlite.py | 85 ++++++++---- 2 files changed, 149 insertions(+), 58 deletions(-) diff --git a/scripts/py_cli/py_cli/scratch_scripts/activity_analysis.py b/scripts/py_cli/py_cli/scratch_scripts/activity_analysis.py index fd1a2d62c..ce48e794c 100755 --- a/scripts/py_cli/py_cli/scratch_scripts/activity_analysis.py +++ b/scripts/py_cli/py_cli/scratch_scripts/activity_analysis.py @@ -47,31 +47,74 @@ def plot_timestamped_events_with_pandas( ) -> Optional[Tuple[matplotlib_figure.Figure, Any]]: session = sessionmaker(bind=engine)() - # log(CAT).info( - # render_rich( - # format_rich_query(engine, select(sql.Block.timestamp, literal("Block"))))) - union_query = union_all( select( sql.PriorityModified.timestamp.label("timestamp"), literal("PriorityModified").label("event"), + literal(1).label("count"), + ), + select( + sql.StateModified.timestamp, + literal("StateModified"), + literal(1).label("count"), + ), + select( + sql.TagModified.timestamp, + literal("TagModified"), + literal(1).label("count"), + ), + select( + sql.ClockModified.from_.label("timestamp"), + literal("ClockModified"), + literal(1).label("count"), + ), + select( + sql.NoteModified.timestamp, + literal("NoteModified"), + literal(1).label("count"), + ), + select( + sql.Block.timestamp, + literal("Block"), + literal(1).label("count"), + ), + select( + sql.Subtree.created.label("timestamp"), + literal("SubtreeCreated"), + literal(1).label("count"), + ), + select( + sql.Subtree.scheduled, + literal("SubtreeScheduled"), + literal(1).label("count"), + ), + select( + sql.Subtree.deadline, + literal("SubtreeDeadline"), + literal(1).label("count"), ), - select(sql.StateModified.timestamp, literal("StateModified")), - select(sql.TagModified.timestamp, literal("TagModified")), - select(sql.ClockModified.from_.label("timestamp"), literal("ClockModified")), - select(sql.NoteModified.timestamp, literal("NoteModified")), - select(sql.Block.timestamp, literal("Block")), - select(sql.Subtree.created.label("timestamp"), literal("SubtreeCreated")), - select(sql.Subtree.scheduled, literal("SubtreeScheduled")), - select(sql.Subtree.deadline, literal("SubtreeDeadline")), select( sql.Subtree.closed, literal("SubtreeClosed"), + literal(1).label("count"), + ), + select( + sql.Subtree.created.label("timestamp"), + literal("SubtreeWordcount"), + sql.Subtree.wordcount.label("count"), + ), + select( + sql.Block.timestamp, + literal("BlockWordcount"), + sql.Block.wordcount.label("count"), ), ).alias("union_query") - query = select(union_query.c.timestamp, - union_query.c.event).where(union_query.c.timestamp.is_not(None)) + query = select( + union_query.c.timestamp, + union_query.c.event, + union_query.c.count, + ).where(union_query.c.timestamp.is_not(None)) df = pd.read_sql(query, engine) @@ -94,31 +137,41 @@ def plot_timestamped_events_with_pandas( df["bin"] = pd.cut(df["timestamp"], bins=bins, right=False) - grouped = df.groupby(["bin", "event"]).size().unstack(fill_value=0) + grouped = df.groupby(["bin", "event"])["count"].sum().unstack(fill_value=0) + grouped.index = pd.to_datetime(grouped.index.map(lambda it: it.left)) - fig, ax = plt.subplots(figsize=figsize) - grouped.plot.area(stacked=True, ax=ax) + if grouped.empty: + return None - ax.set_xlabel("Time") - ax.set_ylabel("Number of Events") - ax.xaxis_date() - ax.legend(loc="upper left", bbox_to_anchor=(1, 1)) + # log(CAT).info("\n" + render_rich(dataframe_to_rich_table(grouped.head(20)))) + n_plots = len(grouped.columns) + fig, axes = plt.subplots(n_plots, 1, sharex=True, figsize=(10, 2 * n_plots)) - grouped.index = pd.to_datetime(grouped.index.map(lambda it: it.left)) - ticklabels = [""] * len(grouped.index) - ticklabels[::time_tick_skip] = [ - item.strftime("%Y-%m-%d") for item in grouped.index[::time_tick_skip] - ] + if n_plots == 1: + axes = [axes] + + + for ax, column in zip(axes, grouped.columns): + ax.fill_between(grouped.index, grouped[column], alpha=0.5) + ax.plot(grouped.index, grouped[column], label=column) + ax.margins(y=0.1) # Add some margin to the y-axis for aesthetics + ax.set_ylabel(column) + ax.legend(loc="upper left") + + ticklabels = [item.strftime("%Y-%m-%d") for item in grouped.index] + axes[0].set_xticklabels(ticklabels[::time_tick_skip]) + axes[0].set_xticks(grouped.index[::time_tick_skip]) - ax.xaxis.set_major_formatter(mticker.FixedFormatter(ticklabels)) - ax.grid(True) - ax.grid(color='gray', linestyle='--', linewidth=0.5) + for ax in axes: + ax.xaxis.grid(True) + ax.yaxis.grid(False) + ax.grid(color='gray', linestyle='--', linewidth=0.5) - fig.autofmt_xdate() + fig.autofmt_xdate(rotation=45) fig.tight_layout() session.close() - return (fig, ax) + return (fig, axes) @click.command() @@ -145,6 +198,7 @@ def cli(ctx: click.Context, config: str, **kwargs) -> None: if sql_db.exists(): sql_db.unlink() + log(CAT).info("Registering DB") engine: Engine = create_engine("sqlite:///" + str(sql_db)) sql.Base.metadata.create_all(engine) for node, file in nodes: @@ -157,10 +211,10 @@ def cli(ctx: click.Context, config: str, **kwargs) -> None: log(CAT).info("Plotting data") plot = plot_timestamped_events_with_pandas( engine, - bin_size=25, - min_time=datetime(year=2017, month=1, day=1), + bin_size=30, + # min_time=datetime(year=2017, month=1, day=1), figsize=(20, 12), - time_tick_skip=1, + time_tick_skip=5, ) if plot: diff --git a/scripts/py_exporters/py_exporters/export_sqlite.py b/scripts/py_exporters/py_exporters/export_sqlite.py index 44f2eb31c..690142f72 100644 --- a/scripts/py_exporters/py_exporters/export_sqlite.py +++ b/scripts/py_exporters/py_exporters/export_sqlite.py @@ -43,6 +43,7 @@ class Subtree(Base): deadline = DateTimeColumn(nullable=True) closed = DateTimeColumn(nullable=True) location = ForeignId(name="Location.id", nullable=True) + wordcount = IntColumn(nullable=True) class BlockKind(enum.Enum): @@ -124,9 +125,9 @@ class RefileModified(Base): CAT = "haxorg.export.sqlite" - subtree_count = 0 + @beartype def registerDocument(node: org.Org, engine: Engine, file: str): Base.metadata.bind = engine @@ -143,7 +144,7 @@ def registerDocument(node: org.Org, engine: Engine, file: str): def get_location(node: org.Org) -> Optional[int]: if not node.loc: return None - + nonlocal counter result = file_record.id * 1E6 + counter counter += 1 @@ -246,37 +247,81 @@ def aux_subtree_log(node: org.SubtreeLog, subtree_id: int): session.add( NoteModified( subtree=subtree_id, - plaintext=ExporterUltraplain.getStr(note.desc) if note.desc else "", + plaintext=ExporterUltraplain.getStr(note.desc) + if note.desc else "", )) + @beartype + def getSubtreeTime(node: org.Subtree, + kind: org.SubtreePeriodKind) -> Optional[datetime]: + result: Optional[datetime] = None + time: org.SubtreePeriod + for time in node.getTimePeriods(org.IntSetOfSubtreePeriodKindIntVec([kind])): + + if time.from_.getTimeKind() == org.TimeTimeKind.Static: + result = evalDateTime(time.from_.getStatic().time) + + return result + + @beartype + def getCreationTime(node: org.Org) -> Optional[datetime]: + match node: + case org.Subtree(): + return getSubtreeTime(node, + org.SubtreePeriodKind.Created) or getSubtreeTime( + node, org.SubtreePeriodKind.Titled) + + case org.AnnotatedParagraph(): + if node.getAnnotationKind( + ) == org.AnnotatedParagraphAnnotationKind.Timestamp: + return evalDateTime(node.getTimestamp().time.getStatic().time) + @beartype def aux(node: org.Org, parent: Optional[int] = None): global subtree_count match node: case org.Subtree(): - def getTime(kind: org.SubtreePeriodKind) -> Optional[datetime]: - result: Optional[datetime] = None - time: org.SubtreePeriod - for time in node.getTimePeriods( - org.IntSetOfSubtreePeriodKindIntVec([kind])): - if time.from_.getTimeKind() == org.TimeTimeKind.Static: - result = evalDateTime(time.from_.getStatic().time) + def getNestedWordcount(node: org.Org) -> int: + if not node or getCreationTime(node) is not None: + return 0 + else: + result = 0 + match node: + case org.Word() | org.BigIdent() | org.RawText( + ) | org.HashTag() | org.AtMention(): + result += 1 + + case _: + for sub in node: + result += getNestedWordcount(sub) + + return result + + count = 0 + for sub in node: + count += getNestedWordcount(sub) + + # log(CAT).info("{} {} {}:{}".format( + # ExporterUltraplain.getStr(node.title), + # count, + # node.loc.line if node.loc else -1, + # node.loc.column if node.loc else -1, + # )) - return result - session.add( Subtree( id=subtree_count, parent=parent, - created=getTime(org.SubtreePeriodKind.Created), - scheduled=getTime(org.SubtreePeriodKind.Scheduled), + created=getCreationTime(node), + scheduled=getSubtreeTime(node, org.SubtreePeriodKind.Scheduled), level=node.level, plaintext_title=ExporterUltraplain.getStr(node.title), location=get_location(node), + wordcount=count, )) - + subtree_count += 1 for item in node.logbook: @@ -302,18 +347,11 @@ def getTime(kind: org.SubtreePeriodKind) -> Optional[datetime]: if sub.getKind() == osk.Word: wordcount += 1 - timestamp: Optional[datetime] = None - if isinstance(node, org.AnnotatedParagraph): - if node.getAnnotationKind( - ) == org.AnnotatedParagraphAnnotationKind.Timestamp: - timestamp = evalDateTime( - node.getTimestamp().time.getStatic().time) - session.add( Block( kind=BlockKind.Paragraph, wordcount=wordcount, - timestamp=timestamp, + timestamp=getCreationTime(node), plaintext=ExporterUltraplain.getStr(node), location=get_location(node), )) @@ -336,6 +374,5 @@ def getTime(kind: org.SubtreePeriodKind) -> Optional[datetime]: case osk.Table: pass - aux(node) session.commit()