Skip to content

Commit

Permalink
py: count words per period
Browse files Browse the repository at this point in the history
  • Loading branch information
haxscramper committed Mar 14, 2024
1 parent 4ffa2eb commit 88b7bac
Show file tree
Hide file tree
Showing 2 changed files with 149 additions and 58 deletions.
122 changes: 88 additions & 34 deletions scripts/py_cli/py_cli/scratch_scripts/activity_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,31 +47,74 @@ def plot_timestamped_events_with_pandas(
) -> Optional[Tuple[matplotlib_figure.Figure, Any]]:
session = sessionmaker(bind=engine)()

# log(CAT).info(
# render_rich(
# format_rich_query(engine, select(sql.Block.timestamp, literal("Block")))))

union_query = union_all(
select(
sql.PriorityModified.timestamp.label("timestamp"),
literal("PriorityModified").label("event"),
literal(1).label("count"),
),
select(
sql.StateModified.timestamp,
literal("StateModified"),
literal(1).label("count"),
),
select(
sql.TagModified.timestamp,
literal("TagModified"),
literal(1).label("count"),
),
select(
sql.ClockModified.from_.label("timestamp"),
literal("ClockModified"),
literal(1).label("count"),
),
select(
sql.NoteModified.timestamp,
literal("NoteModified"),
literal(1).label("count"),
),
select(
sql.Block.timestamp,
literal("Block"),
literal(1).label("count"),
),
select(
sql.Subtree.created.label("timestamp"),
literal("SubtreeCreated"),
literal(1).label("count"),
),
select(
sql.Subtree.scheduled,
literal("SubtreeScheduled"),
literal(1).label("count"),
),
select(
sql.Subtree.deadline,
literal("SubtreeDeadline"),
literal(1).label("count"),
),
select(sql.StateModified.timestamp, literal("StateModified")),
select(sql.TagModified.timestamp, literal("TagModified")),
select(sql.ClockModified.from_.label("timestamp"), literal("ClockModified")),
select(sql.NoteModified.timestamp, literal("NoteModified")),
select(sql.Block.timestamp, literal("Block")),
select(sql.Subtree.created.label("timestamp"), literal("SubtreeCreated")),
select(sql.Subtree.scheduled, literal("SubtreeScheduled")),
select(sql.Subtree.deadline, literal("SubtreeDeadline")),
select(
sql.Subtree.closed,
literal("SubtreeClosed"),
literal(1).label("count"),
),
select(
sql.Subtree.created.label("timestamp"),
literal("SubtreeWordcount"),
sql.Subtree.wordcount.label("count"),
),
select(
sql.Block.timestamp,
literal("BlockWordcount"),
sql.Block.wordcount.label("count"),
),
).alias("union_query")

query = select(union_query.c.timestamp,
union_query.c.event).where(union_query.c.timestamp.is_not(None))
query = select(
union_query.c.timestamp,
union_query.c.event,
union_query.c.count,
).where(union_query.c.timestamp.is_not(None))

df = pd.read_sql(query, engine)

Expand All @@ -94,31 +137,41 @@ def plot_timestamped_events_with_pandas(

df["bin"] = pd.cut(df["timestamp"], bins=bins, right=False)

grouped = df.groupby(["bin", "event"]).size().unstack(fill_value=0)
grouped = df.groupby(["bin", "event"])["count"].sum().unstack(fill_value=0)
grouped.index = pd.to_datetime(grouped.index.map(lambda it: it.left))

fig, ax = plt.subplots(figsize=figsize)
grouped.plot.area(stacked=True, ax=ax)
if grouped.empty:
return None

ax.set_xlabel("Time")
ax.set_ylabel("Number of Events")
ax.xaxis_date()
ax.legend(loc="upper left", bbox_to_anchor=(1, 1))
# log(CAT).info("\n" + render_rich(dataframe_to_rich_table(grouped.head(20))))
n_plots = len(grouped.columns)
fig, axes = plt.subplots(n_plots, 1, sharex=True, figsize=(10, 2 * n_plots))

grouped.index = pd.to_datetime(grouped.index.map(lambda it: it.left))
ticklabels = [""] * len(grouped.index)
ticklabels[::time_tick_skip] = [
item.strftime("%Y-%m-%d") for item in grouped.index[::time_tick_skip]
]
if n_plots == 1:
axes = [axes]


for ax, column in zip(axes, grouped.columns):
ax.fill_between(grouped.index, grouped[column], alpha=0.5)
ax.plot(grouped.index, grouped[column], label=column)
ax.margins(y=0.1) # Add some margin to the y-axis for aesthetics
ax.set_ylabel(column)
ax.legend(loc="upper left")

ticklabels = [item.strftime("%Y-%m-%d") for item in grouped.index]
axes[0].set_xticklabels(ticklabels[::time_tick_skip])
axes[0].set_xticks(grouped.index[::time_tick_skip])

ax.xaxis.set_major_formatter(mticker.FixedFormatter(ticklabels))
ax.grid(True)
ax.grid(color='gray', linestyle='--', linewidth=0.5)
for ax in axes:
ax.xaxis.grid(True)
ax.yaxis.grid(False)
ax.grid(color='gray', linestyle='--', linewidth=0.5)

fig.autofmt_xdate()
fig.autofmt_xdate(rotation=45)
fig.tight_layout()

session.close()
return (fig, ax)
return (fig, axes)


@click.command()
Expand All @@ -145,6 +198,7 @@ def cli(ctx: click.Context, config: str, **kwargs) -> None:
if sql_db.exists():
sql_db.unlink()

log(CAT).info("Registering DB")
engine: Engine = create_engine("sqlite:///" + str(sql_db))
sql.Base.metadata.create_all(engine)
for node, file in nodes:
Expand All @@ -157,10 +211,10 @@ def cli(ctx: click.Context, config: str, **kwargs) -> None:
log(CAT).info("Plotting data")
plot = plot_timestamped_events_with_pandas(
engine,
bin_size=25,
min_time=datetime(year=2017, month=1, day=1),
bin_size=30,
# min_time=datetime(year=2017, month=1, day=1),
figsize=(20, 12),
time_tick_skip=1,
time_tick_skip=5,
)

if plot:
Expand Down
85 changes: 61 additions & 24 deletions scripts/py_exporters/py_exporters/export_sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ class Subtree(Base):
deadline = DateTimeColumn(nullable=True)
closed = DateTimeColumn(nullable=True)
location = ForeignId(name="Location.id", nullable=True)
wordcount = IntColumn(nullable=True)


class BlockKind(enum.Enum):
Expand Down Expand Up @@ -124,9 +125,9 @@ class RefileModified(Base):

CAT = "haxorg.export.sqlite"


subtree_count = 0


@beartype
def registerDocument(node: org.Org, engine: Engine, file: str):
Base.metadata.bind = engine
Expand All @@ -143,7 +144,7 @@ def registerDocument(node: org.Org, engine: Engine, file: str):
def get_location(node: org.Org) -> Optional[int]:
if not node.loc:
return None

nonlocal counter
result = file_record.id * 1E6 + counter
counter += 1
Expand Down Expand Up @@ -246,37 +247,81 @@ def aux_subtree_log(node: org.SubtreeLog, subtree_id: int):
session.add(
NoteModified(
subtree=subtree_id,
plaintext=ExporterUltraplain.getStr(note.desc) if note.desc else "",
plaintext=ExporterUltraplain.getStr(note.desc)
if note.desc else "",
))

@beartype
def getSubtreeTime(node: org.Subtree,
kind: org.SubtreePeriodKind) -> Optional[datetime]:
result: Optional[datetime] = None
time: org.SubtreePeriod
for time in node.getTimePeriods(org.IntSetOfSubtreePeriodKindIntVec([kind])):

if time.from_.getTimeKind() == org.TimeTimeKind.Static:
result = evalDateTime(time.from_.getStatic().time)

return result

@beartype
def getCreationTime(node: org.Org) -> Optional[datetime]:
match node:
case org.Subtree():
return getSubtreeTime(node,
org.SubtreePeriodKind.Created) or getSubtreeTime(
node, org.SubtreePeriodKind.Titled)

case org.AnnotatedParagraph():
if node.getAnnotationKind(
) == org.AnnotatedParagraphAnnotationKind.Timestamp:
return evalDateTime(node.getTimestamp().time.getStatic().time)

@beartype
def aux(node: org.Org, parent: Optional[int] = None):
global subtree_count
match node:
case org.Subtree():
def getTime(kind: org.SubtreePeriodKind) -> Optional[datetime]:
result: Optional[datetime] = None
time: org.SubtreePeriod
for time in node.getTimePeriods(
org.IntSetOfSubtreePeriodKindIntVec([kind])):

if time.from_.getTimeKind() == org.TimeTimeKind.Static:
result = evalDateTime(time.from_.getStatic().time)
def getNestedWordcount(node: org.Org) -> int:
if not node or getCreationTime(node) is not None:
return 0

else:
result = 0
match node:
case org.Word() | org.BigIdent() | org.RawText(
) | org.HashTag() | org.AtMention():
result += 1

case _:
for sub in node:
result += getNestedWordcount(sub)

return result

count = 0
for sub in node:
count += getNestedWordcount(sub)

# log(CAT).info("{} {} {}:{}".format(
# ExporterUltraplain.getStr(node.title),
# count,
# node.loc.line if node.loc else -1,
# node.loc.column if node.loc else -1,
# ))

return result

session.add(
Subtree(
id=subtree_count,
parent=parent,
created=getTime(org.SubtreePeriodKind.Created),
scheduled=getTime(org.SubtreePeriodKind.Scheduled),
created=getCreationTime(node),
scheduled=getSubtreeTime(node, org.SubtreePeriodKind.Scheduled),
level=node.level,
plaintext_title=ExporterUltraplain.getStr(node.title),
location=get_location(node),
wordcount=count,
))

subtree_count += 1

for item in node.logbook:
Expand All @@ -302,18 +347,11 @@ def getTime(kind: org.SubtreePeriodKind) -> Optional[datetime]:
if sub.getKind() == osk.Word:
wordcount += 1

timestamp: Optional[datetime] = None
if isinstance(node, org.AnnotatedParagraph):
if node.getAnnotationKind(
) == org.AnnotatedParagraphAnnotationKind.Timestamp:
timestamp = evalDateTime(
node.getTimestamp().time.getStatic().time)

session.add(
Block(
kind=BlockKind.Paragraph,
wordcount=wordcount,
timestamp=timestamp,
timestamp=getCreationTime(node),
plaintext=ExporterUltraplain.getStr(node),
location=get_location(node),
))
Expand All @@ -336,6 +374,5 @@ def getTime(kind: org.SubtreePeriodKind) -> Optional[datetime]:
case osk.Table:
pass


aux(node)
session.commit()

0 comments on commit 88b7bac

Please sign in to comment.