forked from valefras/italian_parliament_corpus
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathplot_stats.py
91 lines (67 loc) · 2.57 KB
/
plot_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from rdflib import Graph, URIRef
from utils import ordered_leg_names
from rdflib.plugins.sparql import prepareQuery
from rdflib.namespace import FOAF, XSD, RDF, DC, RDFS, OWL
from datetime import datetime
plt.rcParams["font.family"] = "sans-serif"
plt.rcParams["font.sans-serif"] = "Helvetica"
stats = pd.read_csv("stats_def.csv", encoding="utf-8")
dataset = Graph()
dataset.parse(os.path.join("tagging_modules/rdf/", "legislatura.rdf"), format="nt")
starts = []
ends = []
for leg in stats["legislature"]:
query = prepareQuery(
f"""
SELECT ?start ?end
WHERE {{
<http://dati.camera.it/ocd/legislatura.rdf/{leg}> <http://purl.org/dc/elements/1.1/date> ?dates .
BIND(xsd:integer(strbefore(?dates, "-")) AS ?start) .
BIND(xsd:integer(strafter(?dates, "-")) AS ?end) .
}}
""",
initNs={"ocd": URIRef("http://dati.camera.it/ocd/")},
)
result = dataset.query(query)
for row in result:
print("ound" + leg)
start = row.start
end = row.end
starts.append(start)
ends.append(end)
stats["start"] = starts
stats["end"] = ends
# convert yyyymmdd to datetime
# TypeError: Expected unicode, got Literal
stats["start"] = stats["start"].apply(lambda x: datetime.strptime(str(x), "%Y%m%d"))
stats["end"] = stats["end"].apply(lambda x: datetime.strptime(str(x), "%Y%m%d"))
stats["span"] = (stats["end"] - stats["start"]).dt.days
print(stats["span"])
# convert token_num to millions log
stats["token_num"] = stats["token_num"] / stats["span"]
# order the palette based on token_num
stats = stats.sort_values(by=["token_num"])
color_palette = sns.color_palette("magma", len(stats))
stats["color"] = color_palette
stats = stats.sort_values(by=["start"])
# Create a diverging color palette (e.g., using RdBu_r)
# plot using time span as width of the bars
# Create a figure and axis
fig, ax = plt.subplots(figsize=(9, 5.25))
# Loop through the time spans and create bars with varying widths
for index, row in stats.iterrows():
value = row.token_num
# color the bar based on the number of tokens
ax.bar(row.start, value, width=row.span, align="edge", color=row.color)
# sns.kdeplot(x=stats["start"], color="darkgrey", alpha=0.5, ax=ax)
# Add labels and legend
ax.set_xlabel("Time")
ax.set_ylabel("Number of tokens per day of legislature")
ax.set_title("Number of tokens published per day of legislature")
# Display the plot
plt.tight_layout()
fig.savefig("tokens_per_leg.png", dpi=200)