-
Notifications
You must be signed in to change notification settings - Fork 138
/
Copy pathgenerate_table.py
60 lines (47 loc) · 2.66 KB
/
generate_table.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/usr/bin/env python
"""Script for generation table with data/models from list.json in .md format"""
import argparse
import json
DATASETS_SOURCE = "list.json"
def generate_table(fn):
with open(fn) as infile:
data = json.loads(infile.read())
datasets = sorted(data["corpora"].items(), key=lambda kv: kv[0])
models = sorted(data["models"].items(), key=lambda kv: kv[0])
print("## Available data")
print("")
print("### Datasets")
print("")
print("| name | file size | read_more | description | license |")
print("|------|-----------|-----------|-------------|---------|")
for name, other in datasets:
if name.startswith("__testing_"):
continue
links = "<ul>" + " ".join("<li>{}</li>".format(link) for link in other["read_more"]) + "</ul>"
print("| {name} | {size} | {links} | {description} | {license} |".format(
name=name, links=links, description=other["description"],
size="{} MB".format(other["file_size"] // 2 ** 20), license=other["license"]
))
print("")
print("### Models")
print("")
print("| name | num vectors | file size | base dataset | read_more | description | parameters | preprocessing | license |")
print("|------|-------------|-----------|--------------|------------|-------------|------------|---------------|---------|")
for name, other in models:
if name.startswith("__testing_"):
continue
links = "<ul>" + " ".join("<li>{}</li>".format(link) for link in other["read_more"]) + "</ul>"
parameters = "<ul>" + " ".join("<li>{} - {}</li>".format(k, v) for (k, v) in other["parameters"].items()) + "</ul>"
print("| {name} | {num_vectors} | {size} | {base_dataset} | {links} | {description} | {parameters} | {preprocessing} | {license} |".format(
name=name, num_vectors=other["num_records"], size="{} MB".format(other["file_size"] // 2 ** 20),
base_dataset=other["base_dataset"], links=links, description=other["description"], parameters=parameters,
preprocessing=other.get("preprocessing", "-"), license=other["license"]
))
print("\n(generated by [{script_name}](https://github.com/RaRe-Technologies/gensim-data/blob/master/{script_name}) based on [{list_name}](https://github.com/RaRe-Technologies/gensim-data/blob/master/{list_name}))".format(script_name=__file__, list_name=fn))
def main():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--input-file", help="Path to {}".format(DATASETS_SOURCE), default=DATASETS_SOURCE)
args = parser.parse_args()
generate_table(args.input_file)
if __name__ == "__main__":
main()