Skip to content

Commit

Permalink
Add command index-stats
Browse files Browse the repository at this point in the history
The command now has more stats (time and space) and options (show only
time, show only space, ignore the text index) than in the previous
version.
  • Loading branch information
Hannah Bast committed Mar 10, 2024
1 parent b4d7daf commit 58336df
Show file tree
Hide file tree
Showing 3 changed files with 253 additions and 8 deletions.
4 changes: 2 additions & 2 deletions src/qlever/commands/add-text-index.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def additional_arguments(self, subparser) -> None:

def execute(self, args) -> bool:
# Check that there is actually something to add.
if args.with_text_index is None:
if args.with_text_index == "none":
log.error("You specified `--with_text_index none`, nothing to add")
return False

Expand All @@ -49,7 +49,7 @@ def execute(self, args) -> bool:
if args.with_text_index in \
["from_literals", "from_text_records_and_literals"]:
add_text_index_cmd += " --text-words-from-literals"
add_text_index_cmd += f" | tee {args.name}.add-text-index-log.txt"
add_text_index_cmd += f" | tee {args.name}.text-index-log.txt"

# Run the command in a container (if so desired).
if args.system in Containerize.supported_systems():
Expand Down
243 changes: 243 additions & 0 deletions src/qlever/commands/index-stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
from __future__ import annotations

import re
from datetime import datetime
from pathlib import Path

from qlever.command import QleverCommand
from qlever.log import log
from qlever.util import get_total_file_size


class IndexStatsCommand(QleverCommand):
"""
Class for executing the `index-stats` command.
"""

def __init__(self):
pass

def description(self) -> str:
return ("Breakdown of the time and space used for the index build")

def should_have_qleverfile(self) -> bool:
return False

def relevant_qleverfile_arguments(self) -> dict[str: list[str]]:
return {"data": ["name"]}

def additional_arguments(self, subparser) -> None:
subparser.add_argument("--only-time", action="store_true",
default=False,
help="Show only the time used")
subparser.add_argument("--only-space", action="store_true",
default=False,
help="Show only the space used")
subparser.add_argument("--ignore-text-index", action="store_true",
default=False,
help="Ignore the text index")

def execute_time(self, args, log_file_name) -> bool:
"""
Part of `execute` that shows the time used.
"""

# Read the content of `log_file_name` into a list of lines.
try:
with open(log_file_name, "r") as log_file:
lines = log_file.readlines()
except Exception as e:
log.error(f"Problem reading index log file {log_file_name}: {e}")
return False
# If there is a separate `add-text-index-log.txt` file, append those
# lines.
try:
text_log_file_name = f"{args.name}.text-index-log.txt"
if Path(text_log_file_name).exists():
with open(text_log_file_name, "r") as text_log_file:
lines.extend(text_log_file.readlines())
except Exception as e:
log.error(f"Problem reading text index log file "
f"{text_log_file_name}: {e}")
return False

# Helper function that finds the next line matching the given `regex`,
# starting from `current_line`, and extracts the time. Returns a tuple
# of the time and the regex match object. If a match is found,
# `current_line` is updated to the line after the match. Otherwise,
# `current_line` will be one beyond the last line, unless
# `line_is_optional` is true, in which case it will be the same as when
# the function was entered.
current_line = 0

def find_next_line(regex, line_is_optional=False):
nonlocal lines
nonlocal current_line
current_line_backup = current_line
# Find starting from `current_line`.
while current_line < len(lines):
line = lines[current_line]
current_line += 1
timestamp_regex = r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}"
timestamp_format = "%Y-%m-%d %H:%M:%S"
regex_match = re.search(regex, line)
if regex_match:
try:
return datetime.strptime(
re.match(timestamp_regex, line).group(),
timestamp_format), regex_match
except Exception as e:
log.error(f"Could not parse timestamp of form "
f"\"{timestamp_regex}\" from line "
f" \"{line.rstrip()}\" ({e})")
# If we get here, we did not find a matching line.
if line_is_optional:
current_line = current_line_backup
return None, None

# Find the lines matching the key_lines_regex and extract the time
# information from them.
overall_begin, _ = find_next_line(r"INFO:\s*Processing")
merge_begin, _ = find_next_line(r"INFO:\s*Merging partial vocab")
convert_begin, _ = find_next_line(r"INFO:\s*Converting triples")
perm_begin_and_info = []
while True:
perm_begin, _ = find_next_line(r"INFO:\s*Creating a pair", True)
if perm_begin is None:
break
_, perm_info = find_next_line(r"INFO:\s*Writing meta data for"
r" ([A-Z]+ and [A-Z]+)", True)
# if perm_info is None:
# break
perm_begin_and_info.append((perm_begin, perm_info))
convert_end = (perm_begin_and_info[0][0] if
len(perm_begin_and_info) > 0 else None)
normal_end, _ = find_next_line(r"INFO:\s*Index build completed")
text_begin, _ = find_next_line(r"INFO:\s*Adding text index", True)
text_end, _ = find_next_line(r"INFO:\s*Text index build comp", True)
if args.ignore_text_index:
text_begin = text_end = None
# print("DEBUG:", len(perm_begin_and_info), perm_begin_and_info)
# print("DEBUG:", overall_begin)
# print("DEBUG:", normal_end)

# Check whether at least the first phase is done.
if overall_begin is None:
log.error("Missing line that index build has started")
return False
if overall_begin and not merge_begin:
log.error("According to the log file, the index build "
"has started, but is still in its first "
"phase (parsing the input)")
return False

# Helper function that shows the duration for a phase (if the start and
# end timestamps are available).
def show_duration(heading, start_end_pairs):
nonlocal unit
num_start_end_pairs = 0
diff_seconds = 0
for start, end in start_end_pairs:
if start and end:
diff_seconds += (end - start).total_seconds()
num_start_end_pairs += 1
if num_start_end_pairs > 0:
if unit == "h":
diff = diff_seconds / 3600
elif unit == "min":
diff = diff_seconds / 60
else:
diff = diff_seconds
log.info(f"{heading:<21} : {diff:>6.1f} {unit}")

# Get the times of the various phases (hours or minutes, depending on
# how long the first phase took).
unit = "h"
if merge_begin and overall_begin:
parse_duration = (merge_begin - overall_begin).total_seconds()
if parse_duration < 200:
unit = "s"
elif parse_duration < 3600:
unit = "min"
show_duration("Parse input", [(overall_begin, merge_begin)])
show_duration("Build vocabularies", [(merge_begin, convert_begin)])
show_duration("Convert to global IDs", [(convert_begin, convert_end)])
for i in range(len(perm_begin_and_info)):
perm_begin, perm_info = perm_begin_and_info[i]
perm_end = perm_begin_and_info[i + 1][0] if i + 1 < len(
perm_begin_and_info) else normal_end
perm_info_text = (perm_info.group(1).replace(" and ", " & ")
if perm_info else f"#{i + 1}")
show_duration(f"Permutation {perm_info_text}",
[(perm_begin, perm_end)])
show_duration("Text index", [(text_begin, text_end)])
if text_begin and text_end:
log.info("")
show_duration("TOTAL time",
[(overall_begin, normal_end),
(text_begin, text_end)])
elif normal_end:
log.info("")
show_duration("TOTAL time",
[(overall_begin, normal_end)])
return True

def execute_space(self, args) -> bool:
"""
Part of `execute` that shows the space used.
"""

# Get the sizes for the various groups of index files.
index_size = get_total_file_size([f"{args.name}.index.*"])
vocab_size = get_total_file_size([f"{args.name}.vocabulary.*"])
text_size = get_total_file_size([f"{args.name}.text.*"])
if args.ignore_text_index:
text_size = 0
total_size = index_size + vocab_size + text_size

# Determing the proper unit for the size.
unit = "GB"
if total_size < 1e6:
unit = "B"
elif total_size < 1e9:
unit = "MB"

# Helper function for showing the size in a uniform way.
def show_size(heading, size):
nonlocal unit
if unit == "GB":
size /= 1e9
elif unit == "MB":
size /= 1e6
log.info(f"{heading:<21} : {size:>6.1f} {unit}")

show_size("Files index.*", index_size)
show_size("Files vocabulary.*", vocab_size)
if text_size > 0:
show_size("Files text.*", text_size)
log.info("")
show_size("TOTAL size", total_size)
return True

def execute(self, args) -> bool:
ret_value = args.show

# The "time" part of the command.
if not args.only_space:
log_file_name = f"{args.name}.index-log.txt"
self.show(f"Breakdown of the time used for "
f"building the index, based on the timestamps for key "
f"lines in \"{log_file_name}\"", only_show=args.show)
if not args.show:
ret_value &= self.execute_time(args, log_file_name)
if not args.only_time:
log.info("")

# The "space" part of the command.
if not args.only_time:
self.show("Breakdown of the space used for building the index",
only_show=args.show)
if not args.show:
ret_value &= self.execute_space(args)

return ret_value
14 changes: 8 additions & 6 deletions src/qlever/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@

import argcomplete

from qlever import command_objects
from qlever import script_name
from qlever import command_objects, script_name
from qlever.log import log
from qlever.qleverfile import Qleverfile

Expand Down Expand Up @@ -136,9 +135,9 @@ def add_qleverfile_option(parser):
qleverfile_parser.add_argument("command", type=str, nargs="?")
qleverfile_args, _ = qleverfile_parser.parse_known_args()
qleverfile_path_name = qleverfile_args.qleverfile
command = qleverfile_args.command
parse_qleverfile = command in command_objects \
and command_objects[command].should_have_qleverfile()
# command = qleverfile_args.command
# should_have_qleverfile = command in command_objects \
# and command_objects[command].should_have_qleverfile()

# Check if the Qleverfile exists and if we are using the default name.
# We need this again further down in the code, so remember it.
Expand All @@ -156,7 +155,10 @@ def add_qleverfile_option(parser):
#
# IMPORTANT: No need to parse the Qleverfile in autocompletion mode and
# it would be unnecessarily expensive to do so.
if qleverfile_exists and parse_qleverfile and not autocomplete_mode:
#
# TODO: What if `command.should_have_qleverfile()` is `False`, should
# we then parse the Qleverfile or not.
if qleverfile_exists and not autocomplete_mode:
try:
qleverfile_config = Qleverfile.read(qleverfile_path)
except Exception as e:
Expand Down

0 comments on commit 58336df

Please sign in to comment.