From 58336df94439353edd4804935094a6fd526f3f17 Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Sun, 10 Mar 2024 15:33:54 +0100 Subject: [PATCH] Add command `index-stats` The command now has more stats (time and space) and options (show only time, show only space, ignore the text index) than in the previous version. --- src/qlever/commands/add-text-index.py | 4 +- src/qlever/commands/index-stats.py | 243 ++++++++++++++++++++++++++ src/qlever/config.py | 14 +- 3 files changed, 253 insertions(+), 8 deletions(-) create mode 100644 src/qlever/commands/index-stats.py diff --git a/src/qlever/commands/add-text-index.py b/src/qlever/commands/add-text-index.py index 8300794f..87f88639 100644 --- a/src/qlever/commands/add-text-index.py +++ b/src/qlever/commands/add-text-index.py @@ -36,7 +36,7 @@ def additional_arguments(self, subparser) -> None: def execute(self, args) -> bool: # Check that there is actually something to add. - if args.with_text_index is None: + if args.with_text_index == "none": log.error("You specified `--with_text_index none`, nothing to add") return False @@ -49,7 +49,7 @@ def execute(self, args) -> bool: if args.with_text_index in \ ["from_literals", "from_text_records_and_literals"]: add_text_index_cmd += " --text-words-from-literals" - add_text_index_cmd += f" | tee {args.name}.add-text-index-log.txt" + add_text_index_cmd += f" | tee {args.name}.text-index-log.txt" # Run the command in a container (if so desired). if args.system in Containerize.supported_systems(): diff --git a/src/qlever/commands/index-stats.py b/src/qlever/commands/index-stats.py new file mode 100644 index 00000000..39e10d65 --- /dev/null +++ b/src/qlever/commands/index-stats.py @@ -0,0 +1,243 @@ +from __future__ import annotations + +import re +from datetime import datetime +from pathlib import Path + +from qlever.command import QleverCommand +from qlever.log import log +from qlever.util import get_total_file_size + + +class IndexStatsCommand(QleverCommand): + """ + Class for executing the `index-stats` command. + """ + + def __init__(self): + pass + + def description(self) -> str: + return ("Breakdown of the time and space used for the index build") + + def should_have_qleverfile(self) -> bool: + return False + + def relevant_qleverfile_arguments(self) -> dict[str: list[str]]: + return {"data": ["name"]} + + def additional_arguments(self, subparser) -> None: + subparser.add_argument("--only-time", action="store_true", + default=False, + help="Show only the time used") + subparser.add_argument("--only-space", action="store_true", + default=False, + help="Show only the space used") + subparser.add_argument("--ignore-text-index", action="store_true", + default=False, + help="Ignore the text index") + + def execute_time(self, args, log_file_name) -> bool: + """ + Part of `execute` that shows the time used. + """ + + # Read the content of `log_file_name` into a list of lines. + try: + with open(log_file_name, "r") as log_file: + lines = log_file.readlines() + except Exception as e: + log.error(f"Problem reading index log file {log_file_name}: {e}") + return False + # If there is a separate `add-text-index-log.txt` file, append those + # lines. + try: + text_log_file_name = f"{args.name}.text-index-log.txt" + if Path(text_log_file_name).exists(): + with open(text_log_file_name, "r") as text_log_file: + lines.extend(text_log_file.readlines()) + except Exception as e: + log.error(f"Problem reading text index log file " + f"{text_log_file_name}: {e}") + return False + + # Helper function that finds the next line matching the given `regex`, + # starting from `current_line`, and extracts the time. Returns a tuple + # of the time and the regex match object. If a match is found, + # `current_line` is updated to the line after the match. Otherwise, + # `current_line` will be one beyond the last line, unless + # `line_is_optional` is true, in which case it will be the same as when + # the function was entered. + current_line = 0 + + def find_next_line(regex, line_is_optional=False): + nonlocal lines + nonlocal current_line + current_line_backup = current_line + # Find starting from `current_line`. + while current_line < len(lines): + line = lines[current_line] + current_line += 1 + timestamp_regex = r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}" + timestamp_format = "%Y-%m-%d %H:%M:%S" + regex_match = re.search(regex, line) + if regex_match: + try: + return datetime.strptime( + re.match(timestamp_regex, line).group(), + timestamp_format), regex_match + except Exception as e: + log.error(f"Could not parse timestamp of form " + f"\"{timestamp_regex}\" from line " + f" \"{line.rstrip()}\" ({e})") + # If we get here, we did not find a matching line. + if line_is_optional: + current_line = current_line_backup + return None, None + + # Find the lines matching the key_lines_regex and extract the time + # information from them. + overall_begin, _ = find_next_line(r"INFO:\s*Processing") + merge_begin, _ = find_next_line(r"INFO:\s*Merging partial vocab") + convert_begin, _ = find_next_line(r"INFO:\s*Converting triples") + perm_begin_and_info = [] + while True: + perm_begin, _ = find_next_line(r"INFO:\s*Creating a pair", True) + if perm_begin is None: + break + _, perm_info = find_next_line(r"INFO:\s*Writing meta data for" + r" ([A-Z]+ and [A-Z]+)", True) + # if perm_info is None: + # break + perm_begin_and_info.append((perm_begin, perm_info)) + convert_end = (perm_begin_and_info[0][0] if + len(perm_begin_and_info) > 0 else None) + normal_end, _ = find_next_line(r"INFO:\s*Index build completed") + text_begin, _ = find_next_line(r"INFO:\s*Adding text index", True) + text_end, _ = find_next_line(r"INFO:\s*Text index build comp", True) + if args.ignore_text_index: + text_begin = text_end = None + # print("DEBUG:", len(perm_begin_and_info), perm_begin_and_info) + # print("DEBUG:", overall_begin) + # print("DEBUG:", normal_end) + + # Check whether at least the first phase is done. + if overall_begin is None: + log.error("Missing line that index build has started") + return False + if overall_begin and not merge_begin: + log.error("According to the log file, the index build " + "has started, but is still in its first " + "phase (parsing the input)") + return False + + # Helper function that shows the duration for a phase (if the start and + # end timestamps are available). + def show_duration(heading, start_end_pairs): + nonlocal unit + num_start_end_pairs = 0 + diff_seconds = 0 + for start, end in start_end_pairs: + if start and end: + diff_seconds += (end - start).total_seconds() + num_start_end_pairs += 1 + if num_start_end_pairs > 0: + if unit == "h": + diff = diff_seconds / 3600 + elif unit == "min": + diff = diff_seconds / 60 + else: + diff = diff_seconds + log.info(f"{heading:<21} : {diff:>6.1f} {unit}") + + # Get the times of the various phases (hours or minutes, depending on + # how long the first phase took). + unit = "h" + if merge_begin and overall_begin: + parse_duration = (merge_begin - overall_begin).total_seconds() + if parse_duration < 200: + unit = "s" + elif parse_duration < 3600: + unit = "min" + show_duration("Parse input", [(overall_begin, merge_begin)]) + show_duration("Build vocabularies", [(merge_begin, convert_begin)]) + show_duration("Convert to global IDs", [(convert_begin, convert_end)]) + for i in range(len(perm_begin_and_info)): + perm_begin, perm_info = perm_begin_and_info[i] + perm_end = perm_begin_and_info[i + 1][0] if i + 1 < len( + perm_begin_and_info) else normal_end + perm_info_text = (perm_info.group(1).replace(" and ", " & ") + if perm_info else f"#{i + 1}") + show_duration(f"Permutation {perm_info_text}", + [(perm_begin, perm_end)]) + show_duration("Text index", [(text_begin, text_end)]) + if text_begin and text_end: + log.info("") + show_duration("TOTAL time", + [(overall_begin, normal_end), + (text_begin, text_end)]) + elif normal_end: + log.info("") + show_duration("TOTAL time", + [(overall_begin, normal_end)]) + return True + + def execute_space(self, args) -> bool: + """ + Part of `execute` that shows the space used. + """ + + # Get the sizes for the various groups of index files. + index_size = get_total_file_size([f"{args.name}.index.*"]) + vocab_size = get_total_file_size([f"{args.name}.vocabulary.*"]) + text_size = get_total_file_size([f"{args.name}.text.*"]) + if args.ignore_text_index: + text_size = 0 + total_size = index_size + vocab_size + text_size + + # Determing the proper unit for the size. + unit = "GB" + if total_size < 1e6: + unit = "B" + elif total_size < 1e9: + unit = "MB" + + # Helper function for showing the size in a uniform way. + def show_size(heading, size): + nonlocal unit + if unit == "GB": + size /= 1e9 + elif unit == "MB": + size /= 1e6 + log.info(f"{heading:<21} : {size:>6.1f} {unit}") + + show_size("Files index.*", index_size) + show_size("Files vocabulary.*", vocab_size) + if text_size > 0: + show_size("Files text.*", text_size) + log.info("") + show_size("TOTAL size", total_size) + return True + + def execute(self, args) -> bool: + ret_value = args.show + + # The "time" part of the command. + if not args.only_space: + log_file_name = f"{args.name}.index-log.txt" + self.show(f"Breakdown of the time used for " + f"building the index, based on the timestamps for key " + f"lines in \"{log_file_name}\"", only_show=args.show) + if not args.show: + ret_value &= self.execute_time(args, log_file_name) + if not args.only_time: + log.info("") + + # The "space" part of the command. + if not args.only_time: + self.show("Breakdown of the space used for building the index", + only_show=args.show) + if not args.show: + ret_value &= self.execute_space(args) + + return ret_value diff --git a/src/qlever/config.py b/src/qlever/config.py index 0f348c3f..06268cfa 100644 --- a/src/qlever/config.py +++ b/src/qlever/config.py @@ -7,8 +7,7 @@ import argcomplete -from qlever import command_objects -from qlever import script_name +from qlever import command_objects, script_name from qlever.log import log from qlever.qleverfile import Qleverfile @@ -136,9 +135,9 @@ def add_qleverfile_option(parser): qleverfile_parser.add_argument("command", type=str, nargs="?") qleverfile_args, _ = qleverfile_parser.parse_known_args() qleverfile_path_name = qleverfile_args.qleverfile - command = qleverfile_args.command - parse_qleverfile = command in command_objects \ - and command_objects[command].should_have_qleverfile() + # command = qleverfile_args.command + # should_have_qleverfile = command in command_objects \ + # and command_objects[command].should_have_qleverfile() # Check if the Qleverfile exists and if we are using the default name. # We need this again further down in the code, so remember it. @@ -156,7 +155,10 @@ def add_qleverfile_option(parser): # # IMPORTANT: No need to parse the Qleverfile in autocompletion mode and # it would be unnecessarily expensive to do so. - if qleverfile_exists and parse_qleverfile and not autocomplete_mode: + # + # TODO: What if `command.should_have_qleverfile()` is `False`, should + # we then parse the Qleverfile or not. + if qleverfile_exists and not autocomplete_mode: try: qleverfile_config = Qleverfile.read(qleverfile_path) except Exception as e: