From 58336df94439353edd4804935094a6fd526f3f17 Mon Sep 17 00:00:00 2001
From: Hannah Bast <bast@cs.uni-freiburg.de>
Date: Sun, 10 Mar 2024 15:33:54 +0100
Subject: [PATCH] Add command `index-stats`

The command now has more stats (time and space) and options (show only
time, show only space, ignore the text index) than in the previous
version.
---
 src/qlever/commands/add-text-index.py |   4 +-
 src/qlever/commands/index-stats.py    | 243 ++++++++++++++++++++++++++
 src/qlever/config.py                  |  14 +-
 3 files changed, 253 insertions(+), 8 deletions(-)
 create mode 100644 src/qlever/commands/index-stats.py

diff --git a/src/qlever/commands/add-text-index.py b/src/qlever/commands/add-text-index.py
index 8300794f..87f88639 100644
--- a/src/qlever/commands/add-text-index.py
+++ b/src/qlever/commands/add-text-index.py
@@ -36,7 +36,7 @@ def additional_arguments(self, subparser) -> None:
 
     def execute(self, args) -> bool:
         # Check that there is actually something to add.
-        if args.with_text_index is None:
+        if args.with_text_index == "none":
             log.error("You specified `--with_text_index none`, nothing to add")
             return False
 
@@ -49,7 +49,7 @@ def execute(self, args) -> bool:
         if args.with_text_index in \
                 ["from_literals", "from_text_records_and_literals"]:
             add_text_index_cmd += " --text-words-from-literals"
-        add_text_index_cmd += f" | tee {args.name}.add-text-index-log.txt"
+        add_text_index_cmd += f" | tee {args.name}.text-index-log.txt"
 
         # Run the command in a container (if so desired).
         if args.system in Containerize.supported_systems():
diff --git a/src/qlever/commands/index-stats.py b/src/qlever/commands/index-stats.py
new file mode 100644
index 00000000..39e10d65
--- /dev/null
+++ b/src/qlever/commands/index-stats.py
@@ -0,0 +1,243 @@
+from __future__ import annotations
+
+import re
+from datetime import datetime
+from pathlib import Path
+
+from qlever.command import QleverCommand
+from qlever.log import log
+from qlever.util import get_total_file_size
+
+
+class IndexStatsCommand(QleverCommand):
+    """
+    Class for executing the `index-stats` command.
+    """
+
+    def __init__(self):
+        pass
+
+    def description(self) -> str:
+        return ("Breakdown of the time and space used for the index build")
+
+    def should_have_qleverfile(self) -> bool:
+        return False
+
+    def relevant_qleverfile_arguments(self) -> dict[str: list[str]]:
+        return {"data": ["name"]}
+
+    def additional_arguments(self, subparser) -> None:
+        subparser.add_argument("--only-time", action="store_true",
+                               default=False,
+                               help="Show only the time used")
+        subparser.add_argument("--only-space", action="store_true",
+                               default=False,
+                               help="Show only the space used")
+        subparser.add_argument("--ignore-text-index", action="store_true",
+                               default=False,
+                               help="Ignore the text index")
+
+    def execute_time(self, args, log_file_name) -> bool:
+        """
+        Part of `execute` that shows the time used.
+        """
+
+        # Read the content of `log_file_name` into a list of lines.
+        try:
+            with open(log_file_name, "r") as log_file:
+                lines = log_file.readlines()
+        except Exception as e:
+            log.error(f"Problem reading index log file {log_file_name}: {e}")
+            return False
+        # If there is a separate `add-text-index-log.txt` file, append those
+        # lines.
+        try:
+            text_log_file_name = f"{args.name}.text-index-log.txt"
+            if Path(text_log_file_name).exists():
+                with open(text_log_file_name, "r") as text_log_file:
+                    lines.extend(text_log_file.readlines())
+        except Exception as e:
+            log.error(f"Problem reading text index log file "
+                      f"{text_log_file_name}: {e}")
+            return False
+
+        # Helper function that finds the next line matching the given `regex`,
+        # starting from `current_line`, and extracts the time. Returns a tuple
+        # of the time and the regex match object. If a match is found,
+        # `current_line` is updated to the line after the match. Otherwise,
+        # `current_line` will be one beyond the last line, unless
+        # `line_is_optional` is true, in which case it will be the same as when
+        # the function was entered.
+        current_line = 0
+
+        def find_next_line(regex, line_is_optional=False):
+            nonlocal lines
+            nonlocal current_line
+            current_line_backup = current_line
+            # Find starting from `current_line`.
+            while current_line < len(lines):
+                line = lines[current_line]
+                current_line += 1
+                timestamp_regex = r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}"
+                timestamp_format = "%Y-%m-%d %H:%M:%S"
+                regex_match = re.search(regex, line)
+                if regex_match:
+                    try:
+                        return datetime.strptime(
+                                re.match(timestamp_regex, line).group(),
+                                timestamp_format), regex_match
+                    except Exception as e:
+                        log.error(f"Could not parse timestamp of form "
+                                  f"\"{timestamp_regex}\" from line "
+                                  f" \"{line.rstrip()}\" ({e})")
+            # If we get here, we did not find a matching line.
+            if line_is_optional:
+                current_line = current_line_backup
+            return None, None
+
+        # Find the lines matching the key_lines_regex and extract the time
+        # information from them.
+        overall_begin, _ = find_next_line(r"INFO:\s*Processing")
+        merge_begin, _ = find_next_line(r"INFO:\s*Merging partial vocab")
+        convert_begin, _ = find_next_line(r"INFO:\s*Converting triples")
+        perm_begin_and_info = []
+        while True:
+            perm_begin, _ = find_next_line(r"INFO:\s*Creating a pair", True)
+            if perm_begin is None:
+                break
+            _, perm_info = find_next_line(r"INFO:\s*Writing meta data for"
+                                          r" ([A-Z]+ and [A-Z]+)", True)
+            # if perm_info is None:
+            #     break
+            perm_begin_and_info.append((perm_begin, perm_info))
+        convert_end = (perm_begin_and_info[0][0] if
+                       len(perm_begin_and_info) > 0 else None)
+        normal_end, _ = find_next_line(r"INFO:\s*Index build completed")
+        text_begin, _ = find_next_line(r"INFO:\s*Adding text index", True)
+        text_end, _ = find_next_line(r"INFO:\s*Text index build comp", True)
+        if args.ignore_text_index:
+            text_begin = text_end = None
+        # print("DEBUG:", len(perm_begin_and_info), perm_begin_and_info)
+        # print("DEBUG:", overall_begin)
+        # print("DEBUG:", normal_end)
+
+        # Check whether at least the first phase is done.
+        if overall_begin is None:
+            log.error("Missing line that index build has started")
+            return False
+        if overall_begin and not merge_begin:
+            log.error("According to the log file, the index build "
+                      "has started, but is still in its first "
+                      "phase (parsing the input)")
+            return False
+
+        # Helper function that shows the duration for a phase (if the start and
+        # end timestamps are available).
+        def show_duration(heading, start_end_pairs):
+            nonlocal unit
+            num_start_end_pairs = 0
+            diff_seconds = 0
+            for start, end in start_end_pairs:
+                if start and end:
+                    diff_seconds += (end - start).total_seconds()
+                    num_start_end_pairs += 1
+            if num_start_end_pairs > 0:
+                if unit == "h":
+                    diff = diff_seconds / 3600
+                elif unit == "min":
+                    diff = diff_seconds / 60
+                else:
+                    diff = diff_seconds
+                log.info(f"{heading:<21} : {diff:>6.1f} {unit}")
+
+        # Get the times of the various phases (hours or minutes, depending on
+        # how long the first phase took).
+        unit = "h"
+        if merge_begin and overall_begin:
+            parse_duration = (merge_begin - overall_begin).total_seconds()
+            if parse_duration < 200:
+                unit = "s"
+            elif parse_duration < 3600:
+                unit = "min"
+        show_duration("Parse input", [(overall_begin, merge_begin)])
+        show_duration("Build vocabularies", [(merge_begin, convert_begin)])
+        show_duration("Convert to global IDs", [(convert_begin, convert_end)])
+        for i in range(len(perm_begin_and_info)):
+            perm_begin, perm_info = perm_begin_and_info[i]
+            perm_end = perm_begin_and_info[i + 1][0] if i + 1 < len(
+                    perm_begin_and_info) else normal_end
+            perm_info_text = (perm_info.group(1).replace(" and ", " & ")
+                              if perm_info else f"#{i + 1}")
+            show_duration(f"Permutation {perm_info_text}",
+                          [(perm_begin, perm_end)])
+        show_duration("Text index", [(text_begin, text_end)])
+        if text_begin and text_end:
+            log.info("")
+            show_duration("TOTAL time",
+                          [(overall_begin, normal_end),
+                           (text_begin, text_end)])
+        elif normal_end:
+            log.info("")
+            show_duration("TOTAL time",
+                          [(overall_begin, normal_end)])
+        return True
+
+    def execute_space(self, args) -> bool:
+        """
+        Part of `execute` that shows the space used.
+        """
+
+        # Get the sizes for the various groups of index files.
+        index_size = get_total_file_size([f"{args.name}.index.*"])
+        vocab_size = get_total_file_size([f"{args.name}.vocabulary.*"])
+        text_size = get_total_file_size([f"{args.name}.text.*"])
+        if args.ignore_text_index:
+            text_size = 0
+        total_size = index_size + vocab_size + text_size
+
+        # Determing the proper unit for the size.
+        unit = "GB"
+        if total_size < 1e6:
+            unit = "B"
+        elif total_size < 1e9:
+            unit = "MB"
+
+        # Helper function for showing the size in a uniform way.
+        def show_size(heading, size):
+            nonlocal unit
+            if unit == "GB":
+                size /= 1e9
+            elif unit == "MB":
+                size /= 1e6
+            log.info(f"{heading:<21} : {size:>6.1f} {unit}")
+
+        show_size("Files index.*", index_size)
+        show_size("Files vocabulary.*", vocab_size)
+        if text_size > 0:
+            show_size("Files text.*", text_size)
+        log.info("")
+        show_size("TOTAL size", total_size)
+        return True
+
+    def execute(self, args) -> bool:
+        ret_value = args.show
+
+        # The "time" part of the command.
+        if not args.only_space:
+            log_file_name = f"{args.name}.index-log.txt"
+            self.show(f"Breakdown of the time used for "
+                      f"building the index, based on the timestamps for key "
+                      f"lines in \"{log_file_name}\"", only_show=args.show)
+            if not args.show:
+                ret_value &= self.execute_time(args, log_file_name)
+            if not args.only_time:
+                log.info("")
+
+        # The "space" part of the command.
+        if not args.only_time:
+            self.show("Breakdown of the space used for building the index",
+                      only_show=args.show)
+            if not args.show:
+                ret_value &= self.execute_space(args)
+
+        return ret_value
diff --git a/src/qlever/config.py b/src/qlever/config.py
index 0f348c3f..06268cfa 100644
--- a/src/qlever/config.py
+++ b/src/qlever/config.py
@@ -7,8 +7,7 @@
 
 import argcomplete
 
-from qlever import command_objects
-from qlever import script_name
+from qlever import command_objects, script_name
 from qlever.log import log
 from qlever.qleverfile import Qleverfile
 
@@ -136,9 +135,9 @@ def add_qleverfile_option(parser):
         qleverfile_parser.add_argument("command", type=str, nargs="?")
         qleverfile_args, _ = qleverfile_parser.parse_known_args()
         qleverfile_path_name = qleverfile_args.qleverfile
-        command = qleverfile_args.command
-        parse_qleverfile = command in command_objects \
-            and command_objects[command].should_have_qleverfile()
+        # command = qleverfile_args.command
+        # should_have_qleverfile = command in command_objects \
+        #     and command_objects[command].should_have_qleverfile()
 
         # Check if the Qleverfile exists and if we are using the default name.
         # We need this again further down in the code, so remember it.
@@ -156,7 +155,10 @@ def add_qleverfile_option(parser):
         #
         # IMPORTANT: No need to parse the Qleverfile in autocompletion mode and
         # it would be unnecessarily expensive to do so.
-        if qleverfile_exists and parse_qleverfile and not autocomplete_mode:
+        #
+        # TODO: What if `command.should_have_qleverfile()` is `False`, should
+        # we then parse the Qleverfile or not.
+        if qleverfile_exists and not autocomplete_mode:
             try:
                 qleverfile_config = Qleverfile.read(qleverfile_path)
             except Exception as e: