diff --git a/pyproject.toml b/pyproject.toml index 132febd..295ae2d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ packages=["ragdaemon"] [project] name = "ragdaemon" -version = "0.4.7" +version = "0.5.0" description = "Generate and render a call graph for a Python project." readme = "README.md" dependencies = [ diff --git a/ragdaemon/__init__.py b/ragdaemon/__init__.py index a34b2f6..3d18726 100644 --- a/ragdaemon/__init__.py +++ b/ragdaemon/__init__.py @@ -1 +1 @@ -__version__ = "0.4.7" +__version__ = "0.5.0" diff --git a/ragdaemon/annotators/call_graph.py b/ragdaemon/annotators/call_graph.py index c2e4497..5de1b86 100644 --- a/ragdaemon/annotators/call_graph.py +++ b/ragdaemon/annotators/call_graph.py @@ -9,7 +9,7 @@ from spice.models import TextModel from ragdaemon.annotators.base_annotator import Annotator -from ragdaemon.database import Database +from ragdaemon.database import Database, remove_update_db_duplicates from ragdaemon.graph import KnowledgeGraph from ragdaemon.errors import RagdaemonError from ragdaemon.utils import ( @@ -155,13 +155,11 @@ async def get_file_call_data( node: str, data: dict, graph: KnowledgeGraph, - db: Database, retries: int = 1, ): - """Generate and save call data for a file node to graph and db""" + """Generate and save call data for a file node to graph""" calls = {} - record = db.get(data["checksum"]) - document = record["documents"][0] + document = data["document"] # Insert line numbers lines = document.split("\n") @@ -184,10 +182,6 @@ async def get_file_call_data( else "Skipping." ) - # Save to db and graph - metadatas = record["metadatas"][0] - metadatas[self.call_field_id] = json.dumps(calls) - db.update(data["checksum"], metadatas=metadatas) data[self.call_field_id] = calls async def annotate( @@ -212,17 +206,27 @@ async def annotate( files_with_calls.append((node, data)) # Generate/add call data for nodes that don't have it tasks = [] + files_just_updated = set() for node, data in files_with_calls: if refresh or data.get(self.call_field_id, None) is None: checksum = data.get("checksum") if checksum is None: raise RagdaemonError(f"Node {node} has no checksum.") - tasks.append(self.get_file_call_data(node, data, graph, db)) + tasks.append(self.get_file_call_data(node, data, graph)) + files_just_updated.add(node) if len(tasks) > 0: if self.verbose: await tqdm.gather(*tasks, desc="Generating call graph") else: await asyncio.gather(*tasks) + update_db = {"ids": [], "metadatas": []} + for node in files_just_updated: + data = graph.nodes[node] + update_db["ids"].append(data["checksum"]) + metadatas = {self.call_field_id: json.dumps(data[self.call_field_id])} + update_db["metadatas"].append(metadatas) + update_db = remove_update_db_duplicates(**update_db) + db.update(**update_db) # Add call edges to graph. Each call should have only ONE source; if there are # chunks, the source is the matching chunk, otherwise it's the file. @@ -244,8 +248,7 @@ async def annotate( checksum = data.get("checksum") if checksum is None: raise RagdaemonError(f"File node {file} is missing checksum field.") - record = db.get(checksum) - document = record["documents"][0] + document = data["document"] for i in range(1, len(document.split("\n")) + 1): line_index[i] = file else: diff --git a/ragdaemon/annotators/chunker.py b/ragdaemon/annotators/chunker.py index 0e8423b..0d4e4c0 100644 --- a/ragdaemon/annotators/chunker.py +++ b/ragdaemon/annotators/chunker.py @@ -17,13 +17,18 @@ import asyncio import json +from copy import deepcopy from pathlib import Path -from typing import Any, Coroutine, Optional +from typing import Any, Optional from tqdm.asyncio import tqdm from ragdaemon.annotators.base_annotator import Annotator -from ragdaemon.database import Database, remove_add_to_db_duplicates +from ragdaemon.database import ( + Database, + remove_add_to_db_duplicates, + remove_update_db_duplicates, +) from ragdaemon.errors import RagdaemonError from ragdaemon.graph import KnowledgeGraph from ragdaemon.utils import DEFAULT_CODE_EXTENSIONS, get_document, hash_str, truncate @@ -64,92 +69,18 @@ async def chunk_document(self, document: str) -> list[dict[str, Any]]: """Return a list of {id, ref} chunks for the given document.""" raise NotImplementedError() - async def get_file_chunk_data(self, node, data, db): + async def get_file_chunk_data(self, node, data): """Generate and save chunk data for a file node to graph and db""" - record = db.get(data["checksum"]) - document = record["documents"][0] + document = data["document"] try: chunks = await self.chunk_document(document) except RagdaemonError: if self.verbose: print(f"Error chunking {node}; skipping.") chunks = [] - # Save to db and graph - metadatas = record["metadatas"][0] - metadatas[self.chunk_field_id] = json.dumps(chunks) - db.update(data["checksum"], metadatas=metadatas) + chunks = sorted(chunks, key=lambda x: len(x["id"])) data[self.chunk_field_id] = chunks - def add_file_chunks_to_graph( - self, - file: str, - data: dict, - graph: KnowledgeGraph, - db: Database, - refresh: bool = False, - ) -> dict[str, list[Any]]: - """Load chunks from file data into db/graph""" - - # Grab and validate chunks for given file - chunks = data.get(self.chunk_field_id) - if chunks is None: - raise RagdaemonError(f"Node {file} missing {self.chunk_field_id}") - if isinstance(chunks, str): - chunks = json.loads(chunks) - data[self.chunk_field_id] = chunks - - add_to_db = {"ids": [], "documents": [], "metadatas": []} - if len(chunks) == 0: - return add_to_db - base_id = f"{file}:BASE" - if not any(chunk["id"] == base_id for chunk in chunks): - raise RagdaemonError(f"Node {file} missing base chunk") - edges_to_add = {(file, base_id)} - for chunk in chunks: - # Locate or create record for chunk - id, ref = chunk["id"], chunk["ref"] - document = get_document(ref, Path(graph.graph["cwd"])) - checksum = hash_str(document) - records = db.get(checksum)["metadatas"] - if not refresh and len(records) > 0: - record = records[0] - else: - record = { - "id": id, - "type": "chunk", - "ref": chunk["ref"], - "checksum": checksum, - "active": False, - } - document, truncate_ratio = truncate(document, db.embedding_model) - if truncate_ratio > 0 and self.verbose: - print(f"Truncated {id} by {truncate_ratio:.2%}") - add_to_db["ids"].append(checksum) - add_to_db["documents"].append(document) - add_to_db["metadatas"].append(record) - - # Add chunk to graph and connect hierarchy edges - graph.add_node(record["id"], **record) - - def _link_to_base_chunk(_id): - """Recursively create links from _id to base chunk.""" - path_str, chunk_str = _id.split(":") - chunk_list = chunk_str.split(".") - _parent = ( - f"{path_str}:{'.'.join(chunk_list[:-1])}" - if len(chunk_list) > 1 - else base_id - ) - edges_to_add.add((_parent, _id)) - if _parent != base_id: - _link_to_base_chunk(_parent) - - if id != base_id: - _link_to_base_chunk(id) - for source, target in edges_to_add: - graph.add_edge(source, target, type="hierarchy") - return add_to_db - async def annotate( self, graph: KnowledgeGraph, db: Database, refresh: bool = False ) -> KnowledgeGraph: @@ -174,37 +105,98 @@ async def annotate( files_just_chunked = set() for node, data in files_with_chunks: if refresh or data.get(self.chunk_field_id, None) is None: - tasks.append(self.get_file_chunk_data(node, data, db)) + tasks.append(self.get_file_chunk_data(node, data)) files_just_chunked.add(node) + elif isinstance(data[self.chunk_field_id], str): + data[self.chunk_field_id] = json.loads(data[self.chunk_field_id]) if len(tasks) > 0: if self.verbose: await tqdm.gather(*tasks, desc="Chunking files...") else: await asyncio.gather(*tasks) + update_db = {"ids": [], "metadatas": []} + for node in files_just_chunked: + data = graph.nodes[node] + update_db["ids"].append(data["checksum"]) + metadatas = {self.chunk_field_id: json.dumps(data[self.chunk_field_id])} + update_db["metadatas"].append(metadatas) + update_db = remove_update_db_duplicates(**update_db) + db.update(**update_db) # Process chunks - add_to_db = {"ids": [], "documents": [], "metadatas": []} - remove_from_db = set() + # 1. Add all chunks to graph + all_chunk_ids = set() for file, data in files_with_chunks: - try: - refresh = refresh or file in files_just_chunked - _add_to_db = self.add_file_chunks_to_graph( - file, data, graph, db, refresh - ) - for field, values in _add_to_db.items(): - add_to_db[field].extend(values) - except RagdaemonError as e: - # If there's a problem with the chunks, remove the file from the db. - # This, along with 'files_just_chunked', prevents invalid database - # records perpetuating. - if self.verbose: - print(f"Error adding chunks for {file}:\n{e}. Removing db record.") - remove_from_db.add(data["checksum"]) + if len(data[self.chunk_field_id]) == 0: continue - if len(remove_from_db) > 0: - db.delete(list(remove_from_db)) - raise RagdaemonError(f"Chunking error, try again.") + # Sort such that "parents" are added before "children" + base_id = f"{file}:BASE" + chunks = [c for c in data[self.chunk_field_id] if c["id"] != base_id] + chunks.sort(key=lambda x: len(x["id"])) + base_chunk = [c for c in data[self.chunk_field_id] if c["id"] == base_id] + if len(base_chunk) != 1: + raise RagdaemonError(f"Node {file} missing base chunk") + chunks = base_chunk + chunks + # Load chunks into graph + for chunk in chunks: + id, ref = chunk["id"], chunk["ref"] + document = get_document(ref, Path(graph.graph["cwd"])) + chunk_data = { + "id": id, + "ref": ref, + "type": "chunk", + "document": document, + "checksum": hash_str(document), + "active": False, + } + graph.add_node(id, **chunk_data) + all_chunk_ids.add(id) + # Locate the parent and add hierarchy edge + chunk_str = id.split(":")[1] + if chunk_str == "BASE": + parent = file + elif "." not in chunk_str: + parent = base_id + else: + parts = chunk_str.split(".") + while True: + parent = f"{file}:{'.'.join(parts[:-1])}" + if parent in graph: + break + parent_str = parent.split(":")[1] + if "." not in parent_str: + # If we can't find a parent, use the base node. + if self.verbose: + print(f"No parent node found for {id}") + parent = base_id + break + # If intermediate parents are missing, skip them + parts = parent_str.split(".") + graph.add_edge(parent, id, type="hierarchy") + + # 2. Get metadata for all chunks from db + all_chunk_checksums = [ + graph.nodes[chunk]["checksum"] for chunk in all_chunk_ids + ] + response = db.get(ids=all_chunk_checksums, include=["metadatas"]) + db_data = {data["id"]: data for data in response["metadatas"]} + add_to_db = {"ids": [], "documents": [], "metadatas": []} + for chunk in all_chunk_ids: + if chunk in db_data: + # 3. Add db metadata for nodes that have it + graph.nodes[chunk].update(db_data[chunk]) + else: + # 4. Add to db nodes that don't + data = deepcopy(graph.nodes[chunk]) + document = data.pop("document") + document, truncate_ratio = truncate(document, db.embedding_model) + if truncate_ratio > 0 and self.verbose: + print(f"Truncated {chunk} by {truncate_ratio:.2%}") + add_to_db["ids"].append(data["checksum"]) + add_to_db["documents"].append(document) + add_to_db["metadatas"].append(data) if len(add_to_db["ids"]) > 0: add_to_db = remove_add_to_db_duplicates(**add_to_db) - db.upsert(**add_to_db) + db.add(**add_to_db) + return graph diff --git a/ragdaemon/annotators/chunker_llm.py b/ragdaemon/annotators/chunker_llm.py index 2132773..77b4ee3 100644 --- a/ragdaemon/annotators/chunker_llm.py +++ b/ragdaemon/annotators/chunker_llm.py @@ -37,10 +37,12 @@ class ChunkerLLM(Chunker): def __init__( self, *args, + batch_size: int = 800, model: Optional[TextModel | str] = DEFAULT_COMPLETION_MODEL, **kwargs, ): super().__init__(*args, **kwargs) + self.batch_size = batch_size self.model = model async def get_llm_response( @@ -88,7 +90,7 @@ async def get_llm_response( return chunks async def chunk_document( - self, document: str, batch_size: int = 1000, retries: int = 1 + self, document: str, retries: int = 1 ) -> list[dict[str, Any]]: """Parse file_lines into a list of {id, ref} chunks.""" lines = document.split("\n") @@ -100,9 +102,9 @@ async def chunk_document( # Get raw llm output: {id, start_line, end_line} chunks = list[dict[str, Any]]() - n_batches = (len(file_lines) + batch_size - 1) // batch_size + n_batches = (len(file_lines) + self.batch_size - 1) // self.batch_size for i in range(n_batches): - batch_lines = file_lines[i * batch_size : (i + 1) * batch_size] + batch_lines = file_lines[i * self.batch_size : (i + 1) * self.batch_size] last_chunk = chunks.pop() if chunks else None for j in range(retries + 1, 0, -1): try: diff --git a/ragdaemon/annotators/diff.py b/ragdaemon/annotators/diff.py index bf4e0ce..8b6ac88 100644 --- a/ragdaemon/annotators/diff.py +++ b/ragdaemon/annotators/diff.py @@ -1,5 +1,6 @@ import json import re +from copy import deepcopy from pathlib import Path from ragdaemon.annotators.base_annotator import Annotator @@ -94,86 +95,79 @@ async def annotate( if data and data.get("type") == "diff" } graph.remove_nodes_from(graph_nodes) + + checksums = dict[str, str]() document = get_document(self.diff_args, cwd, type="diff") checksum = hash_str(document) - existing_records = db.get(checksum) - if refresh or len(existing_records["ids"]) == 0: - chunks = get_chunks_from_diff(id=self.id, diff=document) - data = { - "id": self.id, - "ref": self.diff_args, - "type": "diff", - "checksum": checksum, - "chunks": json.dumps(chunks), - "active": False, - } - - # If the full diff is too long to embed, it is truncated. Anything - # removed will be captured in chunks. - document, truncate_ratio = truncate(document, db.embedding_model) - if truncate_ratio > 0 and self.verbose: - print(f"Truncated diff by {truncate_ratio:.2%}") - db.upsert(ids=checksum, documents=document, metadatas=data) - else: - data = existing_records["metadatas"][0] - data["chunks"] = json.loads(data["chunks"]) + chunks = get_chunks_from_diff(id=self.id, diff=document) + data = { + "id": self.id, + "ref": self.diff_args, + "type": "diff", + "document": document, + "checksum": checksum, + "chunks": chunks, + "active": False, + } graph.add_node(self.id, **data) + checksums[self.id] = checksum - # Add chunks - add_to_db = {"ids": [], "documents": [], "metadatas": []} - edges_to_add = set() - for chunk_id, chunk_ref in data["chunks"].items(): + for chunk_id, chunk_ref in chunks.items(): document = get_document(chunk_ref, cwd, type="diff") chunk_checksum = hash_str(document) - existing_records = db.get(chunk_checksum) - if refresh or len(existing_records["ids"]) == 0: - data = { - "id": chunk_id, - "ref": chunk_ref, - "type": "diff", - "checksum": chunk_checksum, - "active": False, - } - document, truncate_ratio = truncate(document, db.embedding_model) - if truncate_ratio > 0 and self.verbose: - print(f"Truncated diff chunk {chunk_id} by {truncate_ratio:.2%}") - add_to_db["ids"].append(chunk_checksum) - add_to_db["documents"].append(document) - add_to_db["metadatas"].append(data) - else: - data = existing_records["metadatas"][0] + data = { + "id": chunk_id, + "ref": chunk_ref, + "type": "diff", + "document": document, + "checksum": chunk_checksum, + "active": False, + } graph.add_node(chunk_id, **data) - edges_to_add.add((self.id, chunk_id)) - # Match file/chunk nodes in graph - path_ref = chunk_id.split(":", 1)[1] - file, lines = parse_path_ref(path_ref) - file_str = str(file) - if file_str not in graph: # Removed files + graph.add_edge(self.id, chunk_id, type="diff") + checksums[chunk_id] = chunk_checksum + + # Link it to all overlapping chunks (if file has chunks) or to the file + _, path, lines = parse_diff_id(chunk_id) + if not path: + continue + path_str = path.as_posix() + if path_str not in graph: # Removed files if self.verbose: - print(f"File {file_str} not in graph") + print(f"File {path_str} not in graph") + continue + link_to = set() + for node, data in graph.nodes(data=True): + if not node.startswith(f"{path_str}:") or data.get("type") != "chunk": + continue + _, _lines = parse_path_ref(data["ref"]) + if lines and _lines and lines.intersection(_lines): + link_to.add(node) + if len(link_to) == 0: + link_to.add(path_str) + for node in link_to: + graph.add_edge(node, chunk_id, type="link") + + # Sync with remote DB + ids = list(set(checksums.values())) + response = db.get(ids=ids, include=[]) + db_data = set(response["ids"]) + add_to_db = {"ids": [], "documents": [], "metadatas": []} + for id, checksum in checksums.items(): + if checksum in db_data: continue - edges_to_add.add((chunk_id, file_str)) - - def _link_to_successors(_node, visited=set()): - for successor in graph.successors(_node): - if successor in visited: - continue - visited.add(successor) - edge = (chunk_id, successor) - _data = graph.nodes[successor] - if _data.get("type") not in ["file", "chunk"]: - continue - _, _lines = parse_path_ref(_data["ref"]) - if lines and _lines and lines.intersection(_lines): - edges_to_add.add(edge) - _link_to_successors(successor, visited) - - _link_to_successors(file_str) - - for source, target in edges_to_add: - graph.add_edge(source, target, type="diff") + data = deepcopy(graph.nodes[id]) + document = data.pop("document") + if "chunks" in data: + data["chunks"] = json.dumps(data["chunks"]) + document, truncate_ratio = truncate(document, db.embedding_model) + if self.verbose and truncate_ratio > 0: + print(f"Truncated {id} by {truncate_ratio:.2%}") + add_to_db["ids"].append(checksum) + add_to_db["documents"].append(document) + add_to_db["metadatas"].append(data) if len(add_to_db["ids"]) > 0: add_to_db = remove_add_to_db_duplicates(**add_to_db) - db.upsert(**add_to_db) + db.add(**add_to_db) return graph diff --git a/ragdaemon/annotators/hierarchy.py b/ragdaemon/annotators/hierarchy.py index 397a151..b22919b 100644 --- a/ragdaemon/annotators/hierarchy.py +++ b/ragdaemon/annotators/hierarchy.py @@ -1,3 +1,4 @@ +from copy import deepcopy from pathlib import Path from ragdaemon.annotators.base_annotator import Annotator @@ -8,94 +9,6 @@ from ragdaemon.utils import get_document, hash_str, truncate -def get_active_checksums( - cwd: Path, - db: Database, - refresh: bool = False, - verbose: bool = False, - ignore_patterns: set[Path] = set(), -) -> dict[Path, str]: - # Get checksums for all active files - checksums: dict[Path, str] = {} - paths = get_paths_for_directory(cwd, exclude_patterns=ignore_patterns) - add_to_db = { - "ids": [], - "documents": [], - "metadatas": [], - } - for path in paths: - try: - path_str = path.as_posix() - ref = path_str - document = get_document(ref, cwd) - checksum = hash_str(document) - existing_record = len(db.get(checksum)["ids"]) > 0 - if refresh or not existing_record: - # add new items to db (will generate embeddings) - metadatas = { - "id": path_str, - "type": "file", - "ref": ref, - "checksum": checksum, - "active": False, - } - document, truncate_ratio = truncate(document, db.embedding_model) - if truncate_ratio > 0 and verbose: - print(f"Truncated {path_str} by {truncate_ratio:.2%}") - add_to_db["ids"].append(checksum) - add_to_db["documents"].append(document) - add_to_db["metadatas"].append(metadatas) - checksums[path] = checksum - except UnicodeDecodeError: # Ignore non-text files - pass - except RagdaemonError as e: - if verbose: - print(f"Error processing path {path}: {e}") - - # Get checksums for all active directories - directories = set() - for path in paths: - for parent in path.parents: - if parent not in paths: - directories.add(parent) - for path in directories: - ref = path.as_posix() - document = get_document(ref, cwd, type="directory") - - # The checksum for a directory is the hash of the checksums of its subpaths, - # which are listed in the document and were computed above. - subdir_checksums = "" - for subpath in document.split("\n")[1:]: - subpath = Path(ref) / subpath - if subpath in checksums: - subdir_checksums += checksums[subpath] - else: - raise RagdaemonError(f"Checksum not found for {subpath}") - checksum = hash_str(subdir_checksums) - - existing_record = len(db.get(checksum)["ids"]) > 0 - if refresh or not existing_record: - metadatas = { - "id": ref, - "type": "directory", - "ref": ref, - "checksum": checksum, - "active": False, - } - document, truncate_ratio = truncate(document, db.embedding_model) - if truncate_ratio > 0 and verbose: - print(f"Truncated {ref} by {truncate_ratio:.2%}") - add_to_db["ids"].append(checksum) - add_to_db["documents"].append(document) - add_to_db["metadatas"].append(metadatas) - checksums[path] = checksum - - if len(add_to_db["ids"]) > 0: - add_to_db = remove_add_to_db_duplicates(**add_to_db) - db.upsert(**add_to_db) - return checksums - - def files_checksum(cwd: Path, ignore_patterns: set[Path] = set()) -> str: timestamps = "" for path in get_paths_for_directory(cwd, exclude_patterns=ignore_patterns): @@ -124,44 +37,88 @@ async def annotate( self, graph: KnowledgeGraph, db: Database, refresh: bool = False ) -> KnowledgeGraph: """Build a graph of active files and directories with hierarchy edges.""" - cwd = Path(graph.graph["cwd"]) - checksums = get_active_checksums( - cwd, - db, - refresh=refresh, - verbose=self.verbose, - ignore_patterns=self.ignore_patterns, - ) - _files_checksum = files_checksum(cwd, self.ignore_patterns) - # Initialize an empty graph. We'll build it from scratch. + # Initialize a new graph from scratch with same cwd + cwd = Path(graph.graph["cwd"]) graph = KnowledgeGraph() graph.graph["cwd"] = str(cwd) - edges_to_add = set() - for path, checksum in checksums.items(): - # add db reecord - id = path.as_posix() if len(path.parts) > 0 else "ROOT" - results = db.get(checksum) - data = results["metadatas"][0] - graph.add_node(id, **data) - if id == "ROOT": - continue - # add hierarchy edges - def _link_to_cwd(_path: Path): - _parent = _path.parent.as_posix() if len(_path.parts) > 1 else "ROOT" - edges_to_add.add((_parent, _path.as_posix())) - if _parent != "ROOT": - _link_to_cwd(_path.parent) - - _link_to_cwd(path) + # Load active files/dirs and checksums + checksums = dict[Path, str]() + paths = get_paths_for_directory(cwd, exclude_patterns=self.ignore_patterns) + directories = set() + edges = set() + for path in paths: + path_str = path.as_posix() + document = get_document(path_str, cwd) + checksum = hash_str(document) + data = { + "id": path_str, + "type": "file", + "ref": path_str, + "document": document, + "checksum": checksum, + "active": False, + } + graph.add_node(path_str, **data) + checksums[path] = checksum + # Record parents & edges + _last = path + for parent in path.parents: + if len(parent.parts) == 0: + parent = Path("ROOT") + directories.add(parent) + edges.add((parent.as_posix(), _last.as_posix())) + _last = parent + + for dir in directories: + dir_str = dir.as_posix() + dir_path = dir if dir != Path("ROOT") else Path(".") + document = get_document(dir_str, cwd, type="directory") + checksum = hash_str( + "".join( + checksums[dir_path / subpath] + for subpath in document.split("\n")[1:] + ) + ) + data = { + "id": dir_str, + "type": "directory", + "ref": dir_str, + "document": document, + "checksum": checksum, + "active": False, + } + graph.add_node(dir_str, **data) + checksums[dir] = checksum - # Add directory nodes with checksums - for source, target in edges_to_add: + for source, target in edges: for id in (source, target): if id not in graph: raise RagdaemonError(f"Node {id} not found in graph") graph.add_edge(source, target, type="hierarchy") - graph.graph["files_checksum"] = _files_checksum + # Sync with remote DB + ids = list(set(checksums.values())) + response = db.get(ids=ids, include=["metadatas"]) + db_data = {id: data for id, data in zip(response["ids"], response["metadatas"])} + add_to_db = {"ids": [], "documents": [], "metadatas": []} + for path, checksum in checksums.items(): + if checksum in db_data: + data = db_data[checksum] + graph.nodes[path.as_posix()].update(data) + else: + data = deepcopy(graph.nodes[path.as_posix()]) + document = data.pop("document") + document, truncate_ratio = truncate(document, db.embedding_model) + if self.verbose and truncate_ratio > 0: + print(f"Truncated {path} by {truncate_ratio:.2%}") + add_to_db["ids"].append(checksum) + add_to_db["documents"].append(document) + add_to_db["metadatas"].append(data) + if len(add_to_db["ids"]) > 0: + add_to_db = remove_add_to_db_duplicates(**add_to_db) + db.add(**add_to_db) + + graph.graph["files_checksum"] = files_checksum(cwd, self.ignore_patterns) return graph diff --git a/ragdaemon/annotators/summarizer.py b/ragdaemon/annotators/summarizer.py index 66fb16a..ab467f6 100644 --- a/ragdaemon/annotators/summarizer.py +++ b/ragdaemon/annotators/summarizer.py @@ -8,7 +8,7 @@ from ragdaemon.annotators.base_annotator import Annotator from ragdaemon.context import ContextBuilder -from ragdaemon.database import Database +from ragdaemon.database import Database, remove_update_db_duplicates from ragdaemon.graph import KnowledgeGraph from ragdaemon.errors import RagdaemonError from ragdaemon.utils import DEFAULT_COMPLETION_MODEL, hash_str, semaphore, truncate @@ -70,7 +70,6 @@ def build_filetree( def get_document_and_context( node: str, graph: KnowledgeGraph, - db: Database, summary_field_id: str = "summary", model: Optional[TextModel] = None, ) -> tuple[str, str]: @@ -85,12 +84,12 @@ def get_document_and_context( if data.get("type") == "directory": document = f"Directory: {node}" else: - cb = ContextBuilder(graph, db) + cb = ContextBuilder(graph) cb.add_id(node) document = cb.render() if data.get("type") == "chunk": - cb = ContextBuilder(graph, db) + cb = ContextBuilder(graph) # Parent chunks back to the file def get_hierarchical_parents(target: str, cb: ContextBuilder): @@ -212,7 +211,6 @@ def is_complete(self, graph: KnowledgeGraph, db: Database) -> bool: document, context = get_document_and_context( node, graph, - db, summary_field_id=self.summary_field_id, model=self.model, ) @@ -225,16 +223,15 @@ async def generate_summary( self, node: str, graph: KnowledgeGraph, - db: Database, loading_bar: Optional[tqdm] = None, refresh: bool = False, ): - """Asynchronously generate summary and update graph and db""" + """Asynchronously generate summary and update graph""" if self.spice_client is None: raise RagdaemonError("Spice client not initialized") document, context = get_document_and_context( - node, graph, db, summary_field_id=self.summary_field_id, model=self.model + node, graph, summary_field_id=self.summary_field_id, model=self.model ) summary_checksum = hash_str(document + context) data = graph.nodes[node] @@ -263,14 +260,9 @@ async def generate_summary( ) summary = response.text - record = db.get(data["checksum"]) - metadatas = record["metadatas"][0] if summary != "PASS": - metadatas[self.summary_field_id] = summary data[self.summary_field_id] = summary - metadatas[self.checksum_field_id] = summary_checksum data[self.checksum_field_id] = summary_checksum - db.update(data["checksum"], metadatas=metadatas) if loading_bar is not None: loading_bar.update(1) @@ -279,7 +271,6 @@ async def dfs( self, node: str, graph: KnowledgeGraph, - db: Database, loading_bar: Optional[tqdm] = None, refresh: bool = False, ): @@ -291,29 +282,40 @@ async def dfs( and graph.nodes[edge[1]].get("type") in self.summarize_nodes ] if children: - tasks = [ - self.dfs(child, graph, db, loading_bar, refresh) for child in children - ] + tasks = [self.dfs(child, graph, loading_bar, refresh) for child in children] await asyncio.gather(*tasks) - await self.generate_summary(node, graph, db, loading_bar, refresh) + await self.generate_summary(node, graph, loading_bar, refresh) async def annotate( self, graph: KnowledgeGraph, db: Database, refresh: bool = False ) -> KnowledgeGraph: """Asynchronously generate or fetch summaries and add to graph/db""" + summaries = dict[str, str]() + for node, data in graph.nodes(data=True): + if data is not None and data.get("type") in self.summarize_nodes: + summaries[node] = data.get(self.checksum_field_id, "") + if self.verbose: - n = len( - [ - node - for node, data in graph.nodes(data=True) - if data is not None and data.get("type") in self.summarize_nodes - ] - ) - loading_bar = tqdm(total=n, desc="Summarizing code...") + loading_bar = tqdm(total=len(summaries), desc="Summarizing code...") else: loading_bar = None - await self.dfs("ROOT", graph, db, loading_bar, refresh) + await self.dfs("ROOT", graph, loading_bar, refresh) + + update_db = {"ids": [], "metadatas": []} + for node, summary_checksum in summaries.items(): + if graph.nodes[node].get(self.checksum_field_id) != summary_checksum: + data = graph.nodes[node] + update_db["ids"].append(data["checksum"]) + update_db["metadatas"].append( + { + self.summary_field_id: data[self.summary_field_id], + self.checksum_field_id: data[self.checksum_field_id], + } + ) + if len(update_db["ids"]) > 1: + update_db = remove_update_db_duplicates(**update_db) + db.update(**update_db) if loading_bar is not None: loading_bar.close() diff --git a/ragdaemon/context.py b/ragdaemon/context.py index 650d907..d51f982 100644 --- a/ragdaemon/context.py +++ b/ragdaemon/context.py @@ -37,16 +37,15 @@ def render_comments(comments: list[Comment]) -> str: class ContextBuilder: """Renders items from a graph into an llm-readable string.""" - def __init__(self, graph: KnowledgeGraph, db: Database, verbose: bool = False): + def __init__(self, graph: KnowledgeGraph, verbose: bool = False): self.graph = graph - self.db = db self.verbose = verbose self.context = dict[ str, dict[str, Any] ]() # {path: {lines, tags, document, diff}} def copy(self): - duplicate = ContextBuilder(self.graph, self.db, self.verbose) + duplicate = ContextBuilder(self.graph, self.verbose) duplicate.context = deepcopy(self.context) return duplicate @@ -69,20 +68,17 @@ def _add_path(self, path_str: str): """Create a new record in the context for the given path.""" document = None if path_str in self.graph: - checksum = self.graph.nodes[path_str]["checksum"] - document = self.db.get(checksum)["documents"][0] + document = self.graph.nodes[path_str]["document"] if document.endswith("[TRUNCATED]"): document = None if document is None: # Truncated or deleted try: - # Could be an ignored file, in which case load it into graph/db # TODO: Add ignored files to the graph/database cwd = Path(self.graph.graph["cwd"]) document = get_document(path_str, cwd, type="file") except FileNotFoundError: # Or could be deleted but have a diff document = f"{path_str}\n[DELETED]" - checksum = hash_str(document) message = { "lines": set(), "tags": set(), @@ -258,8 +254,7 @@ def render_diffs(self, ids: set[str]) -> str: git_command += f" {diff_str}" output += f"{git_command}\n" for id in sorted(ids): - checksum = self.graph.nodes[id]["checksum"] - document = self.db.get(checksum)["documents"][0] + document = self.graph.nodes[id]["document"] # TODO: Add line numbers without_git_command = "\n".join(document.split("\n")[1:]) output += without_git_command + "\n" diff --git a/ragdaemon/daemon.py b/ragdaemon/daemon.py index d0bcc57..93d0dfc 100644 --- a/ragdaemon/daemon.py +++ b/ragdaemon/daemon.py @@ -144,9 +144,7 @@ def search( return self.db.query_graph(query, self.graph, n=n, node_types=node_types) def get_document(self, filename: str) -> str: - checksum = self.graph.nodes[filename]["checksum"] - document = self.db.get(checksum)["documents"][0] - return document + return self.graph.nodes[filename]["document"] def get_context( self, @@ -157,7 +155,7 @@ def get_context( model: Model | str = DEFAULT_COMPLETION_MODEL, ) -> ContextBuilder: if context_builder is None: - context = ContextBuilder(self.graph, self.db, self.verbose) + context = ContextBuilder(self.graph, self.verbose) else: # TODO: Compare graph hashes, reconcile changes context = context_builder diff --git a/ragdaemon/database/__init__.py b/ragdaemon/database/__init__.py index dbafd62..5eb1f1b 100644 --- a/ragdaemon/database/__init__.py +++ b/ragdaemon/database/__init__.py @@ -3,9 +3,12 @@ from typing import Optional from spice import Spice -from spice.errors import SpiceError -from ragdaemon.database.chroma_database import ChromaDB, remove_add_to_db_duplicates +from ragdaemon.database.chroma_database import ( + ChromaDB, + remove_add_to_db_duplicates, + remove_update_db_duplicates, +) from ragdaemon.database.database import Database from ragdaemon.database.lite_database import LiteDB from ragdaemon.utils import mentat_dir_path diff --git a/ragdaemon/database/chroma_database.py b/ragdaemon/database/chroma_database.py index f34f44c..78bf7b4 100644 --- a/ragdaemon/database/chroma_database.py +++ b/ragdaemon/database/chroma_database.py @@ -13,12 +13,6 @@ MAX_INPUTS_PER_CALL = 2048 -if TYPE_CHECKING: - from chromadb.api.types import ( - GetResult, - Metadata, - ) - def remove_add_to_db_duplicates( ids: list[str], documents: list[str], metadatas: list[dict] @@ -34,6 +28,19 @@ def remove_add_to_db_duplicates( return output +def remove_update_db_duplicates( + ids: list[str], metadatas: list[dict] +) -> dict[str, Any]: + seen = set() + output = {"ids": [], "metadatas": []} + for id, metadata in zip(ids, metadatas): + if id not in seen: + output["ids"].append(id) + output["metadatas"].append(metadata) + seen.add(id) + return output + + class ChromaDB(Database): def __init__( self, @@ -105,39 +112,44 @@ def __call__(self, input_texts: Embeddable) -> Embeddings: ) def query(self, query: str, active_checksums: list[str]) -> list[dict]: + """ + Since we add many different versions of each file to Chroma, we can't do a + straightforward query, because it'd return multiple version of the same file. + + The best workaround I've found for this is using the 'active' flag in metadata. + The downside is that it requires 2 additional calls to the database each time: + one to set it, another to unset it. The extra time is negligible for local DBs + and hopefully not unreasonable for remote. + + There's a third "extra" call to validate the active_checksums. If we don't do + this it will still function properly but it will print a lot of warnings. + """ + valid_checksums = self._collection.get(ids=active_checksums, include=[])["ids"] # Flag active records - result: GetResult = self._collection.get(active_checksums) - metadatas: Optional[list[Metadata]] = result["metadatas"] - if not metadatas or len(metadatas) == 0: - return [] - updates = {"ids": [], "metadatas": []} - for metadata in metadatas: - updates["ids"].append(metadata["checksum"]) - updates["metadatas"].append({**metadata, "active": True}) + updates = { + "ids": valid_checksums, + "metadatas": [{"active": True} for _ in valid_checksums], + } self._collection.update(**updates) # Query response = self._collection.query( query_texts=query, where={"active": True}, - n_results=len(metadatas), + n_results=len(valid_checksums), + include=["distances"], ) # Remove flags - updates["metadatas"] = [{**metadata, "active": False} for metadata in metadatas] + updates = { + "ids": valid_checksums, + "metadatas": [{"active": False} for _ in valid_checksums], + } self._collection.update(**updates) - # Parse results. Return results for the 'first query' only - if ( - response is None - or response["metadatas"] is None - or response["documents"] is None - or response["distances"] is None - ): + if response is None or response["distances"] is None: return [] - _metadatas = response["metadatas"][0] - _documents = response["documents"][0] - _distances = response["distances"][0] + # Parse results. Return results for the 'first query' only results = [ - {**m, "document": do, "distance": di} - for m, do, di in zip(_metadatas, _documents, _distances) + {"checksum": id, "distance": distance} + for id, distance in zip(response["ids"][0], response["distances"][0]) ] results = sorted(results, key=lambda x: x["distance"]) return results diff --git a/ragdaemon/database/database.py b/ragdaemon/database/database.py index 95bca7c..2b36c26 100644 --- a/ragdaemon/database/database.py +++ b/ragdaemon/database/database.py @@ -1,5 +1,5 @@ from pathlib import Path -from typing import Iterable, Optional +from typing import Any, Iterable, Optional from ragdaemon.graph import KnowledgeGraph @@ -30,19 +30,24 @@ def query_graph( Chroma's default search covers all records, including inactive ones, so we manually flag the active records, query them, and then unflag them. """ - active_checksums = list( - { - data["checksum"] - for _, data in graph.nodes(data=True) - if data and "checksum" in data and data["type"] in node_types - } - ) - results = self.query(query, active_checksums) + checksum_index = { + data["checksum"]: node + for node, data in graph.nodes(data=True) + if data and "checksum" in data and data["type"] in node_types + } + response = self.query(query, list(checksum_index.keys())) + + # Add (local) metadata to results + results = list[dict[str, Any]]() + for result in response: + node = checksum_index[result["checksum"]] + data = graph.nodes[node] + result = {**result, **data} + results.append(result) # Add exact-match multiplier for result in results: distance = result["distance"] - # Multiply by 2 if query is in the NAME type = result["type"] if type == "file": name = Path(result["id"]).name diff --git a/ragdaemon/database/lite_database.py b/ragdaemon/database/lite_database.py index 821cf4c..ddea1e9 100644 --- a/ragdaemon/database/lite_database.py +++ b/ragdaemon/database/lite_database.py @@ -1,5 +1,5 @@ from pathlib import Path -from typing import Any +from typing import Any, Optional from ragdaemon.database.database import Database @@ -13,12 +13,8 @@ def __init__(self, cwd: Path, db_path: Path): def query(self, query: str, active_checksums: list[str]) -> list[dict]: response = self._collection.query(query, active_checksums) results = [ - {**data, "document": document, "distance": distance} - for data, document, distance in zip( - response["metadatas"][0], - response["documents"][0], - response["distances"][0], - ) + {"checksum": id, "distance": distance} + for id, distance in zip(response["ids"][0], response["distances"][0]) ] results = sorted(results, key=lambda x: x["distance"]) return results @@ -30,13 +26,13 @@ class LiteCollection: Matches the chroma Collection API except: - No embeddings - In-memory - - A basic hand-coded search algo + - Query returns all distances=1 """ def __init__(self): self.data = dict[str, dict[str, Any]]() # {id: {metadatas, document}} - def get(self, ids: list[str] | str) -> dict: + def get(self, ids: list[str] | str, include: Optional[list[str]] = None) -> dict: if isinstance(ids, str): ids = [ids] output = {"ids": [], "metadatas": [], "documents": []} @@ -45,6 +41,8 @@ def get(self, ids: list[str] | str) -> dict: output["ids"].append(id) output["metadatas"].append(self.data[id]["metadatas"]) output["documents"].append(self.data[id]["document"]) + if include: + output = {k: v for k, v in output.items() if k in include or k == "ids"} return output def count(self) -> int: @@ -65,12 +63,10 @@ def query(self, query: str, active_checksums: list[str]) -> dict[str, list[Any]] ] return { "ids": [[r["id"] for r in records]], - "metadatas": [[r["metadatas"] for r in records]], - "documents": [[r["document"] for r in records]], "distances": [[1] * len(records)], } - def upsert( + def add( self, ids: list[str] | str, metadatas: list[dict] | dict, diff --git a/ragdaemon/utils.py b/ragdaemon/utils.py index 0156039..1dd81b1 100644 --- a/ragdaemon/utils.py +++ b/ragdaemon/utils.py @@ -6,7 +6,7 @@ from pathlib import Path from spice import Spice -from spice.models import GPT_4_TURBO, Model, UnknownModel +from spice.models import GPT_4o_2024_05_13, Model, UnknownModel from spice.spice import get_model_from_name from ragdaemon.errors import RagdaemonError @@ -41,7 +41,7 @@ ] -DEFAULT_COMPLETION_MODEL = GPT_4_TURBO +DEFAULT_COMPLETION_MODEL = GPT_4o_2024_05_13 def hash_str(string: str) -> str: diff --git a/tests/annotators/test_chunker_llm.py b/tests/annotators/test_chunker_llm.py index badc8a9..5eee24a 100644 --- a/tests/annotators/test_chunker_llm.py +++ b/tests/annotators/test_chunker_llm.py @@ -37,7 +37,7 @@ def expected_chunks(): async def test_chunker_llm_edge_cases(cwd, expected_chunks): # NOTE: TO RUN THIS YOU HAVE TO COMMENT_OUT tests/conftest.py/mock_openai_api_key daemon = Daemon(cwd, annotators={"hierarchy": {}}) - chunker = ChunkerLLM(spice_client=daemon.spice_client) + chunker = ChunkerLLM(spice_client=daemon.spice_client, batch_size=10) # One example with all the edge cases (when batch_size = 10 lines): # - First batch ends mid-class, so second batch needs 'call path' @@ -45,7 +45,7 @@ async def test_chunker_llm_edge_cases(cwd, expected_chunks): # - Third batch is all inside one function, so needs to pass call forward. text = Path("tests/data/hard_to_chunk.txt").read_text() document = f"src/calculator.py\n{text}" - actual_chunks = await chunker.chunk_document(document, batch_size=10) + actual_chunks = await chunker.chunk_document(document) print(actual_chunks) diff --git a/tests/annotators/test_diff.py b/tests/annotators/test_diff.py index c86a083..bfedfd2 100644 --- a/tests/annotators/test_diff.py +++ b/tests/annotators/test_diff.py @@ -59,7 +59,7 @@ async def test_diff_render(git_history, mock_db): await daemon.update(refresh=True) # Only diffs - context = ContextBuilder(daemon.graph, daemon.db) + context = ContextBuilder(daemon.graph) context.add_diff("DEFAULT:main.py") context.add_diff("DEFAULT:src/operations.py:1-5") context.add_diff("DEFAULT:src/operations.py:8-10") diff --git a/tests/annotators/test_hierarchy.py b/tests/annotators/test_hierarchy.py index f86c257..686967a 100644 --- a/tests/annotators/test_hierarchy.py +++ b/tests/annotators/test_hierarchy.py @@ -4,30 +4,10 @@ from networkx.readwrite import json_graph import pytest -from ragdaemon.annotators.hierarchy import Hierarchy, get_active_checksums +from ragdaemon.annotators.hierarchy import Hierarchy from ragdaemon.graph import KnowledgeGraph -def test_get_active_checksums(cwd, mock_db): - checksums = get_active_checksums(cwd, mock_db) - assert isinstance(checksums, dict), "Checksums is not a dict" - assert all(isinstance(k, Path) for k in checksums), "Keys are not all Paths" - assert all( - isinstance(v, str) for v in checksums.values() - ), "Values are not all strings" - - hierarchy_graph = KnowledgeGraph.load("tests/data/hierarchy_graph.json") - expected = { - (node, data["checksum"]) - for node, data in hierarchy_graph.nodes(data=True) - if data and "checksum" in data - } - # Replace checksums "." with "ROOT" - checksums[Path("ROOT")] = checksums.pop(Path(".")) - actual = {(path.as_posix(), checksum) for path, checksum in checksums.items()} - assert actual == expected, "Checksums are not equal" - - def test_hierarchy_is_complete(cwd, mock_db): empty_graph = KnowledgeGraph() empty_graph.graph["cwd"] = cwd.as_posix() diff --git a/tests/annotators/test_summarizer.py b/tests/annotators/test_summarizer.py index 87c6a9c..0e97a82 100644 --- a/tests/annotators/test_summarizer.py +++ b/tests/annotators/test_summarizer.py @@ -1,6 +1,8 @@ +import json from pathlib import Path import pytest +from networkx.readwrite import json_graph from ragdaemon.annotators.summarizer import ( build_filetree, @@ -46,14 +48,13 @@ async def test_build_filetree(cwd): @pytest.mark.asyncio async def test_get_document_and_context(cwd): graph = KnowledgeGraph.load("tests/data/summarizer_graph.json") # Chunk data - db = LiteDB(cwd=cwd, db_path=Path(".")) for _, data in graph.nodes(data=True): document = get_document(data["ref"], cwd=cwd, type=data["type"]) - db._collection.upsert(ids=data["checksum"], documents=document, metadatas=data) + data["document"] = document # A chunk document, context = get_document_and_context( - "src/interface.py:parse_arguments", graph, db + "src/interface.py:parse_arguments", graph ) assert ( document @@ -107,7 +108,7 @@ async def test_get_document_and_context(cwd): ) # A file - document, context = get_document_and_context("src/interface.py", graph, db) + document, context = get_document_and_context("src/interface.py", graph) assert document.startswith("src/interface.py\n") assert ( context @@ -131,7 +132,7 @@ async def test_get_document_and_context(cwd): ) # A directory - document, context = get_document_and_context("src", graph, db) + document, context = get_document_and_context("src", graph) assert document == "Directory: src" assert ( context diff --git a/tests/data/summarizer_graph.json b/tests/data/summarizer_graph.json index 3456932..9c33771 100644 --- a/tests/data/summarizer_graph.json +++ b/tests/data/summarizer_graph.json @@ -28,7 +28,8 @@ "ref": "src/interface.py", "summary": "Parse command-line input to extract operands and an operator for arithmetic operations and display the output to the console.", "summary_checksum": "156e6b95a939cc690524c7e96448c787", - "type": "file" + "type": "file", + "document": "src/interface.py\nimport argparse\nimport re\n\n\ndef parse_arguments():\n parser = argparse.ArgumentParser(description=\"Basic Calculator\")\n parser.add_argument(\"operation\", type=str, help=\"Calculation operation\")\n args = parser.parse_args()\n\n # use re to parse symbol, nubmer before, nubmer after\n match = re.match(r\"(\\d+)(\\D)(\\d+)\", args.operation)\n if match is None:\n raise ValueError(\"Invalid operation\")\n return int(match.group(1)), match.group(2), int(match.group(3))\n\n\ndef render_response(result):\n print(result)\n" }, { "active": false, @@ -37,7 +38,8 @@ "ref": "README.md", "summary": "Describe the application's experimental purpose in testing the limits of the treesitter parser.", "summary_checksum": "f512afb951427a1494eecd927607aa42", - "type": "file" + "type": "file", + "document": "README.md\nAn unnecessarily convoluted app to test the boundaries of the treesitter parser\n" }, { "active": false, @@ -48,7 +50,8 @@ "ref": "src/__init__.py", "summary": "Establish the 'src' as a Python package to organize related modules concerning command-line based arithmetic operations, without adding any explicit functionality.", "summary_checksum": "207e3de4ed658542202ca6ccc3376a96", - "type": "file" + "type": "file", + "document": "src/__init__.py\n" }, { "active": false, @@ -57,7 +60,8 @@ "ref": ".gitignore", "summary": "Manage exclusions for version control by specifying files and directories that Git should ignore, while ensuring the .gitignore file itself remains tracked.", "summary_checksum": "5f3c1aebfa8418a5845a2c5ddc2b33cf", - "type": "file" + "type": "file", + "document": ".gitignore\n.ragdaemon\n**/.*\n**/__pycache__\n!.gitignore\n" }, { "active": false, @@ -93,7 +97,8 @@ "ref": "src/operations.py", "summary": "Define basic arithmetic operations including addition, subtraction, multiplication, division, and square root calculation utilizing Python's math library.", "summary_checksum": "f32593b8091a214cc0042312abb4626c", - "type": "file" + "type": "file", + "document": "src/operations.py\nimport math\n\n\ndef add(a, b):\n return a + b\n\n\ndef subtract(a, b):\n return a - b\n\n\ndef multiply(a, b):\n return a * b\n\n\ndef divide(a, b):\n return a / b\n\n\ndef sqrt(a):\n return math.sqrt(a)\n" }, { "active": false, @@ -113,7 +118,8 @@ "ref": "main.py", "summary": "Execute arithmetic operations based on command-line input and produce an output.", "summary_checksum": "23112504dd4d8d6daf28cb234eb9a7f3", - "type": "file" + "type": "file", + "document": "main.py\nfrom src.interface import parse_arguments, render_response\nfrom src.operations import add, divide, multiply, subtract\n\n\ndef main():\n a, op, b = parse_arguments()\n\n if op == \"+\":\n result = add(a, b)\n elif op == \"-\":\n result = subtract(a, b)\n elif op == \"*\":\n result = multiply(a, b)\n elif op == \"/\":\n result = divide(a, b)\n else:\n raise ValueError(\"Unsupported operation\")\n\n render_response(result)\n\n\nif __name__ == \"__main__\":\n main()\n" }, { "active": false, @@ -122,7 +128,8 @@ "ref": ".", "summary": "Execute simple arithmetic operations from command-line input and explore the capabilities of the treesitter parser, organizing the code into clear modules within the source directory.", "summary_checksum": "53a5d32f0cc62b73d0129d17ca1a64e5", - "type": "directory" + "type": "directory", + "document": ".\n.gitignore\nREADME.md\nmain.py\nsrc/__init__.py\nsrc/interface.py\nsrc/operations.py" }, { "active": false, @@ -131,7 +138,8 @@ "ref": "src", "summary": "Organize code modules for a simple arithmetic operations application. It includes files for initializing the package, parsing command-line input, and defining arithmetic operations.", "summary_checksum": "0d9635c66b257d6f31f165fea667e3fd", - "type": "directory" + "type": "directory", + "document": "src\n__init__.py\ninterface.py\noperations.py" }, { "active": false, @@ -140,7 +148,8 @@ "ref": "src/interface.py:1-4,15-16,19", "summary": "No action is described as the provided code only includes import statements.", "summary_checksum": "775efab314470411a3b831802154edff", - "type": "chunk" + "type": "chunk", + "document": "src/interface.py:1-4,15-16,19\nimport argparse\nimport re\n\n\n\n\n\n" }, { "active": false, @@ -149,7 +158,8 @@ "ref": "src/interface.py:5-14", "summary": "Parse command-line arguments into three components: an integer, a symbol representing a mathematical operation, and a second integer.", "summary_checksum": "eaca0178f36ef4919aad47f57682c7e7", - "type": "chunk" + "type": "chunk", + "document": "src/interface.py:5-14\ndef parse_arguments():\n parser = argparse.ArgumentParser(description=\"Basic Calculator\")\n parser.add_argument(\"operation\", type=str, help=\"Calculation operation\")\n args = parser.parse_args()\n\n # use re to parse symbol, nubmer before, nubmer after\n match = re.match(r\"(\\d+)(\\D)(\\d+)\", args.operation)\n if match is None:\n raise ValueError(\"Invalid operation\")\n return int(match.group(1)), match.group(2), int(match.group(3))\n" }, { "active": false, @@ -158,7 +168,8 @@ "ref": "src/interface.py:17-18", "summary": "Display the result of a mathematical operation to standard output.", "summary_checksum": "8d69c71cda68ed3dad02d2c6f8e31503", - "type": "chunk" + "type": "chunk", + "document": "src/interface.py:17-18\ndef render_response(result):\n print(result)\n" }, { "active": false, @@ -167,7 +178,8 @@ "ref": "src/operations.py:1-3,6-7,10-11,14-15,18-19,22", "summary": "No operation is performed in the provided code snippet as it only includes an import statement for the math library and no other executable code.", "summary_checksum": "d41598bfcfe2731338d393cd640de305", - "type": "chunk" + "type": "chunk", + "document": "src/operations.py:1-3,6-7,10-11,14-15,18-19,22\nimport math\n\n\n\n\n\n\n\n\n\n\n\n" }, { "active": false, @@ -176,7 +188,8 @@ "ref": "src/operations.py:4-5", "summary": "Define an addition function that takes two arguments and returns their sum.", "summary_checksum": "55c3d4fb8a3bb7f1e5c414d11d08ade8", - "type": "chunk" + "type": "chunk", + "document": "src/operations.py:4-5\ndef add(a, b):\n return a + b\n" }, { "active": false, @@ -185,7 +198,8 @@ "ref": "src/operations.py:8-9", "summary": "Perform subtraction by taking two inputs, 'a' and 'b', and returning the result of 'a' - 'b'.", "summary_checksum": "64eb46f64d0361a7d7a2ef7e4afaf0f5", - "type": "chunk" + "type": "chunk", + "document": "src/operations.py:8-9\ndef subtract(a, b):\n return a - b\n" }, { "active": false, @@ -194,7 +208,8 @@ "ref": "src/operations.py:12-13", "summary": "Multiply two numbers and return the result.", "summary_checksum": "a1502efd32ac3ff71c767a1a31d359ef", - "type": "chunk" + "type": "chunk", + "document": "src/operations.py:12-13\ndef multiply(a, b):\n return a * b\n" }, { "active": false, @@ -203,7 +218,8 @@ "ref": "src/operations.py:16-17", "summary": "Perform division on two numbers, `a` and `b`, and return the result.", "summary_checksum": "108512db41db799615544198f287815d", - "type": "chunk" + "type": "chunk", + "document": "src/operations.py:16-17\ndef divide(a, b):\n return a / b\n" }, { "active": false, @@ -212,7 +228,8 @@ "ref": "src/operations.py:20-21", "summary": "Calculate the square root of a number using the math library's sqrt function.", "summary_checksum": "bff8880d50f1e5c5d011135b8c720b43", - "type": "chunk" + "type": "chunk", + "document": "src/operations.py:20-21\ndef sqrt(a):\n return math.sqrt(a)\n" }, { "active": false, @@ -221,7 +238,8 @@ "ref": "main.py:1-4,20-24", "summary": "Execute the main function if the script is run as the main program.", "summary_checksum": "5536373063c7333d4c35c6497b3862d6", - "type": "chunk" + "type": "chunk", + "document": "main.py:1-4,20-24\nfrom src.interface import parse_arguments, render_response\nfrom src.operations import add, divide, multiply, subtract\n\n\n\n\nif __name__ == \"__main__\":\n main()\n\n" }, { "active": false, @@ -230,7 +248,8 @@ "ref": "main.py:5-19", "summary": "Parse command-line arguments for a basic arithmetic operation, perform the corresponding calculation, and print the result. If an unsupported operation is specified, raise a ValueError.", "summary_checksum": "6be54ffb7a09c09c621330f1edef8687", - "type": "chunk" + "type": "chunk", + "document": "main.py:5-19\ndef main():\n a, op, b = parse_arguments()\n\n if op == \"+\":\n result = add(a, b)\n elif op == \"-\":\n result = subtract(a, b)\n elif op == \"*\":\n result = multiply(a, b)\n elif op == \"/\":\n result = divide(a, b)\n else:\n raise ValueError(\"Unsupported operation\")\n\n render_response(result)\n" } ], "links": [ diff --git a/tests/test_comments.py b/tests/test_comments.py index 5b147ce..478c4d3 100644 --- a/tests/test_comments.py +++ b/tests/test_comments.py @@ -10,7 +10,7 @@ async def test_comment_render(git_history, mock_db): daemon = Daemon(cwd=git_history) await daemon.update(refresh=True) - context = ContextBuilder(daemon.graph, daemon.db) + context = ContextBuilder(daemon.graph) context.add_ref("src/operations.py") context.add_comment( "src/operations.py", {"comment": "What is this file for?"}, tags=["test-flag"] diff --git a/tests/test_context.py b/tests/test_context.py index 3191309..89003fc 100644 --- a/tests/test_context.py +++ b/tests/test_context.py @@ -8,12 +8,12 @@ from ragdaemon.utils import get_document -def test_daemon_render_context(cwd, mock_db): +def test_daemon_render_context(cwd): path_str = Path("src/interface.py").as_posix() ref = path_str # Base Chunk - context = ContextBuilder(KnowledgeGraph(), mock_db) + context = ContextBuilder(KnowledgeGraph()) context.context = { path_str: { "lines": set([1, 2, 3, 4, 15]), @@ -99,7 +99,7 @@ def test_to_refs(cwd, mock_db): ref = path_str # Setup Context - context = ContextBuilder(KnowledgeGraph(), mock_db) + context = ContextBuilder(KnowledgeGraph()) context.context = { path_str: { "lines": set([1, 2, 3, 4, 15]),