c2siorg · kmehant · Jul 5, 2024 · Jul 2, 2024 · Jul 2, 2024 · Jul 3, 2024
diff --git a/GraphRag/README.MD b/GraphRag/README.MD
@@ -0,0 +1,64 @@
+# Knowledge Graph Builder
+
+This project lets you build a Knowledge Graph from unstructured data (.md, .py files for now).
+
+## Table of Contents
+
+- [Installation from Source](#installation-from-source)
+- [Usage](#usage)
+  <!--- [Data Preparation](#data-preparation)-->
+  <!--- [LLM Setup](#llm-setup)-->
+  <!--- [Build Graph Index](#build-graph-index)-->
+
+## Installation from Source
+
+Follow these instructions to set up the project:
+
+```bash
+git clone https://github.com/debrupf2946/KnowledgeGraphBuilder.git
+cd KnowledgeGraphBuilder
+pip3 install -r requirements.txt
+```
+
+## Usage
+
+### Data Preparation
+
+1. First, create or import a data directory at the root folder containing documents (.md files).
+2. Copy the path of the directory.
+3. Load and chunk the documents using `load_directory(PATH)`.
+
+```python
+documents = load_directory("/data")
+```
+
+### LLM Setup
+
+Users need to set up the LLM (llama3) locally to build the Knowledge Graph.
+
+1. Initialize the LLM with `initialize_llm()`.
+2. The default parameters are:  
+    - `base_url="http://localhost:11434"` (Ollama server)
+    - `model="llama3"`
+    - `chunk_size = 512`
+3. Change the parameters as needed.
+
+```python
+initialize_llm()
+```
+
+### Build Graph Index
+
+1. Build the Knowledge Graph using the [documents](#data-preparation).
+2. Call `build_graph(documents)` to create an index.
+3. This will also save `Graph_visualization.html`, which can be opened in a browser to visualize the Knowledge Graph.
+
+```python
+index = build_graph(documents)
+```
+
+4. Save the `index` as a pickle file.
+
+```python
+save_index(index)
+```
diff --git a/GraphRag/knowledgeGraph.py b/GraphRag/knowledgeGraph.py
@@ -0,0 +1,54 @@
+from llama_index.core import StorageContext
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.core import KnowledgeGraphIndex
+from llama_index.core.graph_stores import SimpleGraphStore
+from pyvis.network import Network
+import os
+import pickle
+
+
+
+
+def build_graph(documents :str,llm: str =None,max_triplets_per_chunk: int =10,embeddings: str ="microsoft/codebert-base"):
+    """
+    This function builds KnowledgeGraph Index that can be queried
+    Args:
+        documents: llama-index Document type object
+        llm:
+        max_triplets_per_chunk: Max triplets that can be extracted from each document chunk defaults:3
+        embeddings: Hugging-Face Embeddings model name default: microsoft/codebert-base
+
+    Returns:
+        Knowledge Graph-index,also saves html visualization file
+
+    """
+    graph_store = SimpleGraphStore()
+    storage_context = StorageContext.from_defaults(graph_store=graph_store)
+    index = KnowledgeGraphIndex.from_documents(
+        documents,
+        max_triplets_per_chunk=max_triplets_per_chunk,
+        llm=llm,
+        embed_model =HuggingFaceEmbedding(model_name=embeddings),
+        storage_context=storage_context,
+    )
+    print("KG built succesfully!")
+    os.makedirs("results", exist_ok=True)
+    g = index.get_networkx_graph()
+    net = Network(notebook=True, cdn_resources="in_line", directed=True)
+    net.from_nx(g)
+    net.show("Graph_visualization.html")
+    return index
+
+def save_index(index):
+    """
+    serializes the index object,so that it can be loaded and used later
+    Args:
+        index: Grpah-Index object
+
+    Returns:
+        saves pickle file of the Grpah-Index
+    """
+    os.makedirs("results", exist_ok=True)
+    with open('results/graphIndex', 'wb') as f:
+        pickle.dump(index, f)
+    print("Index saved succesfully!")
diff --git a/GraphRag/main.py b/GraphRag/main.py
@@ -0,0 +1,8 @@
+from tools import initialize_llm,load_directory
+from knowledgeGraph import build_graph,save_index
+
+
+initialize_llm()
+documents=load_directory("/data")
+index=build_graph(documents)
+save_index(index)
diff --git a/GraphRag/requirements.txt b/GraphRag/requirements.txt
@@ -0,0 +1,6 @@
+llama-index-embeddings-huggingface
+llama-index-llms-ollama
+llama-index
+pyvis
+tree-sitter==0.21.3
+tree-sitter-languages
diff --git a/GraphRag/tools.py b/GraphRag/tools.py
@@ -0,0 +1,80 @@
+from llama_index.llms.ollama import Ollama
+from llama_index.core import SimpleDirectoryReader
+from llama_index.core.node_parser import CodeSplitter
+from llama_index.core import Document
+from llama_index.core import Settings
+
+
+
+
+def initialize_llm(base_url: str ="http://localhost:11434",model: str ="llama3",chunk_size: int = 512):
+    """
+    Initializes the llm for building the KnowledgeGraph
+    Args:
+        base_url: The ollama server URL where the model is listening
+        model: The model string that ollama is hosting and will be used to build the KnowledgeGraph
+        chunk_size: The Documents uploaded will be chunked,it represents size of each chunk
+
+    Returns:
+        None
+    """
+    llm = Ollama(base_url=base_url,model=model)
+    Settings.llm = llm
+    Settings.chunk_size = chunk_size
+    print(f"{model} initialized succesfully!")
+
+def code_spiltting(documents,language: str = "python"):
+    """
+    If the KnowledgeGraph is to be built for code-files then files are splitted using this function
+    Args:
+        documents: llama-index Document type object,Then coding-files Document
+        language: The language of coding-file
+
+    Returns:
+        nodes: Split code chunks,llama-index Nodes type object
+    """
+    splitter = CodeSplitter(
+        language=language,
+        chunk_lines=30,  # lines per chunk
+        chunk_lines_overlap=6,  # lines overlap between chunks
+        max_chars=1500,  # max chars per chunk
+    )
+    nodes = splitter.get_nodes_from_documents(documents)
+    print(f"{len(nodes)} nodes created succesfully!")
+    return nodes
+
+def convert_nodes_to_docs(nodes):
+    """
+    converts llama-index Nodes Type object to llama-index Document Type objects
+    Args:
+        nodes: llama-index Nodes type object
+    Returns:
+        lama-index Document Type objects
+    """
+    documents_from_nodes = [Document(text=node.text, metadata=node.metadata) for node in nodes]
+    print(f"{len(documents_from_nodes)} number of documents are being converted successfully!")
+    return documents_from_nodes
+
+def load_directory(directory_path: str,code_file: bool = False,language: str = "python"):
+    """
+    Loads the documentation-directory, does preprocessing and chunking depending on code_file parameter
+    Args:
+        directory_path: Path to the Files Directory from which Knowledge graph is to be made
+        code_file: Bool that specifies that given directory contains code files or not
+        language: language of the code-files
+    Returns:
+        lama-index Document Type objects
+    """
+
+    documents = SimpleDirectoryReader(directory_path).load_data()
+
+    if code_file:
+        nodes=code_spiltting(documents,language)
+        docs=convert_nodes_to_docs(nodes)
+        print(f"{len(documents)}Documents loaded succesfully!")
+        return docs
+
+    print(f"{len(documents)}Documents loaded succesfully!")
+    return documents
+
+