-
Notifications
You must be signed in to change notification settings - Fork 86
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #260 from topoteretes/COG-505-data-dataset-model-c…
…hanges Cog 505 data dataset model changes
- Loading branch information
Showing
21 changed files
with
344 additions
and
34 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
name: test | deduplication | ||
|
||
on: | ||
workflow_dispatch: | ||
pull_request: | ||
branches: | ||
- main | ||
types: [labeled, synchronize] | ||
|
||
|
||
concurrency: | ||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} | ||
cancel-in-progress: true | ||
|
||
env: | ||
RUNTIME__LOG_LEVEL: ERROR | ||
|
||
jobs: | ||
get_docs_changes: | ||
name: docs changes | ||
uses: ./.github/workflows/get_docs_changes.yml | ||
|
||
run_deduplication_test: | ||
name: test | ||
needs: get_docs_changes | ||
if: needs.get_docs_changes.outputs.changes_outside_docs == 'true' && ${{ github.event.label.name == 'run-checks' }} | ||
runs-on: ubuntu-latest | ||
defaults: | ||
run: | ||
shell: bash | ||
services: | ||
postgres: | ||
image: pgvector/pgvector:pg17 | ||
env: | ||
POSTGRES_USER: cognee | ||
POSTGRES_PASSWORD: cognee | ||
POSTGRES_DB: cognee_db | ||
options: >- | ||
--health-cmd pg_isready | ||
--health-interval 10s | ||
--health-timeout 5s | ||
--health-retries 5 | ||
ports: | ||
- 5432:5432 | ||
|
||
steps: | ||
- name: Check out | ||
uses: actions/checkout@master | ||
|
||
- name: Setup Python | ||
uses: actions/setup-python@v5 | ||
with: | ||
python-version: '3.11.x' | ||
|
||
- name: Install Poetry | ||
uses: snok/[email protected] | ||
with: | ||
virtualenvs-create: true | ||
virtualenvs-in-project: true | ||
installer-parallel: true | ||
|
||
- name: Install dependencies | ||
run: poetry install -E postgres --no-interaction | ||
|
||
- name: Run deduplication test | ||
env: | ||
ENV: 'dev' | ||
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} | ||
run: poetry run python ./cognee/tests/test_deduplication.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,11 @@ | ||
from uuid import uuid5, NAMESPACE_OID | ||
from .data_types import IngestionData | ||
|
||
def identify(data: IngestionData) -> str: | ||
data_id: str = data.get_identifier() | ||
from cognee.modules.users.models import User | ||
|
||
return uuid5(NAMESPACE_OID, data_id) | ||
|
||
def identify(data: IngestionData, user: User) -> str: | ||
data_content_hash: str = data.get_identifier() | ||
|
||
# return UUID hash of file contents + owner id | ||
return uuid5(NAMESPACE_OID, f"{data_content_hash}{user.id}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,25 +1,28 @@ | ||
import string | ||
import random | ||
import os.path | ||
import hashlib | ||
from typing import BinaryIO, Union | ||
from cognee.base_config import get_base_config | ||
from cognee.infrastructure.files.storage import LocalStorage | ||
from .classify import classify | ||
|
||
def save_data_to_file(data: Union[str, BinaryIO], dataset_name: str, filename: str = None): | ||
def save_data_to_file(data: Union[str, BinaryIO], filename: str = None): | ||
base_config = get_base_config() | ||
data_directory_path = base_config.data_root_directory | ||
|
||
classified_data = classify(data, filename) | ||
|
||
storage_path = data_directory_path + "/" + dataset_name.replace(".", "/") | ||
storage_path = os.path.join(data_directory_path, "data") | ||
LocalStorage.ensure_directory_exists(storage_path) | ||
|
||
file_metadata = classified_data.get_metadata() | ||
if "name" not in file_metadata or file_metadata["name"] is None: | ||
letters = string.ascii_lowercase | ||
random_string = "".join(random.choice(letters) for _ in range(32)) | ||
file_metadata["name"] = "text_" + random_string + ".txt" | ||
data_contents = classified_data.get_data().encode('utf-8') | ||
hash_contents = hashlib.md5(data_contents).hexdigest() | ||
file_metadata["name"] = "text_" + hash_contents + ".txt" | ||
file_name = file_metadata["name"] | ||
LocalStorage(storage_path).store(file_name, classified_data.get_data()) | ||
|
||
# Don't save file if it already exists | ||
if not os.path.isfile(os.path.join(storage_path, file_name)): | ||
LocalStorage(storage_path).store(file_name, classified_data.get_data()) | ||
|
||
return "file://" + storage_path + "/" + file_name |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
""" | ||
Custom exceptions for the Cognee API. | ||
This module defines a set of exceptions for handling various shared utility errors | ||
""" | ||
|
||
from .exceptions import ( | ||
IngestionError, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
from cognee.exceptions import CogneeApiError | ||
from fastapi import status | ||
|
||
class IngestionError(CogneeApiError): | ||
def __init__( | ||
self, | ||
message: str = "Failed to load data.", | ||
name: str = "IngestionError", | ||
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, | ||
): | ||
super().__init__(message, name, status_code) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.