Skip to content

Commit

Permalink
Release v1.0.0
Browse files Browse the repository at this point in the history
  • Loading branch information
nboyse authored Jan 31, 2025
2 parents 70dd3db + d866e42 commit 0480273
Show file tree
Hide file tree
Showing 11 changed files with 97 additions and 75 deletions.
2 changes: 2 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,5 @@ REDBOX_API_KEY = myapi
# AUTHBROKER_CLIENT_ID=REPLACE_WITH_GITLAB_SECRET
# AUTHBROKER_CLIENT_SECRET=REPLACE_WITH_GITLAB_SECRET
# AUTHBROKER_URL=https://sso.trade.gov.uk

ENABLE_METADATA_EXTRACTION = True
1 change: 1 addition & 0 deletions django_app/redbox_app/jinja2.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ def environment(**options):
"waffle_flag": waffle.flag_is_active,
"google_analytics_tag": settings.GOOGLE_ANALYTICS_TAG,
"google_analytics_link": settings.GOOGLE_ANALYTICS_LINK,
"google_analytics_iframe_src": settings.GOOGLE_ANALYTICS_IFRAME_SRC,
}
)
return env
18 changes: 11 additions & 7 deletions django_app/redbox_app/redbox_core/views/misc_views.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import logging
import waffle
from http import HTTPStatus

from django.conf import settings
from django.http import HttpRequest, HttpResponse
from django.shortcuts import render
from django.shortcuts import render, redirect
from django.views.decorators.http import require_http_methods
from django.views.generic.base import RedirectView

Expand All @@ -14,11 +15,14 @@

@require_http_methods(["GET"])
def homepage_view(request):
return render(
request,
template_name="homepage.html",
context={"request": request, "allow_sign_ups": settings.ALLOW_SIGN_UPS},
)
if not request.user.is_authenticated and settings.LOGIN_METHOD == "sso":
return redirect("authbroker_client:login")
else:
return render(
request,
template_name="homepage.html",
context={"request": request, "allow_sign_ups": settings.ALLOW_SIGN_UPS},
)


@require_http_methods(["GET"])
Expand Down Expand Up @@ -48,4 +52,4 @@ def faq_view(request):
request,
template_name="faq.html",
context={"request": request},
)
)
22 changes: 16 additions & 6 deletions django_app/redbox_app/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,9 @@
CSP_DEFAULT_SRC = (
"'self'",
"s3.amazonaws.com",
"https://www.google-analytics.com/",
"https://region1.google-analytics.com/",
"https://www.googletagmanager.com/",
)

CSP_SCRIPT_SRC = (
Expand All @@ -195,18 +198,22 @@
"'sha256-1NTuHcjvzzB6D69Pb9lbxI5pMJNybP/SwBliv3OvOOE='",
"'sha256-DrkvIvFj5cNADO03twE83GwgAKgP224E5UyyxXFfvTc='",
"https://*.googletagmanager.com",
"https://tagmanager.google.com/",
"https://www.googletagmanager.com/",
"ajax.googleapis.com/",
"sha256-T/1K73p+yppfXXw/AfMZXDh5VRDNaoEh3enEGFmZp8M="
)
CSP_OBJECT_SRC = ("'none'",)
CSP_REQUIRE_TRUSTED_TYPES_FOR = ("'script'",)
CSP_TRUSTED_TYPES = ("dompurify", "default")
CSP_TRUSTED_TYPES = ("dompurify", "default", "goog#html")
CSP_REPORT_TO = "csp-endpoint"
CSP_FONT_SRC = (
"'self'",
"s3.amazonaws.com",
)
CSP_INCLUDE_NONCE_IN = ("script-src",)
CSP_STYLE_SRC = (
"'self'",
# "https://tagmanager.google.com/",
"https://tagmanager.google.com/",
)
CSP_FRAME_ANCESTORS = ("'none'",)

Expand All @@ -216,9 +223,9 @@
f"{WEBSOCKET_SCHEME}://{ENVIRONMENT.hosts[0]}/ws/chat/",
"eu.i.posthog.com",
"eu-assets.i.posthog.com",
"https://*.google-analytics.com",
"https://*.analytics.google.com",
"https://*.googletagmanager.com",
"https://www.google-analytics.com/",
"https://region1.google-analytics.com/",
"https://www.googletagmanager.com/",
]


Expand Down Expand Up @@ -418,6 +425,7 @@ def filter_transactions(event):

GOOGLE_ANALYTICS_TAG = env.str("GOOGLE_ANALYTICS_TAG", " ")
GOOGLE_ANALYTICS_LINK = env.str("GOOGLE_ANALYTICS_LINK", " ")
GOOGLE_ANALYTICS_IFRAME_SRC = env.str("GOOGLE_ANALYTICS_IFRAME_SRC", " ")
# TEST_SSO_PROVIDER_SET_RETURNED_ACCESS_TOKEN = 'someCode'

REST_FRAMEWORK = {
Expand All @@ -431,3 +439,5 @@ def filter_transactions(event):


REDBOX_API_KEY = env.str("REDBOX_API_KEY")

ENABLE_METADATA_EXTRACTION = env.str("ENABLE_METADATA_EXTRACTION")
23 changes: 7 additions & 16 deletions django_app/redbox_app/templates/base.html
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
{% from "macros/iai-top-nav.html" import iaiTopNav %}
{% set cspNonce=request.csp_nonce %}

<!DOCTYPE html>
<html lang="en" class="govuk-template">
Expand All @@ -20,22 +21,12 @@

{% if environment | lower == "prod" and waffle_flag(request, 'uktrade') %}
<!-- Google Tag Manager -->
<script>
(function(w, d, s, l, i) {
w[l] = w[l] || [];
w[l].push({ 'gtm.start': new Date().getTime(), event: 'gtm.js' });
var f = d.getElementsByTagName(s)[0],
j = d.createElement(s),
dl = l != 'dataLayer' ? '&l=' + l : '';
j.async = true;

const policy = trustedTypes.createPolicy('default', {
createScriptURL: (url) => url
});

j.src = policy.createScriptURL('{{ google_analytics_link }}' + i + dl);
f.parentNode.insertBefore(j, f);
})(window, document, 'script', 'dataLayer', '{{ google_analytics_tag }}');
<script hash="sha256-T/1K73p+yppfXXw/AfMZXDh5VRDNaoEh3enEGFmZp8M=" nonce="{{ request.csp_nonce }}">
(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
'{{ google_analytics_link }}'+i+dl;f.parentNode.insertBefore(j,f);
})(window,document,'script','dataLayer','{{ google_analytics_tag }}');
</script>
<!-- End Google Tag Manager -->
{% endif %}
Expand Down
9 changes: 4 additions & 5 deletions redbox-core/redbox/chains/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,17 @@
import re
from collections.abc import AsyncIterator
from json import JSONDecodeError
from typing import Any, Iterator, List, Optional, Type, TypeVar, Union
from typing import Any, Iterator, List, Optional, Type, Union

import jsonpatch # type: ignore[import]
import pydantic # pydantic: ignore
from langchain_core.callbacks.manager import dispatch_custom_event
from langchain_core.exceptions import OutputParserException
from langchain_core.messages import BaseMessage, BaseMessageChunk
from langchain_core.output_parsers import BaseCumulativeTransformOutputParser
from langchain_core.output_parsers.format_instructions import JSON_FORMAT_INSTRUCTIONS
from langchain_core.output_parsers.transform import BaseCumulativeTransformOutputParser
from langchain_core.outputs import ChatGenerationChunk, Generation, GenerationChunk
from langchain_core.utils.json import parse_and_check_json_markdown, parse_json_markdown, parse_partial_json
from langchain_core.utils.json import parse_json_markdown
from langchain_core.utils.pydantic import PYDANTIC_MAJOR_VERSION
from pydantic import BaseModel

Expand All @@ -32,8 +31,8 @@ class ClaudeParser(BaseCumulativeTransformOutputParser[Any]):
describing the difference between the previous and the current object.
"""

pydantic_object: Optional[Type[TBaseModel]] = None # type: ignore
"""The Pydantic object to use for validation.
pydantic_object: Optional[Type] = None # type: ignore
"""The Pydantic object to use for validation.
If None, no validation is performed."""

def _diff(self, prev: Optional[Any], next: Any) -> Any:
Expand Down
18 changes: 11 additions & 7 deletions redbox-core/redbox/loader/ingester.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
import logging
from typing import TYPE_CHECKING

from langchain_core.runnables import RunnableParallel
from langchain_elasticsearch.vectorstores import BM25Strategy, ElasticsearchStore
from langchain_community.vectorstores import OpenSearchVectorSearch
from langchain_core.embeddings import FakeEmbeddings
from langchain_core.runnables import RunnableParallel

from redbox.chains.components import get_embeddings
from redbox.chains.ingest import ingest_from_loader
from redbox.loader.loaders import MetadataLoader, UnstructuredChunkLoader
from redbox.models.settings import get_settings
from redbox.models.chain import GeneratedMetadata
from redbox.models.file import ChunkResolution
from redbox.models.settings import get_settings

if TYPE_CHECKING:
from mypy_boto3_s3.client import S3Client
Expand Down Expand Up @@ -52,7 +52,7 @@ def get_elasticsearch_store_without_embeddings(es, es_index_name: str):
return OpenSearchVectorSearch(
index_name=es_index_name,
opensearch_url=env.elastic.collection_endpoint,
embedding_function=FakeEmbeddings(size=env.embedding_backend_vector_size)
embedding_function=FakeEmbeddings(size=env.embedding_backend_vector_size),
)


Expand All @@ -67,7 +67,7 @@ def create_alias(alias: str):
es.indices.put_alias(index=chunk_index_name, name=alias)


def _ingest_file(file_name: str, es_index_name: str = alias):
def _ingest_file(file_name: str, es_index_name: str = alias, enable_metadata_extraction=env.enable_metadata_extraction):
logging.info("Ingesting file: %s", file_name)

es = env.elasticsearch_client()
Expand All @@ -84,8 +84,12 @@ def _ingest_file(file_name: str, es_index_name: str = alias):
es.indices.create(index=es_index_name, body=env.index_mapping, ignore=400)

# Extract metadata
metadata_loader = MetadataLoader(env=env, s3_client=env.s3_client(), file_name=file_name)
metadata = metadata_loader.extract_metadata()
if enable_metadata_extraction:
metadata_loader = MetadataLoader(env=env, s3_client=env.s3_client(), file_name=file_name)
metadata = metadata_loader.extract_metadata()
else:
# return empty metadata
metadata = GeneratedMetadata(name=file_name, description="", keywords=[])

chunk_ingest_chain = ingest_from_loader(
loader=UnstructuredChunkLoader(
Expand Down
26 changes: 20 additions & 6 deletions redbox-core/redbox/loader/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import tiktoken
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate
from pydantic import ValidationError
from redbox_app.setting_enums import Environment

from redbox.chains.components import get_chat_llm
Expand Down Expand Up @@ -70,21 +71,29 @@ def _chunking(self) -> list[dict]:
if response.status_code != 200:
raise ValueError(response.text)

return response.json() or []
elements = response.json()

if not elements:
raise ValueError("Unstructured failed to extract text for this file")

return elements

def extract_metadata(self) -> GeneratedMetadata:
"""
Extract metadata from first 1_000 chunks
"""

chunks = self._chunking()

original_metadata = chunks[0]["metadata"] if chunks else {}
first_thousand_words = "".join(chunk["text"] for chunk in chunks)[:10_000]

try:
metadata = self.create_file_metadata(first_thousand_words, original_metadata=original_metadata)
except TypeError:
metadata = GeneratedMetadata(name=original_metadata.get("filename"))
except Exception as e:
logger.info(e)
if original_metadata.get("filename"):
metadata = GeneratedMetadata(name=original_metadata.get("filename"))
else:
metadata = GeneratedMetadata(name=self.file_name)
return metadata

def create_file_metadata(self, page_content: str, original_metadata: dict | None = None) -> GeneratedMetadata:
Expand Down Expand Up @@ -117,7 +126,12 @@ def trim(obj, max_length=1000):
)
metadata_chain = metadata_prompt | self.llm | parser

return metadata_chain.invoke({"page_content": page_content})
try:
return metadata_chain.invoke({"page_content": page_content})
except ValidationError as e:
# error due to LLM return incorrect response
logger.info(e.errors())
return GeneratedMetadata(name=original_metadata.get("filename"))


class UnstructuredChunkLoader:
Expand Down
14 changes: 3 additions & 11 deletions redbox-core/redbox/models/chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,7 @@
from enum import StrEnum
from functools import reduce
from types import UnionType
from typing import (
Annotated,
Literal,
NotRequired,
Required,
TypedDict,
get_args,
get_origin,
)
from typing import Annotated, Literal, NotRequired, Required, TypedDict, get_args, get_origin
from uuid import UUID, uuid4

from langchain_core.documents import Document
Expand Down Expand Up @@ -379,5 +371,5 @@ class GeneratedMetadata(BaseModel):
"""Document Metadata generated by the LLM"""

name: str = Field(description="document name", default="")
description: str | None = Field(description="document description", default=None)
keywords: list[str] = Field(description="document keywords", max_length=5, default_factory=list)
description: str | None = Field(description="document description", default="")
keywords: list[str] = Field(description="document keywords", default_factory=list)
19 changes: 14 additions & 5 deletions redbox-core/redbox/models/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,14 +138,23 @@ class Settings(BaseSettings):

model_config = SettingsConfigDict(env_file=".env", env_nested_delimiter="__", extra="allow", frozen=True)

enable_metadata_extraction: bool = env.bool("ENABLE_METADATA_EXTRACTION", default=True)

## Prompts
metadata_prompt: tuple = (
"system",
"You are an SEO specialist that must optimise the metadata of a document "
"to make it as discoverable as possible. You are about to be given the first "
"1_000 tokens of a document and any hard-coded file metadata that can be "
"recovered from it. Create SEO-optimised metadata for this document."
"Description must be less than 100 words. and no more than 5 keywords .",
"Given the first 1,000 tokens of a document and any available hard-coded file metadata, create"
"SEO-optimized metadata for the document in the following JSON format:\n\n"
'{ "name": '
', "description": '
', "keywords": ["", "", "", "", ""] }\n'
"The description should summarize the document's content in a concise and SEO-friendly manner, "
"and the keywords should represent the most relevant topics or phrases related to the document.",
# "You are an SEO specialist that must optimise the metadata of a document "
# "to make it as discoverable as possible. You are about to be given the first "
# "1_000 tokens of a document and any hard-coded file metadata that can be "
# "recovered from it. Create SEO-optimised metadata for this document."
# "Description must be less than 100 words. and maximum 5 keywords .",
)

# Define index mapping for Opensearch - this is important so that KNN search works
Expand Down
20 changes: 8 additions & 12 deletions redbox-core/tests/test_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,12 @@
from langchain_core.language_models.fake_chat_models import GenericFakeChatModel
from langchain_elasticsearch import ElasticsearchStore


from redbox.models.chain import GeneratedMetadata
from redbox.chains.ingest import document_loader, ingest_from_loader
from redbox.loader import ingester
from redbox.loader.loaders import (
MetadataLoader,
UnstructuredChunkLoader,
)
from redbox.models.file import ChunkResolution
from redbox.loader.ingester import ingest_file
from redbox.loader.loaders import MetadataLoader, UnstructuredChunkLoader
from redbox.models.chain import GeneratedMetadata
from redbox.models.file import ChunkResolution
from redbox.models.settings import Settings
from redbox.retriever.queries import build_query_filter

Expand Down Expand Up @@ -89,7 +85,7 @@ def test_extract_metadata_missing_key(
metadata_loader = MetadataLoader(env=env, s3_client=s3_client, file_name=file_name)
metadata = metadata_loader.extract_metadata()

assert metadata == GeneratedMetadata()
assert metadata == GeneratedMetadata(name=file_name)


@patch("redbox.loader.loaders.get_chat_llm")
Expand All @@ -107,7 +103,7 @@ def test_extract_metadata_extra_key(

requests_mock.post(
f"http://{env.unstructured_host}:8000/general/v0/general",
json=[{"text": "hello", "metadata": {}}],
json=[{"text": "hello", "metadata": {"filename": "something"}}],
)

"""
Expand All @@ -121,9 +117,9 @@ def test_extract_metadata_extra_key(
metadata = metadata_loader.extract_metadata()

assert metadata is not None
assert metadata.name == "foo"
assert metadata.description == "test"
assert metadata.keywords == ["abc"]
assert metadata.name == "something"
assert metadata.description == ""
assert metadata.keywords == []


@patch("redbox.loader.loaders.get_chat_llm")
Expand Down

0 comments on commit 0480273

Please sign in to comment.