Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable indexing to OpenSearch v2.16 #204

Draft
wants to merge 8 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Removed

- Removed indexing of api-spec content (REST API specification)

## [3.4.2] - 2024-03-13

## [3.4.1] - 2024-02-13
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3.12-alpine3.18
FROM python:3.12-alpine3.20

ENV PYTHON_UNBUFFERED 1
ENV PYTHONWARNINGS "ignore:Unverified HTTPS request"
Expand Down
56 changes: 31 additions & 25 deletions blog.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,17 @@
import sys
from time import sleep

from elasticsearch import Elasticsearch
from elasticsearch.exceptions import NotFoundError
from opensearchpy import OpenSearch
from opensearchpy.exceptions import NotFoundError

from common import html2text
from common import index_settings

HUBSPOT_ACCESS_TOKEN = os.getenv("HUBSPOT_ACCESS_TOKEN")

ELASTICSEARCH_ENDPOINT = os.getenv("ELASTICSEARCH_ENDPOINT")
OPENSEARCH_ENDPOINT = os.getenv("OPENSEARCH_ENDPOINT")
OPENSEARCH_USERNAME = os.getenv("OPENSEARCH_USERNAME", "admin")
OPENSEARCH_PASSWORD = os.getenv("OPENSEARCH_PASSWORD", "")

# TODO: validate
BASE_URL = os.getenv("BASE_URL")
Expand Down Expand Up @@ -97,15 +99,14 @@ def parse_blog_post(post):
return ret


def index_blog_post(es, index_name, data):
def index_blog_post(osclient, index_name, data):
"""
Write content for one blog post to the index
"""
id = data['id']
try:
es.index(
osclient.index(
index=index_name,
doc_type="_doc",
id=data['id'],
body=data)
except Exception as e:
Expand All @@ -131,39 +132,37 @@ def full_index_name(dt):
return f'{INDEX_NAME_PREFIX}-{datestring}'


def create_index(es, index_name):
es.indices.create(
def create_index(osclient, index_name):
osclient.indices.create(
index=index_name,
body={
"settings" : index_settings,
"mappings": INDEX_MAPPING
},
# include_type_name=false shall be removed once we are on ES 7 or higher
include_type_name="false")
})


def set_index_alias(es, new_index_name):
def set_index_alias(osclient, new_index_name):
"""
Ensures that index alias INDEX_NAME_PREFIX points to new_index_name only,
deletes the old index/indices the alias pointed to.
"""
if es.indices.exists_alias(name=INDEX_NAME_PREFIX):
alias = es.indices.get_alias(name=INDEX_NAME_PREFIX)
if osclient.indices.exists_alias(name=INDEX_NAME_PREFIX):
alias = osclient.indices.get_alias(name=INDEX_NAME_PREFIX)
for index_name in list(alias.keys()):
logging.info(f'Removing alias {INDEX_NAME_PREFIX} => {index_name}')
try:
es.indices.delete_alias(index=index_name, name=INDEX_NAME_PREFIX)
osclient.indices.delete_alias(index=index_name, name=INDEX_NAME_PREFIX)
except NotFoundError:
logging.error(f'Could not delete index alias {INDEX_NAME_PREFIX} => {index_name} (not found)')
pass

try:
logging.info(f'Deleting index {index_name}')
es.indices.delete(index=index_name)
osclient.indices.delete(index=index_name)
except:
logging.error("Could not delete index %s" % index_name)
pass
es.indices.put_alias(index=new_index_name, name=INDEX_NAME_PREFIX)
osclient.indices.put_alias(index=new_index_name, name=INDEX_NAME_PREFIX)


def run():
Expand All @@ -174,35 +173,42 @@ def run():
logging.error(f'Environment variable HUBSPOT_ACCESS_TOKEN must be set')
sys.exit(1)

if ELASTICSEARCH_ENDPOINT is None:
logging.error("ELASTICSEARCH_ENDPOINT isn't configured.")
if OPENSEARCH_ENDPOINT is None:
logging.error("OPENSEARCH_ENDPOINT isn't configured.")
sys.exit(1)
if OPENSEARCH_USERNAME is None:
logging.error("OPENSEARCH_USERNAME isn't configured.")
sys.exit(1)
if OPENSEARCH_PASSWORD is None or OPENSEARCH_PASSWORD == "DUMMYPASS":
logging.error("OPENSEARCH_PASSWORD isn't configured.")
sys.exit(1)

# give elasticsearch some time
# give OpenSearch some time
sleep(3)
logging.info(f'Establish connection to Elasticsearch host {ELASTICSEARCH_ENDPOINT}')
es = Elasticsearch(hosts=[ELASTICSEARCH_ENDPOINT])
logging.info(f'Establish connection to OpenSearch host {OPENSEARCH_ENDPOINT}')
osclient = OpenSearch(hosts=[OPENSEARCH_ENDPOINT],
http_auth=(OPENSEARCH_USERNAME, OPENSEARCH_PASSWORD))

# Our new target index name
now_date = datetime.utcnow()
index_name = full_index_name(now_date)

logging.info(f'Creating new index {index_name}')

create_index(es, index_name)
create_index(osclient, index_name)

logging.info(f'Starting to index hubspot blog')

count = 0
for post in get_blog_posts():
doc = parse_blog_post(post)
index_blog_post(es, index_name, doc)
index_blog_post(osclient, index_name, doc)
count += 1

# Set/update index alias
if count > 0:
logging.info(f'Updating index alias {INDEX_NAME_PREFIX} to use {index_name}')
set_index_alias(es, index_name)
set_index_alias(osclient, index_name)
else:
logging.info(f'No new/updated blog posts found.')

Expand Down
2 changes: 1 addition & 1 deletion common.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from bs4 import BeautifulSoup

# Common settings for all elasticsearch indexes
# Common settings for all OpenSearch indexes
index_settings = {
"index": {
"number_of_shards" : 1,
Expand Down
5 changes: 1 addition & 4 deletions docs/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,14 @@ The following environment variables are accepted, by indexer sub command:

## `hugo`

- `ELASTICSEARCH_ENDPOINT`: URI for the Elasticsearch API endpoint.
- `OPENSEARCH_ENDPOINT`: URI for the OpenSearch API endpoint.
- `GITHUB_TOKEN`: If the repo is private, use this access token.
- `INDEX_NAME`: Name of the search index to maintain.
- `BASE_URL`: URL corresponding to the published root page of the site.
- `REPOSITORY_HANDLE`: Github organization and repository name in the format `org/repo`.
- `REPOSITORY_BRANCH`: Defaults to `main`.
- `REPOSITORY_SUBFOLDER`: Only look into this path within the repository for indexable content.
- `TYPE_LABEL`: User friendly search result type name.
- `APIDOCS_BASE_URI`: Base URI for API documentation. Should be `https://docs.giantswarm.io/api/`.
- `APIDOCS_BASE_PATH`: Should be `/api/`
- `API_SPEC_FILES`: Comma separated list of YAML files to fetch for the OpenAPI spec.

## `blog`

Expand Down
2 changes: 1 addition & 1 deletion docs/schema.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Search index schema

This indexers create Elasticsearch indices with the mappings defined in the files `/mappings/*.json`.
This indexers create OpenSearch indices with the mappings defined in the files `/mappings/*.json`.

Here is some additional information on the index fields:

Expand Down
14 changes: 12 additions & 2 deletions helm/docs-indexer-app/templates/cronjob-blog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,18 @@ spec:
seccompProfile:
type: RuntimeDefault
env:
- name: ELASTICSEARCH_ENDPOINT
value: {{ .Values.elasticsearchEndpoint }}
- name: OPENSEARCH_ENDPOINT
value: {{ .Values.opensearchEndpoint }}
- name: OPENSEARCH_USERNAME
valueFrom:
secretKeyRef:
name: {{ .Values.name }}-credentials
key: opensearch-user-name
- name: OPENSEARCH_PASSWORD
valueFrom:
secretKeyRef:
name: {{ .Values.name }}-credentials
key: opensearch-password
- name: HUBSPOT_ACCESS_TOKEN
valueFrom:
secretKeyRef:
Expand Down
14 changes: 12 additions & 2 deletions helm/docs-indexer-app/templates/cronjob-docs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,18 @@ spec:
seccompProfile:
type: RuntimeDefault
env:
- name: ELASTICSEARCH_ENDPOINT
value: {{ .Values.elasticsearchEndpoint }}
- name: OPENSEARCH_ENDPOINT
value: {{ .Values.opensearchEndpoint }}
- name: OPENSEARCH_USERNAME
valueFrom:
secretKeyRef:
name: {{ .Values.name }}-credentials
key: opensearch-user-name
- name: OPENSEARCH_PASSWORD
valueFrom:
secretKeyRef:
name: {{ .Values.name }}-credentials
key: opensearch-password
- name: BASE_URL
value: https://docs.giantswarm.io
- name: REPOSITORY_HANDLE
Expand Down
14 changes: 12 additions & 2 deletions helm/docs-indexer-app/templates/cronjob-handbook.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,18 @@ spec:
seccompProfile:
type: RuntimeDefault
env:
- name: ELASTICSEARCH_ENDPOINT
value: {{ .Values.elasticsearchEndpoint }}
- name: OPENSEARCH_ENDPOINT
value: {{ .Values.opensearchEndpoint }}
- name: OPENSEARCH_USERNAME
valueFrom:
secretKeyRef:
name: {{ .Values.name }}-credentials
key: opensearch-user-name
- name: OPENSEARCH_PASSWORD
valueFrom:
secretKeyRef:
name: {{ .Values.name }}-credentials
key: opensearch-password
- name: BASE_URL
value: https://handbook.giantswarm.io
- name: REPOSITORY_HANDLE
Expand Down
14 changes: 12 additions & 2 deletions helm/docs-indexer-app/templates/cronjob-intranet.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,18 @@ spec:
seccompProfile:
type: RuntimeDefault
env:
- name: ELASTICSEARCH_ENDPOINT
value: {{ .Values.elasticsearchEndpoint }}
- name: OPENSEARCH_ENDPOINT
value: {{ .Values.opensearchEndpoint }}
- name: OPENSEARCH_USERNAME
valueFrom:
secretKeyRef:
name: {{ .Values.name }}-credentials
key: opensearch-user-name
- name: OPENSEARCH_PASSWORD
valueFrom:
secretKeyRef:
name: {{ .Values.name }}-credentials
key: opensearch-password
- name: BASE_URL
value: https://intranet.giantswarm.io/docs
- name: GITHUB_TOKEN
Expand Down
2 changes: 2 additions & 0 deletions helm/docs-indexer-app/templates/secret.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,5 @@ type: Opaque
data:
github-access-token: {{ .Values.credentials.githubAccessToken | b64enc }}
hubspot-access-token: {{ .Values.credentials.hubspotAccessToken | b64enc }}
opensearch-user-name: {{ .Values.credentials.opensearchUsername | b64enc }}
opensearch-password: {{ .Values.credentials.opensearchPassword | b64enc }}
4 changes: 3 additions & 1 deletion helm/docs-indexer-app/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ image:
name: docs-indexer
tag: "[[.Version]]"
sha: "[[.SHA]]"
elasticsearchEndpoint: "http://sitesearch-app:9200/"
opensearchEndpoint: "http://sitesearch-app:9200/"

resources:
requests:
Expand All @@ -18,6 +18,8 @@ resources:
credentials:
githubAccessToken: DUMMYTOKEN
hubspotAccessToken: DUMMYTOKEN
opensearchUsername: admin
opensearchPassword: DUMMYPASS

global:
podSecurityStandards:
Expand Down
Loading