Skip to content

Commit

Permalink
re-add recent changes to main
Browse files Browse the repository at this point in the history
  • Loading branch information
jmckenna committed Dec 17, 2024
1 parent bae7f34 commit 6093b2c
Show file tree
Hide file tree
Showing 9 changed files with 137 additions and 21 deletions.
8 changes: 8 additions & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file

version: 2
updates:
- package-ecosystem: "github-actions" # See documentation for possible values
directory: "/" # Location of package manifests
schedule:
interval: "weekly"
2 changes: 1 addition & 1 deletion .github/workflows/check-crlf.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:

steps:
- name: Checkout repository contents
uses: actions/checkout@v3
uses: actions/checkout@v4

- name: Use action to check for CRLF endings
uses: erclu/[email protected]
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/irc_notify.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: irc push
uses: rectalogic/notify-irc@v1
uses: rectalogic/notify-irc@v2
if: github.event_name == 'push'
with:
channel: "#oih"
Expand All @@ -26,7 +26,7 @@ jobs:
${{ github.actor }} pushed ${{ github.event.ref }} ${{ github.event.compare }}
${{ join(github.event.commits.*.message) }}
- name: irc pull request
uses: rectalogic/notify-irc@v1
uses: rectalogic/notify-irc@v2
if: github.event_name == 'pull_request'
with:
channel: "#oih"
Expand All @@ -36,7 +36,7 @@ jobs:
message: |
${{ github.actor }} opened PR ${{ github.event.pull_request.html_url }}
- name: irc tag created
uses: rectalogic/notify-irc@v1
uses: rectalogic/notify-irc@v2
if: github.event_name == 'create' && github.event.ref_type == 'tag'
with:
channel: "#oih"
Expand Down
2 changes: 1 addition & 1 deletion LICENSE.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
MIT License

Copyright (c) 2023 Ocean InfoHub
Copyright (c) 2024 Ocean InfoHub

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@ This repo contains the code for the Ocean Info Hub Global Search Portal.
* `/indexer` contains all of the code to ingest the OIH graph into the SOLR Instance
* `/solr` contains the configuration required for the solr instance, including the schema.
* `/frontend` contains the code for the static javascript app. This will produce a container in dev mode running a live server, and a static html/javascript site in production mode.
* `/regions` contains the QGIS file defining the gographical regions.
* `/regions` contains the QGIS file defining the geographical regions.

See the individual README files for more information.
14 changes: 13 additions & 1 deletion indexer/conversions.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,18 @@ def GeoShape(geo):
return _geo(field, fmt % val)
raise UnhandledFormatException("Didn't handle %s in GeoShape" % json.dumps(geo))

def GeoCoordinates(geo):
#print('here [GeoCoordinates]')

lat = geo.get("latitude",None)
long = geo.get("longitude",None)
if lat is not None and long is not None:
print ("Generating a Point from the GeoCoordinates...")
newPoint = "POINT (" + str(long) + " " + str(lat) + ")"
print(newPoint)
return _geo('point', newPoint)

raise UnhandledFormatException("Didn't handle %s in GeoCoordinates" % json.dumps(geo))

def CourseInstance(data):
atts = [_dispatch(field, data.get(field, None)) for field in ('startDate', 'endDate')]
Expand Down Expand Up @@ -239,7 +251,7 @@ def _parseDate(field, d):
try:
dt = isoparse(d)
return [
Att('dt', dt.isoformat(), field),
Att('dt', dt.isoformat(timespec='seconds').replace('+00:00', 'Z'), field),
Att('n', dt.year, field.replace('Date', 'Year')),
]
except ValueError:
Expand Down
89 changes: 89 additions & 0 deletions indexer/indexer-graph-solr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#!/usr/bin/env python3

"""
Purpose: Load a directory of JSON files, generated from the ODIS graph->Solr
process (mdp2solr.sh), into an existing Solr core.
Steps: 1) set your Solr core endpoint variable at the commandline by:
export SOLR_URL=http://127.0.0.1:8983/solr/cioos
2) set the path to the directory of JSON files, at the commandline by:
export DATA_DIR=/home/apps/oih-ui-jmckenna/indexer/data/test
3) python indexer-graph-solr.py
Output: Records indexed into the Solr core. Look for the "added resource" message
in the command window (which means it successfully indexed into Solr) such as:
***Processing filename: /home/apps/oih-ui-jmckenna/indexer/data/test/ttt1.json
added resource https://catalogue.cioos.ca/dataset/00863729-b5a8-4ac6-b73a-523d463f9963.jsonld: schema:Dataset to index
***Processing filename: /home/apps/oih-ui-jmckenna/indexer/data/test/ttt2.json
added resource https://catalogue.cioos.ca/dataset/d1391e91-1ed2-4600-901a-5a5408fd1a6f.jsonld: schema:Dataset to index
Requires: Python 3.x
Notes:
Input files are JSON (not JSON-LD that the orginal "indexer.py" required)
"""

import requests
import json
import os
from pathlib import Path
from test_utils import test_generation, dump_exception

#set urls
BASE_SOLR_URL=os.environ.get('SOLR_URL', '')
solr_url = BASE_SOLR_URL + "/update/json/docs"
delete_url = BASE_SOLR_URL + "/update"
query_url = BASE_SOLR_URL + "/select"

DATA_DIR=os.environ.get('DATA_DIR')
BASE_DIR=Path(DATA_DIR)

session = requests.Session()

# set Solr params
solr_params = {
'commit': 'true',
# echo implies a dry run
# 'echo': 'true',
}

#loop through directory
def import_file(file):
with open(file, 'rb') as f:
print ("***Processing filename: " + f.name)
try:
orig = json.load(f)
except UnicodeDecodeError:
f.seek(0)
file_bytes= f.read()
try:
file_string = file_bytes.decode('latin1')
orig = json.loads(file_string)
except Exception as msg:
print ("Issue decoding %s, continuing" % filename)
shutil.copy(src, os.path.join('exceptions', filename.split('/')[-1]))
return

data = orig
data['keys'] = list(data.keys())
# print (json.dumps(data, indent=2))
data['json_source'] = json.dumps(data)
solr_post = session.post(solr_url, params=solr_params, json=data)
try:
solr_post.raise_for_status()
print("added resource %s: %s to index" % (data['id'], data['type']))
except:
dump_exception(orig, solr_post.text)
return
#print(solr_post.text)

for item in os.scandir(BASE_DIR):
import_file(item)

33 changes: 20 additions & 13 deletions indexer/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,10 +151,9 @@ def genericType_toAtts(orig, rid=None):
if orig['@type'] == 'Project' or orig['@type'] == 'ResearchProject':
print('***changing type:Project to type:ResearchProject')
origType = 'ResearchProject'
#handle type:DigitalDocument as type:CreativeWork (see https://github.com/iodepo/odis-arch/issues/337 )
elif orig['@type'] == 'CreativeWork' or orig['@type'] == 'DigitalDocument':
print('***changing type:DigitalDocument to type:CreativeWork')
origType = 'CreativeWork'
#handle CreativeWork subsets as type:CreativeWork (see https://github.com/iodepo/odis-arch/issues/337 )
elif orig['@type'] == 'CreativeWork' or orig['@type'] == 'DigitalDocument' or orig['@type'] == 'Movie' or orig['@type'] == 'SoftwareSourceCode':
print('***changing type:' + orig['@type'] + ' to type:CreativeWork')
else:
origType = orig['@type']

Expand Down Expand Up @@ -185,14 +184,18 @@ def genericType_toAtts(orig, rid=None):
#handle case of name as list
for i in v:
pos = 0
print(i.values())
for val in i.values():
if val == "en":
listForm = list(i.values())
print('***Name: ' + listForm[pos+1])
data.append(Att(None, listForm[pos+1], k))
data.append(Att('txt', listForm[pos+1], k))
data.append(Att('txt', regions.regionForName(listForm[pos+1]), 'region'))
if isinstance(i, dict) == True:
print(i.values())
for val in i.values():
if val == "en":
listForm = list(i.values())
print('***Name: ' + listForm[pos+1])
data.append(Att(None, listForm[pos+1], k))
data.append(Att('txt', listForm[pos+1], k))
data.append(Att('txt', regions.regionForName(listForm[pos+1]), 'region'))
else:
data.append(Att(None, i, k))
data.append(Att('txt', i, k))
elif k == 'description':
if isinstance(v, list) == False:
#print('type is: ',type(v))
Expand Down Expand Up @@ -330,7 +333,11 @@ def genericType_toAtts(orig, rid=None):
if 'txt_region' in ret:
ret['txt_region'] = list(set(ret['txt_region']))
if 'txt_nationality' in ret:
ret['txt_nationality'] = list(set(ret['txt_nationality']))
ret['txt_nationality'] = list(set(ret['txt_nationality']))
if 'txt_license' in ret:
#remove trailing slash in urls, for performing comparison
stripped_vals = [url.rstrip('/') for url in ret['txt_license']]
ret['txt_license'] = list(set(stripped_vals))
return ret

def _merge_prov(orig, prov):
Expand Down
2 changes: 1 addition & 1 deletion solr/conf/schema.xml
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ schema. In this case the version should be set to the next CKAN version number.
<field name="_version_" type="string" indexed="true" stored="true"/>
<field name="index_id" type="string" indexed="true" stored="true" required="true" />
<field name="id" type="string" indexed="true" stored="true" required="true" />
<field name="name" type="string" indexed="true" stored="true" required="false" />
<field name="name" type="string" indexed="true" stored="true" required="false" multiValued="true" />
<field name="type" type="string" indexed="true" stored="true" required="true" omitNorms="true" />
<field name="description" type="textgen" indexed="true" stored="true" required="false" />
<!-- source graph id -->
Expand Down

0 comments on commit 6093b2c

Please sign in to comment.