re-add recent changes to main

iodepo · Dec 17, 2024 · 6093b2c · 6093b2c
1 parent bae7f34
commit 6093b2c
Show file tree

Hide file tree

Showing 9 changed files with 137 additions and 21 deletions.
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -0,0 +1,8 @@
+# https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
+
+version: 2
+updates:
+  - package-ecosystem: "github-actions" # See documentation for possible values
+    directory: "/" # Location of package manifests
+    schedule:
+      interval: "weekly"
diff --git a/.github/workflows/check-crlf.yml b/.github/workflows/check-crlf.yml
@@ -12,7 +12,7 @@ jobs:
 
     steps:
       - name: Checkout repository contents
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Use action to check for CRLF endings
         uses: erclu/[email protected]

diff --git a/.github/workflows/irc_notify.yml b/.github/workflows/irc_notify.yml
@@ -15,7 +15,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: irc push
-        uses: rectalogic/notify-irc@v1
+        uses: rectalogic/notify-irc@v2
         if: github.event_name == 'push'
         with:
           channel: "#oih"
@@ -26,7 +26,7 @@ jobs:
             ${{ github.actor }} pushed ${{ github.event.ref }} ${{ github.event.compare }}
             ${{ join(github.event.commits.*.message) }}
       - name: irc pull request
-        uses: rectalogic/notify-irc@v1
+        uses: rectalogic/notify-irc@v2
         if: github.event_name == 'pull_request'
         with:
           channel: "#oih"
@@ -36,7 +36,7 @@ jobs:
           message: |
             ${{ github.actor }} opened PR ${{ github.event.pull_request.html_url }}
       - name: irc tag created
-        uses: rectalogic/notify-irc@v1
+        uses: rectalogic/notify-irc@v2
         if: github.event_name == 'create' && github.event.ref_type == 'tag'
         with:
           channel: "#oih"

diff --git a/LICENSE.md b/LICENSE.md
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2023 Ocean InfoHub
+Copyright (c) 2024 Ocean InfoHub
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/README.md b/README.md
@@ -6,6 +6,6 @@ This repo contains the code for the Ocean Info Hub Global Search Portal.
 * `/indexer` contains all of the code to ingest the OIH graph into the SOLR Instance
 * `/solr` contains the configuration required for the solr instance, including the schema. 
 * `/frontend` contains the code for the static javascript app. This will produce a container in dev mode running a live server, and a static html/javascript site in production mode. 
-* `/regions` contains the QGIS file defining the gographical regions. 
+* `/regions` contains the QGIS file defining the geographical regions. 
 
 See the individual README files for more information. 
diff --git a/indexer/conversions.py b/indexer/conversions.py
@@ -131,6 +131,18 @@ def GeoShape(geo):
             return _geo(field, fmt % val)
     raise UnhandledFormatException("Didn't handle %s in GeoShape" % json.dumps(geo))
 
+def GeoCoordinates(geo):
+    #print('here [GeoCoordinates]')
+
+    lat = geo.get("latitude",None)
+    long = geo.get("longitude",None)
+    if lat is not None and long is not None:
+        print ("Generating a Point from the GeoCoordinates...")
+        newPoint = "POINT (" + str(long) + " " + str(lat) + ")"
+        print(newPoint)
+        return _geo('point', newPoint)      
+
+    raise UnhandledFormatException("Didn't handle %s in GeoCoordinates" % json.dumps(geo))
 
 def CourseInstance(data):
     atts = [_dispatch(field, data.get(field, None)) for field in ('startDate', 'endDate')]
@@ -239,7 +251,7 @@ def _parseDate(field, d):
     try:
         dt = isoparse(d)
         return [
-            Att('dt', dt.isoformat(), field),
+            Att('dt', dt.isoformat(timespec='seconds').replace('+00:00', 'Z'), field),
             Att('n', dt.year, field.replace('Date', 'Year')),
         ]
     except ValueError:

diff --git a/indexer/indexer-graph-solr.py b/indexer/indexer-graph-solr.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+
+"""
+Purpose: Load a directory of JSON files, generated from the ODIS graph->Solr 
+         process (mdp2solr.sh), into an existing Solr core.     
+
+Steps:   1) set your Solr core endpoint variable at the commandline by:
+            
+              export SOLR_URL=http://127.0.0.1:8983/solr/cioos
+            
+         2) set the path to the directory of JSON files, at the commandline by:
+            
+              export DATA_DIR=/home/apps/oih-ui-jmckenna/indexer/data/test
+            
+         3) python indexer-graph-solr.py
+
+Output:  Records indexed into the Solr core.  Look for the "added resource" message 
+         in the command window (which means it successfully indexed into Solr) such as:
+         
+           ***Processing filename: /home/apps/oih-ui-jmckenna/indexer/data/test/ttt1.json
+           added resource https://catalogue.cioos.ca/dataset/00863729-b5a8-4ac6-b73a-523d463f9963.jsonld: schema:Dataset to index
+           ***Processing filename: /home/apps/oih-ui-jmckenna/indexer/data/test/ttt2.json
+           added resource https://catalogue.cioos.ca/dataset/d1391e91-1ed2-4600-901a-5a5408fd1a6f.jsonld: schema:Dataset to index
+         
+Requires: Python 3.x
+
+Notes:
+
+  Input files are JSON (not JSON-LD that the orginal "indexer.py" required)
+     
+"""
+
+import requests
+import json
+import os
+from pathlib import Path
+from test_utils import test_generation, dump_exception
+
+#set urls
+BASE_SOLR_URL=os.environ.get('SOLR_URL', '')
+solr_url = BASE_SOLR_URL + "/update/json/docs"
+delete_url = BASE_SOLR_URL + "/update"
+query_url = BASE_SOLR_URL + "/select"
+
+DATA_DIR=os.environ.get('DATA_DIR')
+BASE_DIR=Path(DATA_DIR)
+
+session = requests.Session()
+
+# set Solr params
+solr_params = {
+    'commit': 'true',
+    # echo implies a dry run
+#    'echo': 'true',
+}
+
+#loop through directory
+def import_file(file):
+    with open(file, 'rb') as f:
+        print ("***Processing filename: " + f.name)
+        try:
+            orig = json.load(f)
+        except UnicodeDecodeError:
+            f.seek(0)
+            file_bytes= f.read()
+            try:
+                file_string = file_bytes.decode('latin1')
+                orig = json.loads(file_string)
+            except Exception as msg:
+                print ("Issue decoding %s, continuing" % filename)
+                shutil.copy(src, os.path.join('exceptions', filename.split('/')[-1]))
+                return
+
+        data = orig
+        data['keys'] = list(data.keys())
+        #    print (json.dumps(data, indent=2))
+        data['json_source'] = json.dumps(data)
+        solr_post = session.post(solr_url, params=solr_params, json=data)
+        try:
+            solr_post.raise_for_status()
+            print("added resource %s: %s to index" % (data['id'], data['type']))
+        except:
+            dump_exception(orig, solr_post.text)
+            return
+        #print(solr_post.text)
+
+for item in os.scandir(BASE_DIR):
+    import_file(item)
+
diff --git a/indexer/indexer.py b/indexer/indexer.py
@@ -151,10 +151,9 @@ def genericType_toAtts(orig, rid=None):
         if orig['@type'] == 'Project' or orig['@type'] == 'ResearchProject':
           print('***changing type:Project to type:ResearchProject')
           origType = 'ResearchProject'
-        #handle type:DigitalDocument as type:CreativeWork (see https://github.com/iodepo/odis-arch/issues/337 )
-        elif orig['@type'] == 'CreativeWork' or orig['@type'] == 'DigitalDocument':
-          print('***changing type:DigitalDocument to type:CreativeWork')
-          origType = 'CreativeWork'
+        #handle CreativeWork subsets as type:CreativeWork (see https://github.com/iodepo/odis-arch/issues/337 )
+        elif orig['@type'] == 'CreativeWork' or orig['@type'] == 'DigitalDocument' or orig['@type'] == 'Movie' or orig['@type'] == 'SoftwareSourceCode':
+          print('***changing type:' + orig['@type'] + ' to type:CreativeWork')
         else:
           origType = orig['@type']                
 
@@ -185,14 +184,18 @@ def genericType_toAtts(orig, rid=None):
                     #handle case of name as list
                     for i in v:
                         pos = 0
-                        print(i.values()) 
-                        for val in i.values():
-                            if val == "en":
-                                listForm = list(i.values())
-                                print('***Name: ' + listForm[pos+1])
-                                data.append(Att(None, listForm[pos+1], k))
-                                data.append(Att('txt', listForm[pos+1], k))
-                                data.append(Att('txt', regions.regionForName(listForm[pos+1]), 'region'))
+                        if isinstance(i, dict) == True:
+                            print(i.values()) 
+                            for val in i.values():
+                                if val == "en":
+                                    listForm = list(i.values())
+                                    print('***Name: ' + listForm[pos+1])
+                                    data.append(Att(None, listForm[pos+1], k))
+                                    data.append(Att('txt', listForm[pos+1], k))
+                                    data.append(Att('txt', regions.regionForName(listForm[pos+1]), 'region'))
+                        else:
+                            data.append(Att(None, i, k))
+                            data.append(Att('txt', i, k))
             elif k == 'description':
                 if isinstance(v, list) == False:
                     #print('type is: ',type(v))
@@ -330,7 +333,11 @@ def genericType_toAtts(orig, rid=None):
     if 'txt_region' in ret:
         ret['txt_region'] = list(set(ret['txt_region']))
     if 'txt_nationality' in ret:
-        ret['txt_nationality'] = list(set(ret['txt_nationality']))        
+        ret['txt_nationality'] = list(set(ret['txt_nationality']))
+    if 'txt_license' in ret:
+        #remove trailing slash in urls, for performing comparison
+        stripped_vals = [url.rstrip('/') for url in ret['txt_license']]
+        ret['txt_license'] = list(set(stripped_vals))        
     return ret
 
 def _merge_prov(orig, prov):

diff --git a/solr/conf/schema.xml b/solr/conf/schema.xml
@@ -104,7 +104,7 @@ schema. In this case the version should be set to the next CKAN version number.
     <field name="_version_" type="string" indexed="true" stored="true"/>
     <field name="index_id" type="string" indexed="true" stored="true" required="true" />
     <field name="id" type="string" indexed="true" stored="true" required="true" />
-    <field name="name" type="string" indexed="true" stored="true" required="false" />
+    <field name="name" type="string" indexed="true" stored="true" required="false" multiValued="true" />
     <field name="type" type="string" indexed="true" stored="true" required="true" omitNorms="true" />
     <field name="description" type="textgen" indexed="true" stored="true" required="false" />
     <!-- source graph id -->