diff --git a/README.textile b/README.textile index 27b347f..e0d2309 100644 --- a/README.textile +++ b/README.textile @@ -30,29 +30,23 @@ Set up a location for the EntityFacts input data: @mkdir entityfacts ; cd entityfacts@ -Get the EntityFacts data from the DNB – for the @20200713@ release here, find current at "https://data.dnb.de/opendata/":https://data.dnb.de/opendata/: +Get the latest EntityFacts data from the DNB (see "https://data.dnb.de/opendata/":https://data.dnb.de/opendata/): -@wget https://data.dnb.de/opendata/authorities_entityfacts_20200713.jsonld.gz@ +@wget https://data.dnb.de/opendata/authorities_entityfacts.jsonld.gz@ Unpack the data: -@gunzip < authorities_entityfacts_20200713.jsonld.gz > authorities_entityfacts_20200713.jsonld@ +@gunzip < authorities_entityfacts.jsonld.gz > authorities_entityfacts.jsonld@ Go back to the root directory: @cd ..@ -Set up the data location in 'conf/application.conf' (data.entityfacts): +Index the data, passing the index name: -@data { ... entityfacts: "entityfacts/authorities_entityfacts_20200713.jsonld" ...@ +@sbt -Dindex.entityfacts.index=entityfacts_20210120 "runMain apps.Index entityfacts"@ -Set up the name for the index to create in 'conf/application.conf' (index.entityfacts.index): - -@index { ... entityfacts { index: "entityfacts_20200713" ...@ - -Index the data: - -@sbt "runMain apps.Index entityfacts"@ +For configuration details and defaults, see 'conf/application.conf'. h3. GND Baseline @@ -88,7 +82,7 @@ Convert the data to JSON-LD lines, the index data format: @sbt "runMain apps.ConvertBaseline"@ -To be able to log out from the server while the conversion is running, we actually use: +To be able to log out from the server while the conversion is running, we actually use (see full usage details in baseline.sh): @setsid nohup sbt "runMain apps.ConvertBaseline" &@ diff --git a/app/apps/Convert.java b/app/apps/Convert.java index 8b0d611..e5050fd 100644 --- a/app/apps/Convert.java +++ b/app/apps/Convert.java @@ -5,7 +5,6 @@ import static models.AuthorityResource.ELEMENTSET; import java.io.BufferedReader; -import java.io.File; import java.io.IOException; import java.io.StringReader; import java.io.StringWriter; @@ -53,7 +52,6 @@ import com.github.jsonldjava.utils.JsonUtils; import com.google.common.collect.ImmutableMap; import com.typesafe.config.Config; -import com.typesafe.config.ConfigFactory; import com.typesafe.config.ConfigObject; import controllers.HomeController; @@ -64,13 +62,13 @@ public class Convert { - static final Config CONFIG = ConfigFactory.parseFile(new File("conf/application.conf")); + static final Config CONFIG = HomeController.CONFIG; static final TransportClient CLIENT = new PreBuiltTransportClient( Settings.builder().put("cluster.name", HomeController.config("index.boot.cluster")).build()); static { - ConfigFactory.parseFile(new File("conf/application.conf")).getStringList("index.boot.hosts").forEach((host) -> { + CONFIG.getStringList("index.boot.hosts").forEach((host) -> { try { CLIENT.addTransportAddress(new InetSocketTransportAddress(InetAddress.getByName(host), 9300)); } catch (UnknownHostException e) { diff --git a/baseline.sh b/baseline.sh new file mode 100644 index 0000000..1c38b7d --- /dev/null +++ b/baseline.sh @@ -0,0 +1,67 @@ +#!/bin/bash +set -uo pipefail # See http://redsymbol.net/articles/unofficial-bash-strict-mode/ +# Call on server like: setsid nohup bash baseline.sh > baseline.log 2>&1 & + +# details and defaults are configured in conf/application.conf + +export TODAY=$(date +'%Y%m%d') + +# get entityfacts baseline file +cd data/entityfacts/ +wget https://data.dnb.de/opendata/authorities_entityfacts.jsonld.gz +gunzip < authorities_entityfacts.jsonld.gz > authorities_entityfacts.jsonld +cd ../.. + +# index entityfacts JSON +sbt \ + -Dindex.entityfacts.index=entityfacts_$TODAY \ + "runMain apps.Index entityfacts" \ + > IndexEntityfacts_$TODAY.log 2>&1 + +# clean up entityfacts baseline file +mv data/entityfacts/authorities_entityfacts.jsonld.gz data/entityfacts/authorities_entityfacts_$TODAY.jsonld.gz + +# get gnd_lds baseline files +cd data/gnd_lds +wget https://data.dnb.de/opendata/authorities-{geografikum,koerperschaft,kongress,person,sachbegriff,werk}_lds.rdf.gz +cd ../.. +mkdir data/index/gnd_lds_$TODAY + +# convert RDF_XML to JSON lines +sbt \ + -Dindex.entityfacts.index=entityfacts_$TODAY \ + -Dindex.prod.name=gnd_$TODAY \ + -Ddata.jsonlines=data/index/gnd_lds_$TODAY \ + -Dindex.delete.baseline=GND-deprecated-baseline_$TODAY.txt \ + "runMain apps.ConvertBaseline" \ + > ConvertBaseline_$TODAY.log 2>&1 + +# clean up gnd_lds baseline files +mkdir data/gnd_lds/gnd_lds_$TODAY +mv data/gnd_lds/authorities-*_lds.rdf.gz data/gnd_lds/gnd_lds_$TODAY + +# index JSON lines +sbt \ + -Dindex.prod.name=gnd_$TODAY \ + -Ddata.jsonlines=data/index/gnd_lds_$TODAY \ + -Dindex.delete.baseline=GND-deprecated-baseline_$TODAY.txt \ + "runMain apps.Index baseline" \ + > IndexBaseline_$TODAY.log 2>&1 + +# index updates since last baseline (currently manual process) +# export LAST_BASE=20201013 # get date from https://data.dnb.de/opendata/? +# mkdir data/index/gnd_since_$LAST_BASE +# cp data/backup/GND-updates_2021*.jsonl data/index/gnd_since_$LAST_BASE # etc.; alt: OAI-PMH +# setsid nohup sbt \ +# -Dindex.prod.name=gnd_$TODAY \ +# -Ddata.jsonlines=data/index/gnd_since_$LAST_BASE \ +# -Dindex.delete.baseline=GND-deprecated-updates.txt \ +# "runMain apps.Index baseline" \ +# > IndexBaseline_since_$LAST_BASE.log 2>&1 & + +# a more automatable alternative might be to OAI-PMH updates +# export LAST_BASE=2020-10-13 # get date from https://data.dnb.de/opendata/? +# sbt "runMain apps.ConvertUpdates $LAST_BASE" +# sbt "runMain apps.Index updates" + +## finally, switch 'gnd' alias to 'gnd_$TODAY' diff --git a/conf/application.conf b/conf/application.conf index 0baa623..321d369 100644 --- a/conf/application.conf +++ b/conf/application.conf @@ -43,7 +43,7 @@ index { settings: "conf/index-settings.json" content: "application/json; charset=utf-8" entityfacts { - index: "entityfacts-20190816-1600" + index: "entityfacts" type: "entityfacts" }, delete.baseline: "GND-deprecated-baseline.txt"