Skip to content

Commit

Permalink
Merge #1921 from remote-tracking branch 'origin/1058-enrichWithCultur…
Browse files Browse the repository at this point in the history
…egraphRvkWithFix'
  • Loading branch information
dr0i committed Jun 4, 2024
2 parents 189679f + e297026 commit eb223f8
Show file tree
Hide file tree
Showing 11 changed files with 2,324 additions and 2 deletions.
23 changes: 22 additions & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,12 @@
</dependency>
<dependency>
<groupId>org.metafacture</groupId>
<artifactId>metafacture-triples</artifactId>
<artifactId>metafacture-elasticsearch</artifactId>
<version>6.0.0</version>
</dependency>
<dependency>
<groupId>org.metafacture</groupId>
<artifactId>metafacture-csv</artifactId>
<version>6.0.0</version>
</dependency>
<dependency>
Expand Down Expand Up @@ -124,6 +129,21 @@
<artifactId>core</artifactId>
<version>1.47.1</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-api</artifactId>
<version>2.9.1</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-1.2-api</artifactId>
<version>2.9.1</version>
</dependency>
<dependency>
<groupId>commons-validator</groupId>
<artifactId>commons-validator</artifactId>
<version>1.5.1</version>
</dependency>
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch</artifactId>
Expand Down Expand Up @@ -263,6 +283,7 @@
</executions>
<configuration>
<excludes>
<exclude>tmp/</exclude>
<exclude>web/public/javascripts/leaflet.js</exclude>
<exclude>**/*.woff2</exclude>
<exclude>web/conf/context.jsonld</exclude>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/* Copyright 2020 hbz, Pascal Christoph. Licensed under the EPL 2.0*/

package org.lobid.resources.run;

import java.io.File;
import java.io.IOException;

import org.metafacture.biblio.marc21.MarcXmlHandler;
import org.metafacture.csv.CsvEncoder;
import org.metafacture.json.JsonDecoder;
import org.metafacture.json.JsonEncoder;
import org.metafacture.io.FileOpener;
import org.metafacture.io.ObjectWriter;
import org.metafacture.xml.XmlDecoder;
import org.metafacture.metafix.Metafix;

/**
* Filter resources with hbz holdings from culturegraph's MARCXML while tranform it with reject()
* into a CSV file.
*
* @author Pascal Christoph (dr0i)
* @author Tobias Bülte (TobiasNx)
**/
public final class CulturegraphXmlFilterHbzRvkToCsv {
private static String OUTPUT_FILE="cg-concordance.csv";

public static void main(String... args) {
String XML_INPUT_FILE = new File(args[0]).getAbsolutePath();

if (args.length > 1) OUTPUT_FILE = args[1];

final FileOpener opener = new FileOpener();
JsonDecoder jsonDecoder = new JsonDecoder();
jsonDecoder.setRecordPath("records");
try {
opener.setReceiver(new XmlDecoder()).setReceiver(new MarcXmlHandler())
.setReceiver(new Metafix("src/main/resources/rvk/cg-to-rvk-csv.fix"))
.setReceiver(new JsonEncoder())
.setReceiver(jsonDecoder)
.setReceiver(new CsvEncoder())
.setReceiver(new ObjectWriter<>(OUTPUT_FILE));
} catch (IOException e) {
e.printStackTrace();
}
opener.process(
new File(XML_INPUT_FILE).getAbsolutePath());
try {
opener.closeStream();
} catch (final NullPointerException e) {
// ignore, see https://github.com/hbz/lobid-resources/issues/1030
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/* Copyright 2020 hbz, Pascal Christoph. Licensed under the EPL 2.0*/

package org.lobid.resources.run;

import java.io.File;
import java.io.IOException;

import org.metafacture.biblio.marc21.MarcXmlHandler;
import org.metafacture.elasticsearch.JsonToElasticsearchBulk;
import org.metafacture.io.FileOpener;
import org.metafacture.io.ObjectWriter;
import org.metafacture.json.JsonEncoder;
import org.metafacture.xml.XmlDecoder;
import org.metafacture.metafix.Metafix;

/**
* Filter resources with hbz holdings from culturegraph's MARCXML while tranform it with reject()
* into JSON and write this as an elasticsearch bulk json file.
*
* @author Pascal Christoph (dr0i)
* @author Tobias Bülte (TobiasNx)
**/
public final class CulturegraphXmlFilterHbzToJson {
private static final String ELASTICSEARCH_INDEX_NAME = "cg";
public static final String ELASTICSEARCH_INDEX_TYPE_NAME="rvk";
private static String JSON_FILE="bulk.ndjson";

public static void main(String... args) {
String XML_INPUT_FILE = new File(args[0]).getAbsolutePath();

if (args.length > 1) JSON_FILE = args[1];

final FileOpener opener = new FileOpener();
try {
opener.setReceiver(new XmlDecoder()).setReceiver(new MarcXmlHandler())
.setReceiver(new Metafix("src/main/resources/rvk/cg-to-rvk-json.fix"))
.setReceiver(new JsonEncoder())
.setReceiver(new JsonToElasticsearchBulk(ELASTICSEARCH_INDEX_TYPE_NAME, ELASTICSEARCH_INDEX_NAME))
.setReceiver(new ObjectWriter<>(JSON_FILE));
} catch (IOException e) {
e.printStackTrace();
}
opener.process(
new File(XML_INPUT_FILE).getAbsolutePath());
try {
opener.closeStream();
} catch (final NullPointerException e) {
// ignore, see https://github.com/hbz/lobid-resources/issues/1030
}
}
}
40 changes: 40 additions & 0 deletions src/main/resources/rvk/cg-to-rvk-csv.fix
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
set_array("records[]")
set_array("@id[]")
set_array("rvk[]")

do list(path: "084??", "var": "$i")
if any_match("$i.2", "rvk")
copy_field("$i.a","rvk[].$append")
end
end

uniq("rvk[]")
join_field("rvk[]",",")


do list(path: "035??", "var": "$i")
if any_match("$i.a", "^\\(DE-605\\)(.*)")
copy_field("$i.a","@id[].$append")
end
end
replace_all("id[].*","^\\(DE-605\\)(.*)","$1")

do list(path: "@id[]", "var": "$i")
copy_field("$i","records[].$append.id")
copy_field("rvk[]","records[].$last.rvk[]")
end
replace_all("records[].*.id","^\\(DE-605\\)(.*)","$1")

vacuum()

# Filter records without RVK
unless exists("rvk[]")
reject()
end

# Filter records without hbz ids
unless exists("@id[]")
reject()
end

retain("records[]")
29 changes: 29 additions & 0 deletions src/main/resources/rvk/cg-to-rvk-json.fix
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
set_array("rvk[]")

do list(path: "084??", "var": "$i")
if any_match("$i.2", "rvk")
copy_field("$i.a","rvk[].$append")
end
end
uniq("rvk[]")
set_array("id")
do list(path: "035??", "var": "$i")
if any_match("$i.a", "^\\(DE-605\\)(.*)")
copy_field("$i.a","id.$append")
end
end
replace_all("id.*","^\\(DE-605\\)(.*)","$1")
join_field("id",", ")

retain("rvk[]","id")
vacuum()

# Filter records without RVK
unless exists("rvk[]")
reject()
end

# Filter records without hbz ids
unless exists("id")
reject()
end
4 changes: 3 additions & 1 deletion src/test/java/UnitTests.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
@RunWith(Suite.class)
@Suite.SuiteClasses({
TestGenerateContext.class,
org.lobid.resources.AlmaMarc21XmlToLobidJsonMetafixTest.class})
org.lobid.resources.AlmaMarc21XmlToLobidJsonMetafixTest.class,
org.lobid.resources.CulturegraphXmlFilterHbzRvkToCsvTest.class,
org.lobid.resources.CulturegraphXmlFilterHbzToJsonTest.class})

public final class UnitTests {
/* Suite class, groups tests via annotation above */
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/* Copyright 2020 hbz, Pascal Christoph. Licensed under the EPL 2.0*/

package org.lobid.resources;

import static org.junit.Assert.assertEquals;

import java.io.File;
import java.io.IOException;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.junit.Test;
import org.lobid.resources.run.CulturegraphXmlFilterHbzRvkToCsv;

/**
* Test of filtering resources with hbz holdings from culturegraph MARCXML,
* tranforming into a CSV file.
*
* @author Pascal Christoph(dr0i)
**/
public final class CulturegraphXmlFilterHbzRvkToCsvTest {

private static final Logger LOG =
LoggerFactory.getLogger(CulturegraphXmlFilterHbzRvkToCsvTest.class);

private static final String PATH_TO_TEST = "src/test/resources/";
public static final String OUTPUT_FILE =
PATH_TO_TEST + "cg/output.csv";

private static final String XML_INPUT_FILE = "cg/aggregate_20240507_example.marcxml";

@SuppressWarnings("static-method")
@Test
public void testExtractLookupTableFromCgAsHbzRvk() {
CulturegraphXmlFilterHbzRvkToCsv.main(PATH_TO_TEST + XML_INPUT_FILE,
OUTPUT_FILE);
}

/**private static void ingest() throws IOException {
File jsonFile = new File(OUTPUT_FILE);
}*/


}
Loading

0 comments on commit eb223f8

Please sign in to comment.