-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge #1921 from remote-tracking branch 'origin/1058-enrichWithCultur…
…egraphRvkWithFix'
- Loading branch information
Showing
11 changed files
with
2,324 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
53 changes: 53 additions & 0 deletions
53
src/main/java/org/lobid/resources/run/CulturegraphXmlFilterHbzRvkToCsv.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
/* Copyright 2020 hbz, Pascal Christoph. Licensed under the EPL 2.0*/ | ||
|
||
package org.lobid.resources.run; | ||
|
||
import java.io.File; | ||
import java.io.IOException; | ||
|
||
import org.metafacture.biblio.marc21.MarcXmlHandler; | ||
import org.metafacture.csv.CsvEncoder; | ||
import org.metafacture.json.JsonDecoder; | ||
import org.metafacture.json.JsonEncoder; | ||
import org.metafacture.io.FileOpener; | ||
import org.metafacture.io.ObjectWriter; | ||
import org.metafacture.xml.XmlDecoder; | ||
import org.metafacture.metafix.Metafix; | ||
|
||
/** | ||
* Filter resources with hbz holdings from culturegraph's MARCXML while tranform it with reject() | ||
* into a CSV file. | ||
* | ||
* @author Pascal Christoph (dr0i) | ||
* @author Tobias Bülte (TobiasNx) | ||
**/ | ||
public final class CulturegraphXmlFilterHbzRvkToCsv { | ||
private static String OUTPUT_FILE="cg-concordance.csv"; | ||
|
||
public static void main(String... args) { | ||
String XML_INPUT_FILE = new File(args[0]).getAbsolutePath(); | ||
|
||
if (args.length > 1) OUTPUT_FILE = args[1]; | ||
|
||
final FileOpener opener = new FileOpener(); | ||
JsonDecoder jsonDecoder = new JsonDecoder(); | ||
jsonDecoder.setRecordPath("records"); | ||
try { | ||
opener.setReceiver(new XmlDecoder()).setReceiver(new MarcXmlHandler()) | ||
.setReceiver(new Metafix("src/main/resources/rvk/cg-to-rvk-csv.fix")) | ||
.setReceiver(new JsonEncoder()) | ||
.setReceiver(jsonDecoder) | ||
.setReceiver(new CsvEncoder()) | ||
.setReceiver(new ObjectWriter<>(OUTPUT_FILE)); | ||
} catch (IOException e) { | ||
e.printStackTrace(); | ||
} | ||
opener.process( | ||
new File(XML_INPUT_FILE).getAbsolutePath()); | ||
try { | ||
opener.closeStream(); | ||
} catch (final NullPointerException e) { | ||
// ignore, see https://github.com/hbz/lobid-resources/issues/1030 | ||
} | ||
} | ||
} |
51 changes: 51 additions & 0 deletions
51
src/main/java/org/lobid/resources/run/CulturegraphXmlFilterHbzToJson.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
/* Copyright 2020 hbz, Pascal Christoph. Licensed under the EPL 2.0*/ | ||
|
||
package org.lobid.resources.run; | ||
|
||
import java.io.File; | ||
import java.io.IOException; | ||
|
||
import org.metafacture.biblio.marc21.MarcXmlHandler; | ||
import org.metafacture.elasticsearch.JsonToElasticsearchBulk; | ||
import org.metafacture.io.FileOpener; | ||
import org.metafacture.io.ObjectWriter; | ||
import org.metafacture.json.JsonEncoder; | ||
import org.metafacture.xml.XmlDecoder; | ||
import org.metafacture.metafix.Metafix; | ||
|
||
/** | ||
* Filter resources with hbz holdings from culturegraph's MARCXML while tranform it with reject() | ||
* into JSON and write this as an elasticsearch bulk json file. | ||
* | ||
* @author Pascal Christoph (dr0i) | ||
* @author Tobias Bülte (TobiasNx) | ||
**/ | ||
public final class CulturegraphXmlFilterHbzToJson { | ||
private static final String ELASTICSEARCH_INDEX_NAME = "cg"; | ||
public static final String ELASTICSEARCH_INDEX_TYPE_NAME="rvk"; | ||
private static String JSON_FILE="bulk.ndjson"; | ||
|
||
public static void main(String... args) { | ||
String XML_INPUT_FILE = new File(args[0]).getAbsolutePath(); | ||
|
||
if (args.length > 1) JSON_FILE = args[1]; | ||
|
||
final FileOpener opener = new FileOpener(); | ||
try { | ||
opener.setReceiver(new XmlDecoder()).setReceiver(new MarcXmlHandler()) | ||
.setReceiver(new Metafix("src/main/resources/rvk/cg-to-rvk-json.fix")) | ||
.setReceiver(new JsonEncoder()) | ||
.setReceiver(new JsonToElasticsearchBulk(ELASTICSEARCH_INDEX_TYPE_NAME, ELASTICSEARCH_INDEX_NAME)) | ||
.setReceiver(new ObjectWriter<>(JSON_FILE)); | ||
} catch (IOException e) { | ||
e.printStackTrace(); | ||
} | ||
opener.process( | ||
new File(XML_INPUT_FILE).getAbsolutePath()); | ||
try { | ||
opener.closeStream(); | ||
} catch (final NullPointerException e) { | ||
// ignore, see https://github.com/hbz/lobid-resources/issues/1030 | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
set_array("records[]") | ||
set_array("@id[]") | ||
set_array("rvk[]") | ||
|
||
do list(path: "084??", "var": "$i") | ||
if any_match("$i.2", "rvk") | ||
copy_field("$i.a","rvk[].$append") | ||
end | ||
end | ||
|
||
uniq("rvk[]") | ||
join_field("rvk[]",",") | ||
|
||
|
||
do list(path: "035??", "var": "$i") | ||
if any_match("$i.a", "^\\(DE-605\\)(.*)") | ||
copy_field("$i.a","@id[].$append") | ||
end | ||
end | ||
replace_all("id[].*","^\\(DE-605\\)(.*)","$1") | ||
|
||
do list(path: "@id[]", "var": "$i") | ||
copy_field("$i","records[].$append.id") | ||
copy_field("rvk[]","records[].$last.rvk[]") | ||
end | ||
replace_all("records[].*.id","^\\(DE-605\\)(.*)","$1") | ||
|
||
vacuum() | ||
|
||
# Filter records without RVK | ||
unless exists("rvk[]") | ||
reject() | ||
end | ||
|
||
# Filter records without hbz ids | ||
unless exists("@id[]") | ||
reject() | ||
end | ||
|
||
retain("records[]") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
set_array("rvk[]") | ||
|
||
do list(path: "084??", "var": "$i") | ||
if any_match("$i.2", "rvk") | ||
copy_field("$i.a","rvk[].$append") | ||
end | ||
end | ||
uniq("rvk[]") | ||
set_array("id") | ||
do list(path: "035??", "var": "$i") | ||
if any_match("$i.a", "^\\(DE-605\\)(.*)") | ||
copy_field("$i.a","id.$append") | ||
end | ||
end | ||
replace_all("id.*","^\\(DE-605\\)(.*)","$1") | ||
join_field("id",", ") | ||
|
||
retain("rvk[]","id") | ||
vacuum() | ||
|
||
# Filter records without RVK | ||
unless exists("rvk[]") | ||
reject() | ||
end | ||
|
||
# Filter records without hbz ids | ||
unless exists("id") | ||
reject() | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
45 changes: 45 additions & 0 deletions
45
src/test/java/org/lobid/resources/CulturegraphXmlFilterHbzRvkToCsvTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
/* Copyright 2020 hbz, Pascal Christoph. Licensed under the EPL 2.0*/ | ||
|
||
package org.lobid.resources; | ||
|
||
import static org.junit.Assert.assertEquals; | ||
|
||
import java.io.File; | ||
import java.io.IOException; | ||
|
||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
import org.junit.Test; | ||
import org.lobid.resources.run.CulturegraphXmlFilterHbzRvkToCsv; | ||
|
||
/** | ||
* Test of filtering resources with hbz holdings from culturegraph MARCXML, | ||
* tranforming into a CSV file. | ||
* | ||
* @author Pascal Christoph(dr0i) | ||
**/ | ||
public final class CulturegraphXmlFilterHbzRvkToCsvTest { | ||
|
||
private static final Logger LOG = | ||
LoggerFactory.getLogger(CulturegraphXmlFilterHbzRvkToCsvTest.class); | ||
|
||
private static final String PATH_TO_TEST = "src/test/resources/"; | ||
public static final String OUTPUT_FILE = | ||
PATH_TO_TEST + "cg/output.csv"; | ||
|
||
private static final String XML_INPUT_FILE = "cg/aggregate_20240507_example.marcxml"; | ||
|
||
@SuppressWarnings("static-method") | ||
@Test | ||
public void testExtractLookupTableFromCgAsHbzRvk() { | ||
CulturegraphXmlFilterHbzRvkToCsv.main(PATH_TO_TEST + XML_INPUT_FILE, | ||
OUTPUT_FILE); | ||
} | ||
|
||
/**private static void ingest() throws IOException { | ||
File jsonFile = new File(OUTPUT_FILE); | ||
}*/ | ||
|
||
|
||
} |
Oops, something went wrong.