Skip to content

Commit

Permalink
Hebrew processing scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
andrew-morrison committed Feb 5, 2018
1 parent bb89e1a commit 206448d
Show file tree
Hide file tree
Showing 10 changed files with 447 additions and 27 deletions.
65 changes: 65 additions & 0 deletions processing/batch_conversion/add-keys.xsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns="http://www.tei-c.org/ns/1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:tei="http://www.tei-c.org/ns/1.0"
xmlns:saxon="http://saxon.sf.net/"
exclude-result-prefixes="xs"
version="2.0">

<!-- DON'T FORGET TO SET XSLT TRANSFORMER TO IGNORE THE SCHEMA (TO AVOID ADDING DEFAULT ATTRIBUTES) -->

<xsl:output method="xml" encoding="UTF-8"/>

<xsl:variable name="newline" select="'&#10;'"/>

<xsl:variable name="works" select="document('../../authority/works_master.xml')//tei:TEI/tei:text/tei:body/tei:listBibl/tei:bibl"/>
<xsl:variable name="people" select="document('../../authority/persons_master.xml')//tei:TEI/tei:text/tei:body/tei:listPerson/tei:person"/>
<xsl:variable name="places" select="document('../../authority/places_master.xml')//tei:TEI/tei:text/tei:body/tei:listPlace/tei:place"/>

<xsl:template match="/">
<xsl:value-of select="$newline"/>
<xsl:processing-instruction name="xml-model"><xsl:text>href="https://raw.githubusercontent.com/bodleian/consolidated-tei-schema/master/msdesc.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"</xsl:text></xsl:processing-instruction><xsl:value-of select="$newline"/>
<xsl:processing-instruction name="xml-model"><xsl:text>href="https://raw.githubusercontent.com/bodleian/consolidated-tei-schema/master/msdesc.rng" type="application/xml" schematypens="http://purl.oclc.org/dsdl/schematron"</xsl:text></xsl:processing-instruction><xsl:value-of select="$newline"/>
<xsl:apply-templates select="*[not(processing-instruction('xml-model'))]"/>
</xsl:template>

<xsl:template match="*">
<xsl:copy>
<xsl:apply-templates select="@*[not(name()='key')]"/> <!-- A very, very few already have keys, and they don't relate to anything, so strip them out. -->
<xsl:choose>
<!-- Don't do msItems
<xsl:when test="self::tei:msItem">
<xsl:variable name="thisid" select="@xml:id"/>
<xsl:if test="$thisid = $works/tei:ref/@target">
<xsl:attribute name="key" select="$works[tei:ref/@target = $thisid]/@xml:id"/>
</xsl:if>
</xsl:when>-->
<xsl:when test="self::tei:author">
<xsl:variable name="thisval" select="normalize-space(string-join(.//text(), ' '))"/>
<xsl:if test="$thisval = $people/tei:persName">
<xsl:attribute name="key" select="$people[tei:persName = $thisval]/@xml:id"/>
</xsl:if>
</xsl:when>
<xsl:when test="self::tei:persName">
<xsl:variable name="thisval" select="normalize-space(string-join(.//text(), ' '))"/>
<xsl:if test="$thisval = $people/tei:persName">
<xsl:attribute name="key" select="$people[tei:persName = $thisval]/@xml:id"/>
</xsl:if>
</xsl:when>
<xsl:when test="self::tei:placeName">
<xsl:variable name="thisval" select="normalize-space(string-join(.//text(), ' '))"/>
<xsl:if test="$thisval = $places/tei:placeName">
<xsl:attribute name="key" select="$places[tei:placeName = $thisval]/@xml:id"/>
</xsl:if>
</xsl:when>
</xsl:choose>
<xsl:apply-templates/>
</xsl:copy>
</xsl:template>

<xsl:template match="@*|comment()|processing-instruction()">
<xsl:copy/>
</xsl:template>

</xsl:stylesheet>
103 changes: 103 additions & 0 deletions processing/batch_conversion/build-interim-person-authority-file.xquery
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
declare namespace tei="http://www.tei-c.org/ns/1.0";
declare option saxon:output "indent=yes";


declare function local:logging($level, $msg, $values)
{
(: Trick XQuery into doing trace() to output message to STDERR but not insert it into the XML :)
substring(trace('', concat(upper-case($level), ' ', $msg, ' ', string-join($values, ' '), ' ')), 0, 0)
};

<TEI xmlns="http://www.tei-c.org/ns/1.0">
<teiHeader>
<fileDesc>
<titleStmt>
<title>Title</title>
</titleStmt>
<publicationStmt>
<p>Publication Information</p>
</publicationStmt>
<sourceDesc>
<p>Information about the source</p>
</sourceDesc>
</fileDesc>
</teiHeader>
<text>
<body>
<listPerson>
{

let $skipids := ()

(: First build an in-memory nodeset temporarily storing titles, IDs and the files they come from. :)
let $hebrewpeople := (
for $x in collection('../../collections/?select=*.xml;recurse=yes')//tei:persName[not(ancestor::tei:revisionDesc or ancestor::tei:respStmt)]
return
if ($x eq $skipids) then
( (: This @key is in one of the manually-maintained authority files, so don't include it in the generated list :) )
else
<person>
<name>{ normalize-space(string-join($x//text(), ' ')) }</name>
<file>{ base-uri($x) }</file>
<ref>/catalog/{ $x/ancestor::tei:TEI/@xml:id/data() }|{ ($x/ancestor::tei:TEI/tei:teiHeader/tei:fileDesc/tei:sourceDesc/tei:msDesc/tei:msIdentifier/tei:idno)[1]/text() }</ref>
</person>
)

(: NOTE: In the Hebrew catalogue, persName when used inside author marks up a translated variant of the same name. :)
let $hebrewauthors := (
for $x in collection('../../collections/?select=*.xml;recurse=yes')//tei:author
return
if ($x eq $skipids) then
( (: This @key is in one of the manually-maintained authority files, so don't include it in the generated list :) )
else
<person>
<name>{ normalize-space(string-join($x//text()[not(ancestor::persName)], ' ')) }</name>
<file>{ base-uri($x) }</file>
<ref>/catalog/{ $x/ancestor::tei:TEI/@xml:id/data() }|{ ($x/ancestor::tei:TEI/tei:teiHeader/tei:fileDesc/tei:sourceDesc/tei:msDesc/tei:msIdentifier/tei:idno)[1]/text() }</ref>
</person>
)

let $genizahpeople := (
for $x in collection('../../../genizah-mss/collections/?select=*.xml;recurse=yes')//tei:persName[not(ancestor::tei:revisionDesc or ancestor::tei:respStmt)]
return
if ($x eq $skipids) then
( (: This @key is in one of the manually-maintained authority files, so don't include it in the generated list :) )
else
<person>
<name>{ normalize-space(string-join($x//text(), ' ')) }</name>
<file>{ base-uri($x) }</file>
<ref>/catalog/{ $x/ancestor::tei:TEI/@xml:id/data() }|{ ($x/ancestor::tei:TEI/tei:teiHeader/tei:fileDesc/tei:sourceDesc/tei:msDesc/tei:msIdentifier/tei:idno)[1]/text() }</ref>
</person>
)

let $allpeople := ($hebrewpeople, $hebrewauthors, $genizahpeople)
(: NOTE: No author TEI elements used in Genizah :)

let $dedupedpeople := (
for $t at $pos in distinct-values($allpeople/name/text())
order by $t
return
<person xml:id="{ concat('person_', $pos) }">
<persName type="display">{ $t }</persName>
{
for $s in distinct-values($allpeople[name = $t]/ref/text())
order by $s
return
<ref>{ $s }</ref>
}
</person>

)

return $dedupedpeople

}
</listPerson>
</body>
</text>
</TEI>





Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
declare namespace tei="http://www.tei-c.org/ns/1.0";
declare option saxon:output "indent=yes";


declare function local:logging($level, $msg, $values)
{
(: Trick XQuery into doing trace() to output message to STDERR but not insert it into the XML :)
substring(trace('', concat(upper-case($level), ' ', $msg, ' ', string-join($values, ' '), ' ')), 0, 0)
};

<TEI xmlns="http://www.tei-c.org/ns/1.0">
<teiHeader>
<fileDesc>
<titleStmt>
<title>Title</title>
</titleStmt>
<publicationStmt>
<p>Publication Information</p>
</publicationStmt>
<sourceDesc>
<p>Information about the source</p>
</sourceDesc>
</fileDesc>
</teiHeader>
<text>
<body>
<listPlace>
{

(:
TODO:
Pick up on @role attributes, if any
Batch convert the TEI to add @key to all persNames then use that here (blow away existing ones - there's only about a dozen, and they contain spaces)
:)

let $skipids := () (: TODO:)

(: First build an in-memory nodeset temporarily storing titles, IDs and the files they come from. :)
let $hebrewplaces := (
for $x in collection('../../collections/?select=*.xml;recurse=yes')//tei:placeName
return
if ($x eq $skipids) then
( (: This @key is in one of the manually-maintained authority files, so don't include it in the generated list :) )
else
<place>
<name>{ normalize-space(string-join($x//text(), ' ')) }</name>
<file>{ base-uri($x) }</file>
<ref>/catalog/{ $x/ancestor::tei:TEI/@xml:id/data() }|{ ($x/ancestor::tei:TEI/tei:teiHeader/tei:fileDesc/tei:sourceDesc/tei:msDesc/tei:msIdentifier/tei:idno)[1]/text() }</ref>
<scheme>{ ($x/ancestor::keywords/@scheme/data(), 'bodl')[1] }</scheme>
</place>
)

let $genizahplaces := (
for $x in collection('../../../genizah-mss/collections/?select=*.xml;recurse=yes')//tei:placeName
return
if ($x eq $skipids) then
( (: This @key is in one of the manually-maintained authority files, so don't include it in the generated list :) )
else
<place>
<name>{ normalize-space(string-join($x//text(), ' ')) }</name>
<file>{ base-uri($x) }</file>
<ref>/catalog/{ $x/ancestor::tei:TEI/@xml:id/data() }|{ ($x/ancestor::tei:TEI/tei:teiHeader/tei:fileDesc/tei:sourceDesc/tei:msDesc/tei:msIdentifier/tei:idno)[1]/text() }</ref>
<scheme>{ ($x/ancestor::keywords/@scheme/data(), 'bodl')[1] }</scheme>
</place>
)

let $allplaces := ($hebrewplaces, $genizahplaces)

let $dedupedplaces := (
for $t at $pos in distinct-values($allplaces/name/text())
order by $t
return
<place xml:id="{ concat('place_', $pos) }">
<placeName type="index" source="{ string-join(distinct-values($allplaces[name = $t]/scheme/text()), ' ') }">{ $t }</placeName>
{
for $s in distinct-values($allplaces[name = $t]/ref/text())
order by $s
return
<ref>{ $s }</ref>
}
</place>
)

return $dedupedplaces

}
</listPlace>
</body>
</text>
</TEI>





Original file line number Diff line number Diff line change
@@ -1,6 +1,20 @@
declare namespace tei="http://www.tei-c.org/ns/1.0";
declare option saxon:output "indent=yes";


declare function local:logging($level, $msg, $values)
{
(: Trick XQuery into doing trace() to output message to STDERR but not insert it into the XML :)
substring(trace('', concat(upper-case($level), ' ', $msg, ' ', string-join($values, ' '), ' ')), 0, 0)
};

declare function local:pickrandom($nodeset)
{
(: Cannot generate a random number, so pick the node based on the last char of the filename :)
let $returnnode := (for $n in $nodeset order by tokenize(replace($n/text(), '(.)', '$1&#xE0F1;'), '&#xE0F1;')[position() = last()-5] descending return $n)[1]
return $returnnode
};

<TEI xmlns="http://www.tei-c.org/ns/1.0">
<teiHeader>
<fileDesc>
Expand All @@ -24,8 +38,9 @@ declare option saxon:output "indent=yes";

(: First build an in-memory nodeset temporarily storing titles, IDs and the files they come from. :)
let $hebrewworks := (
for $x in collection('../collections/?select=*.xml;recurse=yes')//tei:msItem[tei:title]/@xml:id
let $langs := $x/parent::tei:msItem/tei:textLang
for $x in collection('../../collections/?select=*.xml;recurse=yes')//tei:msItem[tei:title]/@xml:id
(: let $langs := $x/parent::tei:msItem/tei:textLang :)
let $langs := ($x/ancestor::tei:msItem/tei:textLang)[1]
return
if ($x eq $skipids) then
( (: This @key is in one of the manually-maintained authority files, so don't include it in the generated list :) )
Expand Down Expand Up @@ -54,29 +69,40 @@ declare option saxon:output "indent=yes";
)

let $genizahworks := (
for $x in collection('../../genizah-mss/collections/?select=*.xml;recurse=yes')//tei:msItem[tei:title and not(tei:msItem)]/@xml:id
for $x in collection('../../../genizah-mss/collections/?select=*.xml;recurse=yes')//tei:msItem[tei:title and (not(tei:msItem) or tei:msItem[not(tei:title)])]/@xml:id
(: To be regarded as a work, an msItem must have a title, and either not have child msItems or have child items without titles :)

let $langs := $x/parent::tei:msItem/tei:textLang
return
if ($x eq $skipids) then
( (: This @key is in one of the manually-maintained authority files, so don't include it in the generated list :) )
else
<work id="{ $x }">
{
for $y in $x/parent::tei:msItem/tei:title
let $titletext := normalize-space(string-join($y//text()[not(ancestor::foreign)], ' '))
let $foreigntext := normalize-space(string-join($y//text()[ancestor::foreign], ' '))
return
if (string-length($titletext) eq 0 and string-length($foreigntext) eq 0) then
()
else if (string-length($titletext) gt 0 and string-length($foreigntext) eq 0) then
<title lang="{ $y/@xml:lang }">{ $titletext }</title>
else if (string-length($titletext) eq 0 and string-length($foreigntext) gt 0) then
<title lang="{ $y/foreign/@xml:lang }">{ $foreigntext }</title>
else if (string-length($titletext) gt 0 and string-length($foreigntext) gt 0) then
(<title lang="{ $y/@xml:lang }">{ $titletext }</title>,
<title lang="{ $y/foreign/@xml:lang }">{ $foreigntext }</title>)
else
if ($x/parent::tei:msItem/tei:title) then
for $y in $x/parent::tei:msItem/tei:title
let $titletext := normalize-space(string-join($y//text()[not(ancestor::foreign)], ' '))
let $foreigntext := normalize-space(string-join($y//text()[ancestor::foreign], ' '))
return
if (string-length($titletext) eq 0 and string-length($foreigntext) eq 0) then
()
else if (string-length($titletext) gt 0 and string-length($foreigntext) eq 0) then
<title lang="{ $y/@xml:lang }">{ $titletext }</title>
else if (string-length($titletext) eq 0 and string-length($foreigntext) gt 0) then
<title lang="{ $y/foreign/@xml:lang }">{ $foreigntext }</title>
else if (string-length($titletext) gt 0 and string-length($foreigntext) gt 0) then
(<title lang="{ $y/@xml:lang }">{ $titletext }</title>,
<title lang="{ $y/foreign/@xml:lang }">{ $foreigntext }</title>)
else
<title>{ normalize-space(string-join($y//text(), ' ')) }</title>
else
(: No title, so try building one
if ($x/parent::tei:msItem//parent::tei:msItem/tei:title and $x/parent::tei:msItem/@n) then
<title>{ concat(($x/parent::tei:msItem//parent::tei:msItem/tei:title)[1], ' - ', $x/parent::tei:msItem/@n) }</title>
else
local:logging('info', 'Skipping msItem', $x)
:)
()
}
{ $langs }
<file>{ base-uri($x) }</file>
Expand All @@ -87,6 +113,7 @@ declare option saxon:output "indent=yes";

let $dedupedworks := (
for $t at $pos in distinct-values($allworks/title[not(preceding-sibling::title)]/text())
order by $t
let $variants := (
for $r in $allworks[title = $t]
return
Expand All @@ -112,10 +139,12 @@ declare option saxon:output "indent=yes";
let $mainlang := (distinct-values($allworks[title = $t]/textLang/@mainLang))[1]
let $otherlangs := (distinct-values($allworks[title = $t]/textLang/@otherLangs/tokenize(., ' ')))[not(. eq $mainlang)]
return
if (count($otherlangs) gt 0) then
if (count($otherlangs) gt 0 and count($mainlang) gt 0) then
<textLang mainLang="{ $mainlang }" otherLangs="{ $otherlangs }"/>
else
else if (count($mainlang) gt 0) then
<textLang mainLang="{ $mainlang }"/>
else
()
}
</bibl>

Expand Down
Loading

0 comments on commit 206448d

Please sign in to comment.