-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrunlit-fetch.pl
executable file
·387 lines (352 loc) · 14 KB
/
runlit-fetch.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
#!/usr/bin/env perl
use strict;
use warnings;
use utf8;
use 5.016;
use autodie qw(:file);
use threads;
use open qw(:utf8 :std);
use DBI;
use Digest::SHA3 qw(sha3_224_hex);
use HTML::TreeBuilder::XPath;
use MCE::Loop chunk_size => 'auto', max_workers => 'auto';
use URI::Escape qw(uri_escape);
use WWW::Mechanize;
use YAML::XS qw(DumpFile LoadFile);
#################################################################################
# Scrape the Svensk Runbibliografi for bibliographic data and links, by signum. #
# (I checked their robots.txt, and they seem to be cool with that.) #
#################################################################################
# Initialise a hash to hold all the bibliographic details (keys are SHA3-hashed concatenation of title, author, year):
my %bib;
MCE::Loop::init {
gather => sub { accession(@_, \%bib) }
};
mce_loop {
for (@{$_}) {
MCE->gather(browse_listing($_));
}
} list_signa();
MCE::Loop::finish;
#for (list_signa()) {accession(browse_listing($_), \%bib)} # Non-MCE version, for testing.
# Clean up URLs:
fix_urls(\%bib);
# Serialise unique list of works to YAML:
print 'Dumping results to file… ';
my %conf = %{LoadFile('../config/config.yml')} or die "Failed to read config file: $!\n";
DumpFile(join('', $conf{'path'}, 'srb-lit.yml'), values %bib) or die "Failed to write output file: $!\n";
say 'done.';
# Exeunt omnes, laughing.
#################################################################################
# Query the database for all runic signa, ancient and modern:
sub list_signa {
print 'Gathering list of signa… ';
# Get configuration info:
my %conf = %{LoadFile('../config/config.yml')} or die "Failed to read config file: $!\n";
my $dsn = "DBI:$conf{dsn}{dbms}:database=$conf{dsn}{database};host=$conf{dsn}{hostname};port=$conf{dsn}{port}";
my $dbh = DBI->connect($dsn, $conf{dsn}{username}, $conf{dsn}{password}, {RaiseError => 1, AutoCommit => 1, pg_enable_utf8 => 1, pg_server_prepare => 1});
my @signa;
my $sth = $dbh->prepare(q{
SELECT signum1, signum2
FROM objects_signa_unique
});
$sth->execute();
while (my $record = $sth->fetchrow_arrayref) {
push(@signa, join(' ', @{$record}));
}
say 'done.';
return (@signa);
}
# Request a listing of results for each signum, find the first, and pass its URL off to browse_results():
sub browse_listing {
my $signum = shift;
my $mech = WWW::Mechanize->new();
my $search_uri = join('', q{http://fornsvenskbibliografi.ra.se/pub/views/runbibl/hitlist.aspx?ccl=find%20runsig%20}, uri_escape($signum));
$mech->get($search_uri);
unless ($mech->success()) {
warn "Could not fetch results page for '$signum'; returned status $mech->status()\n";
return(0, $signum);
}
my $tree = HTML::TreeBuilder::XPath->new_from_content($mech->content());
if ($tree->exists(q{//div[@id='ctl00_cphContent_pnlMessageTop']}) &&
($tree->findvalue(q{//div[@id='ctl00_cphContent_pnlMessageTop']}) eq ' Din sökning gav inga träffar. ')) {
$tree->delete;
return(0, $signum); # No results. :(
}
elsif ($tree->exists(q{//table[@id='ctl00_cphContent_gwHitList']})) {
if (my $hit = $mech->find_link(class => 'hitnumber', text => '1')) {
$tree->delete;
my $records; # Hashref of records for this signum.
return(browse_results($signum, $records, $mech, $hit));
}
else {
warn "Could not locate first result link for '$signum', <" . $mech->uri()->as_string() . ">\n";
$tree->delete;
return(0, $signum);
}
}
else {
warn "No 'no hits' message, but no result listing either, for '$signum'. :/\n"; # WTF?
$tree->delete;
return(0, $signum);
}
}
# Request a result URL, check it for validity, pass the content off for recording and line up the next result for processing:
sub browse_results {
my ($signum, $records, $mech, $hit) = @_;
while (1) {
$mech->add_header(Referer => $mech->uri()->as_string());
$mech->get($hit);
unless ($mech->success()) {
warn "Could not fetch hit page for '$signum'; returned status " . $mech->status() . "\n";
return(0, $signum);
}
my $tree = HTML::TreeBuilder::XPath->new_from_content($mech->content());
if ($tree->exists(q{//div[@id='ctl00_cphContent_pnlMessageTop']}) &&
($tree->findvalue(q{//div[@id='ctl00_cphContent_pnlMessageTop']}) eq ' Input string was not in a correct format. ')) {
warn "No referer for '$signum', <" . $mech->uri()->as_string() . ">\n";
$tree->delete;
return(0, $signum);
}
elsif (!$tree->exists(q{//div[@id='ctl00_cphContent_pnlShowRecord']/table})) {
warn "Unable to locate record table for '$signum', <" . $mech->uri()->as_string() . ">\n";
$tree->delete;
return(0, $signum);
}
else {
my $signa = $tree->findvalue(q{//div[@id='ctl00_cphContent_pnlShowRecord']/table/tr[@id='RUNSIG']/td[@class='fieldContents']/div});
if ($signa =~ m/\s$signum;?\s/) { # Are these results actually accurate – is the sought signum really described by this work? (I'm looking at you, Orcadian inscriptions!)
($signum, $records) = extract_fields($signum, $records, $tree);
}
if ($tree->exists(q{//span[@id='ctl00_cphContent_lblResultInfo']})) {
my $xofy = $tree->findvalue(q{//span[@id='ctl00_cphContent_lblResultInfo']});
if ($xofy =~ /^(?:Visar|Record) (?<x>\d+) (?:av|of) (?<y>\d+)\.?$/) {
if ($+{x} >= $+{y}) { # End if this is the last result in the set.
$tree->delete;
return(1, $signum, $records);
}
$tree->delete;
$hit = $mech->uri()->as_string();
$hit =~ s/_(\d+)$/'_' . ($1 + 1)/e; # The next result in the set is this one, +1:
next;
}
else {
warn "Unable to read record number for '$signum', <" . $mech->uri()->as_string() . ">\n";
$tree->delete;
return(0, $signum);
}
}
else {
warn "Unable to locate record number for '$signum', <" . $mech->uri()->as_string() . ">\n";
$tree->delete;
return(0, $signum);
}
}
}
}
# For a given result (work), extract its fields:
sub extract_fields {
my ($signum, $records, $tree) = @_;
my %record; # Here we'll temporarily store the record for this work.
my %fields = (
AU => {
label => 'authors',
action => \&splitpeople,
},
NFIPL => {
label => 'form of name in publication',
action => \&splitpeople,
},
ED => {
label => 'editors',
action => \&splitpeople,
},
TI => {
label => 'title',
action => sub {return(trim(shift))},
},
VT => { # Seems now no longer to occur, but safest to keep this here.
label => 'title of work',
action => sub {return(trim(shift))},
},
PUBF => { # Seems to occur only once, for the record of <http://libris.kb.se/resource/bib/12102929>
label => 'object of publication',
action => sub {return(trim(shift))},
},
YEAR => {
label => 'year',
action => sub {return(trim(shift))},
},
ING => {
label => 'part of',
action => sub {return(trim(shift))},
},
FOR => {
label => 'place and publisher',
action => sub {
my $value = trim(shift);
my %publisher;
@publisher{qw(place publisher)} = split(/\s*:\s*/, $value, 2);
return(\%publisher);
},
},
SE => {
label => 'series',
action => sub {return(trim(shift))},
},
VOL => {
label => 'volume',
action => sub {return(trim(shift))},
},
PUB => {
label => 'type of publication',
action => sub {return(trim(shift))},
},
PAG => {
label => 'pages',
action => sub {return(trim(shift))},
},
KOM => {
label => 'comments',
action => sub {return(trim(shift))},
},
REC => {
label => 'review',
action => sub {return(trim(shift))},
},
UPL => {
label => 'edition',
action => sub {return(trim(shift))},
},
URL => {
label => 'urls',
action => sub {
my $value = trim(shift);
$value =~ s/(?<=\S)http:/, http:/g; # Some URLs appear to be run together.
my @uris = grep {m|^http://|} split(/,? +/, $value); # Sometimes there are notes or empty fields.
return(\@uris);
},
},
);
# Iterate over the table rows, and for each get the label as key, contents as value. Skip signa. Then hash them.
my @fields = $tree->findnodes(q{//div[@id='ctl00_cphContent_pnlShowRecord']/table/tr});
for (@fields) {
my $id = $_->attr('id');
next if (($id eq 'RUNSIG') || ($id eq 'KLASRUN'));
if (exists $fields{$id}) {
$record{$fields{$id}{label}} = $fields{$id}{action}->($tree->findvalue(join('', q{//div[@id='ctl00_cphContent_pnlShowRecord']/table/tr[@id='}, $id, q{']/td[@class='fieldContents']/div})));
}
else {
warn "'$signum' features unknown id '$id' in a record.\n";
}
}
# Create a unique hash key (SHA3 hash) for this work:
my @hashfields = (
(exists $record{authors}) ? join('', map {join('', $_->{surname} // '', $_->{firstname} // '')} @{$record{authors}}) : '',
(exists $record{year}) ? $record{year} : '',
(exists $record{title}) ? $record{title} : '',
(exists $record{editors}) ? join('', map {join('', $_->{surname} // '', $_->{firstname} // '')} @{$record{editors}}) : '',
(exists $record{urls}) ? join('', @{$record{urls}}) : ''
);
for (@hashfields) {utf8::encode($_)} # SHA3 does not like wide characters
$records->{sha3_224_hex(@hashfields)} = \%record;
return($signum, $records);
}
# For records (works) of a given signum, add them if they do not already exist, and append the current signum to their records:
sub accession {
my ($status, $signum, $records, $bib) = @_;
return(0) if ($status == 0);
for (keys %{$records}) {
unless (exists $bib->{$_}) {
$bib->{$_} = $records->{$_};
}
push(@{$bib->{$_}{signa}}, $signum);
}
return(1);
}
# For a record concerning multiple people (authors, editors) return a listref of people (hashrefs of firstname/surname):
sub splitpeople {
my $value = trim(shift);
my @people;
for (split(/\s*[;:]\s*/, $value)) {
my %person;
@person{qw(surname firstname)} = split(/, ?/, $_, 2);
push(@people, \%person);
}
return(\@people);
}
# Remove whitespace from the beginning and end of a string:
sub trim {
my $value = shift;
for ($value) {
s/^\s+//;
s/\s+$//;
}
return($value);
}
# Remove params from Libris URIs, and replace DiVA URLs with URNs resolved via Kb
sub fix_urls {
print 'Resolving URNs… ';
my %bib = %{shift()};
my %uris; # Original as key, permalink as value
# Phase 1: Collect URIs
for my $record (keys %bib) { # For each record (work)…
next unless (exists $bib{$record}{urls}); # Skip the work if it has no URLs…
for my $uri (@{$bib{$record}{urls}}) { # For each URL the work is associated with…
if ((($uri =~ m|^http://libris\.kb\.se/bib/|) &&
($uri =~ m|[?&]|)) ||
($uri =~ m|^http://[a-z]+\.diva-portal.org/smash/|)) {
$uris{$uri} = undef; # Collect the URL, if it maches our criteria.
}
}
}
# Phase 2: ?
for my $uri (keys %uris) {
if ($uri =~ m|^(?<libris>http://libris\.kb\.se/bib/[^\?&]+).*$|) {
# Strip params:
$uris{$uri} = $+{libris};
}
elsif ($uri =~ m|^http://[a-z]+\.diva-portal.org/smash/|) {
if ($uri =~ m|^http://[a-z]+\.diva-portal.org/smash/record\.jsf\?pid=|) {
# Get the permanent URI:
$uris{$uri} = resolve($uri);
}
elsif ($uri =~ m|^http://(?<divaprovider>[a-z]+)\.diva-portal.org/smash/get/diva2:(?<divaid>[0-9]+)/FULLTEXT01|) {
# Get the permanent URI (not the PDF!):
$uris{$uri} = resolve(join('', 'http://', $+{divaprovider}, '.diva-portal.org/smash/record.jsf?pid=diva2:', $+{divaid}));
}
}
}
# Phase 3: Profit
for my $record (keys %bib) { # For each record (work)…
next unless (exists $bib{$record}{urls}); # Skip the work if it has no URLs…
for my $uri (@{$bib{$record}{urls}}) { # For each URL the work is associated with…
if (exists $uris{$uri}) {
$uri = $uris{$uri}; # Replace the URL with a corrected version, where appropriate.
}
$uri =~ s|^http://libris\.kb\.se/bib/|http://libris\.kb\.se/resource/bib/|; # Replace Libris URLs with proper Libris URIs
}
}
say 'done.';
}
# For a DiVA URL, return a permanent URL to Kb's URN resolver:
sub resolve {
my $uri = shift;
my $mech = WWW::Mechanize->new();
$mech->get($uri);
unless ($mech->success()) {
warn "Could not fetch page for '$uri'; returned status $mech->status()\n";
}
my $tree = HTML::TreeBuilder::XPath->new_from_content($mech->content());
if ($tree->exists(q{/html/head/meta[@name='DC.Identifier.url']})) {
for ($tree->findvalues(q{/html/head/meta[@name='DC.Identifier.url']/@content})) {
next unless ($_ =~ m|http://urn\.kb\.se/resolve\?urn=|);
$uri = $_;
}
}
else {
warn "Unable to locate permanent URI for '$uri'.\n";
}
$tree->delete;
return $uri;
}