-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathnew-qualifiers.rb
executable file
·65 lines (49 loc) · 1.76 KB
/
new-qualifiers.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/bin/env ruby
# frozen_string_literal: true
# Check a Wikipedia scraper outfile against what's currently in
# Wikidata, creating wikibase-cli commands for any qualifiers to add.
require 'csv'
require 'pry'
require_relative 'lib/inputfile'
# TODO: sanity check the input
wikipedia_file = Pathname.new(ARGV.first) # output of scraper
wikidata_file = Pathname.new(ARGV.last) # `wd sparql term-members.sparql`
wikipedia = InputFile::CSV.new(wikipedia_file)
wikidata = InputFile::JSON.new(wikidata_file)
def compare(wp, wd)
wp.keys.select { |key| key[/^P\d+/] }.each do |property|
wp_value = wp[property]
next if wp_value.to_s.empty?
wd_value = wd[property] rescue binding.pry
if wp_value.to_s == wd_value.to_s
# warn "#{wd} matches on #{property}"
next
end
if (!wd_value.to_s.empty? && (wp_value != wd_value))
warn "*** MISMATCH for #{wp[:name]} #{InputFile::MAPPING[property].first}: #{wd_value} → #{wp_value}"
warn "\t" + [wd[:statement], property.to_s, wd_value, wp_value].join(' ')
next
end
puts [wd[:statement], property.to_s, wp_value].join " "
end
end
wikipedia.data.each do |wp|
id = wp[:id]
# Unless someone already has at least one relevant P39, we can't do
# anything. Those will need created separately first.
next unless wikidata.tally[id]
found = wikidata.find(id)
# If we expect one match, and it exists, compare them, regardless of
# what's there already
if (wikipedia.tally[id] == 1) && (wikidata.tally[id] == 1)
compare(wp, found.first)
next
end
# Otherwise look for a match with the same start date
narrowed = found.select { |wd| wd[:P580] == wp[:P580] }
if narrowed.count == 1
compare(wp, narrowed.first)
next
end
warn "NO SUITABLE MATCH for #{wp}"
end