-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathreconcile.py
183 lines (154 loc) · 5.12 KB
/
reconcile.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
# -*- coding: utf-8 -*-
"""
An OpenRefine reconciliation service for the AAT API.
This code is adapted from https://github.com/lawlesst/fast-reconcile
"""
from flask import Flask, request, jsonify
from fuzzywuzzy import fuzz
import getopt
import json
from operator import itemgetter
import re
import requests
from sys import version_info
import urllib
import xml.etree.ElementTree as ET
# Help text processing
import text
import requests_cache
# cache calls to the API.
requests_cache.install_cache('getty_cache')
app = Flask(__name__)
# See if Python 3 for unicode/str use decisions
PY3 = version_info > (3,)
# Create base URLs/URIs
api_base_url = 'http://vocabsservices.getty.edu/AATService.asmx/AATGetTermMatch?term='
aat_base_url = 'http://vocab.getty.edu/aat/{0}'
# Map the AAT query indexes to service types
default_query = {
"id": "AATGetTermMatch",
"name": "AAT term",
"index": "term"
}
# to add some other services in the future (TGN, ULAN...)
full_query = []
full_query.append(default_query)
# Make a copy of the AAT mappings.
query_types = [{'id': item['id'], 'name': item['name']} for item in full_query]
def make_uri(getty_id):
"""
Prepare an AAT url from the ID returned by the API.
"""
getty_uri = aat_base_url.format(getty_id)
return getty_uri
# Basic service metadata. There are a number of other documented options
# but this is all we need for a simple service.
metadata = {
"name": "Getty Reconciliation Service",
"defaultTypes": query_types,
"identifierSpace": "http://localhost/identifier",
"schemaSpace": "http://localhost/schema",
"view": {
"url": "http://vocab.getty.edu/aat/{{id}}"
},
"preview": {
"url": "http://vocab.getty.edu/aat/{{id}}",
"width": 430,
"height": 300
},
"suggest": {
"entity": {
"service_url": "http://opencorporates.com",
"service_path": "/reconcile/suggest",
"flyout_service_path": "/reconcile/flyout"
},
"property": {
"service_url": "http://opencorporates.com",
"service_path": "/reconcile/suggest/properties",
"flyout_service_path": "/reconcile/flyout/properties"
}
},
"defaultTypes": [
{
"id": "/organization/organization",
"name": "Organization"
}
]
}
def jsonpify(obj):
"""
Helper to support JSONP
"""
try:
callback = request.args['callback']
response = app.make_response("%s(%s)" % (callback, json.dumps(obj)))
response.mimetype = "text/javascript"
return response
except KeyError:
return jsonify(obj)
def search(raw_query):
out = []
query = text.normalize(raw_query, PY3).strip()
query_type_meta = [i for i in full_query]
#query_index = query_type_meta[0]['index']
# Get the results
try:
if PY3:
url = api_base_url + \
urllib.parse.quote(query) + '&logop=and¬es='
else:
url = api_base_url + urllib.quote(query) + '&logop=and¬es='
app.logger.debug("AAT url is " + url)
resp = requests.get(url)
results = ET.fromstring(resp.content)
except getopt.GetoptError as e:
app.logger.warning(e)
return out
for child in results.iter('Preferred_Parent'):
match = False
try:
name = re.sub(r'\[.+?\]', '', child.text.split(',')[0]).strip()
# the termid is NOT the ID ! We have to find it in the first prefered parent
id = re.search(r"\[(.+?)\]", child.text.split(',')[0]).group(1)
score = fuzz.token_sort_ratio(query, name)
except AttributeError:
pass
if score > 95:
match = True
app.logger.debug("Label is " + name + " Score is " +
str(score) + " URI is " + id)
resource = {
"id": id,
"name": name,
"score": score,
"match": match,
"type": query_type_meta
}
out.append(resource)
# Sort this list containing prefterms by score
sorted_out = sorted(out, key=itemgetter('score'), reverse=True)
# Refine only will handle top 10 matches.
return sorted_out[:10]
@app.route("/", methods=['POST', 'GET'])
def reconcile():
# If a 'queries' parameter is supplied then it is a dictionary
# of (key, query) pairs representing a batch of queries. We
# should return a dictionary of (key, results) pairs.
queries = request.form.get('queries')
if queries:
queries = json.loads(queries)
results = {}
for (key, query) in queries.items():
data = search(query['query'])
results[key] = {"result": data}
return jsonpify(results)
# If neither a 'query' nor 'queries' parameter is supplied then
# we should return the service metadata.
return jsonpify(metadata)
if __name__ == '__main__':
from optparse import OptionParser
oparser = OptionParser()
oparser.add_option('-d', '--debug', action='store_true', default=False)
opts, args = oparser.parse_args()
app.debug = opts.debug
app.run(host='0.0.0.0')