-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathezgeo.py
193 lines (166 loc) · 7.54 KB
/
ezgeo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
# Module for resolving location fields from Twitter to as specific of a location as possible. Returns structured location information in JSON.
import geocoder
import re
import pprint
import blacklist
import json
import os
BLACKLIST_CUTOFF = 0.8
prettyPrint = pprint.PrettyPrinter(indent=4).pprint
blacklist_words = open(os.path.dirname(__file__) + "/dictionaries/blacklist.txt").read().split('\n')
b = blacklist.Blacklist(blacklist_words)
json_data = open(os.path.dirname(__file__) + "/dictionaries/nicknames.json")
nicknames = json.load(json_data)
# Tries to resolve a location string to a particular country. Returns summary information on location if found.
# Uses OSM Nominatim API for natural language, and Yahoo API for lat/lon
def resolve_location_string(locstr, verbose=False):
# Return null object for empty locations
if locstr == None or locstr == '':
return None
G = None
# Clean location string and make appropriate substitutions
locstr = clean_location_string(locstr)
# First step: attempt to run (unicode-removed) query as a lat/lon
try:
G = match_gps(locstr, verbose)
if G['countrycode']:
return G
except ValueError:
pass
# Second, clean string of blacklisted tokens and attempt to match entire string
# Tokenize location string
clean_tokens = get_relevant_tokens(locstr, verbose)
print clean_tokens
# Reconstitute valid tokens into a single cleaned string
clean_locstr = ""
for (tok, sep) in clean_tokens:
clean_locstr += tok + sep
if verbose:
print "Cleaned string: " + repr(clean_locstr)
# First, try to match entire cleaned string
try:
G = match_phrase(clean_locstr, verbose)
if G['countrycode']:
return G
except:
if verbose:
print "Could not match entire phrase. Attempting to match tokens."
pass
# Third, try to match multiple tokens individually. If all resolvable tokens match to the same country, then return the matched dataset
# TODO: In this case, return the intersection of different tokens' datasets (i.e., only what they have in common).
if (len(clean_tokens) > 1):
if verbose:
print "Trying to match tokens in \"%s\"." % (repr(clean_locstr))
final_match = None
for token_pairs in clean_tokens:
tok = token_pairs[0]
current_match = match_phrase(tok, verbose)
if verbose:
print "Trying to match " + repr(tok)
# First, try to directly resolve the token
if current_match['country']:
if verbose:
print "** Resolved token \"%s\" to %s" % (repr(tok), repr(current_match['country']))
# If we haven't made a match before
if (final_match != None):
# If the lat/lon don't match, clear the lat/lon
if (current_match['latitude'] != ""):
if (not (current_match['latitude'] == final_match['latitude'] and current_match['longitude'] == final_match['longitude'])):
final_match['latitude'] = final_match['longitude'] = ""
# If the city doesn't match, clear the city
if (current_match['city'] != ""):
if (current_match['city'] != final_match['city']):
final_match['city'] = ""
# If the state doesn't match, clear the state
if (current_match['state'] != ""):
if (current_match['state'] != final_match['state']):
final_match['state'] = final_match['statecode'] = ""
# If the country doesn't match, clear the country and return
if (current_match['country'] != "" ):
if (current_match['country'] != final_match['country']):
final_match['country'] = final_match['countrycode'] = ""
return final_match
else:
final_match = current_match
return final_match
return None
# Converts to lowercase, removes emoticons, and standardizes some common whitespace items
def clean_location_string(locstr):
locstr = locstr.strip().lower()
# Replace certain character sequences no matter where they appear in the query.
substitutions = {
'\t': ' ',
'\r': ' ',
'\n': ' ',
';)': '',
':)': ''
}
for (k,v) in substitutions.items():
if k in locstr.lower():
locstr = locstr.lower().replace(k,v)
return locstr
# Convert a string into a dictionary of relevant tokens, deleting any blacklisted tokens.
# Tokens are defined as being separated by any of: [,/\&| and or to]
# Returns an array of (token,separator) pairs.
def get_relevant_tokens(locstr, verbose=False):
token_pairs = []
# Split on all possible delimiters
tokens_and_separators = re.split('(,|/|\\\\|&|\|| and | or | to )', locstr)
if verbose:
print tokens_and_separators
tokens = tokens_and_separators[0:len(tokens_and_separators):2]
print tokens
# Map non-empty and non-numeric tokens to their separators
re_num = re.compile(r'\d+$')
for idx, tok in enumerate(tokens):
if (tok != "" and not re_num.match(tok)):
score, match = b.checkWord(repr(tok)) #Check the non-unicode version of the string
if score < BLACKLIST_CUTOFF:
if (2*idx+1 < len(tokens_and_separators)):
sep = tokens_and_separators[2*idx+1]
else:
sep = ""
token_pairs.append((tok.strip(), sep))
else:
if verbose:
print "Found blacklisted token: %s as %s, score %s" % (repr(tok), repr(match), str(score))
else:
if verbose:
print "Found numeric or null token: " + tok
return token_pairs
def match_phrase(phrase, verbose=False):
# First step: attempt to map nicknames to standard place names
if (phrase in nicknames):
if verbose:
print "Resolved nickname \"%s\" as \"%s\": " % (repr(phrase), repr(nicknames[phrase]))
phrase = nicknames[phrase]
# Second step: attempt to match mapped string in Nominatim
try:
G = geocoder.geocode(phrase,site='osm')
if G['countrycode']:
return G
except ValueError:
pass
# Third step: attempt to run query, removing any (parenthesized) expressions
paren_regex = "\(.+?\)"
noparens = re.sub(paren_regex,'',phrase)
if noparens != phrase:
if verbose:
print "* Removing parenthetical expression: \"%s\" -> \"%s\"" % (repr(noparens), repr(phrase))
try:
G = geocoder.geocode(noparens, site='osm')
if G['countrycode']:
if verbose:
print "** Resolved \"%s\" as \"%s\"" % (repr(phrase), repr(noparens))
return G
except ValueError:
pass
return G
def match_gps(locstr, verbose=False):
suffix = locstr.split(':')[-1]
(lat,lon) = [float(x) for x in suffix.strip('( )').split(',')]
reverse = True
query = "%s,%s" % (lat, lon)
if verbose:
print "Trying to match string \"%s\" as lat/lon..." % repr(query)
return geocoder.geocode(query,site='osm')