Skip to content

Commit

Permalink
Keep upper case letters in addresses
Browse files Browse the repository at this point in the history
  • Loading branch information
thdg committed Aug 26, 2022
1 parent d2b8acc commit 6862386
Showing 1 changed file with 13 additions and 6 deletions.
19 changes: 13 additions & 6 deletions collect_addresses.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
from spell_number import get_is_number


ALLOWED_CHARACTERS = string.ascii_lowercase + "áéíúýóðþæö"
SPECIAL_CHARS = "áéíúýóðþæö"
SPECIAL_CHARS_UPPER = SPECIAL_CHARS.upper()
ALLOWED_CHARACTERS = string.ascii_uppercase + string.ascii_lowercase + SPECIAL_CHARS + SPECIAL_CHARS_UPPER
ABBR = {
"v.": "við",
"nr.": "númer",
Expand All @@ -20,7 +22,7 @@
"útiv.sv.": "útivistarsvæði",
"íb.lóð": "íbúðarlóð",
"mhl.": "mhl",
"hreðavatnsl.": "hreðavatnsl",
"hreðavatnsl.": "Hreðavatnsl",
"bakki.": "bakki",
"v.hl.": "við hlið",
"aðk.": "aðkoma",
Expand Down Expand Up @@ -143,7 +145,7 @@ def normalize_address(address):
address = address.replace("4.", "fjórða ")
address = address.replace("5.", "fimmta ")
address = address.replace("17.", "sautjánda ")
tokens = list(tokenize(address.lower()))
tokens = list(tokenize(address))
new_tokens = []
for i, token in enumerate(tokens):
if is_int(token):
Expand All @@ -158,8 +160,8 @@ def normalize_address(address):
return ""
elif token == "nr.":
new_tokens.append("númer")
elif token in ABBR.keys():
new_tokens.append(ABBR[token])
elif token.lower() in ABBR.keys():
new_tokens.append(ABBR[token.lower()])
elif token == ".":
pass
elif token.lower().strip() in IGNORE:
Expand Down Expand Up @@ -188,7 +190,12 @@ def print_help():
except Exception:
print_help()
exit()
ofile = open(ofile, "w")

if ofile == "-":
ofile = sys.stdout
else:
ofile = open(ofile, "w")

for address in read_registry(fname):
a = address["VEF_BIRTING"]
a = clean_address(a)
Expand Down

0 comments on commit 6862386

Please sign in to comment.