Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated docstrings, and some small fixes #132

Merged
merged 5 commits into from
Jun 28, 2021
Merged
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 92 additions & 36 deletions adapt/tools/text/trie.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,18 +30,29 @@ def __init__(self, data=None, is_terminal=False):

def lookup(self, iterable, index=0, gather=False, edit_distance=0, max_edit_distance=0, match_threshold=0.0, matched_length=0):
"""
TODO: Implement trie lookup with edit distance

Args:
iterable(list?): key used to find what is requested this could
be a generator.
index(int): index of what is requested
gather(bool): of whether to gather or not
edit_distance(int): the distance -- currently not used
max_edit_distance(int): the max distance -- not currently used
iterable(hashable): a list of items used to traverse the Trie.
This represents the position of a node in the Trie, matching the
iterable used at insertion time.
For example:
trie.insert('foo', {'bar': 'baz'})
list(trie.lookup('foo')) == [TrieNode(data={'bar': 'baz'}, is_terminal=True)]

index(int): index of item for current position in traversal.
we pass the original iterable and an index to avoid
the cost of repeatedly copying the original iterable
gather(bool): whether to return intermediate results (gather
algorithm)
edit_distance(int): current edit distance in the traversal.
max_edit_distance(int): maximum edit distance
match_threshold(float): minimum confidence of match for discovery
matched_length(int): related to edit distance, for calculating
confidence of match where
confidence = (length - abs(matched_length - length)) / length

yields:
object: yields the results of the search
generator[TrieNode]: a generator that vends the results of the
lookup, of type TrieNode
"""
if self.is_terminal:
if index == len(iterable) or \
Expand All @@ -61,7 +72,8 @@ def lookup(self, iterable, index=0, gather=False, edit_distance=0, max_edit_dist
edit_distance=edit_distance, max_edit_distance=max_edit_distance, matched_length=matched_length + 1):
yield result

# if there's edit distance remaining and it's possible to match a word above the confidence threshold
# if there's edit distance remaining and it's possible to
# match a word above the confidence threshold, continue searching
potential_confidence = float(index - edit_distance + (max_edit_distance - edit_distance)) / \
(float(index) + (max_edit_distance - edit_distance)) if index + max_edit_distance - edit_distance > 0 else 0.0
if edit_distance < max_edit_distance and potential_confidence > match_threshold:
Expand All @@ -87,7 +99,12 @@ def insert(self, iterable, index=0, data=None, weight=1.0):
"""Insert new node into tree

Args:
iterable(hashable): key used to find in the future.
iterable(hashable): a list of items used to traverse the Trie.
This represents the position of a node in the Trie, matching the
iterable used at insertion time.
For example:
trie.insert('foo', {'bar': 'baz'})
list(trie.lookup('foo')) == [TrieNode(data={'bar': 'baz'}, is_terminal=True)]
data(object): data associated with the key
index(int): an index used for insertion.
weight(float): the wait given for the item added.
Expand All @@ -113,7 +130,12 @@ def remove(self, iterable, data=None, index=0):
"""Remove an element from the trie

Args
iterable(hashable): key used to find what is to be removed
iterable(hashable): a list of items used to traverse the Trie.
This represents the position of a node in the Trie, matching the
iterable used at insertion time.
For example:
trie.insert('foo', {'bar': 'baz'})
list(trie.lookup('foo')) == [TrieNode(data={'bar': 'baz'}, is_terminal=True)]
data(object): data associated with the key
index(int): index of what is to me removed

Expand Down Expand Up @@ -141,12 +163,22 @@ def remove(self, iterable, data=None, index=0):


class Trie(object):
"""Interface for the tree
"""Recursive implementation of a prefix trie (Trie)
https://en.wikipedia.org/wiki/Trie
Additionally supports #gather, a traversal whose results include
any terminal nodes visited.

Attributes:
root(TrieNode): parent node to start the tree
max_edit_distance(int): ?
match_threshold(int): ?
max_edit_distance(int): values > 0 allow for fuzzy matching
with a maximum levenshtein edit distance
https://en.wikipedia.org/wiki/Edit_distance
match_threshold(int): only return values with a higher confidence
than this value

While most frequently used with strings, the Trie can be populated with any
iterable (arrays of ints, arrays of objects, arrays of strings) as long
as each value responds to `__hash__`.

"""

Expand All @@ -157,32 +189,46 @@ def __init__(self, max_edit_distance=0, match_threshold=0.0):
max_edit_distance and match_threshold.

Args:
max_edit_distance(int): ?
match_threshold(int): ?

Notes:
This never seems to get called with max_edit_distance or match_threshold
max_edit_distance(int): values > 0 allow for fuzzy matching
with a maximum levenshtein edit distance
https://en.wikipedia.org/wiki/Edit_distance
match_threshold(int): only return values with a higher confidence
than this value
"""
self.root = TrieNode('root')
self.max_edit_distance = max_edit_distance
self.match_threshold = match_threshold

def gather(self, iterable):
"""Calls the lookup with gather True Passing iterable and yields
the result.
"""Executes a "gather" traversal of the Trie
Result set will include any `is_terminal` nodes encountered during
the traversal

Args:
iterable(hashable): a list of items used to traverse the Trie
This represents the position of a node in the Trie, matching the
iterable used at insertion time.
For example:
trie.insert('foo', {'bar': 'baz'})
list(trie.lookup('foo')) == [TrieNode(data={'bar': 'baz'}, is_terminal=True)]

"""
for result in self.lookup(iterable, gather=True):
yield result

def lookup(self, iterable, gather=False):
"""Call the lookup on the root node with the given parameters.

Args
iterable(index or key): Used to retrive nodes from tree
gather(bool): this is passed down to the root node lookup
Args:
iterable(hashable): a list of items used to traverse the Trie
forslund marked this conversation as resolved.
Show resolved Hide resolved
This represents the position of a node in the Trie, matching the
iterable used at insertion time.
For example:
trie.insert('foo', {'bar': 'baz'})
list(trie.lookup('foo')) == [TrieNode(data={'bar': 'baz'}, is_terminal=True)]
gather(bool): flag to indicate whether gather results
should be included

Notes:
max_edit_distance and match_threshold come from the init
"""
for result in self.root.lookup(iterable,
gather=gather,
Expand All @@ -192,28 +238,38 @@ def lookup(self, iterable, gather=False):
yield result

def insert(self, iterable, data=None, weight=1.0):
"""Used to insert into he root node
"""Used to insert into the trie

Args
iterable(hashable): index or key used to identify
data(object): data to be paired with the key
Args:
iterable(hashable): a list of items used to traverse the Trie
This represents the position of a node in the Trie, matching the
iterable used at insertion time.
For example:
trie.insert('foo', {'bar': 'baz'})
list(trie.lookup('foo')) == [TrieNode(data={'bar': 'baz'}, is_terminal=True)]
data(object): data to stored or merged for this iterable
"""
self.root.insert(iterable, index=0, data=data, weight=1.0)
self.root.insert(iterable, index=0, data=data, weight=weight)

def remove(self, iterable, data=None):
"""Used to remove from the root node

Args:
iterable(hashable): index or key used to identify
item to remove
data: data to be paired with the key
iterable(hashable): a list of items used to traverse the Trie
This represents the position of a node in the Trie, matching the
iterable used at insertion time.
For example:
trie.insert('foo', {'bar': 'baz'})
list(trie.lookup('foo')) == [TrieNode(data={'bar': 'baz'}, is_terminal=True)]
data: data to removed. If None, or node is empty as a result,
remove the node.
"""
return self.root.remove(iterable, data=data)

def scan(self, match_func):
"""Traverse the trie scanning for end nodes with matching data.

Arguments:
Args:
match_func (callable): function used to match data.

Returns:
Expand Down