diff --git a/adapt/tools/text/trie.py b/adapt/tools/text/trie.py index 9352fa9..faada62 100644 --- a/adapt/tools/text/trie.py +++ b/adapt/tools/text/trie.py @@ -30,18 +30,29 @@ def __init__(self, data=None, is_terminal=False): def lookup(self, iterable, index=0, gather=False, edit_distance=0, max_edit_distance=0, match_threshold=0.0, matched_length=0): """ - TODO: Implement trie lookup with edit distance - Args: - iterable(list?): key used to find what is requested this could - be a generator. - index(int): index of what is requested - gather(bool): of whether to gather or not - edit_distance(int): the distance -- currently not used - max_edit_distance(int): the max distance -- not currently used + iterable(hashable): a list of items used to traverse the Trie. + This represents the position of a node in the Trie, matching the + iterable used at insertion time. + For example: + trie.insert('foo', {'bar': 'baz'}) + list(trie.lookup('foo')) == [TrieNode(data={'bar': 'baz'}, is_terminal=True)] + + index(int): index of item for current position in traversal. + we pass the original iterable and an index to avoid + the cost of repeatedly copying the original iterable + gather(bool): whether to return intermediate results (gather + algorithm) + edit_distance(int): current edit distance in the traversal. + max_edit_distance(int): maximum edit distance + match_threshold(float): minimum confidence of match for discovery + matched_length(int): related to edit distance, for calculating + confidence of match where + confidence = (length - abs(matched_length - length)) / length yields: - object: yields the results of the search + generator[TrieNode]: a generator that vends the results of the + lookup, of type TrieNode """ if self.is_terminal: if index == len(iterable) or \ @@ -61,7 +72,8 @@ def lookup(self, iterable, index=0, gather=False, edit_distance=0, max_edit_dist edit_distance=edit_distance, max_edit_distance=max_edit_distance, matched_length=matched_length + 1): yield result - # if there's edit distance remaining and it's possible to match a word above the confidence threshold + # if there's edit distance remaining and it's possible to + # match a word above the confidence threshold, continue searching potential_confidence = float(index - edit_distance + (max_edit_distance - edit_distance)) / \ (float(index) + (max_edit_distance - edit_distance)) if index + max_edit_distance - edit_distance > 0 else 0.0 if edit_distance < max_edit_distance and potential_confidence > match_threshold: @@ -87,7 +99,12 @@ def insert(self, iterable, index=0, data=None, weight=1.0): """Insert new node into tree Args: - iterable(hashable): key used to find in the future. + iterable(hashable): a list of items used to traverse the Trie. + This represents the position of a node in the Trie, matching the + iterable used at insertion time. + For example: + trie.insert('foo', {'bar': 'baz'}) + list(trie.lookup('foo')) == [TrieNode(data={'bar': 'baz'}, is_terminal=True)] data(object): data associated with the key index(int): an index used for insertion. weight(float): the wait given for the item added. @@ -113,7 +130,12 @@ def remove(self, iterable, data=None, index=0): """Remove an element from the trie Args - iterable(hashable): key used to find what is to be removed + iterable(hashable): a list of items used to traverse the Trie. + This represents the position of a node in the Trie, matching the + iterable used at insertion time. + For example: + trie.insert('foo', {'bar': 'baz'}) + list(trie.lookup('foo')) == [TrieNode(data={'bar': 'baz'}, is_terminal=True)] data(object): data associated with the key index(int): index of what is to me removed @@ -141,12 +163,22 @@ def remove(self, iterable, data=None, index=0): class Trie(object): - """Interface for the tree + """Recursive implementation of a prefix trie (Trie) + https://en.wikipedia.org/wiki/Trie + Additionally supports #gather, a traversal whose results include + any terminal nodes visited. Attributes: root(TrieNode): parent node to start the tree - max_edit_distance(int): ? - match_threshold(int): ? + max_edit_distance(int): values > 0 allow for fuzzy matching + with a maximum levenshtein edit distance + https://en.wikipedia.org/wiki/Edit_distance + match_threshold(int): only return values with a higher confidence + than this value + + While most frequently used with strings, the Trie can be populated with any + iterable (arrays of ints, arrays of objects, arrays of strings) as long + as each value responds to `__hash__`. """ @@ -157,19 +189,29 @@ def __init__(self, max_edit_distance=0, match_threshold=0.0): max_edit_distance and match_threshold. Args: - max_edit_distance(int): ? - match_threshold(int): ? - - Notes: - This never seems to get called with max_edit_distance or match_threshold + max_edit_distance(int): values > 0 allow for fuzzy matching + with a maximum levenshtein edit distance + https://en.wikipedia.org/wiki/Edit_distance + match_threshold(int): only return values with a higher confidence + than this value """ self.root = TrieNode('root') self.max_edit_distance = max_edit_distance self.match_threshold = match_threshold def gather(self, iterable): - """Calls the lookup with gather True Passing iterable and yields - the result. + """Executes a "gather" traversal of the Trie + Result set will include any `is_terminal` nodes encountered during + the traversal + + Args: + iterable(hashable): a list of items used to traverse the Trie + This represents the position of a node in the Trie, matching the + iterable used at insertion time. + For example: + trie.insert('foo', {'bar': 'baz'}) + list(trie.lookup('foo')) == [TrieNode(data={'bar': 'baz'}, is_terminal=True)] + """ for result in self.lookup(iterable, gather=True): yield result @@ -177,12 +219,16 @@ def gather(self, iterable): def lookup(self, iterable, gather=False): """Call the lookup on the root node with the given parameters. - Args - iterable(index or key): Used to retrive nodes from tree - gather(bool): this is passed down to the root node lookup + Args: + iterable(hashable): a list of items used to traverse the Trie + This represents the position of a node in the Trie, matching the + iterable used at insertion time. + For example: + trie.insert('foo', {'bar': 'baz'}) + list(trie.lookup('foo')) == [TrieNode(data={'bar': 'baz'}, is_terminal=True)] + gather(bool): flag to indicate whether gather results + should be included - Notes: - max_edit_distance and match_threshold come from the init """ for result in self.root.lookup(iterable, gather=gather, @@ -192,28 +238,38 @@ def lookup(self, iterable, gather=False): yield result def insert(self, iterable, data=None, weight=1.0): - """Used to insert into he root node + """Used to insert into the trie - Args - iterable(hashable): index or key used to identify - data(object): data to be paired with the key + Args: + iterable(hashable): a list of items used to traverse the Trie + This represents the position of a node in the Trie, matching the + iterable used at insertion time. + For example: + trie.insert('foo', {'bar': 'baz'}) + list(trie.lookup('foo')) == [TrieNode(data={'bar': 'baz'}, is_terminal=True)] + data(object): data to stored or merged for this iterable """ - self.root.insert(iterable, index=0, data=data, weight=1.0) + self.root.insert(iterable, index=0, data=data, weight=weight) def remove(self, iterable, data=None): """Used to remove from the root node Args: - iterable(hashable): index or key used to identify - item to remove - data: data to be paired with the key + iterable(hashable): a list of items used to traverse the Trie + This represents the position of a node in the Trie, matching the + iterable used at insertion time. + For example: + trie.insert('foo', {'bar': 'baz'}) + list(trie.lookup('foo')) == [TrieNode(data={'bar': 'baz'}, is_terminal=True)] + data: data to removed. If None, or node is empty as a result, + remove the node. """ return self.root.remove(iterable, data=data) def scan(self, match_func): """Traverse the trie scanning for end nodes with matching data. - Arguments: + Args: match_func (callable): function used to match data. Returns: diff --git a/setup.py b/setup.py index 0220902..8e66cce 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def required(requirements_file): setup( name="adapt-parser", - version="0.4.2", + version="0.4.3", author="Sean Fitzgerald", author_email="sean@fitzgeralds.me", description=("A text-to-intent parsing framework."),