Merge pull request #23 from intuit/linear-cost-optimization

Linear cost optimization
intuit · Mar 25, 2020 · 81d7b61 · 81d7b61
2 parents 6a81cf3 + 316f95f
commit 81d7b61
Show file tree

Hide file tree

Showing 38 changed files with 1,049 additions and 1,436 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,27 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## 1.0.0 - 2020-03-25
+### Added
+- In Element ability to set MatchType (this replaces similarityMatchFunction)
+- In Element ability to  set NeighborhoodRange for NEAREST_NEIGHBOR MatchType
+- New classes added - TokenRepo (replaces the TokenMatch) and MatchType (replaces SimilariyyMatchFunction) 
+
+### Removed
+- Document and Element classes does not allow to externally define a ScoringFunction. This is now set to defaults as 
+SimpleAverage (in Element) and ExponentialWeightedAverage (in Document)
+- Element does not allow to externally define similarityMatchFunction. This is replaced by MatchType
+- Element does not allow to externally define matchOptimizerFunction. All these changes allow a guaranteed performance of the library
+- These classes are removed - TokenMatch, NGram, MatchOptimizerFunction, SimilarityMatchFunction
+
+### Changed
+- Significant performance improvements along with reduced memory utilization
+- Soundex match is no longer a Matching function, it is replaced as a tokenization function instead, where encoded soundex token are now used.
+- Element is a generic now. Which replaces the `value` as generic instead of object.
+- ElementType of TEXT is matched by word equality instead of Soundex matching function by default
+- ElementType of NUMBER and DATE are matched using NEAREST_NEIGHBOR MatchType. This gives similar results, but are 
+controlled by NeighborhoodRange attribute defined in Element instead of Threshold   
+
 ## 0.4.4 - 2019-12-23
 ### Fixed
 - Ability to configure scoring function in Element https://github.com/intuit/fuzzy-matcher/issues/19

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -6,6 +6,24 @@ Thanks for your interest in fuzzy-matcher.
 
 Readme.md file gives a good overview of the architecture. Make sure to review the readme.
 
+## Building the Library
+### Prerequisite
+You need Java SDK v1.8 or higher. Before you begin, you should check your current Java installation by using the following command:
+``` java -version ```
+
+fuzzy-match is compatible with Apache Maven 4.0 or above. If you do not already have Maven installed, you can follow the instructions at maven.apache.org.
+```
+On many operating systems, Maven can be installed with a package manager.
+If you use OSX Homebrew, try brew install maven.
+Ubuntu users can run sudo apt-get install maven.
+Windows users with Chocolatey can run choco install maven from an elevated (administrator) prompt.
+```
+### Compiling and installing locally
+After cloning the project locally, run this command to compile, test and install the project
+```
+mvn clean install
+```
+
 ## Contributions
 
 fuzzy-matcher welcomes contributions from everyone.

diff --git a/README.md b/README.md
diff --git a/fuzzy-match.png b/fuzzy-match.png
diff --git a/perf.png b/perf.png
diff --git a/pom.xml b/pom.xml
@@ -79,13 +79,6 @@
             <version>1.11</version>
         </dependency>
 
-        <!-- https://mvnrepository.com/artifact/com.googlecode.libphonenumber/libphonenumber -->
-        <dependency>
-            <groupId>com.googlecode.libphonenumber</groupId>
-            <artifactId>libphonenumber</artifactId>
-            <version>3.5</version>
-        </dependency>
-
         <!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-core -->
         <dependency>
             <groupId>org.apache.lucene</groupId>

diff --git a/src/main/java/com/intuit/fuzzymatcher/component/DocumentMatch.java b/src/main/java/com/intuit/fuzzymatcher/component/DocumentMatch.java
@@ -8,15 +8,18 @@
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 
-
 /**
  * <p>
  * Starts the Matching process by element level matching and aggregates the results back
  * This uses the ScoringFunction defined at each Document to get the aggregated Document score for matched Elements
  */
 public class DocumentMatch {
 
-    private static ElementMatch elementMatch = new ElementMatch();
+    private final ElementMatch elementMatch;
+
+    public DocumentMatch() {
+        this.elementMatch = new ElementMatch();
+    }
 
     /**
      * Executes matching of a document stream
@@ -25,39 +28,45 @@ public class DocumentMatch {
      * @return Stream of Match of Document type objects
      */
     public Stream<Match<Document>> matchDocuments(Stream<Document> documents) {
-        Stream<Element> elements = documents.flatMap(d -> d.getPreProcessedElement().stream());
-        Map<ElementClassification, List<Element>> elementMap = elements.collect(Collectors.groupingBy(Element::getElementClassification));
 
-        List<Match<Element>> matchedElements = new ArrayList<>();
-        elementMap.forEach((key, value) -> {
-            List<Match<Element>> result = elementMatch.matchElements(key, value.parallelStream()).collect(Collectors.toList());
-            matchedElements.addAll(result);
+        Stream<Match<Document>> documentMatch = documents.flatMap(document -> {
+            Set<Element> elements = document.getPreProcessedElement();
+            Set<Match<Element>> eleMatches = elements.stream()
+                    .flatMap(element -> elementMatch.matchElement(element).stream())
+                    .collect(Collectors.toSet());
+            return documentThresholdMatching(document, eleMatches);
         });
 
-        return rollupDocumentScore(matchedElements.parallelStream());
+        return documentMatch;
     }
 
-    private Stream<Match<Document>> rollupDocumentScore(Stream<Match<Element>> matchElementStream) {
-
-        Map<Document, Map<Document, List<Match<Element>>>> groupBy = matchElementStream
-                .collect(Collectors.groupingBy(matchElement -> matchElement.getData().getDocument(),
-                        Collectors.groupingBy(matchElement -> matchElement.getMatchedWith().getDocument())));
-
-        return groupBy.entrySet().parallelStream().flatMap(leftDocumentEntry ->
-                leftDocumentEntry.getValue().entrySet()
-                        .parallelStream()
-                        .flatMap(rightDocumentEntry -> {
-                            List<Score> childScoreList = rightDocumentEntry.getValue()
-                                    .stream()
-                                    .map(d -> d.getScore())
-                                    .collect(Collectors.toList());
-                            Match<Document> leftMatch = new Match<Document>(leftDocumentEntry.getKey(), rightDocumentEntry.getKey(), childScoreList);
-                            if (BooleanUtils.isNotFalse(rightDocumentEntry.getKey().isSource())) {
-                                Match<Document> rightMatch = new Match<Document>(rightDocumentEntry.getKey(), leftDocumentEntry.getKey(), childScoreList);
-                                return Stream.of(leftMatch, rightMatch);
-                            }
-                            return Stream.of(leftMatch);
-                        }))
-                .filter(match -> match.getResult() > match.getData().getThreshold());
+    private Stream<Match<Document>> documentThresholdMatching(Document document, Set<Match<Element>> matchingElements) {
+        Map<Document, List<Match<Element>>> mathes = matchingElements.stream()
+                .collect(Collectors.groupingBy(matchElement -> matchElement.getMatchedWith().getDocument()));
+
+        Stream<Match<Document>> result = mathes.entrySet().stream().flatMap(matchEntry -> {
+
+            List<Score> childScoreList = matchEntry.getValue()
+                    .stream()
+                    .map(d -> d.getScore())
+                    .collect(Collectors.toList());
+            //System.out.println(Arrays.toString(childScoreList.toArray()));
+            Match<Document> leftMatch = new Match<Document>(document, matchEntry.getKey(), childScoreList);
+
+            // Document match Found
+            if (leftMatch.getScore().getResult() > leftMatch.getData().getThreshold()) {
+
+                if (BooleanUtils.isNotFalse(matchEntry.getKey().isSource())) {
+                    Match<Document> rightMatch = new Match<Document>(matchEntry.getKey(), document, childScoreList);
+                    return Stream.of(leftMatch, rightMatch);
+                }
+                return Stream.of(leftMatch);
+            } else {
+                return Stream.empty();
+            }
+        });
+
+        return result;
     }
+
 }
diff --git a/src/main/java/com/intuit/fuzzymatcher/component/ElementMatch.java b/src/main/java/com/intuit/fuzzymatcher/component/ElementMatch.java
@@ -1,41 +1,55 @@
 package com.intuit.fuzzymatcher.component;
 
-import com.intuit.fuzzymatcher.domain.*;
+import com.intuit.fuzzymatcher.domain.Element;
+import com.intuit.fuzzymatcher.domain.Match;
+import com.intuit.fuzzymatcher.domain.Token;
+import org.apache.commons.lang3.BooleanUtils;
 
-import java.util.List;
-import java.util.Map;
-import java.util.stream.Collectors;
-import java.util.stream.Stream;
+import java.util.*;
 
-/**
- * Matches at element level with aggregated results from token.
- * This uses the ScoringFunction defined at each element to get the aggregated Element score for matched tokens
- */
 public class ElementMatch {
 
-    private static TokenMatch tokenMatch = new TokenMatch();
+    private final TokenRepo tokenRepo;
 
-    public Stream<Match<Element>> matchElements(ElementClassification elementClassification, Stream<Element> elements) {
-        Stream<Token> tokenStream = elements.flatMap(Element::getTokens);
-        Stream<Match<Token>> matchedTokens = tokenMatch.matchTokens(elementClassification, tokenStream);
-        return rollupElementScore(matchedTokens);
+    public ElementMatch() {
+        this.tokenRepo = new TokenRepo();
     }
 
-    private Stream<Match<Element>> rollupElementScore(Stream<Match<Token>> matchedTokenStream) {
+    public Set<Match<Element>> matchElement(Element element) {
+        Set<Match<Element>> matchElements = new HashSet<>();
+        Map<Element, Integer> elementTokenScore = new HashMap<>();
 
-        Map<Element, Map<Element, List<Match<Token>>>> groupBy = matchedTokenStream
-                .collect(Collectors.groupingBy((matchToken -> matchToken.getData().getElement()),
-                        Collectors.groupingBy(matchToken -> matchToken.getMatchedWith().getElement())));
+        List<Token> tokens = element.getTokens();
+        tokens.stream()
+                .filter(token -> BooleanUtils.isNotFalse(element.getDocument().isSource()))
+                .forEach(token -> {
+                    elementThresholdMatching(token, elementTokenScore, matchElements);
+                });
 
-        return groupBy.entrySet().parallelStream().flatMap(leftElementEntry ->
-                leftElementEntry.getValue().entrySet().parallelStream().map(rightElementEntry -> {
-                    List<Score> childScoreList = rightElementEntry.getValue()
-                            .stream().map(d -> d.getScore())
-                            .collect(Collectors.toList());
+        tokens.forEach(token -> tokenRepo.put(token));
 
-                    return new Match<Element>(leftElementEntry.getKey(), rightElementEntry.getKey(), childScoreList);
-                }).filter(match -> match.getResult() > match.getData().getThreshold()));
+        return matchElements;
     }
 
-
+    private void elementThresholdMatching(Token token, Map<Element, Integer> elementTokenScore, Set<Match<Element>> matchingElements) {
+        Set<Element> matchElements = tokenRepo.get(token);
+        Element element = token.getElement();
+
+        // Token Match Found
+        if (matchElements != null) {
+            matchElements.forEach(matchElement -> {
+                int score = elementTokenScore.getOrDefault(matchElement, 0) + 1;
+                elementTokenScore.put(matchElement, score);
+                // Element Score above threshold
+                double elementScore = element.getScore(score, matchElement);
+
+                // Element match Found
+                if (elementScore > element.getThreshold()) {
+                    Match<Element> elementMatch = new Match<>(element, matchElement, elementScore);
+                    matchingElements.remove(elementMatch);
+                    matchingElements.add(elementMatch);
+                }
+            });
+        }
+    }
 }
diff --git a/src/main/java/com/intuit/fuzzymatcher/component/MatchService.java b/src/main/java/com/intuit/fuzzymatcher/component/MatchService.java
@@ -20,8 +20,6 @@
  */
 public class MatchService {
 
-    private static DocumentMatch documentMatch = new DocumentMatch();
-
     /**
      * Use this for De-duplication of data, where for a given list of documents it finds duplicates
      * Data is aggregated by a given Document
@@ -30,7 +28,8 @@ public class MatchService {
      * @return a map containing the grouping of each document and its corresponding matches
      */
     public Map<Document, List<Match<Document>>> applyMatch(List<Document> documents) {
-        return documentMatch.matchDocuments(documents.parallelStream())
+        DocumentMatch documentMatch = new DocumentMatch();
+        return documentMatch.matchDocuments(documents.stream())
                 .collect(Collectors.groupingBy(Match::getData));
     }
 
@@ -43,13 +42,15 @@ public Map<Document, List<Match<Document>>> applyMatch(List<Document> documents)
      * @return a map containing the grouping of each document and its corresponding matches
      */
     public Map<Document, List<Match<Document>>> applyMatch(List<Document> documents, List<Document> matchWith) {
+        DocumentMatch documentMatch = new DocumentMatch();
         return documentMatch.matchDocuments(Stream.concat(
-                documents.parallelStream().map(document -> {
-                    document.setSource(true);
-                    return document;
-                }), matchWith.parallelStream().map(document -> {
+                matchWith.stream().map(document -> {
                     document.setSource(false);
                     return document;
+                }),
+                documents.stream().map(document -> {
+                    document.setSource(true);
+                    return document;
                 })))
                 .collect(Collectors.groupingBy(Match::getData));
     }
@@ -58,23 +59,25 @@ public Map<Document, List<Match<Document>>> applyMatch(List<Document> documents,
      * Use this to check duplicate for a new record, where it checks whether a new Document is a duplicate in existing list
      * Data is aggregated by a given Document
      *
-     * @param document the document to match
+     * @param document  the document to match
      * @param matchWith the list of documents to match against
      * @return a map containing the grouping of each document and its corresponding matches
      */
     public Map<Document, List<Match<Document>>> applyMatch(Document document, List<Document> matchWith) {
+        DocumentMatch documentMatch = new DocumentMatch();
         return applyMatch(Arrays.asList(document), matchWith);
     }
 
     /**
      * Use this to check duplicate for a new record, where it checks whether a new Document is a duplicate in existing list
      * Data is aggregated by a given Document Id
      *
-     * @param document the document to match
+     * @param document  the document to match
      * @param matchWith the list of documents to match against
      * @return a map containing the grouping of each document id and its corresponding matches
      */
     public Map<String, List<Match<Document>>> applyMatchByDocId(Document document, List<Document> matchWith) {
+        DocumentMatch documentMatch = new DocumentMatch();
         return applyMatchByDocId(Arrays.asList(document), matchWith);
     }
 
@@ -86,7 +89,8 @@ public Map<String, List<Match<Document>>> applyMatchByDocId(Document document, L
      * @return a map containing the grouping of each document id and its corresponding matches
      */
     public Map<String, List<Match<Document>>> applyMatchByDocId(List<Document> documents) {
-        return documentMatch.matchDocuments(documents.parallelStream())
+        DocumentMatch documentMatch = new DocumentMatch();
+        return documentMatch.matchDocuments(documents.stream())
                 .collect(Collectors.groupingBy(match -> match.getData().getKey()));
     }
 
@@ -99,13 +103,14 @@ public Map<String, List<Match<Document>>> applyMatchByDocId(List<Document> docum
      * @return a map containing the grouping of each document id and its corresponding matches
      */
     public Map<String, List<Match<Document>>> applyMatchByDocId(List<Document> documents, List<Document> matchWith) {
+        DocumentMatch documentMatch = new DocumentMatch();
         return documentMatch.matchDocuments(Stream.concat(
-                documents.parallelStream().map(document -> {
-                    document.setSource(true);
-                    return document;
-                }), matchWith.parallelStream().map(document -> {
+                matchWith.stream().map(document -> {
                     document.setSource(false);
                     return document;
+                }), documents.stream().map(document -> {
+                    document.setSource(true);
+                    return document;
                 })))
                 .collect(Collectors.groupingBy(match -> match.getData().getKey()));
     }

diff --git a/src/main/java/com/intuit/fuzzymatcher/component/TokenMatch.java b/src/main/java/com/intuit/fuzzymatcher/component/TokenMatch.java