Skip to content

Commit

Permalink
AICORE-569: add threshold/distance parameter to find similar API
Browse files Browse the repository at this point in the history
  • Loading branch information
Vladimir Pasquier committed Feb 4, 2022
1 parent 394da14 commit 0c6996d
Show file tree
Hide file tree
Showing 6 changed files with 69 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ public class FindSimilar {
@Param(name = "xpath", required = false)
protected String xpath = FILE_CONTENT;

@Param(name = "distance", required = false)
protected int distance = 0;

@Param(name = "batchId", description = "Batch id required when running this operation without any input", required = false)
protected String batchId;

Expand All @@ -71,7 +74,7 @@ public List<DocumentModel> run(DocumentModel doc) throws IOException {
return emptyList();
}

return scs.findSimilar(session, doc, xpath);
return scs.findSimilar(session, doc, xpath, distance);
}

@OperationMethod
Expand All @@ -88,6 +91,6 @@ public List<DocumentModel> run(Blob blob) throws OperationException, IOException
throw new OperationException("Blob is too large; size = " + blob.getLength());
}

return scs.findSimilar(session, blob, xpath);
return scs.findSimilar(session, blob, xpath, distance);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,30 @@ public interface SimilarContentService {
*/
List<DocumentModel> findSimilar(CoreSession session, DocumentModel doc, String xpath) throws IOException;

/**
* Find Similar {@link DocumentModel}[s] for the provided {@link Blob}
*
* @param session {@link CoreSession} for obtaining user and repository info
* @param doc {@link DocumentModel} based on which similar search will run
* @param xpath {@link String} xpath of the indexed blob in {@link DocumentModel}
* @param distance {@link Integer} max distance between entries.
* @return list of {@link DocumentModel} that are similar to the given {@link DocumentModel}
* @throws IOException in case of processing issues
*/
List<DocumentModel> findSimilar(CoreSession session, DocumentModel doc, String xpath, int distance) throws IOException;

/**
* Find Similar {@link DocumentModel}[s] for the provided {@link Blob}
*
* @param session {@link CoreSession} for obtaining user and repository info
* @param blob {@link Blob} based on which similar search will run
* @param xpath {@link String} xpath of the indexed {@link Blob}
* @param distance {@link Integer} max distance between entries.
* @return list of {@link DocumentModel} that are similar to the given {@link Blob}
* @throws IOException in case of processing issues
*/
List<DocumentModel> findSimilar(CoreSession session, Blob blob, String xpath, int distance) throws IOException;

/**
* Find Similar {@link DocumentModel}[s] for the provided {@link Blob}
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

import static java.util.Collections.emptyList;
import static java.util.Collections.singletonList;
import static org.nuxeo.ai.sdk.rest.Common.DISTANCE_PARAM;
import static org.nuxeo.ai.sdk.rest.Common.UID;
import static org.nuxeo.ai.sdk.rest.Common.XPATH_PARAM;
import static org.nuxeo.ai.similar.content.DedupConstants.DEDUPLICATION_FACET;
Expand Down Expand Up @@ -205,21 +206,43 @@ public List<DocumentModel> findSimilar(CoreSession session, DocumentModel doc, S
parameters.put(UID, doc.getId());
parameters.put(XPATH_PARAM, xpath);

return findSimilar(session, doc, xpath, 0);
}

@Override
public List<DocumentModel> findSimilar(CoreSession session, DocumentModel doc, String xpath, int distance) throws IOException {
InsightClient client = getInsightClient(session);
Map<String, Serializable> parameters = new HashMap<>();
parameters.put(UID, doc.getId());
parameters.put(XPATH_PARAM, xpath);
parameters.put(DISTANCE_PARAM, distance);

List<String> ids = client.api(API.Dedup.FIND).call(parameters, null);
return resolveDocuments(session, ids);
}

@Override
public List<DocumentModel> findSimilar(CoreSession session, Blob blob, String xpath) throws IOException {
public List<DocumentModel> findSimilar(CoreSession session, Blob blob, String xpath, int distance) throws IOException {
InsightClient client = getInsightClient(session);
Map<String, Serializable> parameters = new HashMap<>();
parameters.put(XPATH_PARAM, xpath);
parameters.put(DISTANCE_PARAM, distance);
TensorInstances tensor = constructTensor(blob, xpath);

List<String> ids = client.api(API.Dedup.FIND).call(parameters, tensor);
return resolveDocuments(session, ids);
}

@Override
public List<DocumentModel> findSimilar(CoreSession session, Blob blob, String xpath) throws IOException {
InsightClient client = getInsightClient(session);
Map<String, Serializable> parameters = new HashMap<>();
parameters.put(XPATH_PARAM, xpath);
TensorInstances tensor = constructTensor(blob, xpath);

return findSimilar(session, blob, xpath, 0);
}

@Override
public boolean delete(DocumentModel doc, String xpath) throws IOException {
InsightClient client = getInsightClient(doc.getCoreSession());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ private void indexDocument() throws OperationException {
OperationContext ctx = new OperationContext(session);
ctx.setInput(fileDoc);
ctx.put("xpath", FILE_CONTENT);
ctx.put("distance", 0);
@SuppressWarnings("unchecked")
List<DocumentModel> response = (List<DocumentModel>) automationService.run(ctx, FindSimilar.ID);
assertThat(response).isNotEmpty();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@
See the License for the specific language governing permissions and
limitations under the License.
Example: <nuxeo-ai-dedup-grid property="file:content" doc=[[document]]/>
Example: <nuxeo-ai-dedup-grid property="file:content" doc=[[document]] threshold="1"/>
or
<nuxeo-ai-dedup-grid property="file:content" doc=[[document]]>
<nuxeo-ai-dedup-grid property="file:content" doc=[[document]] threshold="1">
<slot name="dedup-content">
custom template for each similar document accessible via [[item]]
</slot>
Expand Down Expand Up @@ -229,6 +229,10 @@
type: String,
value: 'file:content',
},
threshold: {
type: Number,
value: 0,
},
similars: {
type: Array,
value: [],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,7 @@
}
}

function _findSimilarsWithBatch(doc, value, propTarget) {
function _findSimilarsWithBatch(doc, value, propTarget, threshold) {
if (!op) {
op = document.createElement('nuxeo-operation');
op.op = 'AI.DeduplicationFindSimilar';
Expand All @@ -296,11 +296,12 @@
op.params = {
batchId: batchInfo.batchId,
fileId: batchInfo.fileId,
distance: threshold,
};
return op.execute();
}

function _findSimilarsWithProperty(doc, propTarget) {
function _findSimilarsWithProperty(doc, propTarget, threshold) {
if (!op) {
op = document.createElement('nuxeo-operation');
op.op = 'AI.DeduplicationFindSimilar';
Expand All @@ -310,6 +311,7 @@
op.input = doc;
op.params = {
xpath: propTarget,
distance: threshold,
};
return op.execute();
}
Expand All @@ -332,6 +334,7 @@
if (!propTarget) {
return;
}
const threshold = deduplicationWidget.threshold;
_clearSimilars(deduplicationWidget);
const batchValue = layout.root.querySelector('nuxeo-dropzone').value;
if ((!layout.document.properties[propTarget]
Expand All @@ -342,7 +345,7 @@
_dedupDebouncer = Polymer.Debouncer.debounce(
_dedupDebouncer, Polymer.Async.timeOut.after(500),
() => {
_findSimilarsWithBatch(layout.document, batchValue, propTarget).then((response) => {
_findSimilarsWithBatch(layout.document, batchValue, propTarget, threshold).then((response) => {
const entries = response.entries || [];
deduplicationWidget.set('similars', entries);
}).catch((error) => {
Expand All @@ -352,7 +355,7 @@
},
/**
* For a given layout:
* 1) fetch similars for the document blob of the given property
* 1) fetch similars for the document blob of the given property and threshold
* 2) update similars on deduplication widget in metadata mode
*/
updateMetadataWidget: (layout) => {
Expand All @@ -367,8 +370,9 @@
if (!propTarget) {
return;
}
const threshold = deduplicationWidget.threshold;
_clearSimilars(deduplicationWidget);
_findSimilarsWithProperty(layout.document, propTarget).then((response) => {
_findSimilarsWithProperty(layout.document, propTarget, threshold).then((response) => {
const entries = response.entries || [];
deduplicationWidget.set('similars', entries);
}).catch((error) => {
Expand Down

0 comments on commit 0c6996d

Please sign in to comment.