From 01bee8b4ea782f40b72f31e969eb0bf254bc9bf4 Mon Sep 17 00:00:00 2001 From: ChaseDuncan Date: Thu, 30 Aug 2018 13:11:16 -0500 Subject: [PATCH 1/5] Fix 665: SpanLabelView may not enforce non-overlap constraint - Changed SpanLabelView.addConstituent(Constituent) to enforce nonoverlapping constituents when appropriate, i.e. the flag is set. - Added a class with two basic tests to make sure that the changes work correctly. --- .../textannotation/SpanLabelView.java | 11 +- .../cogcomp/annotation/SpanLabelViewTest.java | 100 ++++++++++++++++++ 2 files changed, 108 insertions(+), 3 deletions(-) create mode 100644 core-utilities/src/test/java/edu/illinois/cs/cogcomp/annotation/SpanLabelViewTest.java diff --git a/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/textannotation/SpanLabelView.java b/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/textannotation/SpanLabelView.java index f77feca47..86e9939c0 100644 --- a/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/textannotation/SpanLabelView.java +++ b/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/textannotation/SpanLabelView.java @@ -14,6 +14,7 @@ import java.util.Collections; import java.util.List; + /** * A SpanLabelView is a specialized view which corresponds to contiguous chunks of tokens that have * a label. Each chunk corresponds to a single {@code Consituent}. In this view, there will be no @@ -60,6 +61,13 @@ public SpanLabelView(String viewName, String viewGenerator, TextAnnotation text, @Override public void addConstituent(Constituent constituent) { + + int start = constituent.getStartSpan(); + int end = constituent.getEndSpan(); + + if (!allowOverlappingSpans && this.getConstituentsCoveringSpan(start, end).size() != 0) + throw new IllegalArgumentException("Span [" + start + ", " + end + "] already labeled."); + super.addConstituent(constituent); // this sort is grossly inefficient when appending contiguous tokens one at a time. @@ -95,9 +103,6 @@ public Constituent addSpanLabel(int start, int end, String label, double score) new Constituent(label, score, this.getViewName(), this.getTextAnnotation(), start, end); - if (!allowOverlappingSpans && this.getConstituentsCoveringSpan(start, end).size() != 0) - throw new IllegalArgumentException("Span [" + start + ", " + end + "] already labeled."); - this.addConstituent(c); return c; diff --git a/core-utilities/src/test/java/edu/illinois/cs/cogcomp/annotation/SpanLabelViewTest.java b/core-utilities/src/test/java/edu/illinois/cs/cogcomp/annotation/SpanLabelViewTest.java new file mode 100644 index 000000000..0ab4976c3 --- /dev/null +++ b/core-utilities/src/test/java/edu/illinois/cs/cogcomp/annotation/SpanLabelViewTest.java @@ -0,0 +1,100 @@ +package edu.illinois.cs.cogcomp.annotation; + +import edu.illinois.cs.cogcomp.core.datastructures.IntPair; +import edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent; +import edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView; +import edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation; +import edu.illinois.cs.cogcomp.nlp.tokenizer.Tokenizer; +import org.junit.Before; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.List; + +public class SpanLabelViewTest { + // test that addConstituent(Constituent) does not allow overlapping spans + SpanLabelView overlappingSpansView; + SpanLabelView noOverlappingSpansView; + TextAnnotation ta; + Constituent baseConstituent; + Constituent overlappingConstituent; + + private Tokenizer.Tokenization tokenization; + + String viewName = "VIEWNAME"; + String viewGenerator = "VIEW-GENERATOR"; + String text = "This is a test string; do not pay it any mind."; + String corpusId = "TEST"; + String textId = "ID"; + + double score = 42.0; + int baseStart = 0; + int baseEnd = 5; + int overStart = 2; + int overEnd = 6; + + private Tokenizer.Tokenization getTokenization(String text) { + String[] tokens = text.split("\\s"); + List characterOffsets = new ArrayList<>(); + int[] sentenceEndArray = {tokens.length}; + + int charOffsetBegin = 0; + int charOffsetEnd = 0; + for (int i = 0; i < text.length(); i++) { + char c = text.charAt(i); + if (Character.isWhitespace(c)) { + charOffsetEnd = i; + IntPair tokenOffsets = new IntPair(charOffsetBegin, charOffsetEnd); + characterOffsets.add(tokenOffsets); + charOffsetBegin = charOffsetEnd + 1; + } + } + IntPair tokenOffsets = new IntPair(charOffsetBegin, text.length()); + characterOffsets.add(tokenOffsets); + + IntPair[] charOffsetArray = new IntPair[characterOffsets.size()]; + + for (int i = 0; i < characterOffsets.size(); i++) { + charOffsetArray[i] = characterOffsets.get(i); + } + Tokenizer.Tokenization tokenization = + new Tokenizer.Tokenization(tokens, charOffsetArray, sentenceEndArray); + return tokenization; + } + + @Before + public void init(){ + TextAnnotationBuilder taBuilder = new BasicTextAnnotationBuilder(); + ta = taBuilder.createTextAnnotation(this.corpusId, this.textId, this.text, getTokenization(this.text)); + boolean allowOverlappingSpans = true; + overlappingSpansView = new SpanLabelView(this.viewName, this.viewGenerator, + ta, this.score, allowOverlappingSpans); + allowOverlappingSpans = false; + noOverlappingSpansView = new SpanLabelView(this.viewName, this.viewGenerator, + ta, this.score, allowOverlappingSpans); + + baseConstituent = new Constituent("BASE", this.score, this.viewName, ta, baseStart, baseEnd); + overlappingConstituent = new Constituent("OVER", this.score, this.viewName, ta, overStart, overEnd); + } + + @Test + public void testOverlappingSpans(){ + overlappingSpansView.addConstituent(baseConstituent); + overlappingSpansView.addConstituent(overlappingConstituent); + for(Constituent c : overlappingSpansView.getConstituents()){ + if(c.getLabel().equals("BASE")) { + assert c.getStartSpan() == this.baseStart; + assert c.getEndSpan() == this.baseEnd; + }else { + assert c.getStartSpan() == this.overStart; + assert c.getEndSpan() == this.overEnd; + } + } + } + + @Test(expected=IllegalArgumentException.class) + public void testNoOverlappingSpans(){ + noOverlappingSpansView.addConstituent(baseConstituent); + noOverlappingSpansView.addConstituent(overlappingConstituent); + } +} From a88173d7137de90089d9320d8580fe3c9e695d57 Mon Sep 17 00:00:00 2001 From: ChaseDuncan Date: Thu, 30 Aug 2018 13:18:25 -0500 Subject: [PATCH 2/5] Removed extra line and some superfluous, empty comment. --- .../core/datastructures/textannotation/SpanLabelView.java | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/textannotation/SpanLabelView.java b/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/textannotation/SpanLabelView.java index 86e9939c0..1aeb9f53e 100644 --- a/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/textannotation/SpanLabelView.java +++ b/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/textannotation/SpanLabelView.java @@ -5,16 +5,13 @@ * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign * http://cogcomp.cs.illinois.edu/ */ -/** - * - */ + package edu.illinois.cs.cogcomp.core.datastructures.textannotation; import java.util.ArrayList; import java.util.Collections; import java.util.List; - /** * A SpanLabelView is a specialized view which corresponds to contiguous chunks of tokens that have * a label. Each chunk corresponds to a single {@code Consituent}. In this view, there will be no From 01e8bd4a78fe6b64bee6725624ba037d60628586 Mon Sep 17 00:00:00 2001 From: ChaseDuncan Date: Tue, 4 Sep 2018 10:18:39 -0500 Subject: [PATCH 3/5] Addressing comments - Moved inline comment into JavaDoc comment above class in SpanLabelViewTest - Moved start, end variable initialization and overlap into conditional statement so that they only occur when overlapping spans are not allowed - Moved SpanLevelViewTest into correct module --- .../datastructures/textannotation/SpanLabelView.java | 11 ++++++----- .../textannotation}/SpanLabelViewTest.java | 8 ++++++-- 2 files changed, 12 insertions(+), 7 deletions(-) rename core-utilities/src/test/java/edu/illinois/cs/cogcomp/{annotation => core/datastructures/textannotation}/SpanLabelViewTest.java (93%) diff --git a/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/textannotation/SpanLabelView.java b/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/textannotation/SpanLabelView.java index 1aeb9f53e..0988ddbad 100644 --- a/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/textannotation/SpanLabelView.java +++ b/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/textannotation/SpanLabelView.java @@ -59,11 +59,12 @@ public SpanLabelView(String viewName, String viewGenerator, TextAnnotation text, @Override public void addConstituent(Constituent constituent) { - int start = constituent.getStartSpan(); - int end = constituent.getEndSpan(); - - if (!allowOverlappingSpans && this.getConstituentsCoveringSpan(start, end).size() != 0) - throw new IllegalArgumentException("Span [" + start + ", " + end + "] already labeled."); + if (!allowOverlappingSpans) { + int start = constituent.getStartSpan(); + int end = constituent.getEndSpan(); + if (this.getConstituentsCoveringSpan(start, end).size() != 0) + throw new IllegalArgumentException("Span [" + start + ", " + end + "] already labeled."); + } super.addConstituent(constituent); diff --git a/core-utilities/src/test/java/edu/illinois/cs/cogcomp/annotation/SpanLabelViewTest.java b/core-utilities/src/test/java/edu/illinois/cs/cogcomp/core/datastructures/textannotation/SpanLabelViewTest.java similarity index 93% rename from core-utilities/src/test/java/edu/illinois/cs/cogcomp/annotation/SpanLabelViewTest.java rename to core-utilities/src/test/java/edu/illinois/cs/cogcomp/core/datastructures/textannotation/SpanLabelViewTest.java index 0ab4976c3..b086a7bd2 100644 --- a/core-utilities/src/test/java/edu/illinois/cs/cogcomp/annotation/SpanLabelViewTest.java +++ b/core-utilities/src/test/java/edu/illinois/cs/cogcomp/core/datastructures/textannotation/SpanLabelViewTest.java @@ -1,5 +1,7 @@ -package edu.illinois.cs.cogcomp.annotation; +package edu.illinois.cs.cogcomp.core.datastructures.textannotation; +import edu.illinois.cs.cogcomp.annotation.BasicTextAnnotationBuilder; +import edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder; import edu.illinois.cs.cogcomp.core.datastructures.IntPair; import edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent; import edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView; @@ -11,8 +13,10 @@ import java.util.ArrayList; import java.util.List; +/** + * Test that addConstituent(Constituent) does not allow overlapping spans + */ public class SpanLabelViewTest { - // test that addConstituent(Constituent) does not allow overlapping spans SpanLabelView overlappingSpansView; SpanLabelView noOverlappingSpansView; TextAnnotation ta; From 786459cccefc18a718fe27ac29b2517c634a0db0 Mon Sep 17 00:00:00 2001 From: ChaseDuncan Date: Tue, 4 Sep 2018 14:10:00 -0500 Subject: [PATCH 4/5] Fixing TeamCity errors - Fixed multiple places in MD and one place in corpus readers where overlapping spans were implicitly or explicitly forbidden but still being used --- .../cs/cogcomp/nlp/corpusreaders/ereReader/ERENerReader.java | 2 +- md/src/main/java/org/cogcomp/md/BIOCombinedReader.java | 2 +- md/src/main/java/org/cogcomp/md/BIOReader.java | 2 +- md/src/main/java/org/cogcomp/md/ColumnFormatReader.java | 2 +- md/src/main/java/org/cogcomp/md/MentionAnnotator.java | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/corpusreaders/src/main/java/edu/illinois/cs/cogcomp/nlp/corpusreaders/ereReader/ERENerReader.java b/corpusreaders/src/main/java/edu/illinois/cs/cogcomp/nlp/corpusreaders/ereReader/ERENerReader.java index 4c4e7e62b..1ef07bd26 100644 --- a/corpusreaders/src/main/java/edu/illinois/cs/cogcomp/nlp/corpusreaders/ereReader/ERENerReader.java +++ b/corpusreaders/src/main/java/edu/illinois/cs/cogcomp/nlp/corpusreaders/ereReader/ERENerReader.java @@ -173,7 +173,7 @@ public List getAnnotationsFromFile(List corpusFileListE TextAnnotation ta = sourceTa.getTextAnnotation(); SpanLabelView tokens = (SpanLabelView) ta.getView(ViewNames.TOKENS); compileOffsets(tokens); - SpanLabelView nerView = new SpanLabelView(getMentionViewName(), NAME, ta, 1.0, false); + SpanLabelView nerView = new SpanLabelView(getMentionViewName(), NAME, ta, 1.0, true); // now pull all mentions we deal with. Start from file list index 1, as index 0 was source // text diff --git a/md/src/main/java/org/cogcomp/md/BIOCombinedReader.java b/md/src/main/java/org/cogcomp/md/BIOCombinedReader.java index 107f4c748..3f835fdce 100644 --- a/md/src/main/java/org/cogcomp/md/BIOCombinedReader.java +++ b/md/src/main/java/org/cogcomp/md/BIOCombinedReader.java @@ -172,7 +172,7 @@ private List getTokensFromTAs(){ mentionViewName = ViewNames.MENTION_ERE; } View mentionView = ta.getView(mentionViewName); - View bioView = new SpanLabelView("BIO", BIOReader.class.getCanonicalName(), ta, 1.0f); + View bioView = new SpanLabelView("BIO", BIOReader.class.getCanonicalName(), ta, 1.0f, true); String[] token2tags = new String[tokenView.getConstituents().size()]; for (int i = 0; i < token2tags.length; i++){ token2tags[i] = "O"; diff --git a/md/src/main/java/org/cogcomp/md/BIOReader.java b/md/src/main/java/org/cogcomp/md/BIOReader.java index 20c56617e..18971b21e 100644 --- a/md/src/main/java/org/cogcomp/md/BIOReader.java +++ b/md/src/main/java/org/cogcomp/md/BIOReader.java @@ -180,7 +180,7 @@ else if (_mode.equals("ColumnFormat")){ for (TextAnnotation ta : taList){ View tokenView = ta.getView(ViewNames.TOKENS); View mentionView = ta.getView(mentionViewName); - View bioView = new SpanLabelView("BIO", BIOReader.class.getCanonicalName(), ta, 1.0f); + View bioView = new SpanLabelView("BIO", BIOReader.class.getCanonicalName(), ta, 1.0f, true); String[] token2tags = new String[tokenView.getConstituents().size()]; for (int i = 0; i < token2tags.length; i++){ token2tags[i] = "O"; diff --git a/md/src/main/java/org/cogcomp/md/ColumnFormatReader.java b/md/src/main/java/org/cogcomp/md/ColumnFormatReader.java index 3290be1f0..8d6454d99 100644 --- a/md/src/main/java/org/cogcomp/md/ColumnFormatReader.java +++ b/md/src/main/java/org/cogcomp/md/ColumnFormatReader.java @@ -118,7 +118,7 @@ public TextAnnotation readSingleFile(String file){ tokens.add(curSentenceArr); } TextAnnotation ta = BasicTextAnnotationBuilder.createTextAnnotationFromTokens(tokens); - SpanLabelView mentionView = new SpanLabelView("MENTIONS", this.getClass().getCanonicalName(), ta, 1.0f); + SpanLabelView mentionView = new SpanLabelView("MENTIONS", this.getClass().getCanonicalName(), ta, 1.0f, true); if (mentionTypes.size() != mentions.size()){ System.out.println("ERROR"); } diff --git a/md/src/main/java/org/cogcomp/md/MentionAnnotator.java b/md/src/main/java/org/cogcomp/md/MentionAnnotator.java index eaadd196e..0834e5d0b 100644 --- a/md/src/main/java/org/cogcomp/md/MentionAnnotator.java +++ b/md/src/main/java/org/cogcomp/md/MentionAnnotator.java @@ -204,7 +204,7 @@ public void addView(TextAnnotation ta) throws AnnotatorException{ throw new AnnotatorException("Missing required view POS"); } View mentionView = new SpanLabelView(ViewNames.MENTION, MentionAnnotator.class.getCanonicalName(), ta, 1.0f, true); - View bioView = new SpanLabelView("BIO", BIOReader.class.getCanonicalName(), ta, 1.0f); + View bioView = new SpanLabelView("BIO", BIOReader.class.getCanonicalName(), ta, 1.0f, true); View tokenView = ta.getView(ViewNames.TOKENS); for (int i = tokenView.getStartSpan(); i < tokenView.getEndSpan(); i++){ Constituent currentToken = tokenView.getConstituentsCoveringToken(i).get(0).cloneForNewView("BIO"); From 4c33f4fd161d7fd4a974cafe57e8b97d207bed0e Mon Sep 17 00:00:00 2001 From: ChaseDuncan Date: Mon, 10 Sep 2018 15:39:38 -0500 Subject: [PATCH 5/5] Addressing CI failures - Added another ctor to TokenLabelView which has a parameter for specifying whether or not to allow overlapping spans - Changed the TokenLabelView ctor which is used in StanfordTrueCaseHandler since it was clear that overlapping spans are desired --- .../datastructures/textannotation/TokenLabelView.java | 9 +++++++++ .../pipeline/handlers/StanfordTrueCaseHandler.java | 2 +- .../nlp/utility/TokenizerTextAnnotationBuilder.java | 2 +- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/textannotation/TokenLabelView.java b/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/textannotation/TokenLabelView.java index d03f13a3d..ec377d93c 100644 --- a/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/textannotation/TokenLabelView.java +++ b/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/textannotation/TokenLabelView.java @@ -31,10 +31,19 @@ public TokenLabelView(String viewName, TextAnnotation text) { this(viewName, viewName + "-annotator", text, 1.0); } + public TokenLabelView(String viewName, TextAnnotation text, boolean allowOverlappingSpans) { + this(viewName, viewName + "-annotator", text, 1.0, allowOverlappingSpans); + } + public TokenLabelView(String viewName, String viewGenerator, TextAnnotation text, double score) { super(viewName, viewGenerator, text, score); } + public TokenLabelView(String viewName, String viewGenerator, TextAnnotation text, double score, + boolean allowOverlappingSpans) { + super(viewName, viewGenerator, text, score, allowOverlappingSpans); + } + /** * Adds a label to a token and returns the newly created constituent. * diff --git a/external/stanford_3.3.1/src/main/java/edu/illinois/cs/cogcomp/pipeline/handlers/StanfordTrueCaseHandler.java b/external/stanford_3.3.1/src/main/java/edu/illinois/cs/cogcomp/pipeline/handlers/StanfordTrueCaseHandler.java index 7af3c7f8f..1db6f7357 100644 --- a/external/stanford_3.3.1/src/main/java/edu/illinois/cs/cogcomp/pipeline/handlers/StanfordTrueCaseHandler.java +++ b/external/stanford_3.3.1/src/main/java/edu/illinois/cs/cogcomp/pipeline/handlers/StanfordTrueCaseHandler.java @@ -54,7 +54,7 @@ public void initialize(ResourceManager rm) { public void addView(TextAnnotation ta) throws AnnotatorException { Annotation document = new Annotation(ta.text); pipeline.annotate(document); - TokenLabelView vu = new TokenLabelView(viewName, ta); + TokenLabelView vu = new TokenLabelView(viewName, ta, true); for (CoreMap sentence : document.get(CoreAnnotations.SentencesAnnotation.class)) { for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) { diff --git a/tokenizer/src/main/java/edu/illinois/cs/cogcomp/nlp/utility/TokenizerTextAnnotationBuilder.java b/tokenizer/src/main/java/edu/illinois/cs/cogcomp/nlp/utility/TokenizerTextAnnotationBuilder.java index 80ea8f250..e651eb443 100644 --- a/tokenizer/src/main/java/edu/illinois/cs/cogcomp/nlp/utility/TokenizerTextAnnotationBuilder.java +++ b/tokenizer/src/main/java/edu/illinois/cs/cogcomp/nlp/utility/TokenizerTextAnnotationBuilder.java @@ -138,7 +138,7 @@ public TextAnnotation createTextAnnotation(String corpusId, String textId, Strin TextAnnotation ta = new TextAnnotation(corpusId, textId, text, tokenization.getCharacterOffsets(), tokenization.getTokens(), tokenization.getSentenceEndTokenIndexes()); SpanLabelView view = - new SpanLabelView(ViewNames.SENTENCE, NAME, ta, 1.0); + new SpanLabelView(ViewNames.SENTENCE, NAME, ta, 1); int start = 0; for (int s : tokenization.getSentenceEndTokenIndexes()) {