Created passive filter, targeting passive_1 and passive_2 in BLiMP

CLMBRs · Jan 28, 2024 · efc8eef · efc8eef
1 parent f195442
commit efc8eef
Show file tree

Hide file tree

Showing 4 changed files with 269 additions and 0 deletions.
diff --git a/corpus_filtering/filters/stanza_filters.py b/corpus_filtering/filters/stanza_filters.py
@@ -632,3 +632,91 @@ def _exclude_sent(self, sent: StanzaSentence) -> bool:
                         return True
 
         return False
+
+
+@register_filter("passive")
+class PassiveFilteredCorpusWriter(PickleStanzaDocCorpusFilterWriter):
+    """
+    A filter for sentences where a verb in a list of verbs appears in the passive. The
+    list of verbs is generated from the verbs used in the following BLiMP benchmark sets
+    (the ones targeted by this filter):
+        passive_1
+        passive_2
+
+    In English, within the Universal Dependencies standards and conventions, a passive
+    verb will have the "Voice=Pass" feature. However, in English there is an ambiguity
+    where a copula + adjective can have the same form as a copula + passive verb. For
+    example:
+
+        He was admired (by everyone).
+
+    Without the by-PP, "admired" is ambiguous- it can be analyzed as either an adjective
+    or a passive verb. Since, ceteris paribus, we prefer stronger filters to weaker
+    ones,  we choose to filter out such sentences even when Stanza parses them as
+    adjectives. In that case, we look for the following structure:
+
+        Copula [1]: deprel = cop, (dependency) head = [2]
+        Adjective [2]: text or lemma in our verb list
+
+    Example sentences targeted by this filter:
+        1. Lucille's sisters are confused by Amy.
+        2. Sherry's partners aren't escaped from by Elizabeth.
+        3. Jason's grandmothers weren't cared for by Joseph.
+        4. Most cashiers are disliked.
+        5. All pedestrians are cared for.
+
+    Example sentences NOT targeted by this filter:
+        1. Amy confuses Lucille's sisters.
+        2. Elizabeth escapes from Sherry's partners.
+        3. Joseph cares for Jason's grandmothers.
+        4. Most cashiers are assaulted.
+        5. All pedestrians care.
+    Note that (4) is not targeted because "assault" is not in the word list.
+
+    For more information and examples, refer to the UD English documentation on Voice:
+        https://universaldependencies.org/u/feat/Voice.html#Pass
+        https://universaldependencies.org/u/overview/morphology.html
+    """
+
+    cli_subcmd_constructor_kwargs = {
+        "description": f"Description:\n{__doc__}",
+        "formatter_class": argparse.RawDescriptionHelpFormatter,
+    }
+
+    verb_list_path = "data/blimp/passive/verbs.txt"
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # read verb list
+        with open(self.verb_list_path, "r") as f:
+            self.verb_set: set[str] = {line.strip().lower() for line in f}
+
+    def _exclude_sent(self, sent: StanzaSentence) -> bool:
+        """Exclude a sentence if a verb from a list of verbs appearing in the BLiMP
+        passive_1 or passive_2 benchmark sets appears in the sentence as a passive. For
+        more information, see the class docstring.
+
+        Args:
+            sent: A stanza `Sentence` object that has been annotated with dependency
+            relations.
+
+        Returns:
+            True if the sentence has a verb from the verb list in the passive form;
+            False otherwise.
+        """
+        for head, deprel, word in sent.dependencies:
+            # for _, _, word in sent.dependencies:
+            if word.feats is not None and "Voice=Pass" in word.feats:
+                if (
+                    word.text.lower() in self.verb_set
+                    or word.lemma.lower() in self.verb_set
+                ):
+                    return True
+            # handle "copula + adjective" == "copula + passive" ambiguity
+            if deprel == "cop" and head.id > 0:
+                if (
+                    head.text.lower() in self.verb_set
+                    or head.lemma.lower() in self.verb_set
+                ):
+                    return True
+        return False
diff --git a/data/blimp/passive/verbs.txt b/data/blimp/passive/verbs.txt
@@ -0,0 +1,127 @@
+admired
+aggravated
+aided
+alarmed
+annoyed
+answered
+appreciated
+approached
+argued
+arrived
+ascended
+astounded
+attacked
+blinked
+boasted
+bored
+bothered
+bought
+boycotted
+brought
+cared
+chatted
+chuckled
+clashed
+cleaned
+collaborated
+come
+communicated
+competed
+complained
+compromised
+concurred
+conferred
+confused
+conspired
+cooperated
+coped
+corresponded
+cried
+criticized
+described
+died
+disagreed
+discussed
+disgusted
+disliked
+distracted
+disturbed
+embarrassed
+escaped
+examined
+exited
+explored
+fallen
+fired
+flirted
+forgotten
+gone
+gotten
+grinned
+harmed
+hated
+helped
+hindered
+hired
+hugged
+hurt
+impressed
+insulted
+interacted
+investigated
+irritated
+joked
+kissed
+known
+laughed
+left
+lied
+lifted
+liked
+littered
+loved
+murmured
+muttered
+negotiated
+nodded
+observed
+overwhelmed
+passed
+praised
+profited
+purchased
+reacted
+referenced
+remembered
+replied
+respected
+responded
+retaliated
+rotted
+scanned
+scared
+screamed
+seen
+shocked
+shouted
+shrugged
+sighed
+smiled
+sold
+spoken
+struggled
+stunned
+suffered
+talked
+testified
+toured
+trained
+underwhelmed
+upset
+visited
+watched
+waved
+wept
+worked
+worn
+worried
diff --git a/data/filtered_corpuses/.gitignore b/data/filtered_corpuses/.gitignore
@@ -7,3 +7,4 @@ binding-*/principle_A*.out
 re-*/*regular_plural_subject_verb_agreement_*.out
 *-quantifier/*_quantifiers_*.out
 det-adj-noun/determiner_noun_agreement_with_adj*.out
+passive/passive_*.out
diff --git a/scripts/gen_blimp_passive_verbs_list.py b/scripts/gen_blimp_passive_verbs_list.py
@@ -0,0 +1,53 @@
+# This script generates the verbs used as passives in the following BLiMP
+# benchmark sets:
+#   passive_1
+#   passive_2
+#
+# This script cannot be run as-is, within this directory, as it requires the BLiMP
+# data generation scripts.
+# How to run:
+#   1. `git clone` the `data_generation` repository made available by the BLiMP paper
+#       authors here: https://github.com/alexwarstadt/data_generation/
+#   2. Go to the directory where you cloned the repo, then run `git checkout blimp`
+#   3. Comment out line 16 of `data_generation/utils/vocab_table.py`. This is due to a
+#       bug/inconsistency in the data generation repo.
+#   4. Copy this script to root `data_generation` folder.
+#   5. `pip install jsonlines` as required by the `data_generation` scripts.
+#   6. Run this script (within the data_generation folder) with python, followed by one
+#       argument: the output file where you want to write the list of verbs.
+
+import sys
+
+from utils.vocab_table import get_all  # , get_matches_of
+
+
+def get_pass_verbs():
+    """Get the verbs used in the passive_1 and passive_2 benchmark sets.
+
+    The transitive verbs below are used for the "good" sentences, while the intransitive
+    ones are used for the "bad" sentences. Thus, in principle, the intransitive ones
+    should never appear passively, but we include them anyways, just in case this
+    assumption doesn't hold up (and it very likely does not, due to polysemy and other
+    reasons).
+
+    Note that some expressions are multiple words e.g. "fallen asleep". Two assumptions
+    follow:
+
+    1. multi-word expressions are head-initial. This assumption has been manually
+    validated and found to be true for the relevant BLiMP benchmark sets.
+    2. The valency of the head verb and of the phrase as a whole are the same. This is
+    known to *not* be true, but the result will only be an excessively strong filter, so
+    this is acceptable.
+    """
+    en_verbs = get_all("en", "1")
+    intransitive = {_[0] for _ in get_all("passive", "0", en_verbs)}
+    transitive = {_[0] for _ in get_all("passive", "1", en_verbs)}
+    return transitive | intransitive
+
+
+if __name__ == "__main__":
+    out = sys.argv[1]
+    verbs = {_.split()[0] for _ in get_pass_verbs()}
+
+    with open(out, "w") as f:
+        print(*sorted(verbs), sep="\n", file=f)