Respond to PR comments + cleanup

CLMBRs · Feb 9, 2024 · 4d4d420 · 4d4d420
1 parent 15dc915
commit 4d4d420
Show file tree

Hide file tree

Showing 3 changed files with 60 additions and 69 deletions.
diff --git a/corpus_filtering/filters/stanza_filters.py b/corpus_filtering/filters/stanza_filters.py
@@ -392,8 +392,7 @@ def _exclude_sent(self, sent: StanzaSentence) -> bool:
 
         for head, deprel, word in sent.dependencies:
             # look for members of first set
-            if word.lemma == "there" and deprel == "expl":  # existential there
-                if head.lemma == "be":  # probably a redundant check
+            if word.lemma == "there" and deprel == "expl" and head.lemma == "be":  # existential there
                     there_copulas.add(head.id)
             # look for members of second set
             elif head.head and word.lemma in self.quantifiers:

diff --git a/scripts/gen_blimp_det_noun_agr_nouns_list.py b/scripts/gen_blimp_det_noun_agr_nouns_list.py
@@ -1,22 +1,23 @@
-# This script generates the nouns used as det+noun constructions in the following BLiMP
-# benchmark sets:
-#   determiner_noun_agreement_1
-#   determiner_noun_agreement_2
-#   determiner_noun_agreement_irregular_1
-#   determiner_noun_agreement_irregular_2
-#
-# This script cannot be run as-is, within this directory, as it requires the BLiMP
-# data generation scripts.
-# How to run:
-#   1. `git clone` the `data_generation` repository made available by the BLiMP paper
-#       authors here: https://github.com/alexwarstadt/data_generation/
-#   2. Go to the directory where you cloned the repo, then run `git checkout blimp`
-#   3. Comment out line 16 of `data_generation/utils/vocab_table.py`. This is due to a
-#       bug/inconsistency in the data generation repo.
-#   4. Copy this script to root `data_generation` folder.
-#   5. `pip install jsonlines` as required by the `data_generation` scripts.
-#   6. Run this script (within the data_generation folder) with python, followed by one
-#       argument: the output file where you want to write the list of verbs.
+"""This script generates the nouns used as det+noun constructions in the following BLiMP
+benchmark sets:
+  determiner_noun_agreement_1
+  determiner_noun_agreement_2
+  determiner_noun_agreement_irregular_1
+  determiner_noun_agreement_irregular_2
+
+This script cannot be run as-is, within this directory, as it requires the BLiMP
+data generation scripts.
+How to run:
+  1. `git clone` the `data_generation` repository made available by the BLiMP paper
+      authors here: https://github.com/alexwarstadt/data_generation/
+  2. Go to the directory where you cloned the repo, then run `git checkout blimp`
+  3. Comment out line 16 of `data_generation/utils/vocab_table.py`. This is due to a
+      bug/inconsistency in the data generation repo.
+  4. Copy this script to root `data_generation` folder.
+  5. `pip install jsonlines` as required by the `data_generation` scripts.
+  6. Run this script (within the data_generation folder) with python, followed by one
+      argument: the output file where you want to write the list of verbs.
+"""
 
 import sys
 
@@ -26,24 +27,22 @@
 from utils.vocab_table import get_all, get_all_conjunctive
 
 
-def get_determiner_noun_agreement_nouns():
+if __name__ == "__main__":
+    out = sys.argv[1]
     all_null_plural_nouns = get_all("sgequalspl", "1")
     all_missingPluralSing_nouns = get_all_conjunctive(
         [("pluralform", ""), ("singularform", "")]
     )
     all_unusable_nouns = np.union1d(all_null_plural_nouns, all_missingPluralSing_nouns)
     all_pluralizable_nouns = np.setdiff1d(all_common_nouns, all_unusable_nouns)
-    return (
+    det_noun_agr_nouns = (
         set(all_pluralizable_nouns["expression"])
         | set(all_pluralizable_nouns["singularform"])
         | set(all_pluralizable_nouns["pluralform"])
     ) - {""}
 
-
-if __name__ == "__main__":
-    out = sys.argv[1]
     # Assumption: in multi-word nouns, final word is the head
-    nouns = {_.split()[-1] for _ in get_determiner_noun_agreement_nouns()}
+    det_noun_agr_nouns = {_.split()[-1] for _ in det_noun_agr_nouns}
 
     with open(out, "w") as f:
-        print(*sorted(nouns), sep="\n", file=f)
+        print(*sorted(det_noun_agr_nouns), sep="\n", file=f)
diff --git a/scripts/gen_blimp_passive_verbs_list.py b/scripts/gen_blimp_passive_verbs_list.py
@@ -1,53 +1,46 @@
-# This script generates the verbs used as passives in the following BLiMP
-# benchmark sets:
-#   passive_1
-#   passive_2
-#
-# This script cannot be run as-is, within this directory, as it requires the BLiMP
-# data generation scripts.
-# How to run:
-#   1. `git clone` the `data_generation` repository made available by the BLiMP paper
-#       authors here: https://github.com/alexwarstadt/data_generation/
-#   2. Go to the directory where you cloned the repo, then run `git checkout blimp`
-#   3. Comment out line 16 of `data_generation/utils/vocab_table.py`. This is due to a
-#       bug/inconsistency in the data generation repo.
-#   4. Copy this script to root `data_generation` folder.
-#   5. `pip install jsonlines` as required by the `data_generation` scripts.
-#   6. Run this script (within the data_generation folder) with python, followed by one
-#       argument: the output file where you want to write the list of verbs.
+"""This script generates the verbs used as passives in the following BLiMP
+benchmark sets:
+  passive_1
+  passive_2
+
+This script cannot be run as-is, within this directory, as it requires the BLiMP
+data generation scripts.
+How to run:
+  1. `git clone` the `data_generation` repository made available by the BLiMP paper
+      authors here: https://github.com/alexwarstadt/data_generation/
+  2. Go to the directory where you cloned the repo, then run `git checkout blimp`
+  3. Comment out line 16 of `data_generation/utils/vocab_table.py`. This is due to a
+      bug/inconsistency in the data generation repo.
+  4. Copy this script to root `data_generation` folder.
+  5. `pip install jsonlines` as required by the `data_generation` scripts.
+  6. Run this script (within the data_generation folder) with python, followed by one
+      argument: the output file where you want to write the list of verbs.
+"""
 
 import sys
 
 from utils.vocab_table import get_all  # , get_matches_of
 
 
-def get_pass_verbs():
-    """Get the verbs used in the passive_1 and passive_2 benchmark sets.
-
-    The transitive verbs below are used for the "good" sentences, while the intransitive
-    ones are used for the "bad" sentences. Thus, in principle, the intransitive ones
-    should never appear passively, but we include them anyways, just in case this
-    assumption doesn't hold up (and it very likely does not, due to polysemy and other
-    reasons).
-
-    Note that some expressions are multiple words e.g. "fallen asleep". Two assumptions
-    follow:
-
-    1. multi-word expressions are head-initial. This assumption has been manually
-    validated and found to be true for the relevant BLiMP benchmark sets.
-    2. The valency of the head verb and of the phrase as a whole are the same. This is
-    known to *not* be true, but the result will only be an excessively strong filter, so
-    this is acceptable.
-    """
+if __name__ == "__main__":
+    out = sys.argv[1]
     en_verbs = get_all("en", "1")
+    # The transitive verbs below are used for the "good" sentences, while the intransitive
+    # ones are used for the "bad" sentences. Thus, in principle, the intransitive ones
+    # should never appear passively, but we include them anyways, just in case this
+    # assumption doesn't hold up (and it very likely does not, due to polysemy and other
+    # reasons).
     intransitive = {_[0] for _ in get_all("passive", "0", en_verbs)}
     transitive = {_[0] for _ in get_all("passive", "1", en_verbs)}
-    return transitive | intransitive
-
-
-if __name__ == "__main__":
-    out = sys.argv[1]
-    verbs = {_.split()[0] for _ in get_pass_verbs()}
+    # Note that some expressions are multiple words e.g. "fallen asleep". Two assumptions
+    # follow:
+
+    # 1. multi-word expressions are head-initial. This assumption has been manually
+    # validated and found to be true for the relevant BLiMP benchmark sets.
+    # 2. The valency of the head verb and of the phrase as a whole are the same. This is
+    # known to *not* be true, but the result will only be an excessively strong filter, so
+    # this is acceptable.
+    verbs = {_.split()[0] for _ in transitive | intransitive}
 
     with open(out, "w") as f:
         print(*sorted(verbs), sep="\n", file=f)