Skip to content

Commit

Permalink
Respond to PR comments + cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
Ubadub committed Feb 9, 2024
1 parent 15dc915 commit 4d4d420
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 69 deletions.
3 changes: 1 addition & 2 deletions corpus_filtering/filters/stanza_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,8 +392,7 @@ def _exclude_sent(self, sent: StanzaSentence) -> bool:

for head, deprel, word in sent.dependencies:
# look for members of first set
if word.lemma == "there" and deprel == "expl": # existential there
if head.lemma == "be": # probably a redundant check
if word.lemma == "there" and deprel == "expl" and head.lemma == "be": # existential there
there_copulas.add(head.id)
# look for members of second set
elif head.head and word.lemma in self.quantifiers:
Expand Down
51 changes: 25 additions & 26 deletions scripts/gen_blimp_det_noun_agr_nouns_list.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,23 @@
# This script generates the nouns used as det+noun constructions in the following BLiMP
# benchmark sets:
# determiner_noun_agreement_1
# determiner_noun_agreement_2
# determiner_noun_agreement_irregular_1
# determiner_noun_agreement_irregular_2
#
# This script cannot be run as-is, within this directory, as it requires the BLiMP
# data generation scripts.
# How to run:
# 1. `git clone` the `data_generation` repository made available by the BLiMP paper
# authors here: https://github.com/alexwarstadt/data_generation/
# 2. Go to the directory where you cloned the repo, then run `git checkout blimp`
# 3. Comment out line 16 of `data_generation/utils/vocab_table.py`. This is due to a
# bug/inconsistency in the data generation repo.
# 4. Copy this script to root `data_generation` folder.
# 5. `pip install jsonlines` as required by the `data_generation` scripts.
# 6. Run this script (within the data_generation folder) with python, followed by one
# argument: the output file where you want to write the list of verbs.
"""This script generates the nouns used as det+noun constructions in the following BLiMP
benchmark sets:
determiner_noun_agreement_1
determiner_noun_agreement_2
determiner_noun_agreement_irregular_1
determiner_noun_agreement_irregular_2
This script cannot be run as-is, within this directory, as it requires the BLiMP
data generation scripts.
How to run:
1. `git clone` the `data_generation` repository made available by the BLiMP paper
authors here: https://github.com/alexwarstadt/data_generation/
2. Go to the directory where you cloned the repo, then run `git checkout blimp`
3. Comment out line 16 of `data_generation/utils/vocab_table.py`. This is due to a
bug/inconsistency in the data generation repo.
4. Copy this script to root `data_generation` folder.
5. `pip install jsonlines` as required by the `data_generation` scripts.
6. Run this script (within the data_generation folder) with python, followed by one
argument: the output file where you want to write the list of verbs.
"""

import sys

Expand All @@ -26,24 +27,22 @@
from utils.vocab_table import get_all, get_all_conjunctive


def get_determiner_noun_agreement_nouns():
if __name__ == "__main__":
out = sys.argv[1]
all_null_plural_nouns = get_all("sgequalspl", "1")
all_missingPluralSing_nouns = get_all_conjunctive(
[("pluralform", ""), ("singularform", "")]
)
all_unusable_nouns = np.union1d(all_null_plural_nouns, all_missingPluralSing_nouns)
all_pluralizable_nouns = np.setdiff1d(all_common_nouns, all_unusable_nouns)
return (
det_noun_agr_nouns = (
set(all_pluralizable_nouns["expression"])
| set(all_pluralizable_nouns["singularform"])
| set(all_pluralizable_nouns["pluralform"])
) - {""}


if __name__ == "__main__":
out = sys.argv[1]
# Assumption: in multi-word nouns, final word is the head
nouns = {_.split()[-1] for _ in get_determiner_noun_agreement_nouns()}
det_noun_agr_nouns = {_.split()[-1] for _ in det_noun_agr_nouns}

with open(out, "w") as f:
print(*sorted(nouns), sep="\n", file=f)
print(*sorted(det_noun_agr_nouns), sep="\n", file=f)
75 changes: 34 additions & 41 deletions scripts/gen_blimp_passive_verbs_list.py
Original file line number Diff line number Diff line change
@@ -1,53 +1,46 @@
# This script generates the verbs used as passives in the following BLiMP
# benchmark sets:
# passive_1
# passive_2
#
# This script cannot be run as-is, within this directory, as it requires the BLiMP
# data generation scripts.
# How to run:
# 1. `git clone` the `data_generation` repository made available by the BLiMP paper
# authors here: https://github.com/alexwarstadt/data_generation/
# 2. Go to the directory where you cloned the repo, then run `git checkout blimp`
# 3. Comment out line 16 of `data_generation/utils/vocab_table.py`. This is due to a
# bug/inconsistency in the data generation repo.
# 4. Copy this script to root `data_generation` folder.
# 5. `pip install jsonlines` as required by the `data_generation` scripts.
# 6. Run this script (within the data_generation folder) with python, followed by one
# argument: the output file where you want to write the list of verbs.
"""This script generates the verbs used as passives in the following BLiMP
benchmark sets:
passive_1
passive_2
This script cannot be run as-is, within this directory, as it requires the BLiMP
data generation scripts.
How to run:
1. `git clone` the `data_generation` repository made available by the BLiMP paper
authors here: https://github.com/alexwarstadt/data_generation/
2. Go to the directory where you cloned the repo, then run `git checkout blimp`
3. Comment out line 16 of `data_generation/utils/vocab_table.py`. This is due to a
bug/inconsistency in the data generation repo.
4. Copy this script to root `data_generation` folder.
5. `pip install jsonlines` as required by the `data_generation` scripts.
6. Run this script (within the data_generation folder) with python, followed by one
argument: the output file where you want to write the list of verbs.
"""

import sys

from utils.vocab_table import get_all # , get_matches_of


def get_pass_verbs():
"""Get the verbs used in the passive_1 and passive_2 benchmark sets.
The transitive verbs below are used for the "good" sentences, while the intransitive
ones are used for the "bad" sentences. Thus, in principle, the intransitive ones
should never appear passively, but we include them anyways, just in case this
assumption doesn't hold up (and it very likely does not, due to polysemy and other
reasons).
Note that some expressions are multiple words e.g. "fallen asleep". Two assumptions
follow:
1. multi-word expressions are head-initial. This assumption has been manually
validated and found to be true for the relevant BLiMP benchmark sets.
2. The valency of the head verb and of the phrase as a whole are the same. This is
known to *not* be true, but the result will only be an excessively strong filter, so
this is acceptable.
"""
if __name__ == "__main__":
out = sys.argv[1]
en_verbs = get_all("en", "1")
# The transitive verbs below are used for the "good" sentences, while the intransitive
# ones are used for the "bad" sentences. Thus, in principle, the intransitive ones
# should never appear passively, but we include them anyways, just in case this
# assumption doesn't hold up (and it very likely does not, due to polysemy and other
# reasons).
intransitive = {_[0] for _ in get_all("passive", "0", en_verbs)}
transitive = {_[0] for _ in get_all("passive", "1", en_verbs)}
return transitive | intransitive


if __name__ == "__main__":
out = sys.argv[1]
verbs = {_.split()[0] for _ in get_pass_verbs()}
# Note that some expressions are multiple words e.g. "fallen asleep". Two assumptions
# follow:

# 1. multi-word expressions are head-initial. This assumption has been manually
# validated and found to be true for the relevant BLiMP benchmark sets.
# 2. The valency of the head verb and of the phrase as a whole are the same. This is
# known to *not* be true, but the result will only be an excessively strong filter, so
# this is acceptable.
verbs = {_.split()[0] for _ in transitive | intransitive}

with open(out, "w") as f:
print(*sorted(verbs), sep="\n", file=f)

0 comments on commit 4d4d420

Please sign in to comment.