diff --git a/corpus_filtering/filters/stanza_filters.py b/corpus_filtering/filters/stanza_filters.py index 6f57d02..a6fdc7e 100755 --- a/corpus_filtering/filters/stanza_filters.py +++ b/corpus_filtering/filters/stanza_filters.py @@ -392,8 +392,7 @@ def _exclude_sent(self, sent: StanzaSentence) -> bool: for head, deprel, word in sent.dependencies: # look for members of first set - if word.lemma == "there" and deprel == "expl": # existential there - if head.lemma == "be": # probably a redundant check + if word.lemma == "there" and deprel == "expl" and head.lemma == "be": # existential there there_copulas.add(head.id) # look for members of second set elif head.head and word.lemma in self.quantifiers: diff --git a/scripts/gen_blimp_det_noun_agr_nouns_list.py b/scripts/gen_blimp_det_noun_agr_nouns_list.py index 3bd4268..1c61f06 100644 --- a/scripts/gen_blimp_det_noun_agr_nouns_list.py +++ b/scripts/gen_blimp_det_noun_agr_nouns_list.py @@ -1,22 +1,23 @@ -# This script generates the nouns used as det+noun constructions in the following BLiMP -# benchmark sets: -# determiner_noun_agreement_1 -# determiner_noun_agreement_2 -# determiner_noun_agreement_irregular_1 -# determiner_noun_agreement_irregular_2 -# -# This script cannot be run as-is, within this directory, as it requires the BLiMP -# data generation scripts. -# How to run: -# 1. `git clone` the `data_generation` repository made available by the BLiMP paper -# authors here: https://github.com/alexwarstadt/data_generation/ -# 2. Go to the directory where you cloned the repo, then run `git checkout blimp` -# 3. Comment out line 16 of `data_generation/utils/vocab_table.py`. This is due to a -# bug/inconsistency in the data generation repo. -# 4. Copy this script to root `data_generation` folder. -# 5. `pip install jsonlines` as required by the `data_generation` scripts. -# 6. Run this script (within the data_generation folder) with python, followed by one -# argument: the output file where you want to write the list of verbs. +"""This script generates the nouns used as det+noun constructions in the following BLiMP +benchmark sets: + determiner_noun_agreement_1 + determiner_noun_agreement_2 + determiner_noun_agreement_irregular_1 + determiner_noun_agreement_irregular_2 + +This script cannot be run as-is, within this directory, as it requires the BLiMP +data generation scripts. +How to run: + 1. `git clone` the `data_generation` repository made available by the BLiMP paper + authors here: https://github.com/alexwarstadt/data_generation/ + 2. Go to the directory where you cloned the repo, then run `git checkout blimp` + 3. Comment out line 16 of `data_generation/utils/vocab_table.py`. This is due to a + bug/inconsistency in the data generation repo. + 4. Copy this script to root `data_generation` folder. + 5. `pip install jsonlines` as required by the `data_generation` scripts. + 6. Run this script (within the data_generation folder) with python, followed by one + argument: the output file where you want to write the list of verbs. +""" import sys @@ -26,24 +27,22 @@ from utils.vocab_table import get_all, get_all_conjunctive -def get_determiner_noun_agreement_nouns(): +if __name__ == "__main__": + out = sys.argv[1] all_null_plural_nouns = get_all("sgequalspl", "1") all_missingPluralSing_nouns = get_all_conjunctive( [("pluralform", ""), ("singularform", "")] ) all_unusable_nouns = np.union1d(all_null_plural_nouns, all_missingPluralSing_nouns) all_pluralizable_nouns = np.setdiff1d(all_common_nouns, all_unusable_nouns) - return ( + det_noun_agr_nouns = ( set(all_pluralizable_nouns["expression"]) | set(all_pluralizable_nouns["singularform"]) | set(all_pluralizable_nouns["pluralform"]) ) - {""} - -if __name__ == "__main__": - out = sys.argv[1] # Assumption: in multi-word nouns, final word is the head - nouns = {_.split()[-1] for _ in get_determiner_noun_agreement_nouns()} + det_noun_agr_nouns = {_.split()[-1] for _ in det_noun_agr_nouns} with open(out, "w") as f: - print(*sorted(nouns), sep="\n", file=f) + print(*sorted(det_noun_agr_nouns), sep="\n", file=f) diff --git a/scripts/gen_blimp_passive_verbs_list.py b/scripts/gen_blimp_passive_verbs_list.py index 49d879f..907bb78 100644 --- a/scripts/gen_blimp_passive_verbs_list.py +++ b/scripts/gen_blimp_passive_verbs_list.py @@ -1,53 +1,46 @@ -# This script generates the verbs used as passives in the following BLiMP -# benchmark sets: -# passive_1 -# passive_2 -# -# This script cannot be run as-is, within this directory, as it requires the BLiMP -# data generation scripts. -# How to run: -# 1. `git clone` the `data_generation` repository made available by the BLiMP paper -# authors here: https://github.com/alexwarstadt/data_generation/ -# 2. Go to the directory where you cloned the repo, then run `git checkout blimp` -# 3. Comment out line 16 of `data_generation/utils/vocab_table.py`. This is due to a -# bug/inconsistency in the data generation repo. -# 4. Copy this script to root `data_generation` folder. -# 5. `pip install jsonlines` as required by the `data_generation` scripts. -# 6. Run this script (within the data_generation folder) with python, followed by one -# argument: the output file where you want to write the list of verbs. +"""This script generates the verbs used as passives in the following BLiMP +benchmark sets: + passive_1 + passive_2 + +This script cannot be run as-is, within this directory, as it requires the BLiMP +data generation scripts. +How to run: + 1. `git clone` the `data_generation` repository made available by the BLiMP paper + authors here: https://github.com/alexwarstadt/data_generation/ + 2. Go to the directory where you cloned the repo, then run `git checkout blimp` + 3. Comment out line 16 of `data_generation/utils/vocab_table.py`. This is due to a + bug/inconsistency in the data generation repo. + 4. Copy this script to root `data_generation` folder. + 5. `pip install jsonlines` as required by the `data_generation` scripts. + 6. Run this script (within the data_generation folder) with python, followed by one + argument: the output file where you want to write the list of verbs. +""" import sys from utils.vocab_table import get_all # , get_matches_of -def get_pass_verbs(): - """Get the verbs used in the passive_1 and passive_2 benchmark sets. - - The transitive verbs below are used for the "good" sentences, while the intransitive - ones are used for the "bad" sentences. Thus, in principle, the intransitive ones - should never appear passively, but we include them anyways, just in case this - assumption doesn't hold up (and it very likely does not, due to polysemy and other - reasons). - - Note that some expressions are multiple words e.g. "fallen asleep". Two assumptions - follow: - - 1. multi-word expressions are head-initial. This assumption has been manually - validated and found to be true for the relevant BLiMP benchmark sets. - 2. The valency of the head verb and of the phrase as a whole are the same. This is - known to *not* be true, but the result will only be an excessively strong filter, so - this is acceptable. - """ +if __name__ == "__main__": + out = sys.argv[1] en_verbs = get_all("en", "1") + # The transitive verbs below are used for the "good" sentences, while the intransitive + # ones are used for the "bad" sentences. Thus, in principle, the intransitive ones + # should never appear passively, but we include them anyways, just in case this + # assumption doesn't hold up (and it very likely does not, due to polysemy and other + # reasons). intransitive = {_[0] for _ in get_all("passive", "0", en_verbs)} transitive = {_[0] for _ in get_all("passive", "1", en_verbs)} - return transitive | intransitive - - -if __name__ == "__main__": - out = sys.argv[1] - verbs = {_.split()[0] for _ in get_pass_verbs()} + # Note that some expressions are multiple words e.g. "fallen asleep". Two assumptions + # follow: + + # 1. multi-word expressions are head-initial. This assumption has been manually + # validated and found to be true for the relevant BLiMP benchmark sets. + # 2. The valency of the head verb and of the phrase as a whole are the same. This is + # known to *not* be true, but the result will only be an excessively strong filter, so + # this is acceptable. + verbs = {_.split()[0] for _ in transitive | intransitive} with open(out, "w") as f: print(*sorted(verbs), sep="\n", file=f)