Skip to content

Commit

Permalink
Created passive filter, targeting passive_1 and passive_2 in BLiMP
Browse files Browse the repository at this point in the history
  • Loading branch information
Ubadub committed Jan 28, 2024
1 parent f195442 commit efc8eef
Show file tree
Hide file tree
Showing 4 changed files with 269 additions and 0 deletions.
88 changes: 88 additions & 0 deletions corpus_filtering/filters/stanza_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -632,3 +632,91 @@ def _exclude_sent(self, sent: StanzaSentence) -> bool:
return True

return False


@register_filter("passive")
class PassiveFilteredCorpusWriter(PickleStanzaDocCorpusFilterWriter):
"""
A filter for sentences where a verb in a list of verbs appears in the passive. The
list of verbs is generated from the verbs used in the following BLiMP benchmark sets
(the ones targeted by this filter):
passive_1
passive_2
In English, within the Universal Dependencies standards and conventions, a passive
verb will have the "Voice=Pass" feature. However, in English there is an ambiguity
where a copula + adjective can have the same form as a copula + passive verb. For
example:
He was admired (by everyone).
Without the by-PP, "admired" is ambiguous- it can be analyzed as either an adjective
or a passive verb. Since, ceteris paribus, we prefer stronger filters to weaker
ones, we choose to filter out such sentences even when Stanza parses them as
adjectives. In that case, we look for the following structure:
Copula [1]: deprel = cop, (dependency) head = [2]
Adjective [2]: text or lemma in our verb list
Example sentences targeted by this filter:
1. Lucille's sisters are confused by Amy.
2. Sherry's partners aren't escaped from by Elizabeth.
3. Jason's grandmothers weren't cared for by Joseph.
4. Most cashiers are disliked.
5. All pedestrians are cared for.
Example sentences NOT targeted by this filter:
1. Amy confuses Lucille's sisters.
2. Elizabeth escapes from Sherry's partners.
3. Joseph cares for Jason's grandmothers.
4. Most cashiers are assaulted.
5. All pedestrians care.
Note that (4) is not targeted because "assault" is not in the word list.
For more information and examples, refer to the UD English documentation on Voice:
https://universaldependencies.org/u/feat/Voice.html#Pass
https://universaldependencies.org/u/overview/morphology.html
"""

cli_subcmd_constructor_kwargs = {
"description": f"Description:\n{__doc__}",
"formatter_class": argparse.RawDescriptionHelpFormatter,
}

verb_list_path = "data/blimp/passive/verbs.txt"

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# read verb list
with open(self.verb_list_path, "r") as f:
self.verb_set: set[str] = {line.strip().lower() for line in f}

def _exclude_sent(self, sent: StanzaSentence) -> bool:
"""Exclude a sentence if a verb from a list of verbs appearing in the BLiMP
passive_1 or passive_2 benchmark sets appears in the sentence as a passive. For
more information, see the class docstring.
Args:
sent: A stanza `Sentence` object that has been annotated with dependency
relations.
Returns:
True if the sentence has a verb from the verb list in the passive form;
False otherwise.
"""
for head, deprel, word in sent.dependencies:
# for _, _, word in sent.dependencies:
if word.feats is not None and "Voice=Pass" in word.feats:
if (
word.text.lower() in self.verb_set
or word.lemma.lower() in self.verb_set
):
return True
# handle "copula + adjective" == "copula + passive" ambiguity
if deprel == "cop" and head.id > 0:
if (
head.text.lower() in self.verb_set
or head.lemma.lower() in self.verb_set
):
return True
return False
127 changes: 127 additions & 0 deletions data/blimp/passive/verbs.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
admired
aggravated
aided
alarmed
annoyed
answered
appreciated
approached
argued
arrived
ascended
astounded
attacked
blinked
boasted
bored
bothered
bought
boycotted
brought
cared
chatted
chuckled
clashed
cleaned
collaborated
come
communicated
competed
complained
compromised
concurred
conferred
confused
conspired
cooperated
coped
corresponded
cried
criticized
described
died
disagreed
discussed
disgusted
disliked
distracted
disturbed
embarrassed
escaped
examined
exited
explored
fallen
fired
flirted
forgotten
gone
gotten
grinned
harmed
hated
helped
hindered
hired
hugged
hurt
impressed
insulted
interacted
investigated
irritated
joked
kissed
known
laughed
left
lied
lifted
liked
littered
loved
murmured
muttered
negotiated
nodded
observed
overwhelmed
passed
praised
profited
purchased
reacted
referenced
remembered
replied
respected
responded
retaliated
rotted
scanned
scared
screamed
seen
shocked
shouted
shrugged
sighed
smiled
sold
spoken
struggled
stunned
suffered
talked
testified
toured
trained
underwhelmed
upset
visited
watched
waved
wept
worked
worn
worried
1 change: 1 addition & 0 deletions data/filtered_corpuses/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ binding-*/principle_A*.out
re-*/*regular_plural_subject_verb_agreement_*.out
*-quantifier/*_quantifiers_*.out
det-adj-noun/determiner_noun_agreement_with_adj*.out
passive/passive_*.out
53 changes: 53 additions & 0 deletions scripts/gen_blimp_passive_verbs_list.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# This script generates the verbs used as passives in the following BLiMP
# benchmark sets:
# passive_1
# passive_2
#
# This script cannot be run as-is, within this directory, as it requires the BLiMP
# data generation scripts.
# How to run:
# 1. `git clone` the `data_generation` repository made available by the BLiMP paper
# authors here: https://github.com/alexwarstadt/data_generation/
# 2. Go to the directory where you cloned the repo, then run `git checkout blimp`
# 3. Comment out line 16 of `data_generation/utils/vocab_table.py`. This is due to a
# bug/inconsistency in the data generation repo.
# 4. Copy this script to root `data_generation` folder.
# 5. `pip install jsonlines` as required by the `data_generation` scripts.
# 6. Run this script (within the data_generation folder) with python, followed by one
# argument: the output file where you want to write the list of verbs.

import sys

from utils.vocab_table import get_all # , get_matches_of


def get_pass_verbs():
"""Get the verbs used in the passive_1 and passive_2 benchmark sets.
The transitive verbs below are used for the "good" sentences, while the intransitive
ones are used for the "bad" sentences. Thus, in principle, the intransitive ones
should never appear passively, but we include them anyways, just in case this
assumption doesn't hold up (and it very likely does not, due to polysemy and other
reasons).
Note that some expressions are multiple words e.g. "fallen asleep". Two assumptions
follow:
1. multi-word expressions are head-initial. This assumption has been manually
validated and found to be true for the relevant BLiMP benchmark sets.
2. The valency of the head verb and of the phrase as a whole are the same. This is
known to *not* be true, but the result will only be an excessively strong filter, so
this is acceptable.
"""
en_verbs = get_all("en", "1")
intransitive = {_[0] for _ in get_all("passive", "0", en_verbs)}
transitive = {_[0] for _ in get_all("passive", "1", en_verbs)}
return transitive | intransitive


if __name__ == "__main__":
out = sys.argv[1]
verbs = {_.split()[0] for _ in get_pass_verbs()}

with open(out, "w") as f:
print(*sorted(verbs), sep="\n", file=f)

0 comments on commit efc8eef

Please sign in to comment.