forked from harveyxia/semantic_classification
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnoun_extractor.py
27 lines (24 loc) · 997 Bytes
/
noun_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import string
from collections import defaultdict
from nltk.tag import _pos_tag as pos_tag
from nltk.tokenize import word_tokenize
from nltk.tag.perceptron import PerceptronTagger
tagger = PerceptronTagger()
# Given a filename, return a dict of nouns:count
def get_nouns(filename):
noun_dict = defaultdict(int)
with open(filename, 'r') as f:
for line in f:
# 1. tokenize the text line
tokens = word_tokenize(_strip_punctuation(line))
# 2. tag the POS of each lexical item
tags = pos_tag(tokens, None, tagger) # tagset is set to None
# 3. filter for nouns
tagged_nouns = filter(lambda tag: tag[1]=='NN', tags)
for tagged_noun in tagged_nouns:
noun_dict[tagged_noun[0]] += 1
return dict(noun_dict)
def _strip_punctuation(s):
printable = set(string.printable)
s = s.translate(string.maketrans("",""), string.punctuation)
return filter(lambda x: x in printable, s)