-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathclassif.py
24 lines (21 loc) · 958 Bytes
/
classif.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# Creates term-frequency array for a given text.
# Note that this is tf, not tf*idf.
from sklearn.feature_extraction.text import TfidfVectorizer
from sources import topcorpuswords1000
# Use only vocabulary of top 10,000 most frequent words in corpus
# (out of the box, these scripts only consider the top 1,000,
# but you can change that number in the classifactory.classifydocs(...) function)
with open(topcorpuswords1000) as vocdoc:
voc = [w[:-1] for w in vocdoc.readlines()]
def tfidf(docs):
'''tfidfer(documentList) -> converts collection of documents to tf matrix'''
tfidfer = TfidfVectorizer(vocabulary=voc, \
min_df=1, \
stop_words=None, \
use_idf=False, \
smooth_idf=True)
alltexts = []
for doc in docs:
alltexts.append(doc)
tfidfcorpus = tfidfer.fit_transform(alltexts)
return tfidfcorpus