朴素:分类过程中进行最简单的假设
假设有一个点 (x,y),它一种有两种分类可能分别记为 1 和 2。属于分类 1 的概率记为 p1(x,y),同理属于分类 2 的 概率记为 p2(x,y)。根据贝叶斯决策理论的核心思想,比较 p1 和 p2 的概率大小来决定点的类别。
贝叶斯准则:
将 4.2 中的公式放到 4.1 的判定方法里面使用。
将输入的文本,转换为向量。这里考虑最简单的方法,将输入的文章变成单词向量。 假设输入得到的文本保存在 postinglist 中,文本的标签(侮辱类和非侮辱类)保存在 classvec 中。vocablist 是 postinglist 中所有单词去重之后的集合。
def load_data_set():
postinglist=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classvec = [0, 1, 0, 1, 0, 1] # 1 is abusive, 0 not
return postinglist, classvec
def creat_vocablist(dataset):
vocabset = set([])
for document in dataset:
vocabset = vocabset | set(document)
return list(vocabset)
下面这两个函数功能为将输入的某一篇文本转化为单词向量。set_of_words_to_vec 使用的是布尔模型,所有向量的取值范围为 {0,1}。bag_of_words_to_vec 使用的是多项式模型,取值为某个单词在文本中出现的全部次数。
def set_of_words_to_vec(vocablist, inputset):
returnvec = [0]*len(vocablist)
for word in inputset:
if word in vocablist:
returnvec[vocablist.index(word)] = 1
else:
print "the word: %s is not in my vocabulary!" % word
return returnvec
def bag_of_words_to_vec(vocablist, inputset):
returnvec = [0]*len(vocablist)
for word in inputset:
if word in vocablist:
returnvec[vocablist.index(word)] += 1
return retuPython
下面为朴素贝叶斯分类训练函数。 由于某些情况下,概率的值很小,所以需要取log,将乘法变成加法,减少向下溢出的情况。
def train_nb0(train_matrix, train_category):
numtraindocs = len(train_matrix)
numwords = len(train_matrix[0])
pabusive = sum(train_category)/float(numtraindocs)
p0num = ones(numwords)
p1num = ones(numwords)
p0denom = 2.0
p1denom = 2.0
for i in range(numtraindocs):
if train_category[i] == 1:
p1num += train_matrix[i]
p1denom += sum(train_matrix[i])
else:
p0num += train_matrix[i]
p0denom += sum(train_matrix[i])
p1vect = log(p1num/p1denom)
p0vect = log(p0num/p0denom)
return pabusive, p1vect, p0vect
def t_train_nb0():
postinglist, classvec = load_data_set()
vocablist = creat_vocablist(postinglist)
trainmat = []
for doc in postinglist:
trainmat.append(set_of_words_to_vec(vocablist, doc))
pab, p1v, p0v = train_nb0(trainmat, classvec)
print pab
print p1v
print p0v
分类器
def classify_nb(vec2calssify, p0vec, p1vec, pclass1):
p1 = sum(vec2calssify * p1vec) + log(pclass1)
p0 = sum(vec2calssify * p0vec) + log(1.0 - pclass1)
if p1 > p0:
return 1
else:
return 0
def t_nb():
postinglist, classvec = load_data_set()
vocablist = creat_vocablist(postinglist)
trainmat = []
for doc in postinglist:
trainmat.append(set_of_words_to_vec(vocablist, doc))
pab, p1v, p0v = train_nb0(trainmat, classvec)
testentry = ['love', 'my', 'dalmation']
thisdoc = array(set_of_words_to_vec(vocablist, testentry))
print classify_nb(thisdoc, p0v, p1v, pab)
testentry = ['stupid', 'garbage']
thisdoc = array(set_of_words_to_vec(vocablist, testentry))
print classify_nb(thisdoc, p0v, p1v, pab)
由于邮件中会出现很多的数字以及标点符号的情况,我们使用正则匹配提取出单词,进一步去除长度小于 2 的单词。
def text_parse(bigstring):
listoftokens = re.split(r'\W*',bigstring)
return [tok.lower() for tok in listoftokens if len(tok) > 2]
分类器,使用交叉验证的方法
def spam_test():
doclist = []
classlist = []
fulltext = []
for i in range(1, 26):
wordlist = text_parse(open('email/spam/%d.txt' % i).read())
doclist.append(wordlist)
fulltext.extend(wordlist)
classlist.append(1)
wordlist = text_parse(open('email/ham/%d.txt' % i).read())
doclist.append(wordlist)
fulltext.extend(wordlist)
classlist.append(0)
vocablist = creat_vocablist(doclist)
trainingset = range(50)
testset = []
for i in range(10):
randindex = int(random.uniform(0, len(trainingset)))
testset.append(trainingset[randindex])
del(trainingset[randindex])
trainmat = []
trainclass = []
for i in trainingset:
trainmat.append(set_of_words_to_vec(vocablist, doclist[i]))
trainclass.append(classlist[i])
pspam, p1, p0 = train_nb0(array(trainmat), array(trainclass))
errorcount = 0
for i in testset:
if classify_nb(array(set_of_words_to_vec(vocablist, doclist[i])), p0, p1, pspam) != classlist[i]:
errorcount += 1
print doclist[i]
print 'the error rate is: ', float(errorcount)/len(testset)
书本中提供的 RSS 源似乎无法使用,所以自己随机选择了一个新的板块https://newyork.craigslist.org/search/mar?format=rss
。这一个例子中的思想,和上一步相同。
def t_feedparser():
ny = feedparser.parse('https://newyork.craigslist.org/search/mar?format=rss')
sf = feedparser.parse('https://sfbay.craigslist.org/search/mar?format=rss')
print min(len(ny['entries']), len(sf['entries']))
def cal_most_freq(vocablist, fulltext):
import operator
freqdict = {}
for i in vocablist:
freqdict[i] = fulltext.count(i)
sortedfreq = sorted(freqdict.iteritems(), key=operator.itemgetter(1), reverse=True)
return sortedfreq[:30]
def local_words(ny, sf):
minlen = min(len(ny['entries']), len(sf['entries']))
doclist = []
classlist = []
fulltext = []
for i in range(minlen):
wordlist = text_parse(sf['entries'][i]['summary'])
doclist.append(wordlist)
fulltext.extend(wordlist)
classlist.append(1)
wordlist = text_parse(ny['entries'][i]['summary'])
doclist.append(wordlist)
fulltext.extend(wordlist)
classlist.append(0)
vocablist = creat_vocablist(doclist)
trainingset = range(2 * minlen)
testset = []
for i in range(10):
randindex = int(random.uniform(0, len(trainingset)))
testset.append(trainingset[randindex])
del (trainingset[randindex])
top30words = cal_most_freq(vocablist, fulltext)
for i in top30words:
if i[0] in vocablist:
vocablist.remove(i[0])
print 'vocalist len', len(vocablist)
trainmat = []
trainclass = []
for i in trainingset:
trainmat.append(bag_of_words_to_vec(vocablist, doclist[i]))
trainclass.append(classlist[i])
pspam, p1, p0 = train_nb0(array(trainmat), array(trainclass))
errorcount = 0
for i in testset:
if classify_nb(array(bag_of_words_to_vec(vocablist, doclist[i])), p0, p1, pspam) != classlist[i]:
errorcount += 1
print doclist[i]
print 'the error rate is: ', float(errorcount) / len(testset)
return vocablist, p1, p0
获取每个地区出现次数大于某个阈值的单词,由于单词数量比较多,最后只输出前5个。
def get_top_words(ny, sf):
import operator
vocablist, psf, pny = local_words(ny, sf)
topny = []
topsf = []
for i in range(len(psf)):
if psf[i] > -6.0:
topsf.append((vocablist[i], psf[i]))
if pny[i] > -6.0:
topny.append((vocablist[i], pny[i]))
sortedsf = sorted(topsf, key=lambda pair: pair[1], reverse=True)
print 'SF********'
for i in sortedsf[:5]:
print i[0]
sortedny = sorted(topny, key=lambda pair: pair[1], reverse=True)
print 'Ny********'
for i in sortedny[:5]:
print i[0]
def t_localwords():
ny = feedparser.parse('https://newyork.craigslist.org/search/mar?format=rss')
sf = feedparser.parse('https://sfbay.craigslist.org/search/mar?format=rss')
# vocablist, psf, pny = local_words(ny, sf)
# vocablist, psf, pny = local_words(ny, sf)
get_top_words(ny, sf)