Skip to content

Latest commit

 

History

History
258 lines (223 loc) · 9.16 KB

4.朴素贝叶斯.md

File metadata and controls

258 lines (223 loc) · 9.16 KB

第 4 章 基于概率论的分类方法:朴素贝叶斯

朴素:分类过程中进行最简单的假设

4.1 基于贝叶斯决策理论的分类方法

假设有一个点 (x,y),它一种有两种分类可能分别记为 1 和 2。属于分类 1 的概率记为 p1(x,y),同理属于分类 2 的 概率记为 p2(x,y)。根据贝叶斯决策理论的核心思想,比较 p1 和 p2 的概率大小来决定点的类别。

4.2 条件概率

贝叶斯准则: ${p(c|x)=\frac{p(x|c)p(c)}{p(x)}}$ 表示在 x 的情况下 c 的概率是多少,一般使用这个公式的前提是直接计算 p(c|x) 比较难算。

4.3 使用条件概率来分类

将 4.2 中的公式放到 4.1 的判定方法里面使用。

4.5 使用 Python 进行文本分类

将输入的文本,转换为向量。这里考虑最简单的方法,将输入的文章变成单词向量。 假设输入得到的文本保存在 postinglist 中,文本的标签(侮辱类和非侮辱类)保存在 classvec 中。vocablist 是 postinglist 中所有单词去重之后的集合。

def load_data_set():
    postinglist=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classvec = [0, 1, 0, 1, 0, 1]    # 1 is abusive, 0 not
    return postinglist, classvec
def creat_vocablist(dataset):
    vocabset = set([])
    for document in dataset:
        vocabset = vocabset | set(document)
    return list(vocabset)

下面这两个函数功能为将输入的某一篇文本转化为单词向量。set_of_words_to_vec 使用的是布尔模型,所有向量的取值范围为 {0,1}。bag_of_words_to_vec 使用的是多项式模型,取值为某个单词在文本中出现的全部次数。

def set_of_words_to_vec(vocablist, inputset):
    returnvec = [0]*len(vocablist)
    for word in inputset:
        if word in vocablist:
            returnvec[vocablist.index(word)] = 1
        else:
            print "the word: %s is not in my vocabulary!" % word
    return returnvec
def bag_of_words_to_vec(vocablist, inputset):
    returnvec = [0]*len(vocablist)
    for word in inputset:
        if word in vocablist:
            returnvec[vocablist.index(word)] += 1
    return retuPython 

下面为朴素贝叶斯分类训练函数。 由于某些情况下,概率的值很小,所以需要取log,将乘法变成加法,减少向下溢出的情况。

def train_nb0(train_matrix, train_category):
    numtraindocs = len(train_matrix)
    numwords = len(train_matrix[0])
    pabusive = sum(train_category)/float(numtraindocs)
    p0num = ones(numwords)
    p1num = ones(numwords)
    p0denom = 2.0
    p1denom = 2.0
    for i in range(numtraindocs):
        if train_category[i] == 1:
            p1num += train_matrix[i]
            p1denom += sum(train_matrix[i])
        else:
            p0num += train_matrix[i]
            p0denom += sum(train_matrix[i])
    p1vect = log(p1num/p1denom)
    p0vect = log(p0num/p0denom)
    return pabusive, p1vect, p0vect


def t_train_nb0():
    postinglist, classvec = load_data_set()
    vocablist = creat_vocablist(postinglist)
    trainmat = []
    for doc in postinglist:
        trainmat.append(set_of_words_to_vec(vocablist, doc))
    pab, p1v, p0v = train_nb0(trainmat, classvec)
    print pab
    print p1v
    print p0v

分类器

def classify_nb(vec2calssify, p0vec, p1vec, pclass1):
    p1 = sum(vec2calssify * p1vec) + log(pclass1)
    p0 = sum(vec2calssify * p0vec) + log(1.0 - pclass1)
    if p1 > p0:
        return 1
    else:
        return 0

def t_nb():
    postinglist, classvec = load_data_set()
    vocablist = creat_vocablist(postinglist)
    trainmat = []
    for doc in postinglist:
        trainmat.append(set_of_words_to_vec(vocablist, doc))
    pab, p1v, p0v = train_nb0(trainmat, classvec)
    testentry = ['love', 'my', 'dalmation']
    thisdoc = array(set_of_words_to_vec(vocablist, testentry))
    print classify_nb(thisdoc, p0v, p1v, pab)
    testentry = ['stupid', 'garbage']
    thisdoc = array(set_of_words_to_vec(vocablist, testentry))
    print classify_nb(thisdoc, p0v, p1v, pab)

4.6 示例:使用朴素贝叶斯过滤垃圾邮件

由于邮件中会出现很多的数字以及标点符号的情况,我们使用正则匹配提取出单词,进一步去除长度小于 2 的单词。

def text_parse(bigstring):
    listoftokens = re.split(r'\W*',bigstring)
    return [tok.lower() for tok in listoftokens if len(tok) > 2]

分类器,使用交叉验证的方法

def spam_test():
    doclist = []
    classlist = []
    fulltext = []
    for i in range(1, 26):
        wordlist = text_parse(open('email/spam/%d.txt' % i).read())
        doclist.append(wordlist)
        fulltext.extend(wordlist)
        classlist.append(1)
        wordlist = text_parse(open('email/ham/%d.txt' % i).read())
        doclist.append(wordlist)
        fulltext.extend(wordlist)
        classlist.append(0)
    vocablist = creat_vocablist(doclist)
    trainingset = range(50)
    testset = []
    for i in range(10):
        randindex = int(random.uniform(0, len(trainingset)))
        testset.append(trainingset[randindex])
        del(trainingset[randindex])
    trainmat = []
    trainclass = []
    for i in trainingset:
        trainmat.append(set_of_words_to_vec(vocablist, doclist[i]))
        trainclass.append(classlist[i])
    pspam, p1, p0 = train_nb0(array(trainmat), array(trainclass))
    errorcount = 0
    for i in testset:
        if classify_nb(array(set_of_words_to_vec(vocablist, doclist[i])), p0, p1, pspam) != classlist[i]:
            errorcount += 1
            print doclist[i]
    print 'the error rate is: ', float(errorcount)/len(testset)

4.7 使用朴素贝叶斯分类器从个人广告中获取区域倾向

书本中提供的 RSS 源似乎无法使用,所以自己随机选择了一个新的板块https://newyork.craigslist.org/search/mar?format=rss。这一个例子中的思想,和上一步相同。

def t_feedparser():
    ny = feedparser.parse('https://newyork.craigslist.org/search/mar?format=rss')
    sf = feedparser.parse('https://sfbay.craigslist.org/search/mar?format=rss')
    print min(len(ny['entries']), len(sf['entries']))


def cal_most_freq(vocablist, fulltext):
    import operator
    freqdict = {}
    for i in vocablist:
        freqdict[i] = fulltext.count(i)
    sortedfreq = sorted(freqdict.iteritems(), key=operator.itemgetter(1), reverse=True)
    return sortedfreq[:30]


def local_words(ny, sf):
    minlen = min(len(ny['entries']), len(sf['entries']))
    doclist = []
    classlist = []
    fulltext = []
    for i in range(minlen):
        wordlist = text_parse(sf['entries'][i]['summary'])
        doclist.append(wordlist)
        fulltext.extend(wordlist)
        classlist.append(1)
        wordlist = text_parse(ny['entries'][i]['summary'])
        doclist.append(wordlist)
        fulltext.extend(wordlist)
        classlist.append(0)
    vocablist = creat_vocablist(doclist)
    trainingset = range(2 * minlen)
    testset = []
    for i in range(10):
        randindex = int(random.uniform(0, len(trainingset)))
        testset.append(trainingset[randindex])
        del (trainingset[randindex])
    top30words = cal_most_freq(vocablist, fulltext)
    for i in top30words:
        if i[0] in vocablist:
            vocablist.remove(i[0])
    print 'vocalist len', len(vocablist)
    trainmat = []
    trainclass = []
    for i in trainingset:
        trainmat.append(bag_of_words_to_vec(vocablist, doclist[i]))
        trainclass.append(classlist[i])
    pspam, p1, p0 = train_nb0(array(trainmat), array(trainclass))
    errorcount = 0
    for i in testset:
        if classify_nb(array(bag_of_words_to_vec(vocablist, doclist[i])), p0, p1, pspam) != classlist[i]:
            errorcount += 1
            print doclist[i]
    print 'the error rate is: ', float(errorcount) / len(testset)
    return vocablist, p1, p0

获取每个地区出现次数大于某个阈值的单词,由于单词数量比较多,最后只输出前5个。

def get_top_words(ny, sf):
    import operator
    vocablist, psf, pny = local_words(ny, sf)
    topny = []
    topsf = []
    for i in range(len(psf)):
        if psf[i] > -6.0:
            topsf.append((vocablist[i], psf[i]))
        if pny[i] > -6.0:
            topny.append((vocablist[i], pny[i]))
    sortedsf = sorted(topsf, key=lambda pair: pair[1], reverse=True)
    print 'SF********'
    for i in sortedsf[:5]:
        print i[0]
    sortedny = sorted(topny, key=lambda pair: pair[1], reverse=True)
    print 'Ny********'
    for i in sortedny[:5]:
        print i[0]


def t_localwords():
    ny = feedparser.parse('https://newyork.craigslist.org/search/mar?format=rss')
    sf = feedparser.parse('https://sfbay.craigslist.org/search/mar?format=rss')
    # vocablist, psf, pny = local_words(ny, sf)
    # vocablist, psf, pny = local_words(ny, sf)
    get_top_words(ny, sf)