python数据挖掘入门与实践 pdf读书笔记_数据挖掘实践指南读书笔记6-CSDN博客

本文链接：https://blog.csdn.net/weixin_39683734/article/details/113513571

本文介绍了如何使用朴素贝叶斯算法进行文本分类，包括数据预处理（去除停用词），训练阶段（合并文档计数）和分类阶段（处理概率计算）。通过实例演示了如何初始化、训练及测试朴素贝叶斯文本分类器，并探讨了停用词的处理策略。适合初学者理解并实践文本分类技术。

摘要由CSDN通过智能技术生成

写在之前

本书涉及的源程序和数据都可以在以下网站中找到：http://guidetodatamining.com/

这本书理论比较简单，书中错误较少，动手锻炼较多，如果每个代码都自己写出来，收获不少。总结：适合入门。

欢迎转载，转载请注明出处，如有问题欢迎指正。

合集地址：https://www.zybuluo.com/hainingwyx/note/559139

朴素贝叶斯和文本

训练阶段：

将标识为同一假设的文档合并成一个文本文件

计算词在该文件中的出现次数n,形成一个词汇表

对于词汇表中的每个词w_k计算器在文本中的出现次数，记为n_k

对词汇表中的每个词(去除停用词)w_k，计算

6ffe77e4f57a

class BayesText:

def __init__(self, trainingdir, stopwordlist):

"""This class implements a naive Bayes approach to text

classification

trainingdir is the training data. Each subdirectory of

trainingdir is titled with the name of the classification

category -- those subdirectories in turn contain the text

files for that category.

The stopwordlist is a list of words (one per line) will be

removed before any counting takes place.

"""

self.vocabulary = {}

self.prob = {}

self.totals = {}

self.stopwords = {} #停用词字典

f = open(stopwordlist)

for line in f:

self.stopwords[line.strip()] = 1

f.close()

categories = os.listdir(trainingdir)

#filter out files that are not directories

self.categories = [filename for filename in categories

if os.path.isdir(trainingdir + filename)]

print("Counting ...")

for category in self.categories:

print(' ' + category)

# 计算当前类别的单词和单词数量，单词的总量

(self.prob[category],

self.totals[category]) = self.train(trainingdir, category)

# I am going to eliminate any word in the 所有种类的单词库vocabulary

# that doesn't occur at least 3 times

toDelete = []

for word in self.vocabulary:

if self.vocabulary[word] < 3:

# mark word for deletion

# can't delete now because you can't delete

# from a list you are currently iterating over

toDelete.append(word)

# now delete

for word in toDelete:

del self.vocabulary[word]

# now compute probabilities

vocabLength = len(self.vocabulary)

print("Computing probabilities:")

for category in self.categories:

print(' ' + category)

denominator = self.totals[category] + vocabLength

for word in self.vocabulary:

if word in self.prob[category]:

count = self.prob[category][word]

else:

count = 1

# 条件概率计算

self.prob[category][word] = (float(count + 1)

/ denominator)

print ("DONE TRAINING\n\n")

# input：trainingdir训练文件的目录, category训练文件的种类

# return: (counts, total) (当前文件的单词和单词数量,所有单词的数量)

def train(self, trainingdir, category):

"""counts word occurrences for a particular category"""

currentdir = trainingdir + category

files = os.listdir(currentdir)

counts = {}

total = 0

for file in files:

#print(currentdir + '/' + file)

f = codecs.open(currentdir + '/' + file, 'r', 'iso8859-1')

for line in f:

tokens = line.split()

for token in tokens:

# get rid of punctuation and lowercase token

token = token.strip('\'".,?:-')

token = token.lower()

if token != '' and not token in self.stopwords:

self.vocabulary.setdefault(token, 0)

self.vocabulary[token] += 1#所有文档的单词和单词数量

counts.setdefault(token, 0)

counts[token] += 1#当前文件的单词和单词数量

total += 1#所有单词的数量

f.close()

return(counts, total)

# test code

bT = BayesText(trainingDir, stoplistfile)

bT.prob['rec.motorcycles']["god"]

分类阶段：

6ffe77e4f57a

如果概率非常小，Python无法计算，可以采用取对数的形式。

停用词：当停用词是噪声时，去掉这些词能减少处理量，提高性能。个别情况下要重新考虑停用词。如性犯罪者会比一般人更多使用me、you这类词语。

def classify(self, itemVector, numVector):

"""Return class we think item Vector is in"""

results = []

sqrt2pi = math.sqrt(2 * math.pi)

for (category, prior) in self.prior.items():

prob = prior

col = 1

for attrValue in itemVector:

if not attrValue in self.conditional[category][col]:

# we did not find any instances of this attribute value

# occurring with this category so prob = 0

prob = 0

else:

prob = prob * self.conditional[category][col][attrValue]

col += 1

col = 1

for x in numVector:

mean = self.means[category][col]

ssd = self.ssd[category][col]

ePart = math.pow(math.e, -(x - mean)**2/(2*ssd**2))

prob = prob * ((1.0 / (sqrt2pi*ssd)) * ePart)

col += 1

results.append((prob, category))

# return the category with the highest probability

#print(results)

return(max(results)[1])

# test code

bT.classify(testDir+ 'rec.motorcycles/104673')

10-fold cross

from __future__ import print_function

import os, codecs, math

class BayesText:

# input:训练文件目录，停用词，忽略的文件子集

def __init__(self, trainingdir, stopwordlist, ignoreBucket):

"""This class implements a naive Bayes approach to text

classification

trainingdir is the training data. Each subdirectory of

trainingdir is titled with the name of the classification

category -- those subdirectories in turn contain the text

files for that category.

The stopwordlist is a list of words (one per line) will be

removed before any counting takes place.

"""

self.vocabulary = {}

self.prob = {}

self.totals = {}

self.stopwords = {}

f = open(stopwordlist)

for line in f:

self.stopwords[line.strip()] = 1

f.close()

categories = os.listdir(trainingdir)

#filter out files that are not directories，in this program, neg and pos

self.categories = [filename for filename in categories

if os.path.isdir(trainingdir + filename)]

print("Counting ...")

for category in self.categories:

#print(' ' + category)

(self.prob[category],

self.totals[category]) = self.train(trainingdir, category,

ignoreBucket)

# I am going to eliminate any word in the vocabulary

# that doesn't occur at least 3 times

toDelete = []

for word in self.vocabulary:

if self.vocabulary[word] < 3:

# mark word for deletion

# can't delete now because you can't delete

# from a list you are currently iterating over

toDelete.append(word)

# now delete

for word in toDelete:

del self.vocabulary[word]

# now compute probabilities

vocabLength = len(self.vocabulary)

#print("Computing probabilities:")

for category in self.categories:

#print(' ' + category)

denominator = self.totals[category] + vocabLength

for word in self.vocabulary:

if word in self.prob[category]:

count = self.prob[category][word]

else:

count = 1

self.prob[category][word] = (float(count + 1)

/ denominator)

#print ("DONE TRAINING\n\n")

def train(self, trainingdir, category, bucketNumberToIgnore):

"""counts word occurrences for a particular category"""

ignore = "%i" % bucketNumberToIgnore

currentdir = trainingdir + category

directories = os.listdir(currentdir)

counts = {}

total = 0

for directory in directories:

if directory != ignore:

currentBucket = trainingdir + category + "/" + directory

files = os.listdir(currentBucket)

#print(" " + currentBucket)

for file in files:

f = codecs.open(currentBucket + '/' + file, 'r', 'iso8859-1')

for line in f:

tokens = line.split()

for token in tokens:

# get rid of punctuation and lowercase token

token = token.strip('\'".,?:-')

token = token.lower()

if token != '' and not token in self.stopwords:

self.vocabulary.setdefault(token, 0)

self.vocabulary[token] += 1

counts.setdefault(token, 0)

counts[token] += 1

total += 1

f.close()

return(counts, total)

def classify(self, filename):

results = {}

for category in self.categories:

results[category] = 0

f = codecs.open(filename, 'r', 'iso8859-1')

for line in f:

tokens = line.split()

for token in tokens:

#print(token)

token = token.strip('\'".,?:-').lower()

if token in self.vocabulary:

for category in self.categories:

if self.prob[category][token] == 0:

print("%s %s" % (category, token))

results[category] += math.log(

self.prob[category][token])

f.close()

results = list(results.items())

results.sort(key=lambda tuple: tuple[1], reverse = True)

# for debugging I can change this to give me the entire list

return results[0][0]

# input: 测试文件的分类目录，当前类别，忽略子集

# return: 当前类别下的分类结果{0:12,1：23}

def testCategory(self, direc, category, bucketNumber):

results = {}

directory = direc + ("%i/" % bucketNumber)

#print("Testing " + directory)

files = os.listdir(directory)

total = 0

#correct = 0

for file in files:

total += 1

result = self.classify(directory + file)

results.setdefault(result, 0)

results[result] += 1

#if result == category:

# correct += 1

return results

# input: 测试文件目录，忽略的子集文件

# return: 所有类别的分类结果{1:{0:12,1：23},}

def test(self, testdir, bucketNumber):

"""Test all files in the test directory--that directory is

organized into subdirectories--each subdir is a classification

category"""

results = {}

categories = os.listdir(testdir)

#filter out files that are not directories

categories = [filename for filename in categories if

os.path.isdir(testdir + filename)]

for category in categories:

#print(".", end="")

results[category] = self.testCategory(

testdir + category + '/', category, bucketNumber)

return results

def tenfold(dataPrefix, stoplist):

results = {}

for i in range(0,10):

bT = BayesText(dataPrefix, stoplist, i)

r = bT.test(theDir, i)

for (key, value) in r.items():

results.setdefault(key, {})

for (ckey, cvalue) in value.items():

results[key].setdefault(ckey, 0)

results[key][ckey] += cvalue

categories = list(results.keys())

categories.sort()

print( "\n Classified as: ")

header = " "

subheader = " +"

for category in categories:

header += "% 2s " % category

subheader += "-----+"

print (header)

print (subheader)

total = 0.0

correct = 0.0

for category in categories:

row = " %s |" % category

for c2 in categories:

if c2 in results[category]:

count = results[category][c2]

else:

count = 0

row += " %3i |" % count

total += count

if c2 == category:

correct += count

print(row)

print(subheader)

print("\n%5.3f percent correct" %((correct * 100) / total))

print("total of %i instances" % total)

# change these to match your directory structure

prefixPath = "data/review_polarity/"

theDir = prefixPath + "/txt_sentoken/"

stoplistfile = prefixPath + "stopwords25.txt"

tenfold(theDir, stoplistfile)