####################################
# 功能:创建样本
# 输入变量:空
# 输出变量:posting_list, class_vec
# 词条切分后的文档集合,对应的标签的集合
####################################
def load_data_set():
posting_list = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
# 1代表侮辱性文字,0代表正常言论
class_vec = [0, 1, 0, 1, 0, 1]
return posting_list, class_vec
####################################
# 功能:创建不重复词的词汇列表
# 输入变量:data_set 词条切分后的文档
# 输出变量:vocab_set 词汇列表
####################################
def create_vocab_list(data_set):
vocab_set = set([])
for document in data_set:
vocab_set = vocab_set | set(document) # 对每个文档求得的结果求并,并去除重复数据
return list(vocab_set)
####################################
# 功能:转换为文档向量
# 输入变量:词汇列表,某个输入文档 vocab_list, input_set
# 输出变量:文档向量 return_vec
####################################
def set_of_words2vec(vocab_list, input_set):
return_vec = [0] * len(vocab_list)
for word in input_set:
if word in vocab_list:
# index方法用于查找vocab_list中第一个匹配word变量的索引
return_vec[vocab_list.index(word)] += 1
else:
print "the word: %s is not in my vocabulary!" % word
return return_vec
####################################
# 功能:计算文档概率P(ci),每个词在某篇文档的概率P(w|ci)
# 输入变量:词汇列表,文档标签 train_matrix, train_category
# 输出变量:正常词的P(w|c0),侮辱词的P(w|c1),文档概率P(ci)
# p0vec, p1vec, p_abusive
####################################
def train_nb0(train_matrix, train_category):
num_train_docs = len(train_matrix) # 6
p_abusive = sum(train_category)/float(num_train_docs) # 侮辱类文档概率(0+1+0+1+0+1)/6=0.5
num_words = len(train_matrix[0]) # 得到词汇表的长度,本篇为32
p0num = ones(num_words) # p0num计算正常词语每个词的个数
p1num = ones(num_words) # p1Num计算侮辱词语每个词的个数
p0denom = 2.0 # 正常词的总个数
p1denom = 2.0 # 侮辱词的总个数
for i in xrange(num_train_docs):
if train_category[i] == 1:
p1num += train_matrix[i]
p1denom += sum(train_matrix[i])
else:
p0num += train_matrix[i]
p0denom += sum(train_matrix[i])
p1vec = log10(p1num/p1denom)
p0vec = log10(p0num/p0denom)
return p0vec, p1vec, p_abusive
3、构建朴素贝叶斯分类器
####################################
# 功能:朴素贝叶斯分类函数
# 输入变量:待分类的向量,正常词的P(w|c0),
# 侮辱词的P(w|c1),文档概率P(ci)
# vec2classify, p0vec, p1vec, p_class1
# 输出变量:判断出的待分类向量的类别标签
####################################
def classify_nb(vec2classify, p0vec, p1vec, p_class1):
p1 = sum(vec2classify * p1vec) + log10(p_class1)
p0 = sum(vec2classify * p0vec) + log10(1.0 - p_class1)
if p1 > p0:
return 1
else:
return 0
4、测试代码
def main():
list_posts, list_classes = load_data_set()
my_vocab_list = create_vocab_list(list_posts)
print "my_vocab_list=", my_vocab_list
list_posts0_vec = set_of_words2vec(my_vocab_list, list_posts[0])
print "list_posts0_vec", list_posts0_vec
train_mat = []
for list_post in list_posts:
train_mat.append(set_of_words2vec(my_vocab_list, list_post))
p0v, p1v, pab = train_nb0(train_mat, list_classes)
print "p0v=", p0v
print "p1v=", p1v
print "pab=", pab
test_entry = ['love', 'my', 'dalmation']
this_doc = array(set_of_words2vec(my_vocab_list, test_entry))
print test_entry, 'classified as:', classify_nb(this_doc, p0v, p1v, pab)
test_entry = ['stupid', 'garbage']
this_doc = array(set_of_words2vec(my_vocab_list, test_entry))
print test_entry, 'classified as:', classify_nb(this_doc, p0v, p1v, pab)
if __name__ == '__main__':
main()