from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
# 留言板评论分类案例
def testNB_skl():
posting = ['my dog has flea problems help please', 'maybe not take him to dog park stupid',
'my dalmation is so cute I love him', 'stop posting stupid worthless garbage',
'mr licks ate my steak how to stop him', 'quit buying worthless dog food stupid']
classVec = [0, 1, 0, 1, 0, 1]
# 交叉验证选择 训练集和测试集
train_data, test_data, train_y, test_y = train_test_split(posting, classVec, test_size=0.2, train_size=0.8)
# 生成文本的词频矩阵
vectorizer = CountVectorizer() # 用于 词袋模型 统计词频
wordX = vectorizer.fit_transform(train_data)
# 训练分类器
clf = MultinomialNB().fit(wordX, train_y)
# 预测测试集的分类结果
test_wordX = vectorizer.transform(test_data).toarray()
predicted = clf.predict(test_wordX) # 预测
for doc, category in zip(test_data, predicted):
print(doc, ':', category)
#在测试集上的性能评估
classTarget_names = ['正常言论', '侮辱性言论']
print(classification_report(test_y, predicted, target_names=classTarget_names))
testNB_skl()
输出:
maybe not take him to dog park stupid : 0
stop posting stupid worthless garbage : 1
precision recall f1-score support
正常言论 0.00 0.00 0.00 0
侮辱性言论 1.00 0.50 0.67 2
avg / total 1.00 0.50 0.67 2