from sklearn.datasets import fetch_20newsgroups # 新闻数据
from sklearn.naive_bayes import MultinomialNB # 朴素贝叶斯模型
from sklearn.model_selection import train_test_split # 数据集分割模块
from sklearn.feature_extraction.text import CountVectorizer # 文本特征向量化模块
from sklearn.metrics import classification_report
# 1.数据抓取
news = fetch_20newsgroups(subset='all')
print(len(news.data))
# 2.数据预处理:训练集和测试集分割,文本特征向量化
x_train, x_test, y_train, y_test = train_test_split(news.data, news.target, test_size=0.25, random_state=33) # train:test = 3:1
vec = CountVectorizer()
x_train = vec.fit_transform(x_train)
x_test = vec.transform(x_test)
# 3.使用朴素贝叶斯进行训练
mnb = MultinomialNB() # 初始化分类器
mnb.fit(x_train, y_train) # 训练分类器
y_pred = mnb.predict(x_test) # 对测试集进行预测
# 4.打印结果
print('The Accuracy of Naive Bayes Classifier is: ', mnb.score(x_test, y_test))
print(classification_report(y_test, y_pred))