第1关:分类器方法
1\C
2\AC
第2关:基于朴素贝叶斯的文本分类
任务描述
本关任务:根据本关所学有关朴素贝叶斯的知识,编写基于朴素贝叶斯理论进行文本分类的程序,并通过所有测试用例。
相关知识
为了完成本关任务,你需要掌握:
-
贝叶斯决策理论思想;
-
朴素贝叶斯分类器的实现。
from functools import reduce
import operator
from numpy import array, zeros
def trainNB(trainMatrix, trainCategory):
numTrainDocs = len(trainMatrix) # 文档数量
numWords = len(trainMatrix[0]) # 第一篇文档的长度,也就是词汇表的长度
pAbusive = sum(trainCategory) / float(numTrainDocs) # 负面文档占总文档比例
p0Num = zeros(numWords) # 初始化概率
p1Num = zeros(numWords)
p0Denom = 0
p1Denom = 0
for i in range(numTrainDocs):
if trainCategory[i] == 1: # 如果是负面文档
p1Num += trainMatrix[i] # 文档对应的词语数量全部加1,向量相加
p1Denom += sum(trainMatrix[i]) # 负面文档词语的总数量
else:
p0Num += trainMatrix[i] # 正常文档对应的词语数量向量
p0Denom += sum(trainMatrix[i]) # 正常文档词语的总数量
p1Vect = p1Num / p1Denom # 对p1Num的每个元素做除法,即负面文档中出现每个词语的概率
p0Vect = p0Num / p0Denom # 对p0Num的每个元素做除法,即正常文档中出现每个词语的概率
return p0Vect, p1Vect, pAbusive
def classifyNB(vec2Classify, trainMatrix, trainCategory):
p0Vect, p1Vect, pAb = trainNB(trainMatrix, trainCategory)
# 计算待分类文档词条对应的条件概率
p1VectClassify = vec2Classify * p1Vect
p0VectClassify = vec2Classify * p0Vect
p1Cond = [];
p0Cond = []
for i in range(len(p1VectClassify)):
if p1VectClassify[i] == 0:
continue
else:
p1Cond.append(p1VectClassify[i])
for i in range(len(p0VectClassify)):
if p0VectClassify[i] == 0:
continue
else:
p0Cond.append(p0VectClassify[i])
# 任务:完成对各概率向量的计算
# ********** Begin *********#
if len(p0Cond): # 若p0Cond不为空,即p0VectClassify不全为0
pC0=reduce(operator.mul, p0Cond, 1) # 计算概率向量内元素乘积
else:
pC0=0
if len(p1Cond): # 计算概率
pC1=reduce(operator.mul, p1Cond, 1)
else:
pC1=0
p1=pC1*pAb
p0=pC0*(1.0-pAb)
# ********** End **********#
if p1 > p0:
return 1
else:
return 0
第3关:基于支持向量机的文本分类
任务描述
本关任务:根据本关所学有关支持向量机的知识,编写基于支持向量机理论进行文本分类的程序,并通过所有测试用例。
相关知识
为了完成本关任务,你需要掌握:
-
支持向量机理论的思想;
-
支持向量机分类器的实现。
import numpy as np
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from test import get_data,get_result
df = get_data()
counter = Counter(df['variety'].tolist())
top_10_varieties = {i[0]: idx for idx, i in enumerate(counter.most_common(10))}
df = df[df['variety'].map(lambda x: x in top_10_varieties)]
description_list = df['description'].tolist()
varietal_list = [top_10_varieties[i] for i in df['variety'].tolist()]
varietal_list = np.array(varietal_list)
count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(description_list)
# 任务:完成对文本的TF-IDF值的计算
# ********** Begin *********#
tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts) # 用于统计vectorizer中每个词语的TFIDF值
# ********** End *********#
train_x, test_x, train_y, test_y = train_test_split(x_train_tfidf, varietal_list, test_size=0.3)
clf = SVC(kernel='linear').fit(train_x, train_y)
y_score = clf.predict(test_x)
n_right = 0
for i in range(len(y_score)):
if y_score[i] == test_y[i]:
n_right += 1
get_result(n_right,test_y)