将文本进行分类是自然语言处理当中最主要的工作之一,本文处理很重要的一项工作就是对文本进行向量化,本文不做详细的介绍,只是采用TF-IDF的方法对文本进行向量化,然后分别采用SVM, Bayes, RandomForest,BP四种方法对文本进行分类。
训练语料是在excel中存储的,格式见下图:
data = pd.read_excel('../corpus.xlsx', encoding='utf-8', header=None)
data.columns = ['class_label', 'text']
data.dropna(inplace=True)
# 加载自定义词典
jieba.load_userdict('../dict_out.csv')
# 加载停用词表
stopkey = [line.strip().decode('utf-8') for line in open("../stopwords.dat", "rb").readlines()]
stopkey.append(" ")
list1 = []
list2 = []
for i in data["text"]:
try:
jiebas = jieba.cut(i)
jiebas = [w for w in jiebas if w not in stopkey]
fenci_key = ",".join(jiebas)
except AttributeError:
continue
finally:
list2.append(jiebas)
list1.append(fenci_key.strip())
# 将分分词结果写入data
data["tokens"] = list1
data.to_excel("1data.xls", header=None, index=False)
该文本训练库共有10000条数据,分为:'体育', '娱乐', '家居', '房产', '教育', '时尚', '时政', '游戏', '科技', '财经'这10个类别。
本文的分类主要工程如下:
- 对文本内容进行分词处理,删除停用词,只留下有意义的词语。
data = pd.read_excel('../corpus.xlsx', encoding='utf-8', header=None)
data.columns = ['class_label', 'text']
data.dropna(inplace=True)
# 加载自定义词典
jieba.load_userdict('../dict_out.csv')
# 加载停用词表
stopkey = [line.strip().decode('utf-8') for line in open("../stopwords.dat", "rb").readlines()]
stopkey.append(" ")
list1 = []
list2 = []
for i in data["text"]:
try:
jiebas = jieba.cut(i)
jiebas = [w for w in jiebas if w not in stopkey]
fenci_key = ",".join(jiebas)
except AttributeError:
continue
finally:
list2.append(jiebas)
list1.append(fenci_key.strip())
# 将分分词结果写入data
data["tokens"] = list1
data.to_excel("1data.xls", header=None, index=False)
2.将语料库分为训练集和测试集
data = pd.read_excel('1data.xls', encoding='utf-8', header=None)
data.columns=[ 'class_label','text', 'tokens']
label = data['class_label']
categories = []
for i in label:
if i in categories:
pass
else:
categories.append(i)
print(categories)
le = preprocessing.LabelEncoder().fit_transform(data['class_label'])
data["class_label"] = le
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(data["tokens"],
data["class_label"],
test_size=0.2,
random_state=1)
3.对词组进行TF-IDF处理,将各个词组转换成词向量。具体理论可查看其他相关资料,这里不再做详细的阐述
# 声明文本特征提取方法
# 文本特征提取
X_train_tfidf, tfidf_vectorizer = tfidf(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
def tfidf(data):
tfidf_vectorizer = TfidfVectorizer()
train = tfidf_vectorizer.fit_transform(data)
return train, tfidf_vectorizer
4.分别采用上面提到的分类方法进行训练和测试,并查看测试结果
def get_metrics(y_test, y_predicted):
"""
y_test:真实值
y_predicted:预测值
"""
# 精确度=真阳性/(真阳性+假阳性)
precision = precision_score(y_test, y_predicted, pos_label=None, average='weighted')
# 召回率=真阳性/(真阳性+假阴性)
recall = recall_score(y_test, y_predicted, pos_label=None, average='weighted')
# F1
f1 = f1_score(y_test, y_predicted, pos_label=None, average='weighted')
# 准确率
accuracy = accuracy_score(y_test, y_predicted)
return accuracy, precision, recall, f1
def BayesClassify():
clf_tfidf = MultinomialNB(alpha=0.01)
clf_tfidf.fit(X_train_tfidf, y_train)
joblib.dump(clf_tfidf, "BayesModel.m")
def BayesTest():
clf_tfidf = joblib.load("BayesModel.m")
y_predicted_tfidf = clf_tfidf.predict(X_test_tfidf)
accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf = get_metrics(y_test, y_predicted_tfidf)
print("accuracy = %.6f, precision = %.6f, recall = %.6f, f1 = %.6f" % (
accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf))
# 评估
print("Precision, Recall, F1-Score and support")
print(metrics.classification_report(y_test, y_predicted_tfidf, target_names=categories))
# 混淆矩阵
print("Confusion Matrix...")
cm = metrics.confusion_matrix(y_test, y_predicted_tfidf)
print(cm)
def SVMClassify():
clf_tfidf = SVC(gamma=1, kernel='rbf', probability=True)
clf_tfidf.fit(X_train_tfidf, y_train)
joblib.dump(clf_tfidf, "SVMModel.m")
def SVMTest():
clf_tfidf = joblib.load("SVMModel.m")
y_predicted_tfidf = clf_tfidf.predict(X_test_tfidf)
accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf = get_metrics(y_test, y_predicted_tfidf)
print("accuracy = %.6f, precision = %.6f, recall = %.6f, f1 = %.6f" % (
accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf))
# 评估
print("Precision, Recall, F1-Score and support")
print(metrics.classification_report(y_test, y_predicted_tfidf, target_names=categories))
# 混淆矩阵
print("Confusion Matrix...")
cm = metrics.confusion_matrix(y_test, y_predicted_tfidf)
print(cm)
def RandomForestClassify():
clf_tfidf = clf_tfidf = RandomForestClassifier(n_estimators=100, max_depth=100, random_state=0)
clf_tfidf.fit(X_train_tfidf, y_train)
joblib.dump(clf_tfidf, "RandomForestModel.m")
def RandomForestTest():
clf_tfidf = joblib.load("RandomForestModel.m")
y_predicted_tfidf = clf_tfidf.predict(X_test_tfidf)
accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf = get_metrics(y_test, y_predicted_tfidf)
print("accuracy = %.6f, precision = %.6f, recall = %.6f, f1 = %.6f" % (
accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf))
# 评估
print("Precision, Recall, F1-Score and support")
print(metrics.classification_report(y_test, y_predicted_tfidf, target_names=categories))
# 混淆矩阵
print("Confusion Matrix...")
cm = metrics.confusion_matrix(y_test, y_predicted_tfidf)
print(cm)
def BPClassify(inputPoint):
net = Sequential()
net.add(Dense(128, input_shape=(inputPoint,)))
net.add(Activation('relu'))
net.add(Dense(len(categories), input_shape=(128,)))
net.add(Activation('sigmoid'))
net.compile(optimizer='adam', loss='binary_crossentropy')
net.fit(X_train_tfidf, y_train_onehot, batch_size=128, epochs=2)
y_predicted_tfidf = net.predict(X_test_tfidf)
print(y_predicted_tfidf)
res = np.zeros((y_test.shape[0], 1))
for i, j in enumerate(y_predicted_tfidf):
j = list(j)
maxIndex = j.index(max(j))
res[i] = maxIndex
y_predicted_tfidf = res
accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf = get_metrics(y_test, y_predicted_tfidf)
print("accuracy = %.6f, precision = %.6f, recall = %.6f, f1 = %.6f" % (
accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf))
# 评估
print("Precision, Recall, F1-Score and support")
print(metrics.classification_report(y_test, y_predicted_tfidf, target_names=categories))
# 混淆矩阵
print("Confusion Matrix...")
cm = metrics.confusion_matrix(y_test, y_predicted_tfidf)
print(cm)
最终的分类效果较为理想,准确率和召回率都在90%以上。其中SVM耗时稍长。
文本分类 svm 贝叶斯 随机森林 神经网络