中文文本分类问题:THUCNews数据集
1 THUCNews数据集与预处理
1.1 数据集下载
本文采用了清华NLP组提供的THUCNews新闻文本分类数据集的子集
数据下载链接:
THUCNews数据子集:https://pan.baidu.com/s/1hugrfRu 密码:qfud
1.2 数据量
该数据集使用了其中的10个分类,每个分类6500条,总共65000条新闻数据。
类别如下:
数据集共有三个文件,如下:
cnews.train.txt: 训练集(500010条)
cnews.val.txt: 验证集(50010条)
cnews.test.txt: 测试集(1000*10条)
1.3 数据预处理
1.3.1 导入数据
import pandas as pd
train_data=pd.read_csv('cnews_train.txt',sep='\t',names=['label','content'])
test_data=pd.read_csv('cnews.test.txt',sep='\t',names=['content'])
train_data.info()
前五个数据样本如下:
1.3.2 将文字型的label 转为数字label
def read_category(y_train):
"""读取分类目录,固定"""
categories = ['体育', '财经', '房产', '家居', '教育', '科技', '时尚', '时政', '游戏', '娱乐']
categories = [x for x in categories]
cat_to_id = dict(zip(categories, range(len(categories))))
label_id = []
for i in range(len(y_train)):
label_id.append(cat_to_id[y_train[i]])
return label_id
train_target=train_data['label']
y_label=read_category(train_target)
2 特征工程
2.1 Jieba分词
def chinese_word_cut(mytext):
return " ".join(jieba.cut(mytext))
# 不添加分词
#train_content = train_data['content']
#test_content = test_data['content']
# 添加分词
train_content =train_data['content'].apply(chinese_word_cut)
test_content = test_data['content'].apply(chinese_word_cut)
2.2 TF-IDF并将文件保存至本地
from sklearn.feature_extraction.text import TfidfVectorizer
f_all = pd.concat(objs=[train_data['content'], test_data['content']], axis=0)
tfidf_vect = TfidfVectorizer(max_df = 0.9,min_df = 3,token_pattern=r"(?u)\b\w+\b")
tfidf_vect.fit(f_all)
X_train=tfidf_vect.fit_transform(train_data['content'])
X_test=tfidf_vect.fit_transform(test_data['content'])
将文件保存至本地
import pickle
data = (X_train, y_label, X_test)
fp = open('data_tfidf.pkl', 'wb')
pickle.dump(data, fp)
fp.close()
3 训练模型
3.1 lightgbm模型
import lightgbm as lgb
X_trn, X_val, y_trn, y_val= train_test_split(X_train,y_label,test_size=0.2, random_state=2019)
train_data=lgb.Dataset(X_trn,label=y_trn)
validation_data=lgb.Dataset(X_val,label=y_val)
params= {
'objective':'multiclass',
'num_class':10,
'boosting': 'gbdt',
'num_threads': 1,
'learning_rate': 0.3,
'num_leaves': 31,
'max_depth': 8,
'max_bin':200,
'lambda_l1': 0,
'lambda_l2': 0,
}
clf=lgb.train(params,train_data,valid_sets=[validation_data])
3.1.1 无Jieba分词直接TF-IDF后训练模型的结果
没有经过结巴分词,直接TF-IDF,X_train的特征有71655维
3.1.2 含Jieba分词直接TF-IDF后训练模型的结果
4 K 折交叉验证
对于多模型,进行交叉验证:
from sklearn.metrics import accuracy_score,f1_score,roc_auc_score,recall_score,precision_score
import lightgbm as lgb
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB,BernoulliNB
from sklearn.linear_model import LogisticRegression
import time
def train_model(X, X_test, y, folds,params=None, model_type='LSVC', plot_feature_importance=False):
n_fold=5
iteration=3000
nrepeats = 2
prediction = np.zeros((X_test.shape[0], n_fold*nrepeats))
scores = []
feature_importance = pd.DataFrame()
fold_n=0
for train_index, valid_index in folds.split(X, y):
fold_n+=1
print('Fold', fold_n, 'started at', time.ctime())
X_trn, X_val = X[trn_index], X[val_index]
y_trn, y_val = y[trn_index], y[val_index]
if model_type=='LSVC':
model= LinearSVC()
model.fit(X_train,y_train)
y_valid_pred=model.predict(X_valid)
y_pred=model.predict(X_test)
if model_type=='lr':
#3.使用逻辑斯蒂回归
model = LogisticRegression() # 初始化LogisticRegression
model.fit(X_trn,y_trn) # 使用训练集对测试集进行训练
y_val_pred=model.predict(X_val) # 使用逻辑回归函数对测试集进行预测
y_pred=model.predict(X_test)
if mode_type=='mnb':
model=MultinomialNB()
model.fit(X_trn,y_trn)
y_val_pred=model.predict(X_val) # 使用MultinomialNB函数对测试集进行预测
y_pred=model.predict(X_test)
if mode_type=='gnb':
model=GaussianNB()
model.fit(X_trn,y_trn)
y_val_pred=model.predict(X_val) # 使用GaussianNB函数对测试集进行预测
y_pred=model.predict(X_test)
if mode_type=='bnb':
model=BernoulliNB()
model.fit(X_trn,y_trn)
y_val_pred=model.predict(X_val) # 使用BernoulliNB函数对测试集进行预测
y_pred=model.predict(X_test)
if model_type == 'lgb':
trn_data = lgb.Dataset(X_trn, label=y_trn)
val_data = lgb.Dataset(X_val, label=y_val)
model = lgb.train(params, trn_data, iteration, valid_sets = [trn_data, val_data], verbose_eval=25,early_stopping_rounds = 200)
y_val_pred = model.predict(X_val, num_iteration=model.best_iteration)
y_pred = model.predict(X_test, num_iteration=model.best_iteration)
#oof[valid_index] = y_valid_pred.reshape(-1,)
f1_scores.append(f1_score(np.array(y_val), y_val_pred,average='micro'))
accuracy_scores.append(accuracy_score(np.array(y_val), y_val_pred,average='micro'))
roc_auc_scores.append(roc_auc_score(np.array(y_val), y_val_pred,average='micro'))
recall_scores.append(recall_score(np.array(y_val), y_val_pred,average='micro'))
precision_scores.append(precision_score(np.array(y_val), y_val_pred,average='micro'))
prediction[:,fold_n]=y_pred
if model_type == 'lgb':
# feature importance
fold_importance = pd.DataFrame()
fold_importance["feature"] = X.columns
fold_importance["importance"] = model.feature_importance()
fold_importance["fold"] = fold_n + 1
feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
y_test_pred = []
for i in range(len(prediction)):
result_vote = np.argmax(np.bincount(prediction[i,:]))
y_test_pred.append(result_vote)
print('CV mean f1_scores: {0:.4f}, std: {1:.4f}.'.format(np.mean(f1_scores), np.std(f1_scores)))
print('CV mean accuracy_scores: {0:.4f}, std: {1:.4f}.'.format(np.mean(accuracy_scores), np.std(accuracy_scores)))
print('CV mean roc_auc_scores: {0:.4f}, std: {1:.4f}.'.format(np.mean(roc_auc_scores), np.std(roc_auc_scores)))
print('CV mean recall_scores: {0:.4f}, std: {1:.4f}.'.format(np.mean(recall_scores), np.std(recall_scores)))
print('CV mean precision_scores: {0:.4f}, std: {1:.4f}.'.format(np.mean(precision_scores), np.std(precision_scores)))
return y_test_pred
嗯。。。Colab跑奔溃了,,,接下来要考虑换其他服务器接着跑,,,