基于文本内容的敏感信息识别
一、数据 获取与处理
import pandas as pd
sens_train_data=pd.read_csv('../训练数据集/train_sensitiveness.csv',encoding='gb18030')
insens_train_data=pd.read_csv('../训练数据集/train_insensitiveness.csv',encoding='gb18030')
data=pd.concat([sens_train_data,insens_train_data],axis=0)
data.columns=['content','label']
data.reset_index(inplace=True,drop=True)
data.shape
data.label.value_counts()
结果:
二、违规和非违规数据占比情况分析
代码块:
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']='Songti SC'
num=data['label'].value_counts()
plt.pie(num, labels=['非违规','违规'], autopct='%.2f%%')
plt.show()
运行程序生成饼图:
三、绘制词云图
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from tkinter import _flatten
# 1 统计词频
# 1.1 分词
data_cut = data['content'].apply(jieba.lcut)
# 1.2 去除停用词
with open('./stoplist.txt', 'r', encoding='utf-8') as f:
stop = f.read().split()
data_after = data_cut.apply(lambda x: [w for w in x if w not in stop])
data_after=data_after.apply(lambda x:[i for i in x if i!=' '])
# 1.3 统计词频
num = pd.Series(_flatten(list(data_after))).value_counts()
# 2 词云绘制
# 2.2 设置词云参数
wc = WordCloud(font_path='./simhei.ttf',
background_color='white')
wc.fit_words(num)
# 2.3 展示词云
plt.imshow(wc)
plt.axis('off')
plt.show()
运行结果:
//词条信息分布柱状图生成
num=data['cut'].apply(lambda x:len(x))
import matplotlib.pyplot as plt
plt.plot(range(len(num)),num)
plt.show()
ind1 = data['label'] ==1
sens_comment = data.loc[ind1, 'cut']
ind2 = data['label'] ==0
insens_comment = data.loc[ind2, 'cut']
def draw_wc(data):
# 1.3 统计词频
num = pd.Series(_flatten(list(data))).value_counts()
print(num[:10])
# 2 词云绘制
# 2.2 设置词云参数
wc = WordCloud(font_path='../data/tmp/simhei.ttf',
background_color='white')
wc.fit_words(num)
# 2.3 展示词云
plt.imshow(wc)
plt.axis('off')
plt.show()
四、主题模型设计
```python
from gensim.corpora import Dictionary
from gensim.models import LdaModel
neg_dict = Dictionary(sens_comment) #建立词典,词到index的一个映射
neg_corpus = [neg_dict.doc2bow(i) for i in sens_comment] #建立语料库,词的ID和词出现的频次
neg_lda = LdaModel(neg_corpus, num_topics =3, id2word = neg_dict) #LDA 模型训练
print("\n敏感信息:")
for i in range(3):
print("主题%d : " %i)
print(neg_lda.print_topic(i,topn=15)) #输出每个主题
# 将分词的单词合并为一句话
data['cut']=data['cut'].apply(lambda x:' '.join(x))
data_new=data[['cut','label']]
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")
tfidf_data = vectorizer.fit_transform(data['cut'])
tfidf_data.toarray()
from imblearn.over_sampling import SMOTE,BorderlineSMOTE, ADASYN
model_smote=SMOTE()
model_smote_x,model_smote_y=model_smote.fit_resample(tfidf_data.toarray(),data_new['label'].values)
# pd.Series(model_smote_y).value_counts()
model_smote_x=pd.DataFrame(model_smote_x)
# model_smote_x
len(model_smote_y)
model_smote_y.shape
#划分数据集
from sklearn.model_selection import train_test_split
test_ratio = 0.2
tfidf_train,tfidf_test, y_train, y_test= train_test_split(model_smote_x,model_smote_y, test_size=test_ratio,random_state=123) #stratify分层抽样
模型一:构建决策书模型
#模型一:构建决策树模型
from sklearn.metrics import accuracy_score,recall_score,classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report
DT_clf1 = DecisionTreeClassifier(class_weight='balanced').fit(tfidf_train, y_train) # 构建决策树模型
res1 = DT_clf1.predict(tfidf_test) # 模型预测
print('DT test accuracy %s' % accuracy_score(y_test, res1 ))
print(' DT test F1_score %s' % recall_score(y_test, res1))
print(classification_report(y_test, res1) ) # 结果报告
相关数据集与代码包请到主页资源库,欢迎大家评论交流指导!