场景大量的文本留言,有短文本有长文本,我们如何搞笑提取文本主题?如上图 知道view 如何获取topics
解决办法:
1文本分类
2,主题提取
3,主题聚类
4,主题输出
from sklearn.cluster import DBSCAN
import jieba.posseg
import jieba.analyse
import pandas as pd
import numpy as np
from baidu_aip import getCommentTag,get_emo
from kashgari.tasks.classification import CNNLSTMModel
import participle as p
df = pd.read_csv('filename')
df['viewn']=df['view'].apply(p.transferred)
df['vl'] = df['viewn'].apply(p.get_wordslist)
df['vl']=df['vl'].apply(lambda x:np.NaN if len(x)==0 or ''.join(x)=='' else x)
df = df[df['vl'].notnull()].copy()
model = CNNLSTMModel()
Model = model.load_model('modelfile')
df['pro'] = Model.predict(df['vl'])
from bert_serving.client import BertClient
bc = BertClient(check_length=False)
df['name'] = df['view'].apply(getCommentTag)
df['emo'] = df['view'].apply(get_emo)
n_id=[]
n_name = []
for i in range(len(df)):
for word in df['name'][i]:
n_id.append(i)
n_name.append(word)
dfn = pd.DataFrame({'id':n_id,'topic':n_name})
vec = bc.encode(dfn['topic'])
clst = DBSCAN()
predict_labels = clst.fit_predict(vec).tolist()
dfn['cl'] = predict_labels
dfn.to_csv('filename')