【1】需要处理的评论数据已经存到MongoDB(评论数据总量约为3万条)
import jieba
import jieba.analyse
from pymongo import MongoClient
from snownlp import SnowNLP
comments = ''
client=MongoClient()
results = client.jd.shouhuan.find({})
for result in results:
for content in result['商品总评论']:
comments+=content[:-26] # 第一步去除"噪音"
# 【STEP 1 从数据库/文件读取评论内容】
#导入自定义词典 # 【目标:】让jieba识别新词
jieba.load_userdict("/Users/macbookair/Desktop/NLP1221/dict.txt")
# ===START=============================
# 去除停用词 #【目标:】去除文本噪音
# ===================================
stopwords = {}.fromkeys(['一晃','准功','平理','一大','充好',';','?','*','**','??????','1','2','3','4','5','6','7','8','9','10','0','a','b','c','d','e','f','g','h','i','g','k','m','n','o','p','q','r','s','t','u','v','w','x','y','z','*^★*☆','丶','helliphellip',';','*?acute╰╯`?','hellip','哦','与','下次','~','!',"(',')",'�','\n','、','~','再','来','给','有','&','的', '包括', '等', '是', '了', '和','开始','用','怎么','说','呢','还是',',',' ','。',':','而且