import pandas as pd
import numpy as np
import re
import jieba
import jieba.posseg
import collections
import csv
import os
import pandas as pd
path = 'A:\jupyter_code\模式识别期末论文\评论数据'
dirs = os.listdir( path )
excels=[]
i=0
for fname in dirs:
if 'xlsx' in fname:
print ('%-22s'%fname,'\t','标签取为:',i)
df = pd.read_excel('评论数据/'+fname)
df['标签']=str(i)
excels.append(df)
i+=1
a = pd.concat(excels)
a = a.dropna(subset=['评分'])
print('去重之前:',a.shape[0])
a=a.drop_duplicates(subset=['评论内容'])
print('去重之后:',a.shape[0])
a.to_excel('景区评论.xlsx',index=False)
丽江古城.xlsx 标签取为: 0
九寨沟.xlsx 标签取为: 1
伦敦眼.xlsx 标签取为: 2
卢浮宫博物馆.xlsx 标签取为: 3
张家界.xlsx 标签取为: 4
杭州西湖.xlsx 标签取为: 5
洱海.xlsx 标签取为: 6
都江堰景区.xlsx 标签取为: 7
雅典卫城.xlsx 标签取为: 8
鼓浪屿.xlsx 标签取为: 9
去重之前: 25789
去重之后: 23811
import pandas as pd
data = pd.read_excel('景区评论.xlsx')
去重之前: 23811
去重之后: 23811
词频统计
import jieba
import jieba.posseg
import collections
import re
import csv
import pandas as pd
data = pd.read_excel('景区评论.xlsx')
stopWords = pd.read_csv('stopword.txt',encoding='utf-8', sep='lipingliping', header=None)
custom_stopWords = pd.read_csv('自定义停用词.txt',encoding='utf-8', sep='lipingliping',header=None)
adverbWords = pd.read_csv('程度副词.txt',encoding='utf-8', sep='lipingliping',header=None)
stop = list(stopWords.iloc[:, 0])+list(custom_stopWords.iloc[:, 0])+list(adverbWords.iloc[:, 0])
name=['丽江古城热词','九寨沟热词','伦敦眼热词','卢浮宫博物馆热词','张家界热词','杭州西湖热词','洱海热词','都江堰景区热词','雅典卫城热词','鼓浪屿热词']
j=0
for labels in range(0,10):
data01=(''.join(str(i) for i in data.loc[data["标签"]==labels,"评论内容"]))
pattern = re.compile(u'\t|\n|\.|-|:|;|\)|\(|\?|\ |"\|~·@¥……*|“”|‘’|()|{}|')
data02 = re.sub(pattern, '', data01)
data_cut = jieba.cut(data02,cut_all=False,HMM=True)
data_stop = []
for word in data_cut:
if word not in stop:
data_stop.append(word)
word_counts = collections.Counter(data_stop)
word_counts_top = word_counts.most_common(20)
print(word_counts_top)
words_cloud_A = open('A:/jupyter_code/模式识别期末论文/景区热词表/'+name[j]+'.csv', 'w', newline = '')
write = csv.writer(words_cloud_A)
write.writerow(['评论热词','