import pandas as pd
# 读取数据
df = pd.read_csv('102.JDcomments.csv')# df=pd.DataFrame(data=data)#默认保留第一次出现的重复项
data = df.drop_duplicates()# 设置列名
data.columns =['评论','评分']# 存储数据
data.to_csv('100去重.csv', encoding='utf8',index=False)
去除换行符
由于评论中有很多换行符,同时,使用的停用词表中并没有换行符,所以需要去除。
使用re就可以简单去除
import pandas as pd
import csv
path='102.去重.csv'withopen(path,encoding='utf-8')as fin:withopen('103.去换行.csv','w',newline='',encoding='utf-8')as fout:
r = csv.reader(fin)#读入文件
w = csv.writer(fout)#写入文件for row in r:
row =[col.replace('\\n','').replace('\\r','')for col in row]#将"\n"替换为无
w.writerow(row)#写入新文件
现在的数据就以及大致可用了
去除停用词
根据停用词表去除停用词
import os
import pandas as pd
import jieba
import numpy as np
defload_stop_words(file="stopwords.txt"):# 停用词检测withopen(file,"r",encoding ="utf-8")as f:return f.read().split("\n")defcut_words(commentSeries):
stop_words = load_stop_words()
result =[]for words in commentSeries:#一行csv
c_words = jieba.lcut(words)
result.append([word for word in c_words if word notin stop_words])# 看看是不是在停用词里return result
data = pd.read_csv("00.3.去换行.csv",encoding ="utf-8")
results = cut_words(data['评论'])
rst =[]for text in results:
str_1 =""for word in text:
str_1 += word +" "
rst.append(str_1)
df.to_csv("104.去除停用词.csv",index=None)
分词
接下来进行分词,众所周知,分词是word2vec的前一步,必须要先分词才能做的捏。
使用jieba就可以轻松分词
同时进行的是,好中差评分类,因为之前不知道要求,所以先按照1,2/3/4,5来分类
import os
import pandas as pd
import numpy as np
data = pd.read_csv("00.4.去除停用词.csv",encoding ="utf-8")
comment = data['评论']
score = data['评分']
cuted = data['分词']defsim_labels(scores):
result =[]for score in scores:if(score==1)|(score==2):
result.append(1)elif score==3:
result.append(2)else:
result.append(3)return result
rst_score = pd.Series(sim_labels(score))
d ={'评论': comment,'分数': rst_score,'分词': cuted}
df = pd.DataFrame(d)
df.to_csv("105.结果.csv",index=None)