'''
#这段代码是从一个网`在这里插入代码片`站借用过来的,具体哪个网址一下子忘记了。
#可以直接运行。
'''
from selenium import webdriver
import time
import codecs
import jieba
import jieba.analyse as analyse
from wordcloud import WordCloud
from scipy.misc import imread
from os import path
def get_douban_comments(url):
# comments_list = [] # 评论列表
login_url = 'https://accounts.douban.com/login?source=movie'
user_name = '15527546531' # 这里替换成你的豆瓣用户名
password = '15898405110ABCD' # 这里替换成你的密码
driver = webdriver.Firefox() # 启动Firefox()
driver.get(login_url)
driver.find_element_by_id('email').clear() # 清除输入框
driver.find_element_by_id('email').send_keys(user_name) # 输入用户名
driver.find_element_by_id('password').clear()
driver.find_element_by_id('password').send_keys(password) # 输入密码
captcha_field = input('请打开浏览器输入验证码:') # 手动填入验证码
driver.find_element_by_id('captcha_field').send_keys(captcha_field)
driver.find_element_by_class_name('btn-submit').click() # 点击登录按钮
time.sleep(5) # 等待跳转到登录之后的页面
driver.get(url) # 定位到目标页面
driver.implicitly_wait(3) # 智能等待3秒
n = 1 # 页数
count = 0 # 评论数目
# 注意:下次需要做词云的时候 需要重新给一个名字或者吧上次的文件清空,
`在这里插入代码片`#因为是添加的模式
file = codecs.open("pjl_comment.txt",mode='a',encoding='utf-8')
while True:
try:
comments_list = [] # 评论列表
results = driver.find_elements_by_class_name('comment')
print("results:",len(results))
for result in results:
# print(result)
# author = result.find_elements_by_tag_name('a')[1].text # 作者
# vote = result.find_element_by_class_name('comment-vote').find_element_by_tag_name('span').text # 赞同数目
# time0 = result.find_element_by_class_name('comment-info').find_elements_by_tag_name('span')[1].text # 时间
comment = result.find_element_by_tag_name('p').text # 评论内容
print(comment)
comments_list.append(comment+u'\n')
print(u"查找到第%d个评论" % count)
count += 1
driver.find_element_by_class_name('next').click() # 点击下一页
print( u'第%d页查找完毕!' % n)
n += 1
time.sleep(2)
file.writelines(comments_list)
except Exception() as e:
print(e)
# with codecs.open('pjl_comment.txt','a',encoding='utf-8') as f:
# f.writelines(comments_list)
# print(u"查找到第%d页,第%d个评论!" %(n,count))
# 得到所有关键词
def get_all_keywords(file_name):
word_lists = [] # 关键词列表
with codecs.open(file_name,'r',encoding='utf-8') as f:
Lists = f.readlines() # 文本列表
for List in Lists:
cut_list = list(jieba.cut(List))
for word in cut_list:
word_lists.append(word)
word_lists_set = set(word_lists) # 去除重复元素
sort_count = []
word_lists_set = list(word_lists_set)
length = len(word_lists_set)
print(u"共有%d个关键词" % length)
k = 1
for w in word_lists_set:
sort_count.append(w+u':'+(word_lists.count(w))+u"次\n")
print(u"%d---" % k + w+u":"+(word_lists.count(w))+ u"次")
k += 1
with codecs.open('count_word.txt','w',encoding='utf-8') as f:
f.writelines(sort_count)
def get_top_keywords(file_name):
top_word_lists = [] # 关键词列表
with codecs.open(file_name,'r',encoding='utf-8') as f:
texts = f.read() # 读取整个文件作为一个字符串
Result = analyse.textrank(texts,topK=20,withWeight=True,withFlag=True)
n = 1
for result in Result:
print(u"%d:" % n )
for C in result[0]: # result[0] 包含关键词和词性
print(C,u" ")
print(u"权重:"+ str(result[1])) # 关键词权重
n += 1
# 绘制词云
def draw_wordcloud():
with codecs.open('pjl_comment.txt',encoding='utf-8') as f:
comment_text = f.read()
cut_text = " ".join(jieba.cut(comment_text)) # 将jieba分词得到的关键词用空格连接成为字符串
d = "E:\\pythonStudy_2\\machine-learning" #当前文件文件夹所在目录
color_mask = imread("E:\\pythonStudy_2\\machine-learning\\tmp.png") # 读取背景图片
cloud = WordCloud(font_path=path.join(d,'simsun.ttc'),background_color='white',mask=color_mask,max_words=2000,max_font_size=40)
word_cloud = cloud.generate(cut_text) # 产生词云
word_cloud.to_file("pjl_cloud2.jpg")
if __name__ == '__main__':
url = "https://movie.douban.com/subject/26752088/comments?status=P" # 我不是药神
get_douban_comments(url)
# file_name = 'pjl_comment.txt'
# get_top_keywords(file_name)
# draw_wordcloud()
豆瓣评论数据词云画像()
最新推荐文章于 2024-09-27 09:16:47 发布