爬取电影影评（HTML 页面词云）

最新推荐文章于 2022-10-29 08:28:02 发布

li_xp123

最新推荐文章于 2022-10-29 08:28:02 发布

阅读量2.9k

点赞数

分类专栏： python提高

本文链接：https://blog.csdn.net/li_xp123/article/details/99696003

版权

python提高专栏收录该内容

14 篇文章 1 订阅

订阅专栏

https://www.jianshu.com/p/779b8b23e08f

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

chrome_options = Options()
# 无头模式启动
chrome_options.add_argument('--headless')
# 谷歌文档提到需要加上这个属性来规避bug
chrome_options.add_argument('--disable-gpu')
# 初始化实例
driver= webdriver.Chrome(chrome_options=chrome_options）
# 请求百度
driver.get("http://www.baidu.com")

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
import time

chrome_options = Options()
chrome_options.add_argument('--headless')
login_url = 'https://movie.douban.com/subject/26794435/comments'
driver = webdriver.Chrome(options=chrome_options)
driver.get(login_url)

comment_list = []

#通过xpath找到当前页的每一条评论，保存到列表中
comments = driver.find_elements_by_xpath('//div[@class="comment-item"]/div[@class="comment"]/p/span')
for comment in comments:
    comment_list.append(comment.text+'\n')

while 1:
    try:#查找‘下一页'按钮
        next_item = driver.find_element_by_xpath('//*[@id="paginator"]/a[@class="next"]')
    except NoSuchElementException:#找不到说明是最后一页了
        print('抓取结束！')
        break
    else:#找到了就点击进入下一页，抓取每一条评论，保存到列表中
        next_item.click()
        time.sleep(3)
        comments = driver.find_elements_by_xpath('//div[@class="comment-item"]/div[@class="comment"]/p/span')
        for comment in comments:
            comment_list.append(comment.text+'\n')

print(len(comment_list))
with open('comments.txt', 'w+', encoding='utf-8') as f:
    f.writelines(comment_list)


#以下是生成词云的代码
import jieba
from pyecharts.charts import WordCloud

#从文本文件中生成词云
def Generator():
    with open('comments.txt', 'r', encoding='utf-8') as f:
        text_body = f.read()
    f.close()

    #使用jieba进行分词
    jieba.del_word('电影')
    jieba.del_word('故事')
    jieba.del_word('哪吒')
    words_lst = jieba.cut(text_body.replace('\n', '').replace(' ', ''))
    #统计词频
    total = {}
    for i in words_lst:
        total[i] = total.get(i, 0) + 1

    #按词频进行排序，只选取包含两个或两个以上字的词
    data = dict(sorted({k: v for k, v in total.items() if len(k) >= 2}.items(), key=lambda x: x[1], reverse=True)[:200])

    #构造一个词云对象，把所有的词放进去
    word_cloud = WordCloud()
    word_cloud.add("豆瓣电影词云", data.items())#用五角星的形状显示词云
    
    #把词云显示到一个html网页中
    word_cloud.render('content.html')

Generator()

li_xp123

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
爬取电影影评（HTML 页面词云）

https://www.jianshu.com/p/779b8b23e08ffrom selenium import webdriverfrom selenium.webdriver.chrome.options import Optionschrome_options = Options()# 无头模式启动chrome_options.add_argument('--headle...
复制链接

扫一扫