https://www.jianshu.com/p/779b8b23e08f
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
# 无头模式启动
chrome_options.add_argument('--headless')
# 谷歌文档提到需要加上这个属性来规避bug
chrome_options.add_argument('--disable-gpu')
# 初始化实例
driver= webdriver.Chrome(chrome_options=chrome_options)
# 请求百度
driver.get("http://www.baidu.com")
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
import time
chrome_options = Options()
chrome_options.add_argument('--headless')
login_url = 'https://movie.douban.com/subject/26794435/comments'
driver = webdriver.Chrome(options=chrome_options)
driver.get(login_url)
comment_list = []
#通过xpath找到当前页的每一条评论,保存到列表中
comments = driver.find_elements_by_xpath('//div[@class="comment-item"]/div[@class="comment"]/p/span')
for comment in comments:
comment_list.append(comment.text+'\n')
while 1:
try:#查找‘下一页'按钮
next_item = driver.find_element_by_xpath('//*[@id="paginator"]/a[@class="next"]')
except NoSuchElementException:#找不到说明是最后一页了
print('抓取结束!')
break
else:#找到了就点击进入下一页,抓取每一条评论,保存到列表中
next_item.click()
time.sleep(3)
comments = driver.find_elements_by_xpath('//div[@class="comment-item"]/div[@class="comment"]/p/span')
for comment in comments:
comment_list.append(comment.text+'\n')
print(len(comment_list))
with open('comments.txt', 'w+', encoding='utf-8') as f:
f.writelines(comment_list)
#以下是生成词云的代码
import jieba
from pyecharts.charts import WordCloud
#从文本文件中生成词云
def Generator():
with open('comments.txt', 'r', encoding='utf-8') as f:
text_body = f.read()
f.close()
#使用jieba进行分词
jieba.del_word('电影')
jieba.del_word('故事')
jieba.del_word('哪吒')
words_lst = jieba.cut(text_body.replace('\n', '').replace(' ', ''))
#统计词频
total = {}
for i in words_lst:
total[i] = total.get(i, 0) + 1
#按词频进行排序,只选取包含两个或两个以上字的词
data = dict(sorted({k: v for k, v in total.items() if len(k) >= 2}.items(), key=lambda x: x[1], reverse=True)[:200])
#构造一个词云对象,把所有的词放进去
word_cloud = WordCloud()
word_cloud.add("豆瓣电影词云", data.items())#用五角星的形状显示词云
#把词云显示到一个html网页中
word_cloud.render('content.html')
Generator()