import pandas as pd
import time
import re
from selenium import webdriver
# 登录豆瓣获取cookies
driver = webdriver.Chrome()
driver.get('https://www.douban.com/')
## 切换iframe子框架
driver.switch_to.frame(driver.find_elements_by_tag_name("iframe")[0])
# driver.maximize_window() # 最大化窗口
driver.find_element_by_css_selector('li.account-tab-account').click() # 点击密码登录的标签
driver.find_element_by_id('username').send_keys('my_user_name')
driver.find_element_by_id('password').send_keys('my_passsword')
# 点击‘登录豆瓣’按钮
# 这里需要注意,当元素的class属性有好几个的时候,此函数的参数填class的第一个就好
driver.find_element_by_class_name('btn').click() # 元素的class属性:btn btn-account
# 获取cookies,字典推导式
cookies = {i['name']: i['value'] for i in driver.get_cookies()}
print(cookies)
time.sleep(10)
# 采集时间:2021年1月7日 17:33
# 登录成功后也只能找到500条评论
df = pd.DataFrame({'comment_time': [0],
'user_name': [0],
'user_id': [0],
'star': [0],
'title': [0],
'comments': [0]})
for j in range(25):
start_page = str(j * 20)
driver.get("https://movie.douban.com/subject/30313969/comments?start=%s" %(start_page))
data = driver.page_source #获取网页源代码
# 获取评论时间
comment_time_pattern = re.compile('<span class="comment-time " title="(.*?)">', re.S)
comment_time = re.findall(comment_time_pattern, data)
# 获取用户名称
user_name_pattern = re.compile('<a title="(.*?)" href="https://www.douban.com/people/.*?/', re.S)
user_name = re.findall(user_name_pattern, data)
# 获取用户ID
user_id_pattern = re.compile('<a title=".*?" href="https://www.douban.com/people/(.*?)/', re.S)
user_id = re.findall(user_id_pattern, data)
# 获取用户星评(取值:10,20,30,40,50)
star_pattern = re.compile('<span class="allstar(.*?) rating" title=".*?"></span>', re.S)
star = re.findall(star_pattern, data)
# 获取用户推荐指数(取值:很差,还行,推荐,力荐)
title_pattern = re.compile('<span class="allstar.*? rating" title="(.*?)"></span>', re.S)
title = re.findall(title_pattern, data)
# 获取评论时间
comments_pattern = re.compile('<span class="short">(.*?)</span>', re.S)
comments = re.findall(comments_pattern, data)
new_df = pd.DataFrame({'comment_time': comment_time,
'user_name': user_name,
'user_id': user_id,
'star': star,
'title': title,
'comments': comments})
df = df.append(new_df)
df.to_excel("Dou_Luo_Continent.xls", index=True)