Python获取豆瓣《斗罗大陆》500条热门短评写入excel-CSDN博客

本文链接：https://blog.csdn.net/lu_xuying/article/details/113753171
import pandas as pd
import time
import re

from selenium import webdriver

# 登录豆瓣获取cookies
driver = webdriver.Chrome()
driver.get('https://www.douban.com/')
## 切换iframe子框架
driver.switch_to.frame(driver.find_elements_by_tag_name("iframe")[0])

# driver.maximize_window()  # 最大化窗口
driver.find_element_by_css_selector('li.account-tab-account').click()  # 点击密码登录的标签
driver.find_element_by_id('username').send_keys('my_user_name')
driver.find_element_by_id('password').send_keys('my_passsword')
# 点击‘登录豆瓣’按钮
# 这里需要注意，当元素的class属性有好几个的时候，此函数的参数填class的第一个就好
driver.find_element_by_class_name('btn').click()  # 元素的class属性：btn btn-account
# 获取cookies,字典推导式
cookies = {i['name']: i['value'] for i in driver.get_cookies()}
print(cookies)

time.sleep(10)


# 采集时间：2021年1月7日 17：33
# 登录成功后也只能找到500条评论
df = pd.DataFrame({'comment_time': [0],
                   'user_name': [0],
                   'user_id': [0],
                   'star': [0],
                   'title': [0],
                   'comments': [0]})

for j in range(25):
    start_page = str(j * 20)
    
    driver.get("https://movie.douban.com/subject/30313969/comments?start=%s" %(start_page))

    data = driver.page_source #获取网页源代码
    
    # 获取评论时间
    comment_time_pattern = re.compile('<span class="comment-time " title="(.*?)">', re.S)
    comment_time = re.findall(comment_time_pattern, data)

    # 获取用户名称
    user_name_pattern = re.compile('<a title="(.*?)" href="https://www.douban.com/people/.*?/', re.S)
    user_name = re.findall(user_name_pattern, data)

    # 获取用户ID
    user_id_pattern = re.compile('<a title=".*?" href="https://www.douban.com/people/(.*?)/', re.S)
    user_id = re.findall(user_id_pattern, data)

    # 获取用户星评（取值：10,20,30,40,50）
    star_pattern = re.compile('<span class="allstar(.*?) rating" title=".*?"></span>', re.S)
    star = re.findall(star_pattern, data)

    # 获取用户推荐指数（取值：很差，还行，推荐，力荐）
    title_pattern = re.compile('<span class="allstar.*? rating" title="(.*?)"></span>', re.S)
    title = re.findall(title_pattern, data)

    # 获取评论时间
    comments_pattern = re.compile('<span class="short">(.*?)</span>', re.S)
    comments = re.findall(comments_pattern, data)
    
    new_df = pd.DataFrame({'comment_time': comment_time,
                           'user_name': user_name,
                           'user_id': user_id,
                           'star': star,
                           'title': title,
                           'comments': comments})

    df = df.append(new_df)

df.to_excel("Dou_Luo_Continent.xls", index=True)