韩剧荒,于是去爬了下豆瓣~~

第一步,利用selenium爬虫

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from lxml import etree
from tkinter import *
import warnings
warnings.filterwarnings('ignore')
import json
import pandas as pd
import time


option = webdriver.ChromeOptions()
option.add_argument('disable-infobars')
option.add_argument('--headless')  # 隐藏窗口
browser = webdriver.Chrome(chrome_options=option)
browser_detail = webdriver.Chrome(chrome_options=option)
browser.maximize_window()
wait = WebDriverWait(browser, 10)
keyword = '韩剧'
rank = 'recommend'
global click_times
click_times = 5
all_directors = []
all_actors = []
all_stars5_rate = []


def login():
    #页面加载
    login_url = 'https://accounts.douban.com/passport/login'
    browser.get(login_url)
    login_handle = browser.current_window_handle
    # print(login_handle)

    submit = wait.until(
        EC.presence_of_element_located((
            By.CSS_SELECTOR,
            '.account-tab-account'
        ))
    )
    submit.click()

    user_name = wait.until(
        EC.presence_of_element_located((
            By.CSS_SELECTOR,
            '#username')))
    password = wait.until(
        EC.presence_of_element_located((
            By.CSS_SELECTOR,
            '#password'
        ))
    )

    submit = wait.until(
        EC.presence_of_element_located((
            By.CSS_SELECTOR,
            'a.btn:nth-child(1)'
        ))
    )

    #输入登陆账号信息
    user_name.send_keys('#####')
    password.send_keys('#####')

    #模拟登陆
    submit.click()
    return login_handle


def index(login_handle):
    cookies = browser.get_cookies()
    # print(cookies)
    url = 'https://movie.douban.com/tv/#!type=tv&tag=' + keyword + '&sort=' + rank + '&page_limit=20&page_start=0'
    js = 'window.open("{}");'.format(url)
    browser.execute_script(js)
    browser.get(url)
    # index_handle = browser.current_window_handle
    windows = browser.window_handles
    browser.switch_to_window(windows[1])
    browser.close()
    browser.switch_to_window(windows[0])
    time.sleep(3)

    #点击加载更多
    # global click_times
    # click_times = 4
    for i in range(click_times):
        more = wait.until(
            EC.presence_of_element_located((
                By.CSS_SELECTOR,
                '#content > div > div.article > div > div.list-wp > a'
            ))
        )
        browser.execute_script("window.scrollTo(0, document.body.scrollHeight)")
        time.sleep(2)
        more.click()

    html = browser.page_source
    return html


def parse_index(html):
    etree_html = etree.HTML(html)
    names = etree_html.xpath('//div[@class="list-wp"]/div[@class="list&#
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值