第一步,利用selenium爬虫
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from lxml import etree
from tkinter import *
import warnings
warnings.filterwarnings('ignore')
import json
import pandas as pd
import time
option = webdriver.ChromeOptions()
option.add_argument('disable-infobars')
option.add_argument('--headless') # 隐藏窗口
browser = webdriver.Chrome(chrome_options=option)
browser_detail = webdriver.Chrome(chrome_options=option)
browser.maximize_window()
wait = WebDriverWait(browser, 10)
keyword = '韩剧'
rank = 'recommend'
global click_times
click_times = 5
all_directors = []
all_actors = []
all_stars5_rate = []
def login():
#页面加载
login_url = 'https://accounts.douban.com/passport/login'
browser.get(login_url)
login_handle = browser.current_window_handle
# print(login_handle)
submit = wait.until(
EC.presence_of_element_located((
By.CSS_SELECTOR,
'.account-tab-account'
))
)
submit.click()
user_name = wait.until(
EC.presence_of_element_located((
By.CSS_SELECTOR,
'#username')))
password = wait.until(
EC.presence_of_element_located((
By.CSS_SELECTOR,
'#password'
))
)
submit = wait.until(
EC.presence_of_element_located((
By.CSS_SELECTOR,
'a.btn:nth-child(1)'
))
)
#输入登陆账号信息
user_name.send_keys('#####')
password.send_keys('#####')
#模拟登陆
submit.click()
return login_handle
def index(login_handle):
cookies = browser.get_cookies()
# print(cookies)
url = 'https://movie.douban.com/tv/#!type=tv&tag=' + keyword + '&sort=' + rank + '&page_limit=20&page_start=0'
js = 'window.open("{}");'.format(url)
browser.execute_script(js)
browser.get(url)
# index_handle = browser.current_window_handle
windows = browser.window_handles
browser.switch_to_window(windows[1])
browser.close()
browser.switch_to_window(windows[0])
time.sleep(3)
#点击加载更多
# global click_times
# click_times = 4
for i in range(click_times):
more = wait.until(
EC.presence_of_element_located((
By.CSS_SELECTOR,
'#content > div > div.article > div > div.list-wp > a'
))
)
browser.execute_script("window.scrollTo(0, document.body.scrollHeight)")
time.sleep(2)
more.click()
html = browser.page_source
return html
def parse_index(html):
etree_html = etree.HTML(html)
names = etree_html.xpath('//div[@class="list-wp"]/div[@class="list&#