#! /usr/bin/env python
# -*- coding:utf-8 -*-
'''
#需求:获取豆瓣电影中动态加载出更多电影详情数据
#URL:https://movie.douban.com/typerank?type_name=%E5%96%9C%E5%89%A7&type=24&interval_id=100:90&action=
#思路:喜剧排行榜,ajax请求,20条,页面滚动加载
#浏览器打开网址--模拟浏览器滚动--怎么滚动页面?执行js代码。怎么抓取?不用重新发请求
'''
from selenium import webdriver as wd
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
import random
'''
下载chromedriver,需要放在python.exe目录中,或者配置环境变量
chrome://version
http://chromedriver.storage.googleapis.com/index.html?path=88.0.4324.96/
'''
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
driver = wd.Chrome(chrome_options=chrome_options)
driver.get('https://movie.douban.com/typerank?type_name=%E5%96%9C%E5%89%A7&type=24&interval_id=100:90&action=')
movies = []
time.sleep(3)
#获取页面初始高度
js = 'return action=document.body.scrollHeight'
height = driver.execute_script(js)
#将滚动条调整至页面底部
driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
time.sleep(5)
#定义初始时间戳(秒)
t1 = int(time.time())
#定义循环标识,用于终止while循环
status = True
# 重试次数
num = 0
cnt = 0
equal_num = 0
while status and cnt < 2000:
#获取当前时间戳(秒)
t2 = int(time.time())
#判断时间初始时间戳和当前时间戳相差是否大于30s,小于30s则下拉滚动条
#print(t2 - t1)
if t2 - t1 < 30:
cnt = cnt + 1
#print('加载{}次'.format(cnt).center(100, '*'))
n_height = driver.execute_script(js)
if n_height > height:
print('加载{}次,{}--{}'.format(cnt, height, n_height).center(100, '*'))
driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
time.sleep(1)
#重置初始页面高度
height = n_height
#重置初始时间戳,重新计时
t1 = int(time.time())
elif n_height == height:
equal_num = equal_num + 1
if equal_num >= 5:
break
elif num < 3:
time.sleep(3)
num = num + 1
else: #超时并超过重试次数,程序结束跳出循环,并认为i页面已经加载完毕!
print('滚动条已经处在页面最下方!')
status = False
# 滚动条调整至页面顶部
driver.execute_script('window.scrollTo(0,0)')
break
# 如何让浏览器对象执行js代码
#将滚动条滚动到页面底部
#js = "var q=document.documentElement.scrollTop=100000" #针对Chrome有效
## "document.body.scrollTop=1500"#针对Chrome有效
#for i in range(5):
# driver.execute_script(js)
# sleep(random.randrange(5, 10, 1))
#driver.save_screenshot('./5.png')
# 获取加载数据后的页面: page_source获取浏览器当前数据
page_text = driver.page_source
soup = BeautifulSoup(page_text, 'lxml')
movie_list_tags = soup.find_all(attrs = {'class':'movie-list-item'})
for item in movie_list_tags:
##print(item)
movie_content = item.find(attrs={'class':'movie-content'})
movie_name = movie_content.find(attrs={'class':'movie-name'})
movie_name_text = movie_name.find(name ='a').text
movie_crew = movie_content.find(attrs={'class':'movie-crew'})
movie_misc = movie_content.find(attrs={'class':'movie-misc'})
movie_rating_num = movie_content.find(attrs={'class':'rating_num'})
movie_rating_comment_num = movie_content.find(attrs={'class':'comment-num'})
movies.append(movie_name_text)
#print(movie_name_text)
print(len(movies))
print(movies)
##print(movie_list_tags)
#print(page_text)
#解析数据