import requests, csv
import time
from lxml import etree
# 创建获取页面数据方法
def get_html(url):
# 伪装浏览器
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/109.0.0.0 Safari/537.36'
}
r = requests.get(url, headers=headers).text
e = etree.HTML(r)
li = e.xpath('/html/body/div[3]/div[1]/div/div[1]/ol/li')
lst = []
for info in li:
filename = info.xpath('./div/div[2]/div[1]/a/span[1]/text()')[0]
year_r = info.xpath('./div/div[2]/div[2]/p[1]/text()[2]')[0]
year = year_r.split("/")[-3].split()[0]
actor_r = info.xpath('./div/div[2]/div[2]/p[1]/text()[1]')[0]
try:
if "主演:" in actor_r:
actor = actor_r.split("主演:")[1].split()[0]
else:
actor = 'none'
except IndexError:
actor = 'none'
film_lst = [filename, actor, year]
lst.append(film_lst)
print('{}----{}----{}'.format(filename, actor, year))
time.sleep(1)
# 调用函数
for i in range(0, 265, 25):
get_html(f'https://movie.douban.com/top250?start={i}&filter=')
# print(url)
python数据采集 使用xpath解析豆瓣top250
最新推荐文章于 2023-11-26 20:11:43 发布