from urllib import request
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36'
}
urls = ['https://movie.douban.com/review/best/?start={}'.format(str(i)) for i in range(0, 10)]
for url in urls:
data = request.Request(url, headers=headers)
res = request.urlopen(data)
res = res.read().decode('utf-8')
res = etree.HTML(res)
name = res.xpath('//*[@id="content"]/div/div/div/div/div/header/a[@class="name"]/text()')
title = res.xpath('//*[@id="content"]/div/div/div/div/div/a[@class="subject-img"]/img/@title')
res = res.xpath('//*[@id="content"]/div/div/div/div/div/div[@class="main-bd"]/h2/a/text()')
for i, j, k in zip(name, title, res):
print(i, j, k)
python urllib抓取豆瓣影评
最新推荐文章于 2021-08-02 18:00:53 发布