练习xPath,爬取豆瓣正在放映电影,获取电影信息,按评分对电影排序。
import requests
from lxml import etree
import operator
def get_text():
# 把源代码爬取下来
headers = {
'Referer': 'https://movie.douban.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:69.0) Gecko/20100101 Firefox/69.0'
}
url = 'https://movie.douban.com/cinema/nowplaying/zhengzhou/'
resp = requests.get(url, headers=headers)
with open('douan.html', 'w', encoding='utf-8') as fp:
fp.write(resp.text)
# 解析
def parser_contents():
nowplaying = []
parser = etree.HTMLParser(encoding='utf-8')
html = etree.parse('douan.html', parser=parser)
movies = html.xpath('/html/body/div[3]/div[1]/div/div[1]/div[3]/div[2]/ul/li')
for movie in movies:
# print(etree.tostring(movie, encoding='utf-8').decode('utf-8'))
time = movie.xpath('.//@data-release')
title = movie.xpath('.//@data-title')
score = movie.xpath('.//@data-score')
duration = movie.xpath('.//@data-duration')
director = movie.xpath('.//@data-director')
actors = movie.xpath('.//@data-actors')
# print(time, score, duration, director, actors)
picture = movie.xpath('.//li[1]//img/@src')[0]
details = movie.xpath('.//li[2]/a//@href')[0]
buy_ticket = movie.xpath('.//li[4]/a//@href')[0]
info = {
'电影名称': title,
'上映时间': time,
'评分': score,
'时长': duration,
'导演': director,
'演员': actors,
'海报': picture,
'详细': details,
'选座购票': buy_ticket,
}
nowplaying.append(info)
return nowplaying
if __name__ == '__main__':
get_text()
nowplaying = parser_contents()
print(len(nowplaying))
# 按评分排序
nowplaying = sorted(nowplaying, key=operator.itemgetter('评分'), reverse=True)
for i in nowplaying:
print(i)
print('\n')
运行截图: