源码
from lxml import etree
import requests
url = 'https://movie.douban.com/cinema/nowplaying/shenzhen/'
headers = {
'Referer': 'https://movie.douban.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
r=requests.get(url=url)
html = etree.HTML(r.text)
lis = html.xpath('//div[@id="nowplaying"]/div/ul/li[@class="list-item"]')
movies = []
for li in lis:
title = li.xpath('.//@data-title')[0]
score = li.xpath('.//@data-score')[0]
release = li.xpath('.//@data-release')[0]
duration = li.xpath('.//@data-duration')[0]
region = li.xpath('.//@data-region')[0]
director = li.xpath('.//@data-director')[0]
actors = li.xpath('.//@data-actors')[0]
movie = {
'title': title,
'score': score,
'release': release,
'duration': duration,
'region': region,
'director': director,
'actors': actors
}
movies.append(movie)
form="{!s}\t"*7
print("电影\t评分\t上映时间\t时长\t地区\t导演\t演员")
for i in movies:
print(print(form.format(i['title'],i['score'],i['release'],i['duration'],i['region'],i['director'],i['actors'])))
运行结果
分析网页可知,上映电影的信息都在属性id="nowplaying的div标签下的子孙标签ul中的li标签内。所以xpath为
//div[@id="nowplaying"]//ul/li[@class="list-item"]
这样就可以获得每个影片的大致信息了,接下来就可以更具体的进行筛选。
网页部分源码
<div id="nowplaying">
<div class="mod-hd">
<h2>正在上映</h2>
</div>
<div class="mod-bd">
<ul class="lists">
<li
id="26394152"
class="list-item"
data-title="大黄蜂"
data-score="7.2"
data-star="40"
data-release="2018"
data-duration="114分钟"
data-region="美国"
data-director="特拉维斯·奈特"
data-actors="海莉·斯坦菲尔德 / 小豪尔赫·兰登伯格 / 约翰·塞纳"
data-category="nowplaying"
data-enough="True"
data-showed="True"
data-votecount="120134"
data-subject="26394152"
>
<ul class="">
<li class="poster">
<a href="https://movie.douban.com/subject/26394152/?from=playing_poster" class=ticket-btn target="_blank" data-psource="poster">
<img src="https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2541662397.webp" alt="大黄蜂" rel="nofollow" class="" />
</a>
</li>
<li class="stitle">
<a href="https://movie.douban.com/subject/26394152/?from=playing_poster"
class="ticket-btn"
target="_blank"
title="大黄蜂"
data-psource="title">
大黄蜂
</a>
</li>
<li class="srating">
<span class="rating-star allstar40"></span>
<span class="subject-rate">7.2</span>
</li>
<li class="sbtn">
<a class="ticket-btn"
href="https://movie.douban.com/ticket/redirect/?url=https%3A%2F%2Fm.maoyan.com%2Fcinema%2Fmovie%2F1206875%3F_v_%3Dyes%26merCode%3D1000011"
target="_blank">
选座购票
</a>
</li>
</ul>
</li>
<li