第一步:导入所需要的库
import requests
from lxml import etree
第二步:将目标网站的页面抓取下来
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"Referer":"https://movie.douban.com/"
}
url="https://movie.douban.com/cinema/nowplaying/shanghai/"
response=requests.get(url,headers=headers)
text=response.text
#response.text:返回的是一个解码后的字符串,是str(unicode)类型
#response.content:返回的是一个原生的字符串,就是从网页上抓取下没有经过处理的字 符串,
# 是byte类型。
第三步:将抓取下来的数据根据一定的规则进行提取。
html=etree.HTML(text)
ul=html.xpath('//ul[@class="lists"]')[0]
lis=ul.xpath('.//li[@class="list-item"]')
movies=[]
for li in lis:
title=li.xpath("@data-title")[0]
score=li.xpath("@data-score")[0]
duration=li.xpath("@data-duration")[0]
region=li.xpath("@data-region")[0]
director=li.xpath("@data-director")[0]
actor=li.xpath("@data-actors")[0]
thumbnail=li.xpath(".//img/@src")[0]
movie={
"titile":title,
"score":score,
"duration":duration,
"region":region,
"director":director,
"actor":actor,
"thumbnail":thumbnail
}
movies.append(movie)
print(movies)