'''
1.明确目标
a.抓取豆瓣电影的排行榜
b.需要提取的数据
I.movie_name
II.actors
III.score
2.伪装---进行伪装与请求
a。requests
3.交互---在响应中去提取我们需要的信息
a.正则 bs4 json() xpath
4.保存结果
a.文件
'''
'''
import requests
url='https://movie.douban.com/chart'
headers={
'User-Agent':''
}
res=requests.get(url,headers=headers)
with open('doubam.html','w',encoding='utf-8') as f:
f.write(res.text)
'''
with open('doubam.html','r',encoding='utf-8') as f:
data=f.read()
from lxml import etree
page=etree.HTML(data)
y=page.xpath('//tr[@class="item"]')
'''
li=[]
for i in y:
movie_name=i.replace(' ','').replace('\n','').replace('/','')
if movie_name:
li.append(movie_name)
print(li)
'''
movie={}
for i in y:
movie_name=i.xpath('./td/a[@class="nbg"]/@title')[0] #取出的都是列表 每一个列表有一个元素
#print(movie_name)
actors=i.xpath('./td/div/p[@class="pl"]/text()')[0]
#print(actors)
movie_score=i.xpath('./td/div/div/span[@class="rating_nums"]/text()')[0]
#print(movie_score)
movie.update({movie_name:{'演员':actors,'评分':movie_score}})
with open('dianying.txt','a',encoding='utf8') as t:
t.write(movie_name+actors+movie_score)
print(movie)
xpath
于 2023-11-20 16:43:43 首次发布