Python爬虫爬取豆瓣图片及文字信息
import requests
from bs4 import BeautifulSoup
#url = "https://movie.douban.com/top250"
# 0到250 因为只有250条 每页显示25条
urls = ['https://movie.douban.com/top250?start='+str(n)+'&filter=' for n in range(0,250,25)]
'''
start每页增加25条
https://movie.douban.com/top250?start=25&filter=
https://movie.douban.com/top250?start=50&filter=
https://movie.douban.com/top250?start=75&filter=
'''
#设置网络请求头 - 模拟浏览器
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36"
}
#序号
i = 0
for url in urls:
web_get = requests.get(url,headers=headers)
#print(web_get)<Response [200]>
#text 转成文本 lxml解析
soup = BeautifulSoup(web_get.text,'lxml')
#网页源码
#print(soup)
#电影名称div class='hd'子节点 a标签
titles = soup.select('div.hd > a')
#print(titles)
#电影评分 <span class="rating_num" property="v:average">9.6</span>
rotes = soup.select('span.rating_num')
#print(rotes)
#图片路径 <img width="100" alt="肖申克的救赎" src="https://img3.doubanio.com/view/photo/s_ratio_poster/public/p480747492.webp" class="">
imgs = soup.select('img[width="100"]')
#print(imgs)
for title,rote,img in zip(titles,rotes,imgs):
#将列表转成字典
data = {
#子节点
'title':list(title.stripped_strings),
'rote':rote.get_text(),
'img':img.get('src'),
}
#print(data)
#请求图片地址
i+=1
fileName = str(i)+'、'+data['title'][0]+' '+data['rote']+'分.jpg'
pic = requests.get(data['img'])
with open('C:/Users/Administrator/Desktop/douban_test/'+fileName,'wb') as photo:
#图片是二进制的 所以要转成二进制 要用到content
photo.write(pic.content)
结果: