测试爬取网站
https://www.qiushibaike.com/imgrank/page/
爬取目标是糗图百科的热图
一、判断图片url所在页面位置
通过浏览器自带的f12 谷歌开发者工具查看网页代码
找到图片url所在位置使用正则定位,代码如下
def get_img_url(url):
ret = requests.get(url=url,headers=headers).text #获取网页代码
# print(ret)
pattern = '<div class="thumb">.*?<img src="(.*?)" alt=.*?</div>'
img_url_list = re.findall(pattern, ret, re.S)
# patten匹配列表 ret范围内从左到右检查 re.S 单行匹配
# print(img_url_list)
return img_url_list
二、保存图片
将图片的二进制数据保存在指定文件夹下
def save_img(url):
src = f"https:{url}"
# 观察获得的url,发现获取的不是完整url,所以保存的时候加上前缀
imgdate = requests.get(url=src,headers=headers).content
filename = src.split('/')[-1]
with open(f'qiutulibs//{filename}',mode='wb') as f:
f.write(imgdate)
print('图片下载成功',filename)
三、完整代码如下
import requests
import re
import os
headers ={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'}
def get_img_url(url):
ret = requests.get(url=url,headers=headers).text
# print(ret)
pattern = '<div class="thumb">.*?<img src="(.*?)" alt=.*?</div>'
img_url_list = re.findall(pattern, ret, re.S)
# print(img_url_list)
return img_url_list
def save_img(url):
src = f"https:{url}"
imgdate = requests.get(url=src,headers=headers).content
filename = src.split('/')[-1]
with open(f'qiutulibs//{filename}',mode='wb') as f:
f.write(imgdate)
print('图片下载成功',filename)
if __name__ == '__main__':
if not os.path.exists('./qiutulibs'):
os.mkdir('./qiutulibs')
for i in range(1,14):
url = f"https://www.qiushibaike.com/imgrank/page/{i}/"
for src in get_img_url(url):
save_img(src)