本案例中抓取的图片是小姐姐
- 首先讲一下本案例概述:抓取的是AJAX,返回格式是JSON,从中提取出图片链接再去请求下载保存在当前工作目录下。
- 了解基本情况后,说一下使用到的第三方模块
- os 用来判断目录是否存在并创建目录
- request 用来get请求(下载也是)
- hashlib.md5 用来根据内容唯一命名图片
- multiprocessing.pool.Pool 用来多进程下载图片
- functools.partial用来
多进程:相当于同时下载多个任务,如果不适用就是一个任务下载完后再下载(速度慢)
#coding:gbk
import os
import requests
from hashlib import md5
from multiprocessing.pool import Pool
from functools import partial
"""
请求头条页面
arges:
num:页面数
keyword:查询的关键字
"""
def get_page(num,keyword):
heads = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
# "Accept-Encoding": "gbk",
"Accept-Language": 'zh-CN,zh;q=0.9',
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Cookie": "_S_DPR=1; _S_IPAD=0; MONITOR_WEB_ID=0; s_v_web_id=verify_a680fd8b18a941d42d4f60d2839e5376; _tea_utm_cache_2018=undefined; MONITOR_DEVICE_ID=70c6d547-96a3-4f12-8853-19d143f73462; _S_WIN_WH=1920_371",
"Host": "so.toutiao.com",
"sec-ch-ua": 'Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "Windows",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
}
try:
response = requests.get(f"https://so.toutiao.com/search?wid_ct=1637235733180&keyword={keyword}&pd=atlas&source=input&dvpf=pc&aid=4916&page_num={num}&rawJSON=1&search_id=202111221051510102121851655DB7FC10",headers=heads)
if response.status_code == 200:
return response.json()
except:
print("请求失败")
return None
"""
请求头条页面
arges:
json:请求回来的json格式
"""
def get_urlImg(json):
if json.get("rawData").get("data"):
# 图片列表
img_url_list = [x["img_url"] for x in json["rawData"]["data"]]
# print(img_url_list)
return img_url_list
else:
print("返回了假数据")
"""
请求头条页面
arges:
keyword:查询的关键字
url:图片路径列表
"""
def save_img(keyword,url):
path = f"头条{keyword}"
if not os.path.exists(path):
os.mkdir(path)
try:
res = requests.get(url)
except:
print(f"图片\"{url}\"下载失败")
# fullPath = [x for x in ]
fullPath = f'{path}/{md5(res.content).hexdigest()}.jpg'
with open(fullPath,"wb") as f:
f.write(res.content)
if __name__ == "__main__":
num = int(input("输入查询页数(一页40条)")) #获取页数
keyword = input("输入查询关键字") #查询关键字
json = get_page(num,keyword) #获取请求json
urlImg = get_urlImg(json) #解析json获取图片
pool = Pool() #多进程
func = partial(save_img,keyword)
pool.map(func,urlImg) #多进程下载
pool.close() #不再接收新的进程
pool.join() #全部进程执行完后关闭全部进程