##### 爬取今日头条照片 p247
import requests
import os
from multiprocessing.pool import Pool
from hashlib import md5
from urllib.parse import urlencode
def get_page(offset):
base_url = 'https://www.toutiao.com/api/search/content/?'
params = {
'aid': '24',
'app_name': 'web_search',
'offset': offset, #每次ajax请求传递的offset参数不一样
'format': 'json',
'keyword': '二战图片',
'autoload': 'true',
'count': '20',
'en_qc': '1',
'cur_tab': '1',
'from': 'search_tab',
"pd":" synthesis",
}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36"}
url = base_url+urlencode(params)
try:
response = requests.get(url=url,headers=headers)
if response.status_code==200:
return response.json()
except Exception as e:
print(e)
return None
def get_images(json):
if json.get('data'):
for item in json.get('data'):
image_list = item.get('image_list')
title = item.get('title')
if image_list:#有的item没有image_list 这时获取的就为None 'NoneType' object is not iterable 不可迭代
for image in image_list:
yield{
'image':image.get('url'),
'title':title
}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36"}
def save_page(result):
if not os.path.exists(result.get('title')):
os.mkdir(result.get('title')) #创建保存图片的文件夹
try:
img_url = result.get('image')
img_data = requests.get(url=img_url,headers=headers).content
image_name = md5(img_data).hexdigest()
img_path = '{0}/{1}.{2}'.format(result.get('title'),image_name,'jpg')
if not os.path.exists(img_path):
with open(img_path,'wb') as fp:
fp.write(img_data)
else:
print('Already Downloaded',img_path)
except Exception as e:
print(e)
def main(offset):
json = get_page(offset)
results = get_images(json)
print('results : ',type(results))
for result in results:
# print(result)
# print(type(result))
save_page(result)
GROUP_START = 1
GROUP_END = 3
if __name__=='__main__':
pool = Pool()
groups = ([x*20 for x in range(GROUP_START,GROUP_END+1)])
#[expression for iter_val in iterable]
#[expression for iter_val in iterable if cond_expr]
pool.map(main,groups) # 将数组中的每个元素提取出来当作函数的参数,创建一个个进程,放进进程池中
# 第一个参数是函数,第二个参数是一个迭代器,将迭代器中的数字作为参数依次传入函数中
#多进程的创建 提高爬取效率 实现秒爬
pool.close()#关闭pool,使其不在接受新的(主进程)任务
pool.join()#主进程阻塞后,让子进程继续运行完成,子进程运行完后,再把主进程全部关掉。
Ajax爬取今日头条照片
最新推荐文章于 2023-04-12 20:57:31 发布