崔庆才的网络爬虫实战中p247页上 分析Ajax爬取今日头条的代码会出现 BUG。以下代码为无BUG版。大致是补全了一下URL和加了一个IF条件防止出错。
import os
from _md5 import md5
import multiprocessing.pool
import requests
import urllib.parse
def get_page(offset):
params = {
'offset':offset,
'format':'json',
'keyword':'街拍',
'autoload':'true',
'count':'20',
'cur_tab':'1',
'from':'search_tab'
}
url = 'https://www.toutiao.com/search_content/?'+ urllib.parse.urlencode(params)
try:
response = requests.get(url)
if response.status_code == 200:
return response.json()
except requests.ConnectionError:
return None
def get_images(json):
if json.get('data'):
for item in json.get('data'):
if item.get('title'):
title = item.get('title')
images = item.get('image_list')
for image in images:
yield {
'图片':image.get('url'),
'标题':title
}
def save_image(item):
if not os.path.exists(item.get('标题')):
os.mkdir(item.get('标题'))
try:
picture_url = 'http:'+item.get('图片')
response = requests.get(picture_url)
if response.status_code == 200:
file_path = '{0}/{1}.{2}'.format(item.get('标题'),md5(response.content).hexdigest(),'jpg')
if not os.path.exists(file_path):
with open(file_path,'wb') as f:
f.write(response.content)
else:
print('已经下载过了',file_path)
except requests.ConnectionError:
print('无法保存图片')
def main(offset):
json = get_page(offset)
for item in get_images(json):
print(item)
save_image(item)
group_start = 0
group_end = 10
if __name__ =='__main__':
pool = multiprocessing.pool.Pool()
groups = ([x*20 for x in range(group_start,group_end + 1 )])
pool.map(main,groups)
pool.close()
pool.join()