需要说明的是并没爬取成功,怀疑是网站设置了反爬虫机制:
代码如下:
import requests
from urllib.parse import urlencode
from requests import exceptions
import json
import os
from hashlib import md5
from multiprocessing.pool import Pool
def get_page(offset):
params = {
'aid': '24',
'app_name': 'web_search',
'offset': offset,
'format': 'json',
'keyword': '街拍',
'autoload': 'true',
'count': '20',
'en_qc': '1',
'cur_tab': '1'
}
headers = {
'cookie': 'tt_webid = 6735572958447977996;WEATHER_CITY = % E5 % 8C % 97 % E4 % BA % AC;tt_webid = 6735572958447977996;csrftoken = c3e55dfe0045ef079b496f27dea17ad8;sso_uid_tt = 7c1088ddfdbc75cf58c03def3e61ffd5;toutiao_sso_user = 4c6dc455a3e3f01e1ccf726edeca4c85;login_flag = 580d00027b658521d3d2df09cfe584d4;sessionid = 512a4bcfb5d3cf26a9cd6c132fdb5f46;uid_tt = e8cd26d92023013dfba19509a87b9d42;sid_tt = 512a4bcfb5d3cf26a9cd6c132fdb5f46;sid_guard = "512a4bcfb5d3cf26a9cd6c132fdb5f46|1568712418|15552000|Sun\054 15-Mar-2020 09:26:58 GMT";s_v_web_id = 674345bdfb21d1c64a2c5a0cb15cce32;tasessionId = e03vknqeo1569288824259',
'User-Agent': 'Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 76.0.3809.132Safari / 537.36 '
}
url = 'https://www.toutiao.com/api/search/content/?'+urlencode(params)
try:
response = requests.get(url, params=params)
if response.status_code == 200:
return response.text
except requests.ConnectionError:
return None
def get_images(json):
if json.get('data'):
for item in json.get('data'):
title = item.get('title')
images = item.get('image_detail')
for image in images:
yield {
'image': image.get('url'),
'title': title
}
def save_image(item):
if not os.path.exists(item.get('title')):
os.mkdir(item.get('title'))
try:
headers = {
'cookie': 'tt_webid = 6735572958447977996;WEATHER_CITY = % E5 % 8C % 97 % E4 % BA % AC;tt_webid = 6735572958447977996;csrftoken = c3e55dfe0045ef079b496f27dea17ad8;sso_uid_tt = 7c1088ddfdbc75cf58c03def3e61ffd5;toutiao_sso_user = 4c6dc455a3e3f01e1ccf726edeca4c85;login_flag = 580d00027b658521d3d2df09cfe584d4;sessionid = 512a4bcfb5d3cf26a9cd6c132fdb5f46;uid_tt = e8cd26d92023013dfba19509a87b9d42;sid_tt = 512a4bcfb5d3cf26a9cd6c132fdb5f46;sid_guard = "512a4bcfb5d3cf26a9cd6c132fdb5f46|1568712418|15552000|Sun\054 15-Mar-2020 09:26:58 GMT";s_v_web_id = 674345bdfb21d1c64a2c5a0cb15cce32;tasessionId = e03vknqeo1569288824259',
'User-Agent': 'Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 76.0.3809.132Safari / 537.36 '
}
response = requests.get(item.get('image'))
if response.status_code == 200:
file_path = '{0}/{1}{2}'.format(item.get('title'), md5(response.content).hexdigest(), '.jpg')
if not os.path.exists(file_path):
with open(file_path, 'wb') as f:
f.write(response.content)
else:
print('Already Downloaded', file_path)
except requests.ConnectionError:
print('Failed to Save Image!')
def main(offset):
json = get_page(offset)
for item in get_images(json):
print(item)
save_image(item)
GROUP_START = 1
GROUP_END = 20
if __name__ == 'main':
pool = Pool()
groups = ([x*20 for x in range(GROUP_START, GROUP_END+1)])
pool.map(main, groups)
pool.close()
pool.join()
运行结果不符合预期,如果有有看到代码块的问题还请多多指教!