import requests
import re
import json
import os
from urllib import request
hearders = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
for i in range(0,60,20):
url = 'https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab '.format(i)
# 可以通过response.json 直接获取转化后的对象(dict)
response = requests.get(url)
html_json_dict = response.json()
# 获取dict中的data key对应的列表
data_list = html_json_dict['data']
# print(data_list)
# print(type(data_list))
# 如果列表中的每一项,有article_url我们就取这个值
for data_item in data_list:
if 'article_url' in data_item:
article_url = data_item['article_url']
# print(article_url)
response = requests.get(article_url, headers=hearders)
# print(response)
# with open('jinritoutiao.html', 'wb') as f:
# f.write(response.content)
html_str = response.text
# print(html_str)
#
pattern = r'gallery: JSON\.parse\((.*)\),'
#
match_res = re.search(pattern, html_str)
#
# print(match_res.group(1))
if not os.path.exists('download11'):
os.mkdir('download11')
if match_res:
match_res_json = match_res.group(1)
match_json_one = json.loads(match_res_json)
# print(match_json_one)
# print(type(match_json_one))
match_json_two = json.loads(match_json_one )
# print(match_json_two)
# print(type(match_json_two))
match_dict_url= match_json_two['sub_images']
# print(match_dict_url)
# print(type(match_dict_url))
for v in match_dict_url:
image_url = v['url']
print(image_url)
filename = 'download11/' + image_url.split('/')[-1] + '.jpg'
request.urlretrieve(image_url, filename)
else:
print('哈哈哈')