import requests
import re
import json
from urllib import request
import os
def download(n):
# 控制变量 控制页数
url = 'https://www.toutiao.com/search_content/?offset='+str(20*(n-1))+'&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab'
for i in range(0,n):
url = 'https://www.toutiao.com/search_content/?offset=' + str(20 * n) + '&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab'
response = requests.get(url)
res_dict = response.json()
# print(res_dict,type(res_dict))
data_list = res_dict['data']
src_list =[]
# print(data_list)
# print(data_list[1]['article_url'])
for data in data_list:
if 'article_url' in data.keys():
src_list.append(data['article_url'])
for src in src_list:
url = src
if not os.path.exists('download'):
os.mkdir('download')
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
response = requests.get(url, headers=headers)
# 正则
res = re.search('gallery: JSON\.parse\((.*)\),', response.text)
if res:
res_dict = json.loads(json.loads(res.group(1)))
print(res_dict)
res_list = res_dict['sub_images']
src_list = []
for res in res_list:
src_list.append(res['url'])
# print(src_list)
print(len(src_list))
for src in src_list:
print(src)
# 写入图片
filename = 'download/' + src.split('/')[-1] + '.jpg'
request.urlretrieve(src, filename)
download(6)