import re
import requests
import json
import os
from urllib import request
def sss():
offset = 0
while offset <= 40:
# 1.先获取一个页面
url = 'https://www.toutiao.com/search_content/?offset= {} &format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab'.format(offset)
response = requests.get(url)
# 可以通过response.json直接获取转化后的对象(dict)
html_json_dict = response.json()
# print(html_json_dict)
# 获取dict中的data key对应的列表
data_list = html_json_dict['data']
# print(data_list)
# 如果列表中的每一项,有article_url我们就取这个值
for data_item in data_list:
if 'article_url' in data_item:
article_url = data_item['article_url']
# print(article_url) #结果http://toutiao.com/group/6589933439766495747/
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
# 下面这段代码获取的是页面上的一条连接
response = requests.get(article_url,headers=headers)
# print(article_url)
# with open('jiepai.html','wb') as f:
# f.write(response.content)
html_str = response.text
# print(html_str)
pattern = r'gallery: JSON\.parse\((.*)\),'
# print(pattern)
match_res = re.search(pattern, html_str)
# print(match_res)
# 新建文件夹
if not os.path.exists('down1'):
os.mkdir('down1')
if match_res:
# 是一个str
print(match_res.group(1))
json_origin = match_res.group(1)
# 这是第一遍的负载,返回值是str loads将json的字典类型转为字符串
res_buzhidao = json.loads(json_origin)
# print(res_buzhidao, type(res_buzhidao))
res_dict = json.loads(res_buzhidao)
# print(res_dict, type(res_dict))
# 根据字典的键获取值
sub_images_list = res_dict['sub_images']
# 遍历获取url
for image in sub_images_list:
image_url = image['url']
# print(image_url)
filename = 'down1/' + image_url.split('/')[-1] + '.jpg'
# print(filename)
# 下载图片
request.urlretrieve(image_url, filename)
else:
print('下载失败')
offset += 20
if __name__ == '__main__':
sss()