import re
import requests
import json
import os
from urllib import request
def list_pare(url):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
# 自动创建文件夹
if not os.path.exists('download'):
os.mkdir('download')
# 分页
for i in range(0,100,20):
fullurl = url.format(i)
response = requests.get(fullurl)
# 转成dict
html = response.json()
# 获取data,得出data中的列表
data_list = html['data']
# print(html)
# print(type(html))
# 循环列表
for i in data_list:
# print(type(i))
# 判断如果article_url在i里面那么就运行以下代码
if 'article_url' in i:
# 取出详情页的url
data_list = i['article_url']
# print(data_list)
# else:
# print('此dict没有-share_url')
#
#
# if 'http://toutiao.com' in data_list:
# 打开详情页URL
response = requests.get(data_list,headers=headers)
html = response.text
# print(html)
# 匹配页面url所在
list_img = re.search(r'gallery: JSON\.parse\((.*)\),',html)
# 判断
if list_img:
json_list = list_img.group(1)
sub_imgs = json.loads(json_list)
sub_imgs = json.loads(sub_imgs)
# print(sub_imgs)
# print(sub_imgs)
# 取出图片url,获得一个列表
dict_img = sub_imgs['sub_images']
# 循环得出每个页面的所有图片url列表
for img_list in dict_img:
# print(img_list)
# 取出每个图片的url
img_url = img_list['url']
# print(img_url)
fam = 'download/' + img_url.split('/')[-1] +'.jpg'
# 下载图片
request.urlretrieve(img_url,fam)
else:
print('没有url,无法分析')
# #
# print(fam)
#
# with open('img.html','wb')as f:
# f.write(response.content)
if __name__=='__main__':
url = 'https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab'
list_pare(url)