import re
import requests
import json
import os
import time
from urllib import request
for i in range(1,4):
url = 'https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab'.format((i-1)*20)
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
response = requests.get(url)
html_json_dict = response.json()
# print(html_json_dict)
# 获取dict中的data数据
data_list = html_json_dict['data']
#
# 取列表中的article_url
for data_item in data_list:
if 'article_url' in data_item:
article_url = data_item['article_url']
response = requests.get(article_url,headers=headers)
# print(response.text)
html = response.text
#匹配文件名
file = re.search('<title>(.*)</title>',html)
#取出文件名
filename = file.group(1)
# print(filename)
if not os.path.exists(filename):
os.mkdir(filename)
# print(html)
res = re.search(r'gallery: JSON\.parse\((.*)\),',html)
if res:
# print(res.group(1))
res_str = json.loads(res.group(1))
# print(res_str)
# print(type(res_str))
res_dict = json.loads(res_str)
# print(res_dict)
# print(type(res_dict))
n = 0
for image in res_dict['sub_images']:
n = n+1
image_url = image['url']
# print(image_url)
img_name = '{}/'.format(filename)+image_url.split('/')[-1]+'.jpg'
# print(img_name)
request.urlretrieve(image_url,img_name)
print('第{}张图片下载完毕'.format(n))
time.sleep(1)
else:
print('正则不正确')
print('{}爬取完毕'.format(filename))
print('第{}页结束'.format(i))
爬取jinritoutiao街拍图片并且进行文件存储
最新推荐文章于 2020-01-04 15:41:56 发布