import requests,re import json from urllib import request import os url = 'https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab' headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36', } for i in range(1): url = 'https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab'.format(20*i) response= requests.get(url,headers=headers) res = response.json() data_list = res['data'] # 新建文件夹 # if not os.path.exists(title) : # os.mkdir(title) for data_item in data_list: if 'article_url' in data_item: article_url = data_item['article_url'] title=data_item['title'] #新建一个总文件夹/一个标题在建立一个文件夹(分类) if not os.path.exists('download'+'/'+title): os.mkdir('download'+'/'+title) #print(title) # print(article_url) response = requests.get(article_url,headers=headers) res_html = response.text # print(res_html) res_zhengze = r'gallery: JSON\.parse\((.*)\),' pattern = re.search(res_zhengze, res_html) if pattern: res_2 = json.loads( pattern.group(1)) res_3 = json.loads(res_2) else: continue#终止下面的执行代码,从下一个url开始执行 for res_4 in res_3['sub_images']: res_5 = res_4['url'] print(res_5) name = res_5.split('/')[-1] + '.jpg' filename ='download/'+title+'/'+ name # 下载图片 request.urlretrieve(res_5, filename)
爬取图片并按标题建立文件夹存图
最新推荐文章于 2021-01-22 10:35:52 发布