今日头条三页内容-创建文件夹篇

import requests
import re
import json
import os
from urllib import request

hearders = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
for i in range(0,60,20):
    url = 'https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab '.format(i)
    # 可以通过response.json 直接获取转化后的对象(dict)
    response = requests.get(url)

    html_json_dict = response.json()

    # 获取dict中的data key对应的列表
    data_list = html_json_dict['data']

    # print(data_list)
    # print(type(data_list))

    # 如果列表中的每一项,有article_url我们就取这个值
    for data_item in data_list:
        if 'article_url' in data_item:
            article_url = data_item['article_url']
            # print(article_url)
            response = requests.get(article_url, headers=hearders)
            # print(response)
            # with open('jinritoutiao.html', 'wb') as f:
            #     f.write(response.content)
            html_str = response.text
            # print(html_str)
            #
            pattern = r'gallery: JSON\.parse\((.*)\),'
            #
            match_res = re.search(pattern, html_str)
            #
            # print(match_res.group(1))
            if not os.path.exists('download11'):
                os.mkdir('download11')


            if match_res:
                match_res_json = match_res.group(1)

                match_json_one = json.loads(match_res_json)
                # print(match_json_one)
                # print(type(match_json_one))

                match_json_two = json.loads(match_json_one )
                # print(match_json_two)
                # print(type(match_json_two))

                match_dict_url= match_json_two['sub_images']

                # print(match_dict_url)
                # print(type(match_dict_url))
                for v in match_dict_url:
                    image_url = v['url']
                    print(image_url)
                    filename = 'download11/' + image_url.split('/')[-1] + '.jpg'
                    request.urlretrieve(image_url, filename)
            else:
                print('哈哈哈')

  

转载于:https://www.cnblogs.com/gxsmm/p/9490825.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值