爬取jinritoutiao街拍图片并且进行文件存储

import re
import requests
import json
import os
import time
from urllib import request

for i in range(1,4):

    url = 'https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab'.format((i-1)*20)
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
    }
    response = requests.get(url)

    html_json_dict = response.json()

    # print(html_json_dict)

    # 获取dict中的data数据
    data_list = html_json_dict['data']
    #
    # 取列表中的article_url
    for data_item in data_list:
        if 'article_url' in data_item:
            article_url = data_item['article_url']
            response = requests.get(article_url,headers=headers)
            # print(response.text)
            html = response.text
            #匹配文件名
            file = re.search('<title>(.*)</title>',html)
            #取出文件名
            filename = file.group(1)
            # print(filename)
            if not os.path.exists(filename):
                os.mkdir(filename)
            # print(html)
            res = re.search(r'gallery: JSON\.parse\((.*)\),',html)
            if res:
                # print(res.group(1))
                res_str = json.loads(res.group(1))
                # print(res_str)
                # print(type(res_str))
                res_dict = json.loads(res_str)
                # print(res_dict)
                # print(type(res_dict))
                n = 0
                for image in res_dict['sub_images']:
                    n = n+1
                    image_url = image['url']
                    # print(image_url)
                    img_name = '{}/'.format(filename)+image_url.split('/')[-1]+'.jpg'
                    # print(img_name)
                    request.urlretrieve(image_url,img_name)
                    print('第{}张图片下载完毕'.format(n))
                    time.sleep(1)


            else:
                print('正则不正确')

            print('{}爬取完毕'.format(filename))


    print('第{}页结束'.format(i))
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值