python3爬取头条比基尼图片

最新推荐文章于 2022-10-18 16:57:24 发布

少陽君

最新推荐文章于 2022-10-18 16:57:24 发布

阅读量528

点赞数

分类专栏： python 文章标签： python 爬虫

本文链接：https://blog.csdn.net/u010674101/article/details/119150064

版权

python 专栏收录该内容

62 篇文章 0 订阅

订阅专栏

urlencode主要是百分号解码
HTPAdapter主要是在session实例上挂载Adapter实例, 目的: 请求异常时,自动重试
可以修改keyword 进行下载其他关键字的图片

import json
import os
import urllib
from urllib.parse import urlencode
import requests
import time
from requests.adapters import HTTPAdapter

# from requests.api import get
# from requests.models import Response
#https://www.helloworld.net/p/1580796665


# 设置重连次数为 3 次
requests.adapters.DEFAULT_RETRIES = 3
s = requests.session()
s.keep_alive = False

def get_images(re_json):
    print('111111111111111111111')
    # print(re_json)
    print('222222222222222222222')
    images=re_json.get('rawData').get('data')
    # print(images)
    try:
        for image in images:
        # print(image)
            link = image.get('img_url')
            print(link)
            yield link
    except Exception as ex:
        print("-----------出错继续----------")


def get_page(page_num):
    #https://so.toutiao.com/search?keyword=街拍&pd=atlas&dvpf=pc&aid=4916&page_num=0&search_json={"from_search_id":"202107261607330102121920454A86E71D","origin_keyword":"街拍","image_keyword":"街拍"}
    global headers
    headers = {
    'Connection': 'keep-alive',
    'Pragma': 'no-cache',
    'Cache-Control': 'no-cache',
    'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
    'sec-ch-ua-mobile': '?0',
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36',
    'Accept': '*/*',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Dest': 'empty',
    'Referer': 'https://so.toutiao.com/search?keyword=%E8%A1%97%E6%8B%8D&pd=atlas&dvpf=pc&aid=4916&page_num=0&search_json={%22from_search_id%22:%22202107261607330102121920454A86E71D%22,%22origin_keyword%22:%22%E8%A1%97%E6%8B%8D%22,%22image_keyword%22:%22%E8%A1%97%E6%8B%8D%22}',
    'Accept-Language': 'zh-CN,zh;q=0.9,ja-CN;q=0.8,ja;q=0.7',
    'Cookie': 'passport_csrf_token=e12bf88609ec09f7b050695177292c19; ttwid=1%7CtKUX3oTeFsZd_HWDBF0xNAl1uCb6fAfHEskBGp4mgac%7C1627286233%7C5f3b5a0fa02e9a26deac7c709209b5516591d760cf3b0b9d6ce2163e41febfc0; _S_DPR=1; _S_IPAD=0; MONITOR_WEB_ID=6969408645571298846; SL_GWPT_Show_Hide_tmp=1; SL_wptGlobTipTmp=1; _S_WIN_WH=230_637; ttwid=1%7CtKUX3oTeFsZd_HWDBF0xNAl1uCb6fAfHEskBGp4mgac%7C1627286233%7C5f3b5a0fa02e9a26deac7c709209b5516591d760cf3b0b9d6ce2163e41febfc0'
    }
    params = {
        'keyword': '比基尼',
        'pd':'atlas',
        'dvpf':'pc',
        'aid':4916,
        'page_num':page_num,
        'search_json':urllib.parse.unquote("%7B%22from_search_id%22%3A%22202107261607330102121920454A86E71D%22%2C%22origin_keyword%22%3A%22%E8%A1%97%E6%8B%8D%22%2C%22image_keyword%22%3A%22%E8%A1%97%E6%8B%8D%22%7D"),
        'rawJSON': 1,
        'search_id':'2021072616100601015005901629CDDE2E'
    }
    url = 'https://so.toutiao.com/search?'+ urlencode(params)
    print(url)
    # url='https://so.toutiao.com/search'
    try:
        response= s.get(url=url,headers=headers,params=params)
        s.close()
        if response.status_code==200:
            return response.json()
    except requests.ConnectionError:
        return None

def saving_img(link):
    print(f'-------正在下载第{link}张图片')
    filename = os.path.basename(link)
    data = requests.get(link).content
    if not os.path.exists("./image1"):
        os.mkdir("./image1")
    with open(f'./image1/{filename}','wb')as f:
            f.write(data)
            time.sleep(1)



def main(page_num):
    re_json = get_page(page_num)
    for link in get_images(re_json):
        saving_img(link)



if __name__ == '__main__':
    for i in range(0,100):
        main(i)
    
    print('-------------结束下载----------')