Python爬虫——关键字爬取百度图片

针对源代码进行了以下修改:

1. 增加了报错兼容

2. 增加了进度打印

3. 增加了线程池并行

经过测试,超过30页会触发反扒

import requests
import os
import re
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed


def download_image(image_url, header, save_dir, n):
    try:
        image_data = requests.get(url=image_url, headers=header).content
        with open(os.path.join(save_dir, f'{n:06d}.jpg'), 'wb') as fp:
            fp.write(image_data)
        return n, None
    except requests.RequestException as e:
        return n, f"Error downloading image {image_url}: {e}"


def get_images_from_baidu(keyword, page_num, save_dir):
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
    }
    url = 'https://image.baidu.com/search/acjson?'
    n = 0

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    for pn in tqdm(range(0, 30 * page_num, 30), desc='Downloading Pages'):
        params = {
            'tn': 'resultjson_com',
            'logid': '7603311155072595725',
            'ipn': 'rj',
            'ct': 201326592,
            'is': '',
            'fp': 'result',
            'queryWord': keyword,
            'cl': 2,
            'lm': -1,
            'ie': 'utf-8',
            'oe': 'utf-8',
            'adpicid': '',
            'st': -1,
            'z': '',
            'ic': '',
            'hd': '',
            'latest': '',
            'copyright': '',
            'word': keyword,
            's': '',
            'se': '',
            'tab': '',
            'width': '',
            'height': '',
            'face': 0,
            'istype': 2,
            'qc': '',
            'nc': '1',
            'fr': '',
            'expermode': '',
            'force': '',
            'cg': '',
            'pn': pn,
            'rn': '30',
            'gsm': '1e',
            '1618827096642': ''
        }

        try:
            response = requests.get(url=url, headers=header, params=params)
            response.raise_for_status()
            response.encoding = 'utf-8'
            html = response.text
            image_url_list = re.findall('"thumbURL":"(.*?)",', html, re.S)
        except requests.RequestException as e:
            print(f"Error during request: {e}")
            continue

        with ThreadPoolExecutor(max_workers=10) as executor:
            futures = {executor.submit(download_image, image_url, header, save_dir, n + i): i for i, image_url in
                       enumerate(image_url_list)}
            for future in tqdm(as_completed(futures), desc=f'Downloading Images from Page {pn // 30 + 1}',
                               total=len(image_url_list)):
                i = futures[future]
                result_n, error = future.result()
                if error:
                    print(error)
                else:
                    n = result_n + 1


if __name__ == "__main__":
    keyword = '趴'
    page_num = 30
    save_dir = f'.\\图片\\{keyword}'
    get_images_from_baidu(keyword, page_num, save_dir)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值