Python爬虫——关键字爬取百度图片

测试小胖

于 2024-07-03 20:53:16 发布

阅读量76

点赞数

文章标签： python 爬虫

原文链接：https://blog.csdn.net/qq_41301570/article/details/131593694

版权

针对源代码进行了以下修改：

1. 增加了报错兼容

2. 增加了进度打印

3. 增加了线程池并行

经过测试，超过30页会触发反扒

import requests
import os
import re
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed


def download_image(image_url, header, save_dir, n):
    try:
        image_data = requests.get(url=image_url, headers=header).content
        with open(os.path.join(save_dir, f'{n:06d}.jpg'), 'wb') as fp:
            fp.write(image_data)
        return n, None
    except requests.RequestException as e:
        return n, f"Error downloading image {image_url}: {e}"


def get_images_from_baidu(keyword, page_num, save_dir):
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
    }
    url = 'https://image.baidu.com/search/acjson?'
    n = 0

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    for pn in tqdm(range(0, 30 * page_num, 30), desc='Downloading Pages'):
        params = {
            'tn': 'resultjson_com',
            'logid': '7603311155072595725',
            'ipn': 'rj',
            'ct': 201326592,
            'is': '',
            'fp': 'result',
            'queryWord': keyword,
            'cl': 2,
            'lm': -1,
            'ie': 'utf-8',
            'oe': 'utf-8',
            'adpicid': '',
            'st': -1,
            'z': '',
            'ic': '',
            'hd': '',
            'latest': '',
            'copyright': '',
            'word': keyword,
            's': '',
            'se': '',
            'tab': '',
            'width': '',
            'height': '',
            'face': 0,
            'istype': 2,
            'qc': '',
            'nc': '1',
            'fr': '',
            'expermode': '',
            'force': '',
            'cg': '',
            'pn': pn,
            'rn': '30',
            'gsm': '1e',
            '1618827096642': ''
        }

        try:
            response = requests.get(url=url, headers=header, params=params)
            response.raise_for_status()
            response.encoding = 'utf-8'
            html = response.text
            image_url_list = re.findall('"thumbURL":"(.*?)",', html, re.S)
        except requests.RequestException as e:
            print(f"Error during request: {e}")
            continue

        with ThreadPoolExecutor(max_workers=10) as executor:
            futures = {executor.submit(download_image, image_url, header, save_dir, n + i): i for i, image_url in
                       enumerate(image_url_list)}
            for future in tqdm(as_completed(futures), desc=f'Downloading Images from Page {pn // 30 + 1}',
                               total=len(image_url_list)):
                i = futures[future]
                result_n, error = future.result()
                if error:
                    print(error)
                else:
                    n = result_n + 1


if __name__ == "__main__":
    keyword = '趴'
    page_num = 30
    save_dir = f'.\\图片\\{keyword}'
    get_images_from_baidu(keyword, page_num, save_dir)