Python图片爬虫（一般网站通用）

最新推荐文章于 2024-09-09 23:28:21 发布

颖落

最新推荐文章于 2024-09-09 23:28:21 发布

阅读量605

点赞数

文章标签： python 爬虫

本文链接：https://blog.csdn.net/qq_61572351/article/details/131722005

版权

# 2023/7/14 Ying

import re
import requests
import traceback
import os

def dowmloadPic(html, keyword, startNum):
    headers = {'user-agent': 'Mozilla/5.0'}
    # 属性：百度hoverURL、middleURL、objURL，搜狗thumbUrl……
    pic_url = re.findall('"thumbUrl":"(.*?)",', html, re.S)
    i = startNum
    subroot = root + '/' + word
    txtpath = subroot + '/detail.txt'
    print('找到关键词为' + keyword + '的图片，开始下载图片...')
    for each in pic_url:
        print('正在下载第' + str(i + 1) + '张图片)
        path = subroot + '/' + str(i + 1)
        try:
            if not os.path.exists(subroot):
                os.mkdir(subroot)
            if not os.path.exists(path):
                pic = requests.get(each, headers=headers, timeout=10)
                with open(path + '.jpg', 'wb') as f:
                    f.write(pic.content)
                    f.close()
            else:
                os.remove(f'{path}.jpg')
        except:
            traceback.print_exc()
            print('error，当前图片无法下载')
            continue
        i += 1
    return i


if __name__ == '__main__':

    headers = {'user-agent': 'Mozilla/5.0'}
    # 搜索关键词
    words = ['树木', '花朵']
    root = './images_'
    if not os.path.exists(root):
        os.mkdir(root)
    for word in words:
        num = 0
        # 下载页数
        for i in range(3):
            # 搜狗图片
            url = 'https://pic.sogou.com/napi/pc/searchList?mode=1&start=144&xml_len=48&query=' + word
            # 百度图片
            # url = 'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + "&pn=30&gsm=?&ct=&ic=0&lm=-1&width=0&height=0"
            html = requests.get(url, headers=headers)
            # 下载一页的图片（搜狗48，百度60）
            num = dowmloadPic(html.text, word, num, )