基于Python的网页图片爬虫

最新推荐文章于 2022-07-12 09:34:59 发布

喜碧CatBrother

最新推荐文章于 2022-07-12 09:34:59 发布

阅读量435

点赞数 1

分类专栏：工具文章标签：爬虫 Python 图片

本文链接：https://blog.csdn.net/qq_30680871/article/details/88758002

版权

工具专栏收录该内容

6 篇文章 0 订阅

订阅专栏

两个爬虫代码参考了多篇资料，若有需要标注，请私信联系。闲言少叙，直接上代码。

# -*- coding:utf-8 -*-
import re
import requests

def dowmloadPic(html, keyword):
    pic_url = re.findall('"objURL":"(.*?)",', html, re.S)
    i = 1
    print('找到关键词:' + keyword + '的图片，现在开始下载图片...')
    for each in pic_url:
        print('正在下载第' + str(i) + '张图片，图片地址:' + str(each))
        try:
            pic = requests.get(each, timeout=10)
            # requests.adapters.DEFAULT_RETRIES = 5
        except requests.exceptions.ConnectionError:
            print('【错误】当前图片无法下载')
            continue

        dir = '../images/' + keyword + '_' + str(i) + '.jpg'
        fp = open(dir, 'wb')
        fp.write(pic.content)
        fp.close()
        i += 1
        s = requests.session()
        s.keep_alive = False


if __name__ == '__main__':
    word = input("Input key word: ")
    # pageId = 0
    # # 这里我保存到第50页
    # for i in range(50):
    #     url = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + "&pn=" + str(
    #         pageId) + "&gsm=?&ct=&ic=0&lm=-1&width=0&height=0"
    #     pageId += 20
    url = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + '&ct=201326592&v=flip'

    # url = 'http://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word=' + word
    result = requests.get(url)
    dowmloadPic(result.text, word)

上面这个版本适用于download打开的一个页面，不会自动滚动下滑条。

# -*- coding:utf-8 -*-
import re
import requests
import traceback
import os


def dowmloadPic(html, keyword, startNum):
    kv = {'user-agent': 'Mozilla/5.0'}
    pic_url = re.findall('"objURL":"(.*?)",', html, re.S)
    num = len(pic_url)
    i = startNum
    root = 'L:/pics/'
    print('找到关键词:' + keyword + '的图片，现在开始下载图片...')

    for each in pic_url:
        print('正在下载第' + str(i + 1) + '张图片，图片地址:' + str(each))
        path = root + each.split('/')[-1]
        # dir = root + keyword + str(i) + '.jpg'
        dir = root + keyword + 'v1_' + str(i) + '.jpg'
        try:
            if not os.path.exists(root):
                os.mkdir(root)
            if not os.path.exists(path):
                pic = requests.get(each, headers=kv, timeout=10)
                with open(dir, 'wb') as f:
                    f.write(pic.content)
                    f.close()

        except:
            traceback.print_exc()
            print('【错误】当前图片无法下载')
            continue
        i += 1

    return i


if __name__ == '__main__':

    kv = {'user-agent': 'Mozilla/5.0'}
    lastNum = 0
    words = ['篮球','排球']
    # words为一个列表，可以自动保存多个关键字的图片
    for word in words:
        # word = input("Input key word: ")
        if word.strip() == "exit":
            break
        pageId = 0
        # 此处的参数为需爬取的页数
        for i in range(10):
            url = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + "&pn=" + str(
                pageId) + "&gsm=?&ct=&ic=0&lm=-1&width=0&height=0"
            pageId += 20
            result = requests.get(url, headers=kv)
            lastNum = dowmloadPic(result.text, word, lastNum)

上面这个版本适用于可以滚动下滑条，download大量图片。

向IT工作者致敬，后丹之喜碧CatBrother欢迎吐槽：
后丹-喜碧CatBrother

喜碧CatBrother

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
基于Python的网页图片爬虫

闲言少叙，直接上代码。# -*- coding:utf-8 -*-import reimport requestsdef dowmloadPic(html, keyword): pic_url = re.findall('"objURL":"(.*?)",', html, re.S) #i是保存图片的编号 i = 226 print('找到关键词:' + ...
复制链接

扫一扫

专栏目录