python 图片爬虫

如何批量爬取某个网站的图片(4chan/a的动漫图片)

1. 先写一个网页处理的工具

# -*- coding=utf-8 -*-
# author vvyun

import urllib.request
import re
import json


"""
http工具
"""


def getcontent(url):
    """获取链接html内容"""
    headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64)"}
    request = urllib.request.Request(url, headers=headers)
    response = urllib.request.urlopen(request)
    data = response.read().decode("utf-8")
    return data


def savestr2file(filename, content):
    """存储字符串到文件"""
    output = open(filename, "w+", encoding="utf8")
    output.write(content)
    output.close()


def getcatalogarray(content):
    """获取4chan - catalog数据   ---var catalog = {......};var style_group---"""
    pattern = r"var\scatalog\s=(.*).var\sstyle_group"
    res = re.search(pattern, content, re.M | re.S)
    return res.group(1)


def getimageurls(content):
    """获取网页中的图片链接 <a href="//i.4cdn.org/a/1553711154845s.jpg" """
    pattern = r"a\shref=\"(//i.4cdn.org/.*?\.[j|p|g][p|n|i][g|f])\""
    res = re.findall(pattern, content)
    return res


#  检验url是否可以正常访问
def getHttpStatusCode(tempUrl):
    """检验url是否可以正常访问"""
    opener = urllib.request.build_opener()
    opener.addheaders = [("User-agent", "Mozilla/49.0.2")]
    try:
        opener.open(tempUrl)
        # print(tempUrl+'没问题')
        return 0
    except urllib.error.HTTPError:
        print(tempUrl + "=访问页面出错")
        return 1
    except urllib.error.URLError:
        print(tempUrl + "=访问页面出错")
        return 2

2. 接下来先获取网页html内容,再其中查找所需的帖子链接,获取每个链接对应的网页内容,从中提取所有满足条件的图片下载链接并下载图片

# -*- coding=utf-8 -*-
# author vvyun

import urllib.request as ureq
import re
import json
from tools import *


# 板块名
mokuaibase = ["a", "c", "e"]
# 默认为a(动漫)板块
basec = "a"

urlindexbase = "https://boards.4chan.org/" + basec + "/catalog"
urlthreadbase = "https://boards.4chan.org/" + basec + "/thread/"


# 获取首页html内容
content = getcontent(urlindexbase)

# 保存首页html内容到本地
filename = "image/content_4chan_a.html"
savestr2file(filename, content)

# 获取目录信息
catalog_index = getcatalogarray(content)

# 保存目录信息到本地
filenameimg = "image/catalog_4chan.json"
savestr2file(filenameimg, catalog_index)

# 获取threads信息
catalog_threads = json.loads(catalog_index)["threads"]

# 循环获取每个thread的所有图片
for thread_url in catalog_threads:
    # 页面是否可访问
    if getHttpStatusCode(urlthreadbase + thread_url) > 0:
        continue
    print(urlthreadbase + thread_url)
    # 获取thread页面html内容
    content_thread = getcontent(urlthreadbase + thread_url)
    # 保存thread页面html内容
    filename = "image/html/" + thread_url + ".html"
    savestr2file(filename, content_thread)

    # 获取thread页面图片链接
    imagedata = getimageurls(content_thread)

    for iu in imagedata:

        imd = "https:" + iu
        print(imd)
        try:
            if getHttpStatusCode(imd) < 1:
                ureq.urlretrieve(imd, "image/data/" + iu.replace("/",""))
        except Exception as e:
            raise e

demo:

github源代码 cs-note/python/crawler/crawler4chan at main · nanonin/cs-note · GitHub

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值