python 图片爬虫

最新推荐文章于 2024-08-17 17:18:53 发布

nanonin

最新推荐文章于 2024-08-17 17:18:53 发布

阅读量1.8k

点赞数 1

文章标签： python 爬虫

本文链接：https://blog.csdn.net/baidu_32736115/article/details/81168788

版权

本文介绍如何使用Python实现批量爬取4chan/a板块的动漫图片。通过编写网页处理工具，首先获取HTML内容，然后提取帖子链接，接着获取每个链接的详细内容，最后筛选并下载符合条件的图片。

摘要由CSDN通过智能技术生成

如何批量爬取某个网站的图片（4chan/a的动漫图片）

1. 先写一个网页处理的工具

# -*- coding=utf-8 -*-
# author vvyun

import urllib.request
import re
import json


"""
http工具
"""


def getcontent(url):
    """获取链接html内容"""
    headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64)"}
    request = urllib.request.Request(url, headers=headers)
    response = urllib.request.urlopen(request)
    data = response.read().decode("utf-8")
    return data


def savestr2file(filename, content):
    """存储字符串到文件"""
    output = open(filename, "w+", encoding="utf8")
    output.write(content)
    output.close()


def getcatalogarray(content):
    """获取4chan - catalog数据   ---var catalog = {......};var style_group---"""
    pattern = r"var\scatalog\s=(.*).var\sstyle_group"
    res = re.search(pattern, content, re.M | re.S)
    return res.group(1)


def getimageurls(content):
    """获取网页中的图片链接 <a href="//i.4cdn.org/a/1553711154845s.jpg" """
    pattern = r"a\shref=\"(//i.4cdn.org/.*?\.[j|p|g][p|n|i][g|f])\""
    res = re.findall(pattern, content)
    return res


#  检验url是否可以正常访问
def getHttpStatusCode(tempUrl):
    """检验url是否可以正常访问"""
    opener = urllib.request.build_opener()
    opener.addheaders = [("User-agent", "Mozilla/49.0.2")]
    try:
        opener.open(tempUrl)
        # print(tempUrl+'没问题')
        return 0
    except urllib.error.HTTPError:
        print(tempUrl + "=访问页面出错")
        return 1
    except urllib.error.URLError:
        print(tempUrl + "=访问页面出错")
        return 2

2. 接下来先获取网页html内容，再其中查找所需的帖子链接，获取每个链接对应的网页内容，从中提取所有满足条件的图片下载链接并下载图片

# -*- coding=utf-8 -*-
# author vvyun

import urllib.request as ureq
import re
import json
from tools import *


# 板块名
mokuaibase = ["a", "c", "e"]
# 默认为a（动漫）板块
basec = "a"

urlindexbase = "https://boards.4chan.org/" + basec + "/catalog"
urlthreadbase = "https://boards.4chan.org/" + basec + "/thread/"


# 获取首页html内容
content = getcontent(urlindexbase)

# 保存首页html内容到本地
filename = "image/content_4chan_a.html"
savestr2file(filename, content)

# 获取目录信息
catalog_index = getcatalogarray(content)

# 保存目录信息到本地
filenameimg = "image/catalog_4chan.json"
savestr2file(filenameimg, catalog_index)

# 获取threads信息
catalog_threads = json.loads(catalog_index)["threads"]

# 循环获取每个thread的所有图片
for thread_url in catalog_threads:
    # 页面是否可访问
    if getHttpStatusCode(urlthreadbase + thread_url) > 0:
        continue
    print(urlthreadbase + thread_url)
    # 获取thread页面html内容
    content_thread = getcontent(urlthreadbase + thread_url)
    # 保存thread页面html内容
    filename = "image/html/" + thread_url + ".html"
    savestr2file(filename, content_thread)

    # 获取thread页面图片链接
    imagedata = getimageurls(content_thread)

    for iu in imagedata:

        imd = "https:" + iu
        print(imd)
        try:
            if getHttpStatusCode(imd) < 1:
                ureq.urlretrieve(imd, "image/data/" + iu.replace("/",""))
        except Exception as e:
            raise e

demo:

github源代码 cs-note/python/crawler/crawler4chan at main · nanonin/cs-note · GitHub