如何批量爬取某个网站的图片(4chan/a的动漫图片)
1. 先写一个网页处理的工具
# -*- coding=utf-8 -*-
# author vvyun
import urllib.request
import re
import json
"""
http工具
"""
def getcontent(url):
"""获取链接html内容"""
headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64)"}
request = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(request)
data = response.read().decode("utf-8")
return data
def savestr2file(filename, content):
"""存储字符串到文件"""
output = open(filename, "w+", encoding="utf8")
output.write(content)
output.close()
def getcatalogarray(content):
"""获取4chan - catalog数据 ---var catalog = {......};var style_group---"""
pattern = r"var\scatalog\s=(.*).var\sstyle_group"
res = re.search(pattern, content, re.M | re.S)
return res.group(1)
def getimageurls(content):
"""获取网页中的图片链接 <a href="//i.4cdn.org/a/1553711154845s.jpg" """
pattern = r"a\shref=\"(//i.4cdn.org/.*?\.[j|p|g][p|n|i][g|f])\""
res = re.findall(pattern, content)
return res
# 检验url是否可以正常访问
def getHttpStatusCode(tempUrl):
"""检验url是否可以正常访问"""
opener = urllib.request.build_opener()
opener.addheaders = [("User-agent", "Mozilla/49.0.2")]
try:
opener.open(tempUrl)
# print(tempUrl+'没问题')
return 0
except urllib.error.HTTPError:
print(tempUrl + "=访问页面出错")
return 1
except urllib.error.URLError:
print(tempUrl + "=访问页面出错")
return 2
2. 接下来先获取网页html内容,再其中查找所需的帖子链接,获取每个链接对应的网页内容,从中提取所有满足条件的图片下载链接并下载图片
# -*- coding=utf-8 -*-
# author vvyun
import urllib.request as ureq
import re
import json
from tools import *
# 板块名
mokuaibase = ["a", "c", "e"]
# 默认为a(动漫)板块
basec = "a"
urlindexbase = "https://boards.4chan.org/" + basec + "/catalog"
urlthreadbase = "https://boards.4chan.org/" + basec + "/thread/"
# 获取首页html内容
content = getcontent(urlindexbase)
# 保存首页html内容到本地
filename = "image/content_4chan_a.html"
savestr2file(filename, content)
# 获取目录信息
catalog_index = getcatalogarray(content)
# 保存目录信息到本地
filenameimg = "image/catalog_4chan.json"
savestr2file(filenameimg, catalog_index)
# 获取threads信息
catalog_threads = json.loads(catalog_index)["threads"]
# 循环获取每个thread的所有图片
for thread_url in catalog_threads:
# 页面是否可访问
if getHttpStatusCode(urlthreadbase + thread_url) > 0:
continue
print(urlthreadbase + thread_url)
# 获取thread页面html内容
content_thread = getcontent(urlthreadbase + thread_url)
# 保存thread页面html内容
filename = "image/html/" + thread_url + ".html"
savestr2file(filename, content_thread)
# 获取thread页面图片链接
imagedata = getimageurls(content_thread)
for iu in imagedata:
imd = "https:" + iu
print(imd)
try:
if getHttpStatusCode(imd) < 1:
ureq.urlretrieve(imd, "image/data/" + iu.replace("/",""))
except Exception as e:
raise e
demo:
github源代码 cs-note/python/crawler/crawler4chan at main · nanonin/cs-note · GitHub