这里把爬取的网站地址处理掉了。直接展示不好。
经常找壁纸的童鞋,应该会知道我爬的是哪个网站🙄(找不到狗头)
from typing import List, Any, Union
# import requests
import random
import string
import urllib.request
from bs4 import BeautifulSoup
import os,stat
from PIL import Image
from io import BytesIO
from util.accessWebContent import accessWebContent
class _4kpicSpider:
# 下载
def download(self):
pass
# 进入网站 - 爬动漫图片
def linkWebSit(self, page):
result: List[Union[str, Any]] = []
_base_url = '.....'
file_path='D:/book/img'
if not os.path.exists(file_path):
#创建路径
os.makedirs(file_path)
url = ''
if (page is None) or (page == 1):
# 首页
url = '.....'
else:
url = '.....'+ str(page) +'.html'
content = accessWebContent().accessContent(url)
soup = BeautifulSoup(content, 'html.parser')
pics = soup.find('ul', class_='clearfix').find_all('img')
num = pics.__len__()
if num > 0:
for index in range(0, num):
next_url = _base_url + pics[index].attrs['src']
result.append(next_url)
# 下载图片
# pic = requests.get(next_url, timeout=10)
ran_str = ''.join(random.sample(string.ascii_letters + string.digits, 10))
filename = 'x' + ran_str + '.jpg'
print(filename)
# urllib.request.urlretrieve(next_url,filename=filename)
with urllib.request.urlopen(next_url, timeout=30) as response, open("D:/book/img/"+filename
, 'wb') as f_save:
f_save.write(response.read())
f_save.flush()
f_save.close()
''' 下面是访问链接后在进行爬取,想爬取相对高清一点的但是失败了,request无法爬取使用js渲染的,因此想要搞的需要使用 selenium
linkList = soup.find('ul', class_='clearfix').find_all('a')
num = linkList.__len__()
if num > 0:
for index in range(0, num):
next_url = _base_url + linkList[index].attrs['href']
result.append(next_url)
next_content = accessWebContent().accessContent(next_url)
next_html = BeautifulSoup(content, 'html.parser')
imgEle = next_html.find('#img')
print(imgEle)
'''
# print(result)
return result
if __name__ == "__main__":
spider = _4kpicSpider()
for i in range(1,147):
res = spider.linkWebSit(i)
import requests
import logging
class accessWebContent:
# 无需请求头
def accessContent(self, url):
req = requests.get(url)
if req.encoding == 'ISO-8859-1':
encodings = requests.utils.get_encodings_from_content(req.text)
if encodings:
encoding = encodings[0]
else:
encoding = req.apparent_encoding
# encode_content = req.content.decode(encoding, 'replace').encode('utf-8', 'replace')
global encode_content
#如果设置为replace,则会用?取代非法字符;
encode_content = req.content.decode(encoding, 'replace')
# 默认日志级别为 warning
logging.debug(encode_content)
return encode_content
爬取效果展示: