一、运行环境
pycharm 2020 社区版
python 3.7
beautifulsoup4 4.11.1
二、实际代码
import os
import re
import requests
from bs4 import BeautifulSoup
from access.sprider.SpriderAccess import SpriderAccess
from base.BaseConfig import BaseConfig
from base.BaseFrame import BaseFrame
from object.entity.SpriderEntity import SpriderEntity
from plugin.Tools import Tools
class Netbian:
page_count = 1 # 每个栏目开始业务
base_url = "http://pic.netbian.com" # 采集的网址 http://pic.netbian.com/e/search/result/?searchid=2543
save_path = BaseConfig().CORPUS_ROOT + os.sep + "Netbian" # "/Users/zhangyu/Pictures/Wallpaper/"
second_url = ("4kmeinv","4kfengjing", "4kbeijing", "4kyouxi")
def __init__(self):
pass
def sprider_wall_paper(self):
BaseFrame.__log__("开始采集首页彼岸图网网站的图片...")
for column in self.second_url:
url = self.base_url + "/" + column + "/"
response = requests.get(url)
response.encoding = 'gbk'
soup = BeautifulSoup(response.text, "html5lib")
try:
page_end_url = soup.find('span', attrs={"class": 'slh'}).find_next_siblings()
page_end_num = page_end_url[0].text
while self.page_count <= int(page_end_num): # 翻完停止
if self.page_count == 1: # 栏目的第一个地址
list_url = soup.find_all('a', attrs={"target": '_blank'})
regx = "tupian/\d{1,5}\.html"
images_url = re.findall(regx, str(list_url))
for iurl in images_url:
image_full_url = self.base_url + "/" + iurl
response = requests.get(image_full_url)
response.encoding = 'gbk'
soup = BeautifulSoup(response.text, "html5lib")
for image_obj in soup.find('a', attrs={"id": 'img'}).children:
pic_url = self.base_url + image_obj.get("src")
pic_title = image_obj.get("alt")
BaseFrame.__log__("采集" + pic_title + "的图片..." + url)
mmEntity = SpriderEntity() # 依据图片执行,下载过的图片不再下载
mmEntity.sprider_base_url = self.base_url
mmEntity.create_datetime = Tools.get_current_datetime()
mmEntity.sprider_url = url
mmEntity.sprider_pic_title = pic_title
mmEntity.sprider_pic_index = str(self.page_count)
if SpriderAccess().query_sprider_entity_by_urlandtitle(pic_url, pic_title) is None:
SpriderAccess().save_sprider(mmEntity)
self.down_pic(pic_url, pic_title, column)
else:
BaseFrame.__log__("下载过已经跳过。")
else:
next_pager_url = self.base_url + "/" + column + "/index_" + str(self.page_count) + ".html"
try:
response = requests.get(next_pager_url)
response.encoding = 'gbk'
soup = BeautifulSoup(response.text, "html5lib")
list_url = soup.find_all('a', attrs={"target": '_blank'})
regx = "tupian/\d{1,5}\.html"
images_url = re.findall(regx, str(list_url))
for iurl in images_url:
image_full_url = self.base_url + "/" + iurl
response = requests.get(image_full_url)
response.encoding = 'gbk'
soup = BeautifulSoup(response.text, "html5lib")
for image_obj in soup.find('a', attrs={"id": 'img'}).children:
pic_url = self.base_url + image_obj.get("src")
pic_title = image_obj.get("alt")
BaseFrame.__log__("采集" + pic_title + "的图片..." + url)
mmEntity = SpriderEntity() # 依据图片执行,下载过的图片不再下载
mmEntity.sprider_base_url = self.base_url
mmEntity.create_datetime = Tools.get_current_datetime()
mmEntity.sprider_url = url
mmEntity.sprider_pic_title = pic_title
mmEntity.sprider_pic_index = str(self.page_count)
if SpriderAccess().query_sprider_entity_by_urlandtitle(pic_url, pic_title) is None:
SpriderAccess().save_sprider(mmEntity)
self.down_pic(pic_url, pic_title, column)
else:
BaseFrame.__log__("下载过已经跳过。")
except Exception as e:
BaseFrame.__log__("请求站点过程发生错误..." + e)
continue
pass
self.page_count = self.page_count + 1 # 翻页要不一致第一页
except Exception as e:
BaseFrame.__err__(str(e))
continue
pass
# region 下载图片
def down_pic(self, pic_url, pic_title, second_path):
try:
headers = {"Referer": pic_url,
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 '
'(KHTML, like Gecko)Chrome/62.0.3202.94 Safari/537.36'}
content = requests.get(pic_url, headers=headers)
real_path = self.save_path + os.sep + second_path + os.sep
if (os.path.exists(real_path) is False):
os.makedirs(real_path)
if content.status_code == 200:
pic_cun = real_path + pic_title + '.jpg'
fp = open(pic_cun, 'wb')
fp.write(content.content)
fp.close()
except Exception as e:
BaseFrame.__err__("下载图片出现错误" + str(e))
pass
# endregion
if __name__ == '__main__':
Netbian().sprider_wall_paper()
pass