-- coding: utf-8 --
“”"
author = ‘Allen/Curry’
time = ‘2019/11/22’
“”"
#仅供学习使用,无其他目的。如侵权,请联系删帖。
#爬取本站贴子首页图片。
#图片质量较低,可自行调整
#网站几乎没有反扒措施
import re
import requests
import time
from pyquery import PyQuery as pq
import os
#网页列表
urls = [‘https://www.buxiuse.com/?page={}’.format(str(i)) for i in range(10)] #公5000余页,此处取10页。
#设置headers,几乎可有可无。反扒几乎没有
headers = {
“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.17 Safari/537.36”,
“Cookie”: “SESSION=ZTRiNjFkMzAtOWM4OS00NDVhLTlmN2YtMTBjY2I1MGUzZjQz; Hm_lvt_479b5d690f3b5d1eae450ce953f78480=1574324041,1574339475; Hm_lpvt_479b5d690f3b5d1eae450ce953f78480=1574339494”
}
#存在本项目内
path = ‘./图片/’
#获取下载图片的urls
def down(url):
try:
html = requests.get(url, headers=headers).text
doc = pq(html)
items = doc(’#main .panel-body li’).items()
for item in items:
img_url = item.find(".img_single a img").attr(‘src’)
title = item.find(".img_single a img").attr(‘title’)
# print(title)
# print(img_url)
save(title, img_url)
except:
print(url, “访问失败”)
#保存图片
def save(title, imag_url):
try:
if not os.path.exists(path):
os.makedirs(path)
print(“path创建成功”)
#去除特殊符号,避免保存失败
pattern = r’[\/😗?"<>|\r\n]+’
#特殊符号替换成“-”
title_new = re.sub(pattern, ‘-’, title)
#这里采用帖子标题名称的方式来命名图片,有不妥之处。建议截取图片URL部分作为名称。
img_name = (path + title_new + ‘.jpg’)
with open(img_name, ‘wb+’) as f:
f.write((requests.get(imag_url, headers=headers)).content)
# print(title, “保存完成”)
except:
print(title, imag_url, “保存失败”)
if name == ‘main’:
for url in urls:
print(“正在下载链接”, url, “内图片”)
down(url)
time.sleep(1)
print(“下载完成”)
参考https://blog.csdn.net/qq_34908107/article/details/80476234