Xpath-彼岸图网高清图片获取
目标网站:彼岸图网
首先导入所需包
import os
import time
import requests
from lxml import etree
做好伪装(F12获取信息)
headers = {
'User-Agent': '',
'Referer': '',
'Cookie': ''
}
获取总页数(非固定)
# 获取总页数
def get_last_page():
response = requests.get('http://pic.netbian.com/', headers=headers)
response.encoding = "GBK"
html = etree.HTML(response.text)
last_page = html.xpath('//div[@class="page"]/a[10]/text()')
last = ''.join(last_page)
print("本站一共有{0}页\n".format(last))
return last
由于彼岸图网的高清图片需要在进入一个网页才能拿到,所以先获取小图链接
首页的链接并不规则,须单独设置
def get_main_page():
last = get_last_page()
for page in range(1, int(last) + 1):
print('\n第{0}页'.format(page))
if page == 1:
url = 'http://pic.netbian.com/'
else:
url = 'http://pic.netbian.com//index_{0}.html'.format(page)
response = requests.get(url, headers=headers)
response.encoding = "GBK"
if response.status_code == 200:
html = etree.HTML(response.text)
biglink_list = html.xpath('//div[@class="slist"]//li/a/@href')
for link in biglink_list:
get_img_link('http://pic.netbian.com/'+link)
return biglink_list
进一步访问小图链接,解析高清图链并输出图片名称和链接
# 获取高清图链接
def get_img_link(link):
global pic_sum
try:
response = requests.get(link, headers=headers)
response.encoding = "GBK"
if response.status_code == 200:
html = etree.HTML(response.text)
img_list = html.xpath('//div[@class="photo-pic"]/a/img/@src')
title_list = html.xpath('//div[@class="photo-pic"]/a/img/@title')
for title, img in zip(title_list, img_list):
pic_sum = pic_sum+1
print('{0}:{1}👉http://pic.netbian.com{2}'.format(pic_sum, title, img))
download(title, img)
except Exception as error:
print(error)
创建一个目录存储图片
# 创建目录(如果设置相对路径会报错,第一次创建目录,报错,第二次则会运行成功)
def mkdir():
global path
isExists = os.path.exists(os.path.join("D:/彼岸图网/", path))
if not isExists:
os.makedirs(os.path.join("D:/彼岸图网/", path))
os.chdir(os.path.join("D:/彼岸图网/", path))
else:
print(path, '\t已存在')
保存图片
# 下载图片
def download(title, img):
if img:
filename = title + '.jpg'
with open(path + filename, 'wb') as file:
file.write(requests.get('http://pic.netbian.com'+img).content)
主函数
由于vccode使用ctrl+c中止程序会报KeyboardInterrupt错,捕获错误并输出
# 主函数
def main():
try:
start = time.time()
global pic_sum, path
pic_sum = 0
path = 'D:/彼岸图网/'
mkdir()
get_main_page()
except KeyboardInterrupt as error_quit:
print("\n非正常退出\n注:会造成第张{0}图下载失败".format(pic_sum))
except Exception as error:
print('\n发现了错误:{0}'.format(error))
finally:
print("\n本次用时:{0:.2f}秒\n共爬取{1}页\n共{2}张图\n图片存储于{3}".format(
(time.time() - start),pic_sum/20 , pic_sum,path))
完整代码:
import os
import time
import requests
from lxml import etree
headers = {
'User-Agent': '',
'Referer': '',
'Cookie': ''
}
# 获取总页数
def get_last_page():
response = requests.get('http://pic.netbian.com/', headers=headers)
response.encoding = "GBK"
html = etree.HTML(response.text)
last_page = html.xpath('//div[@class="page"]/a[10]/text()')
# list-->string
last = ''.join(last_page)
print("本站一共有{0}页\n".format(last))
return last
# 获取图片来源链接
def get_main_page():
last = get_last_page()
for page in range(1, int(last) + 1):
print('\n第{0}页'.format(page))
if page == 1:
url = 'http://pic.netbian.com/'
else:
url = 'http://pic.netbian.com//index_{0}.html'.format(page)
response = requests.get(url, headers=headers)
response.encoding = "GBK"
if response.status_code == 200:
html = etree.HTML(response.text)
biglink_list = html.xpath('//div[@class="slist"]//li/a/@href')
for link in biglink_list:
get_img_link('http://pic.netbian.com/'+link)
return biglink_list
# 获取高清图链接
def get_img_link(link):
global pic_sum
try:
response = requests.get(link, headers=headers)
response.encoding = "GBK"
if response.status_code == 200:
html = etree.HTML(response.text)
img_list = html.xpath('//div[@class="photo-pic"]/a/img/@src')
title_list = html.xpath('//div[@class="photo-pic"]/a/img/@title')
for title, img in zip(title_list, img_list):
pic_sum = pic_sum+1
print('{0}:{1}👉http://pic.netbian.com{2}'.format(pic_sum, title, img))
download(title, img)
except Exception as error:
print(error)
# 创建目录(如果设置相对路径会报错,第一次创建目录,报错,第二次则会运行成功)
def mkdir():
global path
isExists = os.path.exists(os.path.join("D:/彼岸图网/", path))
if not isExists:
os.makedirs(os.path.join("D:/彼岸图网/", path))
os.chdir(os.path.join("D:/彼岸图网/", path))
else:
print(path, '\t已存在')
# 下载图片
def download(title, img):
if img:
filename = title + '.jpg'
with open(path + filename, 'wb') as file:
# 通常用img.content,但是网页经xpath解析过格式,不能.content
file.write(requests.get('http://pic.netbian.com'+img).content)
# 主函数
def main():
try:
start = time.time()
global pic_sum, path
pic_sum = 0
path = 'D:/彼岸图网/'
mkdir()
get_main_page()
except KeyboardInterrupt as error_quit:
print("\n非正常退出\n注:会造成第张{0}图下载失败".format(pic_sum))
except Exception as error:
print('\n发现了错误:{0}'.format(error))
finally:
print("\n本次用时:{0:.2f}秒\n共爬取{1}页\n共{2}张图\n图片存储于{3}".format(
(time.time() - start),pic_sum/20 , pic_sum,path))
if __name__ == "__main__":
main()
注:仅供参考学习