Xpath-彼岸图网高清图片获取

最新推荐文章于 2024-07-27 22:46:33 发布

昱禹

最新推荐文章于 2024-07-27 22:46:33 发布

阅读量1.1k

点赞数 1

分类专栏：笔记文章标签： python xpath

本文链接：https://blog.csdn.net/qq_43652321/article/details/110279357

版权

笔记专栏收录该内容

6 篇文章 2 订阅

订阅专栏

Xpath-彼岸图网高清图片获取

目标网站：彼岸图网

首先导入所需包

import os
import time

import requests
from lxml import etree

做好伪装(F12获取信息)

headers = {
    'User-Agent': '',
    'Referer': '',
    'Cookie': ''
}

获取总页数(非固定)

# 获取总页数
def get_last_page():
    response = requests.get('http://pic.netbian.com/', headers=headers)
    response.encoding = "GBK"
    html = etree.HTML(response.text)
    last_page = html.xpath('//div[@class="page"]/a[10]/text()')
    last = ''.join(last_page)
    print("本站一共有{0}页\n".format(last))
    return last

由于彼岸图网的高清图片需要在进入一个网页才能拿到，所以先获取小图链接

首页的链接并不规则，须单独设置

def get_main_page():

    last = get_last_page()
    for page in range(1, int(last) + 1):
        print('\n第{0}页'.format(page))
        if page == 1:
            url = 'http://pic.netbian.com/'
        else:
            url = 'http://pic.netbian.com//index_{0}.html'.format(page)
        response = requests.get(url, headers=headers)
        response.encoding = "GBK"

        if response.status_code == 200:
            html = etree.HTML(response.text)
            biglink_list = html.xpath('//div[@class="slist"]//li/a/@href')
            for link in biglink_list:
                get_img_link('http://pic.netbian.com/'+link)
    return biglink_list

进一步访问小图链接，解析高清图链并输出图片名称和链接

# 获取高清图链接
def get_img_link(link):
    global pic_sum
    try:
        response = requests.get(link, headers=headers)
        response.encoding = "GBK"

        if response.status_code == 200:
            html = etree.HTML(response.text)
            img_list = html.xpath('//div[@class="photo-pic"]/a/img/@src')
            title_list = html.xpath('//div[@class="photo-pic"]/a/img/@title')

            for title, img in zip(title_list, img_list):
                pic_sum = pic_sum+1
                print('{0}:{1}👉http://pic.netbian.com{2}'.format(pic_sum, title, img))
                download(title, img)
    except Exception as error:
        print(error)

创建一个目录存储图片

# 创建目录(如果设置相对路径会报错，第一次创建目录，报错，第二次则会运行成功)
def mkdir():
    global path
    isExists = os.path.exists(os.path.join("D:/彼岸图网/", path))
    if not isExists:
        os.makedirs(os.path.join("D:/彼岸图网/", path))
        os.chdir(os.path.join("D:/彼岸图网/", path))
    else:
        print(path, '\t已存在')

保存图片

# 下载图片
def download(title, img):
    if img:
        filename = title + '.jpg'
        with open(path + filename, 'wb') as file:
             file.write(requests.get('http://pic.netbian.com'+img).content)

主函数

由于vccode使用ctrl+c中止程序会报KeyboardInterrupt错，捕获错误并输出

# 主函数
def main():
    try:
        start = time.time()
        global pic_sum, path
        pic_sum = 0
        path = 'D:/彼岸图网/'
        mkdir()
        get_main_page()
        
    except KeyboardInterrupt as error_quit:
        print("\n非正常退出\n注：会造成第张{0}图下载失败".format(pic_sum))
    except Exception as error:
        print('\n发现了错误：{0}'.format(error))
    finally:
        print("\n本次用时：{0:.2f}秒\n共爬取{1}页\n共{2}张图\n图片存储于{3}".format(
            (time.time() - start),pic_sum/20 , pic_sum,path))

完整代码：

import os
import time

import requests
from lxml import etree

headers = {
    'User-Agent': '',
    'Referer': '',
    'Cookie': ''
}

# 获取总页数
def get_last_page():
    response = requests.get('http://pic.netbian.com/', headers=headers)
    response.encoding = "GBK"
    html = etree.HTML(response.text)
    last_page = html.xpath('//div[@class="page"]/a[10]/text()')
    # list-->string
    last = ''.join(last_page)
    print("本站一共有{0}页\n".format(last))
    return last

# 获取图片来源链接
def get_main_page():

    last = get_last_page()
    for page in range(1, int(last) + 1):
        print('\n第{0}页'.format(page))
        if page == 1:
            url = 'http://pic.netbian.com/'
        else:
            url = 'http://pic.netbian.com//index_{0}.html'.format(page)
        response = requests.get(url, headers=headers)
        response.encoding = "GBK"

        if response.status_code == 200:
            html = etree.HTML(response.text)
            biglink_list = html.xpath('//div[@class="slist"]//li/a/@href')
            for link in biglink_list:
                get_img_link('http://pic.netbian.com/'+link)
    return biglink_list

# 获取高清图链接
def get_img_link(link):
    global pic_sum
    try:
        response = requests.get(link, headers=headers)
        response.encoding = "GBK"

        if response.status_code == 200:
            html = etree.HTML(response.text)
            img_list = html.xpath('//div[@class="photo-pic"]/a/img/@src')
            title_list = html.xpath('//div[@class="photo-pic"]/a/img/@title')

            for title, img in zip(title_list, img_list):
                pic_sum = pic_sum+1
                print('{0}:{1}👉http://pic.netbian.com{2}'.format(pic_sum, title, img))
                download(title, img)
    except Exception as error:
        print(error)


# 创建目录(如果设置相对路径会报错，第一次创建目录，报错，第二次则会运行成功)
def mkdir():
    global path
    isExists = os.path.exists(os.path.join("D:/彼岸图网/", path))
    if not isExists:
        os.makedirs(os.path.join("D:/彼岸图网/", path))
        os.chdir(os.path.join("D:/彼岸图网/", path))
    else:
        print(path, '\t已存在')

# 下载图片
def download(title, img):
    if img:
        filename = title + '.jpg'
        with open(path + filename, 'wb') as file:
            # 通常用img.content，但是网页经xpath解析过格式,不能.content
            file.write(requests.get('http://pic.netbian.com'+img).content)

# 主函数
def main():
    try:
        start = time.time()
        global pic_sum, path
        pic_sum = 0
        path = 'D:/彼岸图网/'
        mkdir()
        get_main_page()
        
    except KeyboardInterrupt as error_quit:
        print("\n非正常退出\n注：会造成第张{0}图下载失败".format(pic_sum))
    except Exception as error:
        print('\n发现了错误：{0}'.format(error))
    finally:
        print("\n本次用时：{0:.2f}秒\n共爬取{1}页\n共{2}张图\n图片存储于{3}".format(
            (time.time() - start),pic_sum/20 , pic_sum,path))



if __name__ == "__main__":
    main()

注：仅供参考学习

昱禹

关注

1
点赞
踩
6

收藏

觉得还不错? 一键收藏
0
评论
Xpath-彼岸图网高清图片获取

Xpath-彼岸图网高清图片获取目标网站：彼岸图网首先导入所需包import osimport timeimport requestsfrom lxml import etree做好伪装(F12获取信息)headers = { 'User-Agent': '', 'Referer': '', 'Cookie': ''}获取总页数(非固定)# 获取总页数def get_last_page(): response = requests.get('ht
复制链接

扫一扫