Xpath-彼岸图网高清图片获取

Xpath-彼岸图网高清图片获取

目标网站:彼岸图网

首先导入所需包

import os
import time

import requests
from lxml import etree

做好伪装(F12获取信息)

headers = {
    'User-Agent': '',
    'Referer': '',
    'Cookie': ''
}

获取总页数(非固定)

# 获取总页数
def get_last_page():
    response = requests.get('http://pic.netbian.com/', headers=headers)
    response.encoding = "GBK"
    html = etree.HTML(response.text)
    last_page = html.xpath('//div[@class="page"]/a[10]/text()')
    last = ''.join(last_page)
    print("本站一共有{0}页\n".format(last))
    return last

由于彼岸图网的高清图片需要在进入一个网页才能拿到,所以先获取小图链接

首页的链接并不规则,须单独设置

def get_main_page():

    last = get_last_page()
    for page in range(1, int(last) + 1):
        print('\n第{0}页'.format(page))
        if page == 1:
            url = 'http://pic.netbian.com/'
        else:
            url = 'http://pic.netbian.com//index_{0}.html'.format(page)
        response = requests.get(url, headers=headers)
        response.encoding = "GBK"

        if response.status_code == 200:
            html = etree.HTML(response.text)
            biglink_list = html.xpath('//div[@class="slist"]//li/a/@href')
            for link in biglink_list:
                get_img_link('http://pic.netbian.com/'+link)
    return biglink_list

进一步访问小图链接,解析高清图链并输出图片名称和链接

# 获取高清图链接
def get_img_link(link):
    global pic_sum
    try:
        response = requests.get(link, headers=headers)
        response.encoding = "GBK"

        if response.status_code == 200:
            html = etree.HTML(response.text)
            img_list = html.xpath('//div[@class="photo-pic"]/a/img/@src')
            title_list = html.xpath('//div[@class="photo-pic"]/a/img/@title')

            for title, img in zip(title_list, img_list):
                pic_sum = pic_sum+1
                print('{0}:{1}👉http://pic.netbian.com{2}'.format(pic_sum, title, img))
                download(title, img)
    except Exception as error:
        print(error)

创建一个目录存储图片

# 创建目录(如果设置相对路径会报错,第一次创建目录,报错,第二次则会运行成功)
def mkdir():
    global path
    isExists = os.path.exists(os.path.join("D:/彼岸图网/", path))
    if not isExists:
        os.makedirs(os.path.join("D:/彼岸图网/", path))
        os.chdir(os.path.join("D:/彼岸图网/", path))
    else:
        print(path, '\t已存在')

保存图片

# 下载图片
def download(title, img):
    if img:
        filename = title + '.jpg'
        with open(path + filename, 'wb') as file:
             file.write(requests.get('http://pic.netbian.com'+img).content)

主函数

由于vccode使用ctrl+c中止程序会报KeyboardInterrupt错,捕获错误并输出

# 主函数
def main():
    try:
        start = time.time()
        global pic_sum, path
        pic_sum = 0
        path = 'D:/彼岸图网/'
        mkdir()
        get_main_page()
        
    except KeyboardInterrupt as error_quit:
        print("\n非正常退出\n注:会造成第张{0}图下载失败".format(pic_sum))
    except Exception as error:
        print('\n发现了错误:{0}'.format(error))
    finally:
        print("\n本次用时:{0:.2f}秒\n共爬取{1}页\n共{2}张图\n图片存储于{3}".format(
            (time.time() - start),pic_sum/20 , pic_sum,path))

完整代码:

import os
import time

import requests
from lxml import etree

headers = {
    'User-Agent': '',
    'Referer': '',
    'Cookie': ''
}

# 获取总页数
def get_last_page():
    response = requests.get('http://pic.netbian.com/', headers=headers)
    response.encoding = "GBK"
    html = etree.HTML(response.text)
    last_page = html.xpath('//div[@class="page"]/a[10]/text()')
    # list-->string
    last = ''.join(last_page)
    print("本站一共有{0}页\n".format(last))
    return last

# 获取图片来源链接
def get_main_page():

    last = get_last_page()
    for page in range(1, int(last) + 1):
        print('\n第{0}页'.format(page))
        if page == 1:
            url = 'http://pic.netbian.com/'
        else:
            url = 'http://pic.netbian.com//index_{0}.html'.format(page)
        response = requests.get(url, headers=headers)
        response.encoding = "GBK"

        if response.status_code == 200:
            html = etree.HTML(response.text)
            biglink_list = html.xpath('//div[@class="slist"]//li/a/@href')
            for link in biglink_list:
                get_img_link('http://pic.netbian.com/'+link)
    return biglink_list

# 获取高清图链接
def get_img_link(link):
    global pic_sum
    try:
        response = requests.get(link, headers=headers)
        response.encoding = "GBK"

        if response.status_code == 200:
            html = etree.HTML(response.text)
            img_list = html.xpath('//div[@class="photo-pic"]/a/img/@src')
            title_list = html.xpath('//div[@class="photo-pic"]/a/img/@title')

            for title, img in zip(title_list, img_list):
                pic_sum = pic_sum+1
                print('{0}:{1}👉http://pic.netbian.com{2}'.format(pic_sum, title, img))
                download(title, img)
    except Exception as error:
        print(error)


# 创建目录(如果设置相对路径会报错,第一次创建目录,报错,第二次则会运行成功)
def mkdir():
    global path
    isExists = os.path.exists(os.path.join("D:/彼岸图网/", path))
    if not isExists:
        os.makedirs(os.path.join("D:/彼岸图网/", path))
        os.chdir(os.path.join("D:/彼岸图网/", path))
    else:
        print(path, '\t已存在')

# 下载图片
def download(title, img):
    if img:
        filename = title + '.jpg'
        with open(path + filename, 'wb') as file:
            # 通常用img.content,但是网页经xpath解析过格式,不能.content
            file.write(requests.get('http://pic.netbian.com'+img).content)

# 主函数
def main():
    try:
        start = time.time()
        global pic_sum, path
        pic_sum = 0
        path = 'D:/彼岸图网/'
        mkdir()
        get_main_page()
        
    except KeyboardInterrupt as error_quit:
        print("\n非正常退出\n注:会造成第张{0}图下载失败".format(pic_sum))
    except Exception as error:
        print('\n发现了错误:{0}'.format(error))
    finally:
        print("\n本次用时:{0:.2f}秒\n共爬取{1}页\n共{2}张图\n图片存储于{3}".format(
            (time.time() - start),pic_sum/20 , pic_sum,path))



if __name__ == "__main__":
    main()

注:仅供参考学习

  • 1
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值