xpath 爬取某网站图片

效果图请自行脑补,此处不做展示(狗头保命)

from lxml import etree
import requests
import os

if __name__ == '__main__':
    url = "http://pic.netbian.com/4kmeinv/"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/"
                      "85.0.4183.83 Safari/537.36"
    }

    response = requests.get(url=url, headers=headers)
    response.encoding = response.apparent_encoding
    page_text = response.text

    tree = etree.HTML(page_text)

    li_list = tree.xpath("//div[@class='slist']/ul/li")

    if not os.path.exists("./picLibs"):
        os.mkdir("./picLibs")

    for li in li_list:
        img_src = "http://pic.netbian.com" + li.xpath("./a/img/@src")[0]
        img_name = li.xpath("./a/img/@alt")[0] + ".jpg"

        img_data = requests.get(url=img_src, headers=headers).content
        img_path = "picLibs/" + img_name

        with open(img_path, "wb") as fp:
            fp.write(img_data)
            print(img_name + "下载成功")

    for x in range(2, 172):
        url_too = f"http://pic.netbian.com/4kmeinv/index_{x}.html"
        response_too = requests.get(url=url_too, headers=headers)
        response_too.encoding = response_too.apparent_encoding
        page_text = response_too.text

        tree_too = etree.HTML(page_text)

        li_list = tree_too.xpath("//div[@class='slist']/ul/li")

        for li_too in li_list:
            new_img_src = f"http://pic.netbian.com/4kmeinv/index_{x}.html" + li_too.xpath("./a/img/@src")[0]
            img_name = li_too.xpath("./a/img/@alt")[0] + ".jpg"

            img_data = requests.get(url=new_img_src, headers=headers).content
            img_path = "picLibs/" + img_name

            with open(img_path, "wb") as fp:
                fp.write(img_data)
                print(img_name + "下载成功")

优化后(可能并没有优化)

from lxml import etree
import requests
import os


def get(url, headers):
    response = requests.get(url=url, headers=headers)
    response.encoding = response.apparent_encoding
    return response.text


def parse(url, headers):
    response = get(url=url, headers=headers)
    tree = etree.HTML(response)
    li_list = tree.xpath("//div[@class='slist']/ul/li")
    return li_list


def save(url):
    for li in li_list:
        img_src = url + li.xpath("./a/img/@src")[0]
        img_name = li.xpath("./a/img/@alt")[0] + ".jpg"
        img_data = requests.get(url=img_src, headers=headers).content
        img_path = "小姐姐图片/" + img_name

        with open(img_path, "wb") as fp:
            fp.write(img_data)
            print(img_name + "下载成功")


if __name__ == '__main__':
    url = "http://pic.netbian.com/4kmeinv/"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/"
                      "85.0.4183.83 Safari/537.36"
    }

    li_list = parse(url=url, headers=headers)

    if not os.path.exists("./小姐姐图片"):
        os.mkdir("./小姐姐图片")

    save(url)

    for x in range(2, 172):
        url = f"http://pic.netbian.com/4kmeinv/index_{x}.html"
        
        li_list = parse(url=url, headers=headers)
        
        save(url)
  • 8
    点赞
  • 32
    收藏
    觉得还不错? 一键收藏
  • 4
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值