xpath 爬取某网站图片

Z0o1010

已于 2022-06-13 21:04:56 修改

阅读量10w+

点赞数 8

分类专栏： Spider_Practise 文章标签： python

于 2020-10-27 22:33:21 首次发布

本文链接：https://blog.csdn.net/weixin_48732879/article/details/109322565

版权

Spider_Practise 专栏收录该内容

8 篇文章 3 订阅

订阅专栏

效果图请自行脑补，此处不做展示（狗头保命）

from lxml import etree
import requests
import os

if __name__ == '__main__':
    url = "http://pic.netbian.com/4kmeinv/"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/"
                      "85.0.4183.83 Safari/537.36"
    }

    response = requests.get(url=url, headers=headers)
    response.encoding = response.apparent_encoding
    page_text = response.text

    tree = etree.HTML(page_text)

    li_list = tree.xpath("//div[@class='slist']/ul/li")

    if not os.path.exists("./picLibs"):
        os.mkdir("./picLibs")

    for li in li_list:
        img_src = "http://pic.netbian.com" + li.xpath("./a/img/@src")[0]
        img_name = li.xpath("./a/img/@alt")[0] + ".jpg"

        img_data = requests.get(url=img_src, headers=headers).content
        img_path = "picLibs/" + img_name

        with open(img_path, "wb") as fp:
            fp.write(img_data)
            print(img_name + "下载成功")

    for x in range(2, 172):
        url_too = f"http://pic.netbian.com/4kmeinv/index_{x}.html"
        response_too = requests.get(url=url_too, headers=headers)
        response_too.encoding = response_too.apparent_encoding
        page_text = response_too.text

        tree_too = etree.HTML(page_text)

        li_list = tree_too.xpath("//div[@class='slist']/ul/li")

        for li_too in li_list:
            new_img_src = f"http://pic.netbian.com/4kmeinv/index_{x}.html" + li_too.xpath("./a/img/@src")[0]
            img_name = li_too.xpath("./a/img/@alt")[0] + ".jpg"

            img_data = requests.get(url=new_img_src, headers=headers).content
            img_path = "picLibs/" + img_name

            with open(img_path, "wb") as fp:
                fp.write(img_data)
                print(img_name + "下载成功")

优化后（可能并没有优化）

from lxml import etree
import requests
import os


def get(url, headers):
    response = requests.get(url=url, headers=headers)
    response.encoding = response.apparent_encoding
    return response.text


def parse(url, headers):
    response = get(url=url, headers=headers)
    tree = etree.HTML(response)
    li_list = tree.xpath("//div[@class='slist']/ul/li")
    return li_list


def save(url):
    for li in li_list:
        img_src = url + li.xpath("./a/img/@src")[0]
        img_name = li.xpath("./a/img/@alt")[0] + ".jpg"
        img_data = requests.get(url=img_src, headers=headers).content
        img_path = "小姐姐图片/" + img_name

        with open(img_path, "wb") as fp:
            fp.write(img_data)
            print(img_name + "下载成功")


if __name__ == '__main__':
    url = "http://pic.netbian.com/4kmeinv/"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/"
                      "85.0.4183.83 Safari/537.36"
    }

    li_list = parse(url=url, headers=headers)

    if not os.path.exists("./小姐姐图片"):
        os.mkdir("./小姐姐图片")

    save(url)

    for x in range(2, 172):
        url = f"http://pic.netbian.com/4kmeinv/index_{x}.html"
        
        li_list = parse(url=url, headers=headers)
        
        save(url)

Z0o1010

关注

8
点赞
踩
32

收藏

觉得还不错? 一键收藏
4
评论
xpath 爬取某网站图片

from lxml import etreeimport requestsimport osif __name__ == '__main__': url = "http://pic.netbian.com/4kmeinv/" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/"
复制链接

扫一扫