冰冻三尺,非一日之寒。数据解析——xpath(1)

xpah 的语法认识

闲话少说,直接上图
在这里插入图片描述

xpath 爬取图片

代码如下

import requests
from lxml import etree
import os

if __name__ == "__main__":
    # 创建文件夹
    if not os.path.exists("./糗图(xpath)"):
        os.mkdir("./糗图(xpath)")

    # UA伪装
    header = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36"
        }

    for index in range(1, 3): # 翻两页
        # 指定url
        url = "https://www.qiushibaike.com/imgrank/page/%s/" % str(index)
        # 获取源码
        response = requests.get(url = url, headers = header).text
        # xpath 解析
        tree = etree.HTML(response)
        src_list = tree.xpath('//div[@class="thumb"]/a/img/@src')
        # print(src_list) # 测试定位是否有错误

        for src in src_list:
            title = src.split("/")[-1]
            # 新的url
            new_url = "https:" + src
            # 获取图片
            photo = requests.get(url =new_url, headers = header).content
            # 存储
            with open("./糗图(xpath)/" + title, "wb") as fp:
                fp.write(photo)
                print(title, "下载成功!!!")
    print("over!!!")

xpath 爬取三国演义

import requests
from lxml import etree

if __name__ == "__main__":
    # UA伪装
    header = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36"
        }

    # 指定 url 
    url = "http://sanguo.5000yan.com/"

    # 获取源码
    response = requests.get(url = url, headers =header)
    response.encoding = "utf-8" # 编译源码,防止乱码
    response = response.text

    # xpath 解析
    tree = etree.HTML(response)
    src_list = tree.xpath('//div[@class="sidamingzhu-list-mulu"]//li[@class="menu-item"]/a/@href')
    # print(src_list) # 测试是否正确
    title_list = tree.xpath('//div[@class="sidamingzhu-list-mulu"]//li[@class="menu-item"]/a/text()')
    # print(title_list) # 测试是否正确
    
    for index in range(len(src_list)):
        # 指定新的url
        new_url = src_list[index]
        title = title_list[index]
        # 获取新的网页源码
        page_text = requests.get(url = new_url, headers = header)
        page_text.encoding = "utf-8"
        page_text = page_text.text

        # xpath 解析
        new_tree = etree.HTML(page_text)
        page = new_tree.xpath('//section[@class="section-body"]//div[@class="grap"]//text()')
        
        # 存储
        with open("C:\\Users\\ASUS\\Desktop\\CSDN\\数据解析\\xpath\\三国演义.txt", "a", encoding = "utf-8") as fp:
            fp.write("\n\n" + title + "\n\n")
            for i in page:
                fp.write(i)
            print(title, "下载成功!!!")

    print("over!!!")
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值