XPath（XML路径语言）

最新推荐文章于 2024-01-05 16:35:50 发布

scralet-moon

最新推荐文章于 2024-01-05 16:35:50 发布

阅读量1.1k

点赞数

分类专栏： python 文章标签：爬虫

本文链接：https://blog.csdn.net/weixin_43682329/article/details/100554652

版权

python 专栏收录该内容

5 篇文章 0 订阅

订阅专栏

XPath（XML路径语言）

是一门在XML文档中查找信息的语言，可用来在XML文档中对元素和属性进行遍历。

W3School官方文档：http： //www.w3school.com.cn/xpath/index.asp

XPath开发工具

开源的的XPath表达式编辑工具：XMLQuire（XML格式文件可用）

Chrome插件XPath Helper

Firefox插件XPath Checker

XPath 语法

表达式描述用法说明

nodename 选取此节点的所有子节点。xpath(‘span’)选取span元素的所有子节点

/ 从根节点选取。xpath(‘/div’)从根节点上选取div节点

// 从匹配选择的当前节点选择文档中的节点，而不考虑它们的位置。xpath(‘//div’)从当前节点选取含有div节点的标签

. 选取当前节点。xpath(‘./div’)选取当前节点下的div标签

.. 选取当前节点的父节点。 xpath(‘../’)回到上一级节点

@ 选取属性。xpath(“//div[@id=’1001’]”)获取div标签中，含有ID属性且值为1001的标签 </table>

XPath的常见用法大全

    from lxml import etree

    html = '''
    <bookstore> 
    <book price="100" category="cooking"> 
        <title lang="en">Everyday Italian</title>  
        <author>Giada De Laurentiis</author>  
        <year>2005</year>  
        <price>30.00</price> 
    </book>  

    <book category="children"> 
        <title lang="en">Harry Potter</title>  
        <author>J K. Rowling</author>  
        <year>2005</year>  
        <price>29.99</price> 
    </book>  

    <book category="web"> 
        <title category="web">XQuery Kick Start</title>  
        <author>James McGovern</author>  
        <author>Per Bothner</author>  
        <author>Kurt Cagle</author>  
        <author>James Linn</author>  
        <author>Vaidyanathan Nagarajan</author>  
        <year>2003</year>  
        <price>49.99</price> 
    </book> 

    <book category="web" cover="paperback"> 
        <title>Learning XML</title>  
        <author>Erik T. Ray</author>  
        <year>2003</year>  
        <price>39.95</price> 
    </book> 

    </bookstore>

    '''
    html = etree.HTML(html) # 加载字符串

    # html = etree.parse('temp.html') # 加载文件

    #etree.HTML():构造了一个XPath解析对象并对HTML文本进行自动修正。
    #etree.tostring()：输出修正后的结果，类型是bytes

    # 构建xpath规则提取数据
    # res = html.xpath('//bookstore/book/title/text()') 
    # res = html.xpath('//book/@cover | //book/@category')
    # res = html.xpath('//bookstore/book[1]/price/text()')
    # res = html.xpath('//bookstore/book[position()<2]') # 获取第一本书  postion就是索引，索引从1开始
    # res = html.xpath('//title[@lang]') #
    # res = html.xpath('//title[@lang="en"]/text()') #
    # res = html.xpath('//bookstore/book[price>35.00]/title/text()') #
    # res = html.xpath('//bookstore/*') #
    # res = html.xpath('//bookstore//*') #
    # res = html.xpath('//title[@*]') #
    # res = html.xpath('//book/title | //book/price') #
    #res = html.xpath('//*[@category="web"]')

    print(res)

58房源案例1

    from lxml import etree
    import requests

    base_url = 'http://bj.58.com/chuzu/?utm_source=market&spm=b-31580022738699-me-f-862.mingzhan&PGTID=0d100000-0000-17cd-3f99-94d590fc655b&ClickID=1'
    response = requests.get(base_url)

    html = response.text

    html = etree.HTML(html)

    # 找到所有房源li
    li_list = html.xpath('//ul[@class="listUl"]/li')
    for li in li_list:
        # 从一个房源中提取具体信息
        title = li.xpath('.//h2/a/text()')
        if title:
            title = title[0].strip()
        else:
            continue
        square = li.xpath('.//p[1]/text()')[0].replace(' ','').replace('\xa0','')
        print(title,square)

58房源案例2

    from lxml import etree
    import requests
    import json

    # 详情页请求
    def get_detail(url,f):
        response = requests.get(url)
        html = response.text
        html = etree.HTML(html)
        # 获取响应状态
        if 200 <= response.status_code <= 300:
            try:
                title = html.xpath('//h1/text()')[0]

                price = html.xpath('//span[@class="c_ff552e"]/b/text()')[0]

                margin = html.xpath('//span[@class="c_333"]/text()')
                if margin:
                    margin = margin[0]
                else:
                    margin = '无'

                rent_type = html.xpath('//ul[@class="f14"]/li[1]/span[2]/text()')[0]
                house_type = html.xpath('//ul[@class="f14"]/li[2]/span[2]/text()')[0]

                direction = html.xpath('//ul[@class="f14"]/li[3]/span[2]/text()')[0]
                host = html.xpath('//ul[@class="f14"]/li[4]/span[2]/a/text()')[0]

                area = html.xpath('//ul[@class="f14"]/li[5]/span[2]/a/text()')
                # ['昌平','立水桥']
                area = ''.join(area)


                addr = html.xpath('//ul[@class="f14"]/li[6]/span[2]/text()')[0]
            except Exception as e:
                print(e)
                print(url)
                exit()

            data = {
                'title': title,
                'price': price,
                'margin': margin,
                'rent_type': rent_type,
                'house_type': house_type,
                'direction': direction,
                'host': host,
                'area': area,
                'addr': addr,
            }
            for key in data:
                data[key] = data[key].replace(' ','').replace('\xa0','').strip()

            # 保存信息到文件中
            print(data['title'])
            f.write(json.dumps(data,ensure_ascii=False) + '\n')


    def getPage():
        base_url = 'http://bj.58.com/chuzu/pn1/'
        response = requests.get(base_url)

        html = response.text

        html = etree.HTML(html)

        # 找到所有房源li
        li_list = html.xpath('//ul[@class="listUl"]/li')
        for li in li_list:
            # 从一个房源中提取详情链接
            detail_url = li.xpath('.//h2/a/@href')
            if detail_url:
                detail_url = detail_url[0]
            else:
                continue
            # 发起详情页请求
            get_detail(detail_url,f)

    if __name__ == '__main__':
        f = open('house.json','w', encoding='utf-8')
        getPage()
        f.close()

补充

    from lxml import etree

    html = '''
        <div class="php_zuopin_fenlei" style="height:70px;">
            <span style="line-height:60px;">>&nbsp;&nbsp;按学科：</span>
            <div style="margin-left:65px;color:gray;font-size:12px;">
                <a href="http://www.itxdl.cn/html/php/phparticles/" title="网络培训">PHP</a>
                <a href="http://www.itxdl.cn/html/java/javaarticles/" title="特服培训"/>Java</a>
                <a href="http://www.itxdl.cn/html/ui/uiuearticles/" title="散打培训"/>UI</a>
                <a href="http://www.itxdl.cn/html/h5/HTML5articles/" title="赛车培训"/>Html5</a>
                <a href="http://www.itxdl.cn/html/linux/linuxartices/" title="Linux培训"/>Linux</a>
            </div>
        </div>
    '''

    # 查看解析以后的html  注意a标签的不规范
    # result = etree.tostring(html)
    # print(result.decode('utf-8'))

    # 自定义Parser
    html = etree.HTML(html,parser=etree.HTMLParser())
    print(html.xpath('//div[@class="php_zuopin_fenlei"]//a/text()'))