xpath的使用

最新推荐文章于 2023-07-05 19:50:36 发布

Tomatosky

最新推荐文章于 2023-07-05 19:50:36 发布

阅读量272

点赞数

分类专栏： python

python 专栏收录该内容

12 篇文章 0 订阅

订阅专栏

首先下载lxml

pip install lxml

使用的时候导入模块

from lxml import etree

>>> def getxpath(html):           #返回html的xml结构
    return etree.HTML(html)
>>> sample="""<html>
              <head>
                <title>My page</title>
              </head>
              <body>
                <h2>Welcome to my <a href="#" src="x">page</a></h2>
                <p>This is the first paragraph.</p>
                <!-- this is the end -->
              </body>
            </html>
            """
>>> s1=getxpath(sample)
>>> s1.xpath('//title/text()')          #根据绝对路径取出内容
['My page']
>>> s1.xpath('/html/head/title/text()')         #相对路径取出内容
['My page']
>>> s1.xpath('//h2/a/@src')          #获取属性src
['x']
>>> s1.xpath('//@href')             #获取属性href
['#']
>>> s1.xpath('//text()')             #取出所有文本内容
['\n  ', '\n    ', 'My page', '\n  ', '\n  ', '\n    ', 'Welcome to my ', 'page', '\n    ', 'This is the first paragraph.', '\n    ', '\n  ', '\n']
>>> s1.xpath('//comment()')            #获取注释
[<!-- this is the end -->]

获取文本内容用text()，注释用comment()，其他的用@就好了。

>>> sample2="""
<html>
<body>
<ul>
<li>Quote 1</li>
<li>Quote 2 with <a href="...">link</a></li>
<li>Quote 3 with <a href="...">another link</a></li>
<li><h2>Quote 4 title</h2> ... </li>
</ul>
</body>
</html>
"""
>>> s2=getxpath(sample2)      
>>> s2.xpath('//li/text()')           #获取li标签下的内容
['Quote 1', 'Quote 2 with ', 'Quote 3 with ', ' ... ']
>>> s2.xpath('//li[position()=1]/text()')         #第一个li的内容
['Quote 1']
>>> s2.xpath('//li[1]/text()')          #两种获取方式
['Quote 1']
>>> s2.xpath('//li[position()=2]/text()')
['Quote 2 with ']
>>> s2.xpath('//li[position() mod2=1]/text()')   #获取所有奇数位li标签的内容
['Quote 1', 'Quote 3 with ']
>>> s2.xpath('//li[position() mod2=0]/text()')    #偶数位li
['Quote 2 with ', ' ... ']
>>> s2.xpath('//li[-1]/text()')   #这个是错误的
[]
>>> s2.xpath('//li[last()]/text()')    #这个才是正确的获取最后一个li内容
[' ... ']
>>> s2.xpath('//li[a]/text()')         #获取Li下面还有a的部分的内容
['Quote 2 with ', 'Quote 3 with ']
>>> s2.xpath('//li[a or h2]/text()')    #获取li下面有a或者h2的内容
['Quote 2 with ', 'Quote 3 with ', ' ... ']
>>> s2.xpath('//a/text()|//h2/text()')     #获取所有a和h2的内容
['link', 'another link', 'Quote 4 title']

位置第一个是1.最后一个要用last()，[-1]是错误的。

>>> sample3 = """<html>
  <body>
    <ul>
      <li id="begin"><a href="https://scrapy.org">Scrapy</a>begin</li>
      <li><a href="https://scrapinghub.com">Scrapinghub</a></li>
      <li><a href="https://blog.scrapinghub.com">Scrapinghub Blog</a></li>
      <li id="end"><a href="http://quotes.toscrape.com">Quotes To Scrape</a>end</li>
      <li data-xxxx="end" abc="abc"><a href="http://quotes.toscrape.com">Quotes To Scrape</a>end</li>
    </ul>
  </body>
</html>
"""
>>> s3=getxpath(sample3)
>>> s3.xpath('//li/a[@href="https://scrapy.org"]/text()')   
['Scrapy']
>>> s3.xpath('//li[@id="begin"]/text()')
['begin']
>>> s3.xpath('//li/a[text()="Scrapinghub"]/text()')
['Scrapinghub']
>>> s3.xpath('//li[@data-xxxx="end"]/text()')   #可以获取这个属性在那个标签下
['end']
>>> s3.xpath('//li[@abc="abc"]/text()')
['end']

可以根据属性或者文本直接定位到当前标签。

>>> sample4 = u"""
<html>
  <head>
    <title>My page</title>
  </head>
  <body>
    <h2>Welcome to my <a href="#" src="x">page</a></h2>
    <p>This is the first paragraph.</p>
    <p class="test">
    编程语言<a href="#">python</a>
    <img src="#" alt="test"/>javascript
    <a href="#"><strong>C#</strong>JAVA</a>
    </p>
    <p class="content-a">a</p>
    <p class="content-b">b</p>
    <p class="content-c">c</p>
    <p class="content-d">d</p>
    <p class="econtent-e">e</p>
    <p class="heh">f</p>
    <!-- this is the end -->
  </body>
</html>
"""
>>> s4=etree.HTML(sample4)
>>> s4.xpath('//p/text()')
['This is the first paragraph.', '\n    编程语言', '\n    ', 'javascript\n    ', '\n    ', 'a', 'b', 'c', 'd', 'e', 'f']
>>> s4.xpath('string(//p[@class="test"])').strip()           #获取p标签下的所有文本
'编程语言python\n    javascript\n    C#JAVA'
>>> s4.xpath('//p[starts-with(@class,"content")]/text()')     获取p标签下class有content的文本
['a', 'b', 'c', 'd']
>>> s4.xpath(('//p[contains(@class,"content")]/text()'))     #获取p标签下有class为content的文本
['a', 'b', 'c', 'd', 'e']