#encoding=utf-8 from lxml import etree html1 = ''' <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title></title> </head> <body> <div id="test-1">需要内容1</div> <div id="test-2">需要内容2</div> <div id="testfault">需要内容3</div> </body> ''' html2 = ''' <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title></title> </head> <body> <div id="test3"> 我左青龙, <span id="tiger"> 右白虎, <ul>上朱雀, <li>下玄武.</li> </ul> 老牛在当中, </span> 龙头在胸口. </div> </body> </html> ''' # # starts-with(@属性名称,属性字符相同的部分) # selector = etree.HTML(html1) # content = selector.xpath('//div[starts-with(@id,"test")]/text()') # for each in content: # print each #标签套标签情况(不可以提取其他标签里面的内容贴) # selector = etree.HTML(html2) # content_1 = selector.xpath('//div[@id="test3"]/text()') # for each in content_1: # print each # #标签套标签情况 # data = selector.xpath('//div[@id="test3"]')[0] # info = data.xpath('string(.)') # content_2 = info.replace('\n','').replace(' ','') # print content_2
XPath特殊应用
最新推荐文章于 2022-01-29 12:16:53 发布