爬虫入门09——Xpath数据解析方法
(1)方法一:直接在python代码中解析html字符串
from lxml import etree
text = '''
<div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">forth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
'''
#使用etree解析使用HTML字符串
html = etree.HTML(text)
(2)方法二:读取一个html文件并解析
from lxml import etree
#text = '''
#<div>
# <ul>
# <li class="item-0"><a href="link1.html">first item</a></li>
# <li class="item-1"><a href="link2.html">second item</a></li>
# <li class="item-inactive"><a href="link3.html">third item</a></li>
# <li class="item-1"><a href="link4.html">forth item</a></li>
# <li class="item-0"><a href="link5.html">fifth item</a>
# </ul>
#</div>
#'''
#方法二:读取一个html文件并解析
html = etree.parse('C:/Users/PC/Desktop/test.html',etree.HTMLParser())
#print(html)
result= etree.tostring(html)
print(result)
print(result.decode('utf-8'))
(3)部分操作
from lxml import etree
#text = '''
#<div>
# <ul>
# <li class="item-0"><a href="link1.html">first item</a></li>
# <li class="item-1"><a href="link2.html">second item</a></li>
# <li class="item-inactive"><a href="link3.html">third item</a></li>
# <li class="item-1"><a href="link4.html">forth item</a></li>
# <li class="item-0"><a href="link5.html">fifth item</a>
# </ul>
#</div>
#'''
#方法二:读取一个html文件并解析
html = etree.parse('C:/Users/PC/Desktop/test.html',etree.HTMLParser())
#print(html)
#result= etree.tostring(html)
#print(result)
#print(result.decode('utf-8'))
#获取页面中所有li里面的数据
#['first item', 'second item', 'third item', 'forth item', 'fifth item']
result1 = html.xpath('//li/a/text()')
print(result1)
#获取指定标签里面的li数据
#['third item']
result2 = html.xpath('//li[@class="item-inactive"]/a/text()')
print(result2)
#获取属性名
#['link3.html']
result3 = html.xpath('//li[@class="item-inactive"]/a/@href')
print(result3)
#[('third item', 'link3.html')]
print(list(zip(result2,result3)))
# /当前元素的直接子节点
# //当前元素的子节点或孙子节点
# text()获取文本
# @attr获取属性对应的值