from lxml import etree
data_str = """
<div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
"""
#注意.上方数据缺少了一个一个li的闭标签.
#利用etree.HTML可以将字符串或bytes转化为Element python对象,这个对象有Xpath的方法.
#想要用xpath的方法就先用etree.HTML转化格式.
html = etree.HTML(data_str)
print(html)
#etree.tostring(html)可以自动修正html中的缺失代码,补全缺失标签
#使用为了观察修改以后的HTML的样子,根据修改后的HTML去写Xpath
result = etree.tostring(html)
print(result.decode('utf-8'))
#获得class = item-1 的a标签的herf属性
#所有满足条件的值都会返回list[]
result = html.xpath('//li[@class="item-0"]/a/@href')
print(result)
#xpath练习二
from lxml import etree
import requests
#ca证书
#https需要增加请求头信息(不然会反爬掉)
#http没有反爬
url = "
http://www.baidu.com/s?wd=python
"
reponse = requests.get(url)
# print(reponse.content.decode('utf-8'))
#转化类型(xpath语法需要找到对应的类型才可以使用)
heml = etree.HTML(reponse.content.decode('utf-8'))
url = heml.xpath('//*[@id ="1"]/h3/a/@href')
print(url)